From b8ead3dd9fc6337331ce7a79e2cef90adafc0e52 Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Mon, 23 Aug 2010 12:20:57 -0400 Subject: [PATCH 01/20] Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 Signed-off-by: Steve Dickson --- config-generic | 12 + kernel.spec | 15 +- linux-2.6-pnfs-compile.patch | 13 + linux-2.6.35-inline.patch | 11 + nfs-35-fc.patch | 7235 ++++++ nfsd-35-fc.patch | 1808 ++ pnfs-all-2.6.35-2010-08-19-f13.patch | 31788 +++++++++++++++++++++++++ 7 files changed, 40880 insertions(+), 2 deletions(-) create mode 100644 linux-2.6-pnfs-compile.patch create mode 100644 linux-2.6.35-inline.patch create mode 100644 nfs-35-fc.patch create mode 100644 nfsd-35-fc.patch create mode 100644 pnfs-all-2.6.35-2010-08-19-f13.patch diff --git a/config-generic b/config-generic index 3b23aabcc..76379c8eb 100644 --- a/config-generic +++ b/config-generic @@ -3322,6 +3322,18 @@ CONFIG_NFSD_V3=y CONFIG_NFSD_V3_ACL=y CONFIG_NFSD_V4=y CONFIG_NFS_FSCACHE=y +# Enable pNFS +CONFIG_PNFS=y +CONFIG_PNFSD=y +CONFIG_PNFSD_LOCAL_EXPORT=y +CONFIG_SPNFS=y +CONFIG_SPNFS_LAYOUTSEGMENTS=y +CONFIG_SPNFS_BLOCK=y +CONFIG_PNFS_OBJLAYOUT=m +CONFIG_PNFS_BLOCK=m +CONFIG_PNFS_PANLAYOUT=m +CONFIG_PNFS_FILE_LAYOUT=m +# CONFIG_LOCKD=m CONFIG_LOCKD_V4=y CONFIG_EXPORTFS=m diff --git a/kernel.spec b/kernel.spec index 3a5dbce65..70b6f45c8 100644 --- a/kernel.spec +++ b/kernel.spec @@ -23,7 +23,7 @@ Summary: The Linux kernel # # (Uncomment the '#' and both spaces below to set the buildid.) # -# % define buildid .local +%define buildid .pnfs_all_2.6.35_2010_08_19 ################################################################### # The buildid can also be specified on the rpmbuild command line @@ -107,7 +107,7 @@ Summary: The Linux kernel # kernel-headers %define with_headers %{?_without_headers: 0} %{?!_without_headers: 1} # kernel-firmware -%define with_firmware %{?_with_firmware: 1} %{?!_with_firmware: 0} +%define with_firmware %{?_with_firmware: 1} %{?!_with_firmware: 1} # tools/perf %define with_perftool %{?_without_perftool: 0} %{?!_without_perftool: 1} # perf noarch subpkg @@ -764,6 +764,12 @@ Patch12440: direct-io-move-aio_complete-into-end_io.patch Patch12450: ext4-move-aio-completion-after-unwritten-extent-conversion.patch Patch12460: xfs-move-aio-completion-after-unwritten-extent-conversion.patch +Patch30000: nfs-35-fc.patch +Patch30001: nfsd-35-fc.patch +Patch30002: pnfs-all-2.6.35-2010-08-19-f13.patch +Patch30003: linux-2.6-pnfs-compile.patch +Patch30004: linux-2.6.35-inline.patch + %endif BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root @@ -1419,6 +1425,11 @@ ApplyPatch direct-io-move-aio_complete-into-end_io.patch ApplyPatch ext4-move-aio-completion-after-unwritten-extent-conversion.patch ApplyPatch xfs-move-aio-completion-after-unwritten-extent-conversion.patch +ApplyPatch nfs-35-fc.patch +ApplyPatch nfsd-35-fc.patch +ApplyPatch pnfs-all-2.6.35-2010-08-19-f13.patch +ApplyPatch linux-2.6-pnfs-compile.patch +ApplyPatch linux-2.6.35-inline.patch # END OF PATCH APPLICATIONS %endif diff --git a/linux-2.6-pnfs-compile.patch b/linux-2.6-pnfs-compile.patch new file mode 100644 index 000000000..7c8cc4248 --- /dev/null +++ b/linux-2.6-pnfs-compile.patch @@ -0,0 +1,13 @@ +diff -up linux-2.6.32.x86_64/fs/nfs/objlayout/pnfs_osd_xdr.h.orig linux-2.6.32.x86_64/fs/nfs/objlayout/pnfs_osd_xdr.h +diff -up linux-2.6.32.x86_64/include/net/inet_connection_sock.h.orig linux-2.6.32.x86_64/include/net/inet_connection_sock.h +--- linux-2.6.32.x86_64/include/net/inet_connection_sock.h.orig 2009-12-02 22:51:21.000000000 -0500 ++++ linux-2.6.32.x86_64/include/net/inet_connection_sock.h 2010-04-21 14:26:24.475659551 -0400 +@@ -23,7 +23,7 @@ + #include + #include + +-#define INET_CSK_DEBUG 1 ++//#define INET_CSK_DEBUG 1 + + /* Cancel timers, when they are not required. */ + #undef INET_CSK_CLEAR_TIMERS diff --git a/linux-2.6.35-inline.patch b/linux-2.6.35-inline.patch new file mode 100644 index 000000000..c56d8da5e --- /dev/null +++ b/linux-2.6.35-inline.patch @@ -0,0 +1,11 @@ +diff -up linux-2.6.34.noarch/arch/x86/Makefile.orig linux-2.6.34.noarch/arch/x86/Makefile +--- linux-2.6.34.noarch/arch/x86/Makefile.orig 2010-07-01 13:33:21.859627499 -0400 ++++ linux-2.6.34.noarch/arch/x86/Makefile 2010-07-01 13:36:26.751576450 -0400 +@@ -81,6 +81,7 @@ ifdef CONFIG_CC_STACKPROTECTOR + $(warning stack protector enabled but no compiler support) + endif + endif ++KBUILD_CFLAGS += -fno-inline-functions-called-once + + # Don't unroll struct assignments with kmemcheck enabled + ifeq ($(CONFIG_KMEMCHECK),y) diff --git a/nfs-35-fc.patch b/nfs-35-fc.patch new file mode 100644 index 000000000..c3ad25f65 --- /dev/null +++ b/nfs-35-fc.patch @@ -0,0 +1,7235 @@ +diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/client.c +--- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-23 11:01:00.352376393 -0400 +@@ -934,7 +934,6 @@ static int nfs_probe_fsinfo(struct nfs_s + } + + fsinfo.fattr = fattr; +- nfs_fattr_init(fattr); + error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); + if (error < 0) + goto out_error; +@@ -1047,13 +1046,18 @@ struct nfs_server *nfs_create_server(con + struct nfs_fh *mntfh) + { + struct nfs_server *server; +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr; + int error; + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + ++ error = -ENOMEM; ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto error; ++ + /* Get a client representation */ + error = nfs_init_server(server, data); + if (error < 0) +@@ -1064,7 +1068,7 @@ struct nfs_server *nfs_create_server(con + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + /* Probe the root fh to retrieve its FSID */ +- error = nfs_probe_fsinfo(server, mntfh, &fattr); ++ error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto error; + if (server->nfs_client->rpc_ops->version == 3) { +@@ -1077,14 +1081,14 @@ struct nfs_server *nfs_create_server(con + server->namelen = NFS2_MAXNAMLEN; + } + +- if (!(fattr.valid & NFS_ATTR_FATTR)) { +- error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); ++ if (!(fattr->valid & NFS_ATTR_FATTR)) { ++ error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); + if (error < 0) { + dprintk("nfs_create_server: getattr error = %d\n", -error); + goto error; + } + } +- memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid)); ++ memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); + + dprintk("Server FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, +@@ -1096,9 +1100,11 @@ struct nfs_server *nfs_create_server(con + spin_unlock(&nfs_client_lock); + + server->mount_time = jiffies; ++ nfs_free_fattr(fattr); + return server; + + error: ++ nfs_free_fattr(fattr); + nfs_free_server(server); + return ERR_PTR(error); + } +@@ -1340,7 +1346,7 @@ error: + struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, + struct nfs_fh *mntfh) + { +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr; + struct nfs_server *server; + int error; + +@@ -1350,6 +1356,11 @@ struct nfs_server *nfs4_create_server(co + if (!server) + return ERR_PTR(-ENOMEM); + ++ error = -ENOMEM; ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto error; ++ + /* set up the general RPC client */ + error = nfs4_init_server(server, data); + if (error < 0) +@@ -1364,7 +1375,7 @@ struct nfs_server *nfs4_create_server(co + goto error; + + /* Probe the root fh to retrieve its FSID */ +- error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path); ++ error = nfs4_get_rootfh(server, mntfh); + if (error < 0) + goto error; + +@@ -1375,7 +1386,7 @@ struct nfs_server *nfs4_create_server(co + + nfs4_session_set_rwsize(server); + +- error = nfs_probe_fsinfo(server, mntfh, &fattr); ++ error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto error; + +@@ -1389,9 +1400,11 @@ struct nfs_server *nfs4_create_server(co + + server->mount_time = jiffies; + dprintk("<-- nfs4_create_server() = %p\n", server); ++ nfs_free_fattr(fattr); + return server; + + error: ++ nfs_free_fattr(fattr); + nfs_free_server(server); + dprintk("<-- nfs4_create_server() = error %d\n", error); + return ERR_PTR(error); +@@ -1405,7 +1418,7 @@ struct nfs_server *nfs4_create_referral_ + { + struct nfs_client *parent_client; + struct nfs_server *server, *parent_server; +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr; + int error; + + dprintk("--> nfs4_create_referral_server()\n"); +@@ -1414,6 +1427,11 @@ struct nfs_server *nfs4_create_referral_ + if (!server) + return ERR_PTR(-ENOMEM); + ++ error = -ENOMEM; ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto error; ++ + parent_server = NFS_SB(data->sb); + parent_client = parent_server->nfs_client; + +@@ -1443,12 +1461,12 @@ struct nfs_server *nfs4_create_referral_ + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + /* Probe the root fh to retrieve its FSID and filehandle */ +- error = nfs4_path_walk(server, mntfh, data->mnt_path); ++ error = nfs4_get_rootfh(server, mntfh); + if (error < 0) + goto error; + + /* probe the filesystem info for this server filesystem */ +- error = nfs_probe_fsinfo(server, mntfh, &fattr); ++ error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto error; + +@@ -1466,10 +1484,12 @@ struct nfs_server *nfs4_create_referral_ + + server->mount_time = jiffies; + ++ nfs_free_fattr(fattr); + dprintk("<-- nfs_create_referral_server() = %p\n", server); + return server; + + error: ++ nfs_free_fattr(fattr); + nfs_free_server(server); + dprintk("<-- nfs4_create_referral_server() = error %d\n", error); + return ERR_PTR(error); +@@ -1485,7 +1505,7 @@ struct nfs_server *nfs_clone_server(stru + struct nfs_fattr *fattr) + { + struct nfs_server *server; +- struct nfs_fattr fattr_fsinfo; ++ struct nfs_fattr *fattr_fsinfo; + int error; + + dprintk("--> nfs_clone_server(,%llx:%llx,)\n", +@@ -1496,6 +1516,11 @@ struct nfs_server *nfs_clone_server(stru + if (!server) + return ERR_PTR(-ENOMEM); + ++ error = -ENOMEM; ++ fattr_fsinfo = nfs_alloc_fattr(); ++ if (fattr_fsinfo == NULL) ++ goto out_free_server; ++ + /* Copy data from the source */ + server->nfs_client = source->nfs_client; + atomic_inc(&server->nfs_client->cl_count); +@@ -1512,7 +1537,7 @@ struct nfs_server *nfs_clone_server(stru + nfs_init_server_aclclient(server); + + /* probe the filesystem info for this server filesystem */ +- error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo); ++ error = nfs_probe_fsinfo(server, fh, fattr_fsinfo); + if (error < 0) + goto out_free_server; + +@@ -1534,10 +1559,12 @@ struct nfs_server *nfs_clone_server(stru + + server->mount_time = jiffies; + ++ nfs_free_fattr(fattr_fsinfo); + dprintk("<-- nfs_clone_server() = %p\n", server); + return server; + + out_free_server: ++ nfs_free_fattr(fattr_fsinfo); + nfs_free_server(server); + dprintk("<-- nfs_clone_server() = error %d\n", error); + return ERR_PTR(error); +diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs/delegation.c +--- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-23 11:01:00.352376393 -0400 +@@ -213,7 +213,7 @@ int nfs_inode_set_delegation(struct inod + struct nfs_delegation *freeme = NULL; + int status = 0; + +- delegation = kmalloc(sizeof(*delegation), GFP_KERNEL); ++ delegation = kmalloc(sizeof(*delegation), GFP_NOFS); + if (delegation == NULL) + return -ENOMEM; + memcpy(delegation->stateid.data, res->delegation.data, +diff -up linux-2.6.34.noarch/fs/nfs/dir.c.orig linux-2.6.34.noarch/fs/nfs/dir.c +--- linux-2.6.34.noarch/fs/nfs/dir.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/dir.c 2010-08-23 11:01:00.353376419 -0400 +@@ -530,9 +530,7 @@ static int nfs_readdir(struct file *filp + nfs_readdir_descriptor_t my_desc, + *desc = &my_desc; + struct nfs_entry my_entry; +- struct nfs_fh fh; +- struct nfs_fattr fattr; +- long res; ++ int res = -ENOMEM; + + dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", + dentry->d_parent->d_name.name, dentry->d_name.name, +@@ -554,9 +552,11 @@ static int nfs_readdir(struct file *filp + + my_entry.cookie = my_entry.prev_cookie = 0; + my_entry.eof = 0; +- my_entry.fh = &fh; +- my_entry.fattr = &fattr; +- nfs_fattr_init(&fattr); ++ my_entry.fh = nfs_alloc_fhandle(); ++ my_entry.fattr = nfs_alloc_fattr(); ++ if (my_entry.fh == NULL || my_entry.fattr == NULL) ++ goto out_alloc_failed; ++ + desc->entry = &my_entry; + + nfs_block_sillyrename(dentry); +@@ -598,7 +598,10 @@ out: + nfs_unblock_sillyrename(dentry); + if (res > 0) + res = 0; +- dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n", ++out_alloc_failed: ++ nfs_free_fattr(my_entry.fattr); ++ nfs_free_fhandle(my_entry.fh); ++ dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + res); + return res; +@@ -776,9 +779,9 @@ static int nfs_lookup_revalidate(struct + struct inode *dir; + struct inode *inode; + struct dentry *parent; ++ struct nfs_fh *fhandle = NULL; ++ struct nfs_fattr *fattr = NULL; + int error; +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; + + parent = dget_parent(dentry); + dir = parent->d_inode; +@@ -811,14 +814,22 @@ static int nfs_lookup_revalidate(struct + if (NFS_STALE(inode)) + goto out_bad; + +- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); ++ error = -ENOMEM; ++ fhandle = nfs_alloc_fhandle(); ++ fattr = nfs_alloc_fattr(); ++ if (fhandle == NULL || fattr == NULL) ++ goto out_error; ++ ++ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + if (error) + goto out_bad; +- if (nfs_compare_fh(NFS_FH(inode), &fhandle)) ++ if (nfs_compare_fh(NFS_FH(inode), fhandle)) + goto out_bad; +- if ((error = nfs_refresh_inode(inode, &fattr)) != 0) ++ if ((error = nfs_refresh_inode(inode, fattr)) != 0) + goto out_bad; + ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fhandle); + out_set_verifier: + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + out_valid: +@@ -842,11 +853,21 @@ out_zap_parent: + shrink_dcache_parent(dentry); + } + d_drop(dentry); ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fhandle); + dput(parent); + dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", + __func__, dentry->d_parent->d_name.name, + dentry->d_name.name); + return 0; ++out_error: ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fhandle); ++ dput(parent); ++ dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n", ++ __func__, dentry->d_parent->d_name.name, ++ dentry->d_name.name, error); ++ return error; + } + + /* +@@ -911,9 +932,9 @@ static struct dentry *nfs_lookup(struct + struct dentry *res; + struct dentry *parent; + struct inode *inode = NULL; ++ struct nfs_fh *fhandle = NULL; ++ struct nfs_fattr *fattr = NULL; + int error; +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; + + dfprintk(VFS, "NFS: lookup(%s/%s)\n", + dentry->d_parent->d_name.name, dentry->d_name.name); +@@ -923,7 +944,6 @@ static struct dentry *nfs_lookup(struct + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) + goto out; + +- res = ERR_PTR(-ENOMEM); + dentry->d_op = NFS_PROTO(dir)->dentry_ops; + + /* +@@ -936,17 +956,23 @@ static struct dentry *nfs_lookup(struct + goto out; + } + ++ res = ERR_PTR(-ENOMEM); ++ fhandle = nfs_alloc_fhandle(); ++ fattr = nfs_alloc_fattr(); ++ if (fhandle == NULL || fattr == NULL) ++ goto out; ++ + parent = dentry->d_parent; + /* Protect against concurrent sillydeletes */ + nfs_block_sillyrename(parent); +- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); ++ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + if (error == -ENOENT) + goto no_entry; + if (error < 0) { + res = ERR_PTR(error); + goto out_unblock_sillyrename; + } +- inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr); ++ inode = nfs_fhget(dentry->d_sb, fhandle, fattr); + res = (struct dentry *)inode; + if (IS_ERR(res)) + goto out_unblock_sillyrename; +@@ -962,6 +988,8 @@ no_entry: + out_unblock_sillyrename: + nfs_unblock_sillyrename(parent); + out: ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fhandle); + return res; + } + +@@ -1669,28 +1697,33 @@ static void nfs_access_free_entry(struct + smp_mb__after_atomic_dec(); + } + ++static void nfs_access_free_list(struct list_head *head) ++{ ++ struct nfs_access_entry *cache; ++ ++ while (!list_empty(head)) { ++ cache = list_entry(head->next, struct nfs_access_entry, lru); ++ list_del(&cache->lru); ++ nfs_access_free_entry(cache); ++ } ++} ++ + int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) + { + LIST_HEAD(head); + struct nfs_inode *nfsi; + struct nfs_access_entry *cache; + +-restart: ++ if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) ++ return (nr_to_scan == 0) ? 0 : -1; ++ + spin_lock(&nfs_access_lru_lock); + list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { +- struct rw_semaphore *s_umount; + struct inode *inode; + + if (nr_to_scan-- == 0) + break; +- s_umount = &nfsi->vfs_inode.i_sb->s_umount; +- if (!down_read_trylock(s_umount)) +- continue; +- inode = igrab(&nfsi->vfs_inode); +- if (inode == NULL) { +- up_read(s_umount); +- continue; +- } ++ inode = &nfsi->vfs_inode; + spin_lock(&inode->i_lock); + if (list_empty(&nfsi->access_cache_entry_lru)) + goto remove_lru_entry; +@@ -1704,61 +1737,47 @@ restart: + else { + remove_lru_entry: + list_del_init(&nfsi->access_cache_inode_lru); ++ smp_mb__before_clear_bit(); + clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); ++ smp_mb__after_clear_bit(); + } +- spin_unlock(&inode->i_lock); +- spin_unlock(&nfs_access_lru_lock); +- iput(inode); +- up_read(s_umount); +- goto restart; + } + spin_unlock(&nfs_access_lru_lock); +- while (!list_empty(&head)) { +- cache = list_entry(head.next, struct nfs_access_entry, lru); +- list_del(&cache->lru); +- nfs_access_free_entry(cache); +- } ++ nfs_access_free_list(&head); + return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure; + } + +-static void __nfs_access_zap_cache(struct inode *inode) ++static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) + { +- struct nfs_inode *nfsi = NFS_I(inode); + struct rb_root *root_node = &nfsi->access_cache; +- struct rb_node *n, *dispose = NULL; ++ struct rb_node *n; + struct nfs_access_entry *entry; + + /* Unhook entries from the cache */ + while ((n = rb_first(root_node)) != NULL) { + entry = rb_entry(n, struct nfs_access_entry, rb_node); + rb_erase(n, root_node); +- list_del(&entry->lru); +- n->rb_left = dispose; +- dispose = n; ++ list_move(&entry->lru, head); + } + nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS; +- spin_unlock(&inode->i_lock); +- +- /* Now kill them all! */ +- while (dispose != NULL) { +- n = dispose; +- dispose = n->rb_left; +- nfs_access_free_entry(rb_entry(n, struct nfs_access_entry, rb_node)); +- } + } + + void nfs_access_zap_cache(struct inode *inode) + { ++ LIST_HEAD(head); ++ ++ if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0) ++ return; + /* Remove from global LRU init */ +- if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { +- spin_lock(&nfs_access_lru_lock); ++ spin_lock(&nfs_access_lru_lock); ++ if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) + list_del_init(&NFS_I(inode)->access_cache_inode_lru); +- spin_unlock(&nfs_access_lru_lock); +- } + + spin_lock(&inode->i_lock); +- /* This will release the spinlock */ +- __nfs_access_zap_cache(inode); ++ __nfs_access_zap_cache(NFS_I(inode), &head); ++ spin_unlock(&inode->i_lock); ++ spin_unlock(&nfs_access_lru_lock); ++ nfs_access_free_list(&head); + } + + static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred) +@@ -1809,8 +1828,8 @@ out_stale: + nfs_access_free_entry(cache); + return -ENOENT; + out_zap: +- /* This will release the spinlock */ +- __nfs_access_zap_cache(inode); ++ spin_unlock(&inode->i_lock); ++ nfs_access_zap_cache(inode); + return -ENOENT; + } + +@@ -1865,9 +1884,11 @@ static void nfs_access_add_cache(struct + smp_mb__after_atomic_inc(); + + /* Add inode to global LRU list */ +- if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { ++ if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { + spin_lock(&nfs_access_lru_lock); +- list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list); ++ if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) ++ list_add_tail(&NFS_I(inode)->access_cache_inode_lru, ++ &nfs_access_lru_list); + spin_unlock(&nfs_access_lru_lock); + } + } +diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file.c +--- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-23 11:00:23.790502081 -0400 ++++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-23 11:01:00.354376416 -0400 +@@ -162,14 +162,17 @@ static int nfs_revalidate_file_size(stru + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_inode *nfsi = NFS_I(inode); + +- if (server->flags & NFS_MOUNT_NOAC) +- goto force_reval; ++ if (nfs_have_delegated_attributes(inode)) ++ goto out_noreval; ++ + if (filp->f_flags & O_DIRECT) + goto force_reval; +- if (nfsi->npages != 0) +- return 0; +- if (!(nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode)) +- return 0; ++ if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) ++ goto force_reval; ++ if (nfs_attribute_timeout(inode)) ++ goto force_reval; ++out_noreval: ++ return 0; + force_reval: + return __nfs_revalidate_inode(server, inode); + } +diff -up linux-2.6.34.noarch/fs/nfs/fscache.c.orig linux-2.6.34.noarch/fs/nfs/fscache.c +--- linux-2.6.34.noarch/fs/nfs/fscache.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/fscache.c 2010-08-23 11:01:00.355376416 -0400 +@@ -467,7 +467,8 @@ int __nfs_readpages_from_fscache(struct + struct list_head *pages, + unsigned *nr_pages) + { +- int ret, npages = *nr_pages; ++ unsigned npages = *nr_pages; ++ int ret; + + dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", + NFS_I(inode)->fscache, npages, inode); +diff -up linux-2.6.34.noarch/fs/nfs/getroot.c.orig linux-2.6.34.noarch/fs/nfs/getroot.c +--- linux-2.6.34.noarch/fs/nfs/getroot.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/getroot.c 2010-08-23 11:01:00.356376417 -0400 +@@ -78,159 +78,94 @@ struct dentry *nfs_get_root(struct super + { + struct nfs_server *server = NFS_SB(sb); + struct nfs_fsinfo fsinfo; +- struct nfs_fattr fattr; +- struct dentry *mntroot; ++ struct dentry *ret; + struct inode *inode; + int error; + + /* get the actual root for this mount */ +- fsinfo.fattr = &fattr; ++ fsinfo.fattr = nfs_alloc_fattr(); ++ if (fsinfo.fattr == NULL) ++ return ERR_PTR(-ENOMEM); + + error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); + if (error < 0) { + dprintk("nfs_get_root: getattr error = %d\n", -error); +- return ERR_PTR(error); ++ ret = ERR_PTR(error); ++ goto out; + } + + inode = nfs_fhget(sb, mntfh, fsinfo.fattr); + if (IS_ERR(inode)) { + dprintk("nfs_get_root: get root inode failed\n"); +- return ERR_CAST(inode); ++ ret = ERR_CAST(inode); ++ goto out; + } + + error = nfs_superblock_set_dummy_root(sb, inode); +- if (error != 0) +- return ERR_PTR(error); ++ if (error != 0) { ++ ret = ERR_PTR(error); ++ goto out; ++ } + + /* root dentries normally start off anonymous and get spliced in later + * if the dentry tree reaches them; however if the dentry already + * exists, we'll pick it up at this point and use it as the root + */ +- mntroot = d_obtain_alias(inode); +- if (IS_ERR(mntroot)) { ++ ret = d_obtain_alias(inode); ++ if (IS_ERR(ret)) { + dprintk("nfs_get_root: get root dentry failed\n"); +- return mntroot; ++ goto out; + } + +- security_d_instantiate(mntroot, inode); +- +- if (!mntroot->d_op) +- mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops; ++ security_d_instantiate(ret, inode); + +- return mntroot; ++ if (ret->d_op == NULL) ++ ret->d_op = server->nfs_client->rpc_ops->dentry_ops; ++out: ++ nfs_free_fattr(fsinfo.fattr); ++ return ret; + } + + #ifdef CONFIG_NFS_V4 + +-/* +- * Do a simple pathwalk from the root FH of the server to the nominated target +- * of the mountpoint +- * - give error on symlinks +- * - give error on ".." occurring in the path +- * - follow traversals +- */ +-int nfs4_path_walk(struct nfs_server *server, +- struct nfs_fh *mntfh, +- const char *path) ++int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh) + { + struct nfs_fsinfo fsinfo; +- struct nfs_fattr fattr; +- struct nfs_fh lastfh; +- struct qstr name; +- int ret; +- +- dprintk("--> nfs4_path_walk(,,%s)\n", path); +- +- fsinfo.fattr = &fattr; +- nfs_fattr_init(&fattr); +- +- /* Eat leading slashes */ +- while (*path == '/') +- path++; ++ int ret = -ENOMEM; ++ ++ dprintk("--> nfs4_get_rootfh()\n"); ++ ++ fsinfo.fattr = nfs_alloc_fattr(); ++ if (fsinfo.fattr == NULL) ++ goto out; + + /* Start by getting the root filehandle from the server */ + ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); + if (ret < 0) { +- dprintk("nfs4_get_root: getroot error = %d\n", -ret); +- return ret; ++ dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); ++ goto out; + } + +- if (!S_ISDIR(fattr.mode)) { +- printk(KERN_ERR "nfs4_get_root:" ++ if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_MODE) ++ || !S_ISDIR(fsinfo.fattr->mode)) { ++ printk(KERN_ERR "nfs4_get_rootfh:" + " getroot encountered non-directory\n"); +- return -ENOTDIR; ++ ret = -ENOTDIR; ++ goto out; + } + +- /* FIXME: It is quite valid for the server to return a referral here */ +- if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) { +- printk(KERN_ERR "nfs4_get_root:" ++ if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { ++ printk(KERN_ERR "nfs4_get_rootfh:" + " getroot obtained referral\n"); +- return -EREMOTE; ++ ret = -EREMOTE; ++ goto out; + } + +-next_component: +- dprintk("Next: %s\n", path); +- +- /* extract the next bit of the path */ +- if (!*path) +- goto path_walk_complete; +- +- name.name = path; +- while (*path && *path != '/') +- path++; +- name.len = path - (const char *) name.name; +- +- if (name.len > NFS4_MAXNAMLEN) +- return -ENAMETOOLONG; +- +-eat_dot_dir: +- while (*path == '/') +- path++; +- +- if (path[0] == '.' && (path[1] == '/' || !path[1])) { +- path += 2; +- goto eat_dot_dir; +- } +- +- /* FIXME: Why shouldn't the user be able to use ".." in the path? */ +- if (path[0] == '.' && path[1] == '.' && (path[2] == '/' || !path[2]) +- ) { +- printk(KERN_ERR "nfs4_get_root:" +- " Mount path contains reference to \"..\"\n"); +- return -EINVAL; +- } +- +- /* lookup the next FH in the sequence */ +- memcpy(&lastfh, mntfh, sizeof(lastfh)); +- +- dprintk("LookupFH: %*.*s [%s]\n", name.len, name.len, name.name, path); +- +- ret = server->nfs_client->rpc_ops->lookupfh(server, &lastfh, &name, +- mntfh, &fattr); +- if (ret < 0) { +- dprintk("nfs4_get_root: getroot error = %d\n", -ret); +- return ret; +- } +- +- if (!S_ISDIR(fattr.mode)) { +- printk(KERN_ERR "nfs4_get_root:" +- " lookupfh encountered non-directory\n"); +- return -ENOTDIR; +- } +- +- /* FIXME: Referrals are quite valid here too */ +- if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) { +- printk(KERN_ERR "nfs4_get_root:" +- " lookupfh obtained referral\n"); +- return -EREMOTE; +- } +- +- goto next_component; +- +-path_walk_complete: +- memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid)); +- dprintk("<-- nfs4_path_walk() = 0\n"); +- return 0; ++ memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); ++out: ++ nfs_free_fattr(fsinfo.fattr); ++ dprintk("<-- nfs4_get_rootfh() = %d\n", ret); ++ return ret; + } + + /* +@@ -239,8 +174,8 @@ path_walk_complete: + struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) + { + struct nfs_server *server = NFS_SB(sb); +- struct nfs_fattr fattr; +- struct dentry *mntroot; ++ struct nfs_fattr *fattr = NULL; ++ struct dentry *ret; + struct inode *inode; + int error; + +@@ -254,40 +189,50 @@ struct dentry *nfs4_get_root(struct supe + return ERR_PTR(error); + } + ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ return ERR_PTR(-ENOMEM);; ++ + /* get the actual root for this mount */ +- error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); ++ error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); + if (error < 0) { + dprintk("nfs_get_root: getattr error = %d\n", -error); +- return ERR_PTR(error); ++ ret = ERR_PTR(error); ++ goto out; + } + +- inode = nfs_fhget(sb, mntfh, &fattr); ++ inode = nfs_fhget(sb, mntfh, fattr); + if (IS_ERR(inode)) { + dprintk("nfs_get_root: get root inode failed\n"); +- return ERR_CAST(inode); ++ ret = ERR_CAST(inode); ++ goto out; + } + + error = nfs_superblock_set_dummy_root(sb, inode); +- if (error != 0) +- return ERR_PTR(error); ++ if (error != 0) { ++ ret = ERR_PTR(error); ++ goto out; ++ } + + /* root dentries normally start off anonymous and get spliced in later + * if the dentry tree reaches them; however if the dentry already + * exists, we'll pick it up at this point and use it as the root + */ +- mntroot = d_obtain_alias(inode); +- if (IS_ERR(mntroot)) { ++ ret = d_obtain_alias(inode); ++ if (IS_ERR(ret)) { + dprintk("nfs_get_root: get root dentry failed\n"); +- return mntroot; ++ goto out; + } + +- security_d_instantiate(mntroot, inode); ++ security_d_instantiate(ret, inode); + +- if (!mntroot->d_op) +- mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops; ++ if (ret->d_op == NULL) ++ ret->d_op = server->nfs_client->rpc_ops->dentry_ops; + ++out: ++ nfs_free_fattr(fattr); + dprintk("<-- nfs4_get_root()\n"); +- return mntroot; ++ return ret; + } + + #endif /* CONFIG_NFS_V4 */ +diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inode.c +--- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-23 11:01:00.357376378 -0400 +@@ -393,8 +393,8 @@ int + nfs_setattr(struct dentry *dentry, struct iattr *attr) + { + struct inode *inode = dentry->d_inode; +- struct nfs_fattr fattr; +- int error; ++ struct nfs_fattr *fattr; ++ int error = -ENOMEM; + + nfs_inc_stats(inode, NFSIOS_VFSSETATTR); + +@@ -417,14 +417,20 @@ nfs_setattr(struct dentry *dentry, struc + filemap_write_and_wait(inode->i_mapping); + nfs_wb_all(inode); + } ++ ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto out; + /* + * Return any delegations if we're going to change ACLs + */ + if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) + nfs_inode_return_delegation(inode); +- error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); ++ error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); + if (error == 0) +- nfs_refresh_inode(inode, &fattr); ++ nfs_refresh_inode(inode, fattr); ++ nfs_free_fattr(fattr); ++out: + return error; + } + +@@ -682,7 +688,7 @@ int + __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) + { + int status = -ESTALE; +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr = NULL; + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", +@@ -693,8 +699,13 @@ __nfs_revalidate_inode(struct nfs_server + if (NFS_STALE(inode)) + goto out; + ++ status = -ENOMEM; ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto out; ++ + nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); +- status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); ++ status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr); + if (status != 0) { + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", + inode->i_sb->s_id, +@@ -707,7 +718,7 @@ __nfs_revalidate_inode(struct nfs_server + goto out; + } + +- status = nfs_refresh_inode(inode, &fattr); ++ status = nfs_refresh_inode(inode, fattr); + if (status) { + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", + inode->i_sb->s_id, +@@ -723,6 +734,7 @@ __nfs_revalidate_inode(struct nfs_server + (long long)NFS_FILEID(inode)); + + out: ++ nfs_free_fattr(fattr); + return status; + } + +@@ -730,9 +742,14 @@ int nfs_attribute_timeout(struct inode * + { + struct nfs_inode *nfsi = NFS_I(inode); + ++ return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); ++} ++ ++static int nfs_attribute_cache_expired(struct inode *inode) ++{ + if (nfs_have_delegated_attributes(inode)) + return 0; +- return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); ++ return nfs_attribute_timeout(inode); + } + + /** +@@ -745,7 +762,7 @@ int nfs_attribute_timeout(struct inode * + int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) + { + if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) +- && !nfs_attribute_timeout(inode)) ++ && !nfs_attribute_cache_expired(inode)) + return NFS_STALE(inode) ? -ESTALE : 0; + return __nfs_revalidate_inode(server, inode); + } +@@ -782,7 +799,8 @@ int nfs_revalidate_mapping(struct inode + int ret = 0; + + if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) +- || nfs_attribute_timeout(inode) || NFS_STALE(inode)) { ++ || nfs_attribute_cache_expired(inode) ++ || NFS_STALE(inode)) { + ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (ret < 0) + goto out; +@@ -916,6 +934,26 @@ void nfs_fattr_init(struct nfs_fattr *fa + fattr->gencount = nfs_inc_attr_generation_counter(); + } + ++struct nfs_fattr *nfs_alloc_fattr(void) ++{ ++ struct nfs_fattr *fattr; ++ ++ fattr = kmalloc(sizeof(*fattr), GFP_NOFS); ++ if (fattr != NULL) ++ nfs_fattr_init(fattr); ++ return fattr; ++} ++ ++struct nfs_fh *nfs_alloc_fhandle(void) ++{ ++ struct nfs_fh *fh; ++ ++ fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS); ++ if (fh != NULL) ++ fh->size = 0; ++ return fh; ++} ++ + /** + * nfs_inode_attrs_need_update - check if the inode attributes need updating + * @inode - pointer to inode +diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/internal.h +--- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-23 11:01:00.358564151 -0400 +@@ -244,9 +244,7 @@ extern struct dentry *nfs_get_root(struc + #ifdef CONFIG_NFS_V4 + extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *); + +-extern int nfs4_path_walk(struct nfs_server *server, +- struct nfs_fh *mntfh, +- const char *path); ++extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); + #endif + + /* read.c */ +diff -up linux-2.6.34.noarch/fs/nfs/iostat.h.orig linux-2.6.34.noarch/fs/nfs/iostat.h +--- linux-2.6.34.noarch/fs/nfs/iostat.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/iostat.h 2010-08-23 11:01:00.358564151 -0400 +@@ -36,14 +36,14 @@ static inline void nfs_inc_stats(const s + + static inline void nfs_add_server_stats(const struct nfs_server *server, + enum nfs_stat_bytecounters stat, +- unsigned long addend) ++ long addend) + { + this_cpu_add(server->io_stats->bytes[stat], addend); + } + + static inline void nfs_add_stats(const struct inode *inode, + enum nfs_stat_bytecounters stat, +- unsigned long addend) ++ long addend) + { + nfs_add_server_stats(NFS_SERVER(inode), stat, addend); + } +@@ -51,7 +51,7 @@ static inline void nfs_add_stats(const s + #ifdef CONFIG_NFS_FSCACHE + static inline void nfs_add_fscache_stats(struct inode *inode, + enum nfs_stat_fscachecounters stat, +- unsigned long addend) ++ long addend) + { + this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend); + } +diff -up linux-2.6.34.noarch/fs/nfs/namespace.c.orig linux-2.6.34.noarch/fs/nfs/namespace.c +--- linux-2.6.34.noarch/fs/nfs/namespace.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/namespace.c 2010-08-23 11:01:00.359420147 -0400 +@@ -105,8 +105,8 @@ static void * nfs_follow_mountpoint(stru + struct vfsmount *mnt; + struct nfs_server *server = NFS_SERVER(dentry->d_inode); + struct dentry *parent; +- struct nfs_fh fh; +- struct nfs_fattr fattr; ++ struct nfs_fh *fh = NULL; ++ struct nfs_fattr *fattr = NULL; + int err; + + dprintk("--> nfs_follow_mountpoint()\n"); +@@ -115,6 +115,12 @@ static void * nfs_follow_mountpoint(stru + if (IS_ROOT(dentry)) + goto out_err; + ++ err = -ENOMEM; ++ fh = nfs_alloc_fhandle(); ++ fattr = nfs_alloc_fattr(); ++ if (fh == NULL || fattr == NULL) ++ goto out_err; ++ + dprintk("%s: enter\n", __func__); + dput(nd->path.dentry); + nd->path.dentry = dget(dentry); +@@ -123,16 +129,16 @@ static void * nfs_follow_mountpoint(stru + parent = dget_parent(nd->path.dentry); + err = server->nfs_client->rpc_ops->lookup(parent->d_inode, + &nd->path.dentry->d_name, +- &fh, &fattr); ++ fh, fattr); + dput(parent); + if (err != 0) + goto out_err; + +- if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) ++ if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) + mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry); + else +- mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, &fh, +- &fattr); ++ mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh, ++ fattr); + err = PTR_ERR(mnt); + if (IS_ERR(mnt)) + goto out_err; +@@ -151,6 +157,8 @@ static void * nfs_follow_mountpoint(stru + nd->path.dentry = dget(mnt->mnt_root); + schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); + out: ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fh); + dprintk("%s: done, returned %d\n", __func__, err); + + dprintk("<-- nfs_follow_mountpoint() = %d\n", err); +diff -up linux-2.6.34.noarch/fs/nfs/nfs3acl.c.orig linux-2.6.34.noarch/fs/nfs/nfs3acl.c +--- linux-2.6.34.noarch/fs/nfs/nfs3acl.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3acl.c 2010-08-23 11:01:00.359420147 -0400 +@@ -185,7 +185,6 @@ static void nfs3_cache_acls(struct inode + struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) + { + struct nfs_server *server = NFS_SERVER(inode); +- struct nfs_fattr fattr; + struct page *pages[NFSACL_MAXPAGES] = { }; + struct nfs3_getaclargs args = { + .fh = NFS_FH(inode), +@@ -193,7 +192,7 @@ struct posix_acl *nfs3_proc_getacl(struc + .pages = pages, + }; + struct nfs3_getaclres res = { +- .fattr = &fattr, ++ 0 + }; + struct rpc_message msg = { + .rpc_argp = &args, +@@ -228,7 +227,10 @@ struct posix_acl *nfs3_proc_getacl(struc + + dprintk("NFS call getacl\n"); + msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; +- nfs_fattr_init(&fattr); ++ res.fattr = nfs_alloc_fattr(); ++ if (res.fattr == NULL) ++ return ERR_PTR(-ENOMEM); ++ + status = rpc_call_sync(server->client_acl, &msg, 0); + dprintk("NFS reply getacl: %d\n", status); + +@@ -238,7 +240,7 @@ struct posix_acl *nfs3_proc_getacl(struc + + switch (status) { + case 0: +- status = nfs_refresh_inode(inode, &fattr); ++ status = nfs_refresh_inode(inode, res.fattr); + break; + case -EPFNOSUPPORT: + case -EPROTONOSUPPORT: +@@ -278,6 +280,7 @@ struct posix_acl *nfs3_proc_getacl(struc + getout: + posix_acl_release(res.acl_access); + posix_acl_release(res.acl_default); ++ nfs_free_fattr(res.fattr); + + if (status != 0) { + posix_acl_release(acl); +@@ -290,7 +293,7 @@ static int nfs3_proc_setacls(struct inod + struct posix_acl *dfacl) + { + struct nfs_server *server = NFS_SERVER(inode); +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr; + struct page *pages[NFSACL_MAXPAGES]; + struct nfs3_setaclargs args = { + .inode = inode, +@@ -335,8 +338,13 @@ static int nfs3_proc_setacls(struct inod + } + + dprintk("NFS call setacl\n"); ++ status = -ENOMEM; ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto out_freepages; ++ + msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; +- nfs_fattr_init(&fattr); ++ msg.rpc_resp = fattr; + status = rpc_call_sync(server->client_acl, &msg, 0); + nfs_access_zap_cache(inode); + nfs_zap_acl_cache(inode); +@@ -344,7 +352,7 @@ static int nfs3_proc_setacls(struct inod + + switch (status) { + case 0: +- status = nfs_refresh_inode(inode, &fattr); ++ status = nfs_refresh_inode(inode, fattr); + nfs3_cache_acls(inode, acl, dfacl); + break; + case -EPFNOSUPPORT: +@@ -355,6 +363,7 @@ static int nfs3_proc_setacls(struct inod + case -ENOTSUPP: + status = -EOPNOTSUPP; + } ++ nfs_free_fattr(fattr); + out_freepages: + while (args.npages != 0) { + args.npages--; +diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs3proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-23 11:01:00.360574301 -0400 +@@ -144,14 +144,12 @@ static int + nfs3_proc_lookup(struct inode *dir, struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) + { +- struct nfs_fattr dir_attr; + struct nfs3_diropargs arg = { + .fh = NFS_FH(dir), + .name = name->name, + .len = name->len + }; + struct nfs3_diropres res = { +- .dir_attr = &dir_attr, + .fh = fhandle, + .fattr = fattr + }; +@@ -163,29 +161,30 @@ nfs3_proc_lookup(struct inode *dir, stru + int status; + + dprintk("NFS call lookup %s\n", name->name); +- nfs_fattr_init(&dir_attr); ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.dir_attr == NULL) ++ return -ENOMEM; ++ + nfs_fattr_init(fattr); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); +- nfs_refresh_inode(dir, &dir_attr); ++ nfs_refresh_inode(dir, res.dir_attr); + if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) { + msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; + msg.rpc_argp = fhandle; + msg.rpc_resp = fattr; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + } ++ nfs_free_fattr(res.dir_attr); + dprintk("NFS reply lookup: %d\n", status); + return status; + } + + static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) + { +- struct nfs_fattr fattr; + struct nfs3_accessargs arg = { + .fh = NFS_FH(inode), + }; +- struct nfs3_accessres res = { +- .fattr = &fattr, +- }; ++ struct nfs3_accessres res; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS], + .rpc_argp = &arg, +@@ -193,7 +192,7 @@ static int nfs3_proc_access(struct inode + .rpc_cred = entry->cred, + }; + int mode = entry->mask; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call access\n"); + +@@ -210,9 +209,13 @@ static int nfs3_proc_access(struct inode + if (mode & MAY_EXEC) + arg.access |= NFS3_ACCESS_EXECUTE; + } +- nfs_fattr_init(&fattr); ++ ++ res.fattr = nfs_alloc_fattr(); ++ if (res.fattr == NULL) ++ goto out; ++ + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); +- nfs_refresh_inode(inode, &fattr); ++ nfs_refresh_inode(inode, res.fattr); + if (status == 0) { + entry->mask = 0; + if (res.access & NFS3_ACCESS_READ) +@@ -222,6 +225,8 @@ static int nfs3_proc_access(struct inode + if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE)) + entry->mask |= MAY_EXEC; + } ++ nfs_free_fattr(res.fattr); ++out: + dprintk("NFS reply access: %d\n", status); + return status; + } +@@ -229,7 +234,7 @@ static int nfs3_proc_access(struct inode + static int nfs3_proc_readlink(struct inode *inode, struct page *page, + unsigned int pgbase, unsigned int pglen) + { +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr; + struct nfs3_readlinkargs args = { + .fh = NFS_FH(inode), + .pgbase = pgbase, +@@ -239,14 +244,19 @@ static int nfs3_proc_readlink(struct ino + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_READLINK], + .rpc_argp = &args, +- .rpc_resp = &fattr, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call readlink\n"); +- nfs_fattr_init(&fattr); ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto out; ++ msg.rpc_resp = fattr; ++ + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); +- nfs_refresh_inode(inode, &fattr); ++ nfs_refresh_inode(inode, fattr); ++ nfs_free_fattr(fattr); ++out: + dprintk("NFS reply readlink: %d\n", status); + return status; + } +@@ -396,12 +406,17 @@ nfs3_proc_remove(struct inode *dir, stru + .rpc_argp = &arg, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call remove %s\n", name->name); +- nfs_fattr_init(&res.dir_attr); ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.dir_attr == NULL) ++ goto out; ++ + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); +- nfs_post_op_update_inode(dir, &res.dir_attr); ++ nfs_post_op_update_inode(dir, res.dir_attr); ++ nfs_free_fattr(res.dir_attr); ++out: + dprintk("NFS reply remove: %d\n", status); + return status; + } +@@ -419,7 +434,7 @@ nfs3_proc_unlink_done(struct rpc_task *t + if (nfs3_async_handle_jukebox(task, dir)) + return 0; + res = task->tk_msg.rpc_resp; +- nfs_post_op_update_inode(dir, &res->dir_attr); ++ nfs_post_op_update_inode(dir, res->dir_attr); + return 1; + } + +@@ -427,7 +442,6 @@ static int + nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, + struct inode *new_dir, struct qstr *new_name) + { +- struct nfs_fattr old_dir_attr, new_dir_attr; + struct nfs3_renameargs arg = { + .fromfh = NFS_FH(old_dir), + .fromname = old_name->name, +@@ -436,23 +450,27 @@ nfs3_proc_rename(struct inode *old_dir, + .toname = new_name->name, + .tolen = new_name->len + }; +- struct nfs3_renameres res = { +- .fromattr = &old_dir_attr, +- .toattr = &new_dir_attr +- }; ++ struct nfs3_renameres res; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], + .rpc_argp = &arg, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); +- nfs_fattr_init(&old_dir_attr); +- nfs_fattr_init(&new_dir_attr); ++ ++ res.fromattr = nfs_alloc_fattr(); ++ res.toattr = nfs_alloc_fattr(); ++ if (res.fromattr == NULL || res.toattr == NULL) ++ goto out; ++ + status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); +- nfs_post_op_update_inode(old_dir, &old_dir_attr); +- nfs_post_op_update_inode(new_dir, &new_dir_attr); ++ nfs_post_op_update_inode(old_dir, res.fromattr); ++ nfs_post_op_update_inode(new_dir, res.toattr); ++out: ++ nfs_free_fattr(res.toattr); ++ nfs_free_fattr(res.fromattr); + dprintk("NFS reply rename: %d\n", status); + return status; + } +@@ -460,30 +478,32 @@ nfs3_proc_rename(struct inode *old_dir, + static int + nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) + { +- struct nfs_fattr dir_attr, fattr; + struct nfs3_linkargs arg = { + .fromfh = NFS_FH(inode), + .tofh = NFS_FH(dir), + .toname = name->name, + .tolen = name->len + }; +- struct nfs3_linkres res = { +- .dir_attr = &dir_attr, +- .fattr = &fattr +- }; ++ struct nfs3_linkres res; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_LINK], + .rpc_argp = &arg, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call link %s\n", name->name); +- nfs_fattr_init(&dir_attr); +- nfs_fattr_init(&fattr); ++ res.fattr = nfs_alloc_fattr(); ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.fattr == NULL || res.dir_attr == NULL) ++ goto out; ++ + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); +- nfs_post_op_update_inode(dir, &dir_attr); +- nfs_post_op_update_inode(inode, &fattr); ++ nfs_post_op_update_inode(dir, res.dir_attr); ++ nfs_post_op_update_inode(inode, res.fattr); ++out: ++ nfs_free_fattr(res.dir_attr); ++ nfs_free_fattr(res.fattr); + dprintk("NFS reply link: %d\n", status); + return status; + } +@@ -554,7 +574,7 @@ out: + static int + nfs3_proc_rmdir(struct inode *dir, struct qstr *name) + { +- struct nfs_fattr dir_attr; ++ struct nfs_fattr *dir_attr; + struct nfs3_diropargs arg = { + .fh = NFS_FH(dir), + .name = name->name, +@@ -563,14 +583,19 @@ nfs3_proc_rmdir(struct inode *dir, struc + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR], + .rpc_argp = &arg, +- .rpc_resp = &dir_attr, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call rmdir %s\n", name->name); +- nfs_fattr_init(&dir_attr); ++ dir_attr = nfs_alloc_fattr(); ++ if (dir_attr == NULL) ++ goto out; ++ ++ msg.rpc_resp = dir_attr; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); +- nfs_post_op_update_inode(dir, &dir_attr); ++ nfs_post_op_update_inode(dir, dir_attr); ++ nfs_free_fattr(dir_attr); ++out: + dprintk("NFS reply rmdir: %d\n", status); + return status; + } +@@ -589,7 +614,6 @@ nfs3_proc_readdir(struct dentry *dentry, + u64 cookie, struct page *page, unsigned int count, int plus) + { + struct inode *dir = dentry->d_inode; +- struct nfs_fattr dir_attr; + __be32 *verf = NFS_COOKIEVERF(dir); + struct nfs3_readdirargs arg = { + .fh = NFS_FH(dir), +@@ -600,7 +624,6 @@ nfs3_proc_readdir(struct dentry *dentry, + .pages = &page + }; + struct nfs3_readdirres res = { +- .dir_attr = &dir_attr, + .verf = verf, + .plus = plus + }; +@@ -610,7 +633,7 @@ nfs3_proc_readdir(struct dentry *dentry, + .rpc_resp = &res, + .rpc_cred = cred + }; +- int status; ++ int status = -ENOMEM; + + if (plus) + msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS]; +@@ -618,12 +641,17 @@ nfs3_proc_readdir(struct dentry *dentry, + dprintk("NFS call readdir%s %d\n", + plus? "plus" : "", (unsigned int) cookie); + +- nfs_fattr_init(&dir_attr); ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.dir_attr == NULL) ++ goto out; ++ + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + + nfs_invalidate_atime(dir); ++ nfs_refresh_inode(dir, res.dir_attr); + +- nfs_refresh_inode(dir, &dir_attr); ++ nfs_free_fattr(res.dir_attr); ++out: + dprintk("NFS reply readdir: %d\n", status); + return status; + } +diff -up linux-2.6.34.noarch/fs/nfs/nfs3xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs3xdr.c +--- linux-2.6.34.noarch/fs/nfs/nfs3xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3xdr.c 2010-08-23 11:01:00.361593802 -0400 +@@ -762,7 +762,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, _ + static int + nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res) + { +- return nfs3_xdr_wccstat(req, p, &res->dir_attr); ++ return nfs3_xdr_wccstat(req, p, res->dir_attr); + } + + /* +diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nfs4_fs.h +--- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-23 11:01:00.362574935 -0400 +@@ -206,14 +206,14 @@ extern ssize_t nfs4_listxattr(struct den + + + /* nfs4proc.c */ +-extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *); +-extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *); ++extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); ++extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); + extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); + extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); + extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); + extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); + extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); +-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait); ++extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); + extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); + extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); + extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); +@@ -286,7 +286,7 @@ extern void nfs4_put_lock_state(struct n + extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); + extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); + +-extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter); ++extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); + extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); + extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); + extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); +diff -up linux-2.6.34.noarch/fs/nfs/nfs4namespace.c.orig linux-2.6.34.noarch/fs/nfs/nfs4namespace.c +--- linux-2.6.34.noarch/fs/nfs/nfs4namespace.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4namespace.c 2010-08-23 11:01:00.363574219 -0400 +@@ -115,6 +115,7 @@ static struct vfsmount *try_location(str + char *page, char *page2, + const struct nfs4_fs_location *location) + { ++ const size_t addr_bufsize = sizeof(struct sockaddr_storage); + struct vfsmount *mnt = ERR_PTR(-ENOENT); + char *mnt_path; + unsigned int maxbuflen; +@@ -126,9 +127,12 @@ static struct vfsmount *try_location(str + mountdata->mnt_path = mnt_path; + maxbuflen = mnt_path - 1 - page2; + ++ mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL); ++ if (mountdata->addr == NULL) ++ return ERR_PTR(-ENOMEM); ++ + for (s = 0; s < location->nservers; s++) { + const struct nfs4_string *buf = &location->servers[s]; +- struct sockaddr_storage addr; + + if (buf->len <= 0 || buf->len >= maxbuflen) + continue; +@@ -137,11 +141,10 @@ static struct vfsmount *try_location(str + continue; + + mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, +- (struct sockaddr *)&addr, sizeof(addr)); ++ mountdata->addr, addr_bufsize); + if (mountdata->addrlen == 0) + continue; + +- mountdata->addr = (struct sockaddr *)&addr; + rpc_set_port(mountdata->addr, NFS_PORT); + + memcpy(page2, buf->data, buf->len); +@@ -156,6 +159,7 @@ static struct vfsmount *try_location(str + if (!IS_ERR(mnt)) + break; + } ++ kfree(mountdata->addr); + return mnt; + } + +@@ -221,8 +225,8 @@ out: + + /* + * nfs_do_refmount - handle crossing a referral on server ++ * @mnt_parent - mountpoint of referral + * @dentry - dentry of referral +- * @nd - nameidata info + * + */ + struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) +diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-23 11:01:00.365544029 -0400 +@@ -70,6 +70,9 @@ static int nfs4_do_fsinfo(struct nfs_ser + static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); + static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); ++static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, ++ struct nfs_fattr *fattr, struct iattr *sattr, ++ struct nfs4_state *state); + + /* Prevent leaks of NFSv4 errors into userland */ + static int nfs4_map_errors(int err) +@@ -714,17 +717,18 @@ static void nfs4_init_opendata_res(struc + + static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, + struct nfs4_state_owner *sp, fmode_t fmode, int flags, +- const struct iattr *attrs) ++ const struct iattr *attrs, ++ gfp_t gfp_mask) + { + struct dentry *parent = dget_parent(path->dentry); + struct inode *dir = parent->d_inode; + struct nfs_server *server = NFS_SERVER(dir); + struct nfs4_opendata *p; + +- p = kzalloc(sizeof(*p), GFP_KERNEL); ++ p = kzalloc(sizeof(*p), gfp_mask); + if (p == NULL) + goto err; +- p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); ++ p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask); + if (p->o_arg.seqid == NULL) + goto err_free; + path_get(path); +@@ -1060,7 +1064,7 @@ static struct nfs4_opendata *nfs4_open_r + { + struct nfs4_opendata *opendata; + +- opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL); ++ opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL, GFP_NOFS); + if (opendata == NULL) + return ERR_PTR(-ENOMEM); + opendata->state = state; +@@ -1648,7 +1652,7 @@ static int _nfs4_do_open(struct inode *d + if (path->dentry->d_inode != NULL) + nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode); + status = -ENOMEM; +- opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr); ++ opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr, GFP_KERNEL); + if (opendata == NULL) + goto err_put_state_owner; + +@@ -1659,15 +1663,24 @@ static int _nfs4_do_open(struct inode *d + if (status != 0) + goto err_opendata_put; + +- if (opendata->o_arg.open_flags & O_EXCL) +- nfs4_exclusive_attrset(opendata, sattr); +- + state = nfs4_opendata_to_nfs4_state(opendata); + status = PTR_ERR(state); + if (IS_ERR(state)) + goto err_opendata_put; + if (server->caps & NFS_CAP_POSIX_LOCK) + set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); ++ ++ if (opendata->o_arg.open_flags & O_EXCL) { ++ nfs4_exclusive_attrset(opendata, sattr); ++ ++ nfs_fattr_init(opendata->o_res.f_attr); ++ status = nfs4_do_setattr(state->inode, cred, ++ opendata->o_res.f_attr, sattr, ++ state); ++ if (status == 0) ++ nfs_setattr_update_inode(state->inode, sattr); ++ nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); ++ } + nfs4_opendata_put(opendata); + nfs4_put_state_owner(sp); + *res = state; +@@ -1914,7 +1927,7 @@ static const struct rpc_call_ops nfs4_cl + * + * NOTE: Caller must be holding the sp->so_owner semaphore! + */ +-int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) ++int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait) + { + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_closedata *calldata; +@@ -1933,7 +1946,7 @@ int nfs4_do_close(struct path *path, str + }; + int status = -ENOMEM; + +- calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); ++ calldata = kzalloc(sizeof(*calldata), gfp_mask); + if (calldata == NULL) + goto out; + calldata->inode = state->inode; +@@ -1941,7 +1954,7 @@ int nfs4_do_close(struct path *path, str + calldata->arg.fh = NFS_FH(state->inode); + calldata->arg.stateid = &state->open_stateid; + /* Serialization for the sequence id */ +- calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); ++ calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask); + if (calldata->arg.seqid == NULL) + goto out_free_calldata; + calldata->arg.fmode = 0; +@@ -2404,14 +2417,12 @@ static int nfs4_proc_lookup(struct inode + static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) + { + struct nfs_server *server = NFS_SERVER(inode); +- struct nfs_fattr fattr; + struct nfs4_accessargs args = { + .fh = NFS_FH(inode), + .bitmask = server->attr_bitmask, + }; + struct nfs4_accessres res = { + .server = server, +- .fattr = &fattr, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], +@@ -2438,7 +2449,11 @@ static int _nfs4_proc_access(struct inod + if (mode & MAY_EXEC) + args.access |= NFS4_ACCESS_EXECUTE; + } +- nfs_fattr_init(&fattr); ++ ++ res.fattr = nfs_alloc_fattr(); ++ if (res.fattr == NULL) ++ return -ENOMEM; ++ + status = nfs4_call_sync(server, &msg, &args, &res, 0); + if (!status) { + entry->mask = 0; +@@ -2448,8 +2463,9 @@ static int _nfs4_proc_access(struct inod + entry->mask |= MAY_WRITE; + if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) + entry->mask |= MAY_EXEC; +- nfs_refresh_inode(inode, &fattr); ++ nfs_refresh_inode(inode, res.fattr); + } ++ nfs_free_fattr(res.fattr); + return status; + } + +@@ -2562,13 +2578,6 @@ nfs4_proc_create(struct inode *dir, stru + } + d_add(dentry, igrab(state->inode)); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); +- if (flags & O_EXCL) { +- struct nfs_fattr fattr; +- status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state); +- if (status == 0) +- nfs_setattr_update_inode(state->inode, sattr); +- nfs_post_op_update_inode(state->inode, &fattr); +- } + if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) + status = nfs4_intent_set_file(nd, &path, state, fmode); + else +@@ -2596,14 +2605,19 @@ static int _nfs4_proc_remove(struct inod + .rpc_argp = &args, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; ++ ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.dir_attr == NULL) ++ goto out; + +- nfs_fattr_init(&res.dir_attr); + status = nfs4_call_sync(server, &msg, &args, &res, 1); + if (status == 0) { + update_changeattr(dir, &res.cinfo); +- nfs_post_op_update_inode(dir, &res.dir_attr); ++ nfs_post_op_update_inode(dir, res.dir_attr); + } ++ nfs_free_fattr(res.dir_attr); ++out: + return status; + } + +@@ -2638,7 +2652,7 @@ static int nfs4_proc_unlink_done(struct + if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) + return 0; + update_changeattr(dir, &res->cinfo); +- nfs_post_op_update_inode(dir, &res->dir_attr); ++ nfs_post_op_update_inode(dir, res->dir_attr); + return 1; + } + +@@ -2653,29 +2667,31 @@ static int _nfs4_proc_rename(struct inod + .new_name = new_name, + .bitmask = server->attr_bitmask, + }; +- struct nfs_fattr old_fattr, new_fattr; + struct nfs4_rename_res res = { + .server = server, +- .old_fattr = &old_fattr, +- .new_fattr = &new_fattr, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME], + .rpc_argp = &arg, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + +- nfs_fattr_init(res.old_fattr); +- nfs_fattr_init(res.new_fattr); +- status = nfs4_call_sync(server, &msg, &arg, &res, 1); ++ res.old_fattr = nfs_alloc_fattr(); ++ res.new_fattr = nfs_alloc_fattr(); ++ if (res.old_fattr == NULL || res.new_fattr == NULL) ++ goto out; + ++ status = nfs4_call_sync(server, &msg, &arg, &res, 1); + if (!status) { + update_changeattr(old_dir, &res.old_cinfo); + nfs_post_op_update_inode(old_dir, res.old_fattr); + update_changeattr(new_dir, &res.new_cinfo); + nfs_post_op_update_inode(new_dir, res.new_fattr); + } ++out: ++ nfs_free_fattr(res.new_fattr); ++ nfs_free_fattr(res.old_fattr); + return status; + } + +@@ -2702,28 +2718,30 @@ static int _nfs4_proc_link(struct inode + .name = name, + .bitmask = server->attr_bitmask, + }; +- struct nfs_fattr fattr, dir_attr; + struct nfs4_link_res res = { + .server = server, +- .fattr = &fattr, +- .dir_attr = &dir_attr, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], + .rpc_argp = &arg, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; ++ ++ res.fattr = nfs_alloc_fattr(); ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.fattr == NULL || res.dir_attr == NULL) ++ goto out; + +- nfs_fattr_init(res.fattr); +- nfs_fattr_init(res.dir_attr); + status = nfs4_call_sync(server, &msg, &arg, &res, 1); + if (!status) { + update_changeattr(dir, &res.cinfo); + nfs_post_op_update_inode(dir, res.dir_attr); + nfs_post_op_update_inode(inode, res.fattr); + } +- ++out: ++ nfs_free_fattr(res.dir_attr); ++ nfs_free_fattr(res.fattr); + return status; + } + +@@ -3146,23 +3164,31 @@ static void nfs4_proc_commit_setup(struc + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; + } + ++struct nfs4_renewdata { ++ struct nfs_client *client; ++ unsigned long timestamp; ++}; ++ + /* + * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special + * standalone procedure for queueing an asynchronous RENEW. + */ +-static void nfs4_renew_release(void *data) ++static void nfs4_renew_release(void *calldata) + { +- struct nfs_client *clp = data; ++ struct nfs4_renewdata *data = calldata; ++ struct nfs_client *clp = data->client; + + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); ++ kfree(data); + } + +-static void nfs4_renew_done(struct rpc_task *task, void *data) ++static void nfs4_renew_done(struct rpc_task *task, void *calldata) + { +- struct nfs_client *clp = data; +- unsigned long timestamp = task->tk_start; ++ struct nfs4_renewdata *data = calldata; ++ struct nfs_client *clp = data->client; ++ unsigned long timestamp = data->timestamp; + + if (task->tk_status < 0) { + /* Unless we're shutting down, schedule state recovery! */ +@@ -3188,11 +3214,17 @@ int nfs4_proc_async_renew(struct nfs_cli + .rpc_argp = clp, + .rpc_cred = cred, + }; ++ struct nfs4_renewdata *data; + + if (!atomic_inc_not_zero(&clp->cl_count)) + return -EIO; ++ data = kmalloc(sizeof(*data), GFP_KERNEL); ++ if (data == NULL) ++ return -ENOMEM; ++ data->client = clp; ++ data->timestamp = jiffies; + return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, +- &nfs4_renew_ops, clp); ++ &nfs4_renew_ops, data); + } + + int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) +@@ -3494,7 +3526,9 @@ nfs4_async_handle_error(struct rpc_task + return _nfs4_async_handle_error(task, server, server->nfs_client, state); + } + +-int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred) ++int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, ++ unsigned short port, struct rpc_cred *cred, ++ struct nfs4_setclientid_res *res) + { + nfs4_verifier sc_verifier; + struct nfs4_setclientid setclientid = { +@@ -3504,7 +3538,7 @@ int nfs4_proc_setclientid(struct nfs_cli + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], + .rpc_argp = &setclientid, +- .rpc_resp = clp, ++ .rpc_resp = res, + .rpc_cred = cred, + }; + __be32 *p; +@@ -3547,12 +3581,14 @@ int nfs4_proc_setclientid(struct nfs_cli + return status; + } + +-static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) ++static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, ++ struct nfs4_setclientid_res *arg, ++ struct rpc_cred *cred) + { + struct nfs_fsinfo fsinfo; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM], +- .rpc_argp = clp, ++ .rpc_argp = arg, + .rpc_resp = &fsinfo, + .rpc_cred = cred, + }; +@@ -3570,12 +3606,14 @@ static int _nfs4_proc_setclientid_confir + return status; + } + +-int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) ++int nfs4_proc_setclientid_confirm(struct nfs_client *clp, ++ struct nfs4_setclientid_res *arg, ++ struct rpc_cred *cred) + { + long timeout = 0; + int err; + do { +- err = _nfs4_proc_setclientid_confirm(clp, cred); ++ err = _nfs4_proc_setclientid_confirm(clp, arg, cred); + switch (err) { + case 0: + return err; +@@ -3667,7 +3705,7 @@ static int _nfs4_proc_delegreturn(struct + }; + int status = 0; + +- data = kzalloc(sizeof(*data), GFP_KERNEL); ++ data = kzalloc(sizeof(*data), GFP_NOFS); + if (data == NULL) + return -ENOMEM; + data->args.fhandle = &data->fh; +@@ -3823,7 +3861,7 @@ static struct nfs4_unlockdata *nfs4_allo + struct nfs4_unlockdata *p; + struct inode *inode = lsp->ls_state->inode; + +- p = kzalloc(sizeof(*p), GFP_KERNEL); ++ p = kzalloc(sizeof(*p), GFP_NOFS); + if (p == NULL) + return NULL; + p->arg.fh = NFS_FH(inode); +@@ -3961,7 +3999,7 @@ static int nfs4_proc_unlck(struct nfs4_s + if (test_bit(NFS_DELEGATED_STATE, &state->flags)) + goto out; + lsp = request->fl_u.nfs4_fl.owner; +- seqid = nfs_alloc_seqid(&lsp->ls_seqid); ++ seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL); + status = -ENOMEM; + if (seqid == NULL) + goto out; +@@ -3989,22 +4027,23 @@ struct nfs4_lockdata { + }; + + static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, +- struct nfs_open_context *ctx, struct nfs4_lock_state *lsp) ++ struct nfs_open_context *ctx, struct nfs4_lock_state *lsp, ++ gfp_t gfp_mask) + { + struct nfs4_lockdata *p; + struct inode *inode = lsp->ls_state->inode; + struct nfs_server *server = NFS_SERVER(inode); + +- p = kzalloc(sizeof(*p), GFP_KERNEL); ++ p = kzalloc(sizeof(*p), gfp_mask); + if (p == NULL) + return NULL; + + p->arg.fh = NFS_FH(inode); + p->arg.fl = &p->fl; +- p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid); ++ p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask); + if (p->arg.open_seqid == NULL) + goto out_free; +- p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid); ++ p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask); + if (p->arg.lock_seqid == NULL) + goto out_free_seqid; + p->arg.lock_stateid = &lsp->ls_stateid; +@@ -4158,7 +4197,8 @@ static int _nfs4_do_setlk(struct nfs4_st + + dprintk("%s: begin!\n", __func__); + data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), +- fl->fl_u.nfs4_fl.owner); ++ fl->fl_u.nfs4_fl.owner, ++ recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS); + if (data == NULL) + return -ENOMEM; + if (IS_SETLKW(cmd)) +@@ -4647,7 +4687,7 @@ static int nfs4_reset_slot_table(struct + if (max_reqs != tbl->max_slots) { + ret = -ENOMEM; + new = kmalloc(max_reqs * sizeof(struct nfs4_slot), +- GFP_KERNEL); ++ GFP_NOFS); + if (!new) + goto out; + ret = 0; +@@ -4712,7 +4752,7 @@ static int nfs4_init_slot_table(struct n + + dprintk("--> %s: max_reqs=%u\n", __func__, max_slots); + +- slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL); ++ slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS); + if (!slot) + goto out; + ret = 0; +@@ -4761,7 +4801,7 @@ struct nfs4_session *nfs4_alloc_session( + struct nfs4_session *session; + struct nfs4_slot_table *tbl; + +- session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL); ++ session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); + if (!session) + return NULL; + +@@ -5105,8 +5145,8 @@ static int nfs41_proc_async_sequence(str + + if (!atomic_inc_not_zero(&clp->cl_count)) + return -EIO; +- args = kzalloc(sizeof(*args), GFP_KERNEL); +- res = kzalloc(sizeof(*res), GFP_KERNEL); ++ args = kzalloc(sizeof(*args), GFP_NOFS); ++ res = kzalloc(sizeof(*res), GFP_NOFS); + if (!args || !res) { + kfree(args); + kfree(res); +@@ -5207,7 +5247,7 @@ static int nfs41_proc_reclaim_complete(s + int status = -ENOMEM; + + dprintk("--> %s\n", __func__); +- calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); ++ calldata = kzalloc(sizeof(*calldata), GFP_NOFS); + if (calldata == NULL) + goto out; + calldata->clp = clp; +diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/nfs4state.c +--- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-23 11:01:00.367574218 -0400 +@@ -62,6 +62,7 @@ static LIST_HEAD(nfs4_clientid_list); + + int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) + { ++ struct nfs4_setclientid_res clid; + unsigned short port; + int status; + +@@ -69,11 +70,15 @@ int nfs4_init_clientid(struct nfs_client + if (clp->cl_addr.ss_family == AF_INET6) + port = nfs_callback_tcpport6; + +- status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred); +- if (status == 0) +- status = nfs4_proc_setclientid_confirm(clp, cred); +- if (status == 0) +- nfs4_schedule_state_renewal(clp); ++ status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); ++ if (status != 0) ++ goto out; ++ status = nfs4_proc_setclientid_confirm(clp, &clid, cred); ++ if (status != 0) ++ goto out; ++ clp->cl_clientid = clid.clientid; ++ nfs4_schedule_state_renewal(clp); ++out: + return status; + } + +@@ -361,7 +366,7 @@ nfs4_alloc_state_owner(void) + { + struct nfs4_state_owner *sp; + +- sp = kzalloc(sizeof(*sp),GFP_KERNEL); ++ sp = kzalloc(sizeof(*sp),GFP_NOFS); + if (!sp) + return NULL; + spin_lock_init(&sp->so_lock); +@@ -435,7 +440,7 @@ nfs4_alloc_open_state(void) + { + struct nfs4_state *state; + +- state = kzalloc(sizeof(*state), GFP_KERNEL); ++ state = kzalloc(sizeof(*state), GFP_NOFS); + if (!state) + return NULL; + atomic_set(&state->count, 1); +@@ -537,7 +542,8 @@ void nfs4_put_open_state(struct nfs4_sta + /* + * Close the current file. + */ +-static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait) ++static void __nfs4_close(struct path *path, struct nfs4_state *state, ++ fmode_t fmode, gfp_t gfp_mask, int wait) + { + struct nfs4_state_owner *owner = state->owner; + int call_close = 0; +@@ -578,17 +584,17 @@ static void __nfs4_close(struct path *pa + nfs4_put_open_state(state); + nfs4_put_state_owner(owner); + } else +- nfs4_do_close(path, state, wait); ++ nfs4_do_close(path, state, gfp_mask, wait); + } + + void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) + { +- __nfs4_close(path, state, fmode, 0); ++ __nfs4_close(path, state, fmode, GFP_NOFS, 0); + } + + void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode) + { +- __nfs4_close(path, state, fmode, 1); ++ __nfs4_close(path, state, fmode, GFP_KERNEL, 1); + } + + /* +@@ -618,7 +624,7 @@ static struct nfs4_lock_state *nfs4_allo + struct nfs4_lock_state *lsp; + struct nfs_client *clp = state->owner->so_client; + +- lsp = kzalloc(sizeof(*lsp), GFP_KERNEL); ++ lsp = kzalloc(sizeof(*lsp), GFP_NOFS); + if (lsp == NULL) + return NULL; + rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue"); +@@ -754,11 +760,11 @@ void nfs4_copy_stateid(nfs4_stateid *dst + nfs4_put_lock_state(lsp); + } + +-struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter) ++struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask) + { + struct nfs_seqid *new; + +- new = kmalloc(sizeof(*new), GFP_KERNEL); ++ new = kmalloc(sizeof(*new), gfp_mask); + if (new != NULL) { + new->sequence = counter; + INIT_LIST_HEAD(&new->list); +@@ -1347,7 +1353,7 @@ static int nfs4_recall_slot(struct nfs_c + + nfs4_begin_drain_session(clp); + new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot), +- GFP_KERNEL); ++ GFP_NOFS); + if (!new) + return -ENOMEM; + +diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-23 11:00:23.792491380 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-23 11:01:00.369544055 -0400 +@@ -1504,14 +1504,14 @@ static void encode_setclientid(struct xd + hdr->replen += decode_setclientid_maxsz; + } + +-static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr) ++static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr) + { + __be32 *p; + + p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); +- p = xdr_encode_hyper(p, client_state->cl_clientid); +- xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); ++ p = xdr_encode_hyper(p, arg->clientid); ++ xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE); + hdr->nops++; + hdr->replen += decode_setclientid_confirm_maxsz; + } +@@ -2324,7 +2324,7 @@ static int nfs4_xdr_enc_setclientid(stru + /* + * a SETCLIENTID_CONFIRM request + */ +-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp) ++static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg) + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +@@ -2334,7 +2334,7 @@ static int nfs4_xdr_enc_setclientid_conf + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); +- encode_setclientid_confirm(&xdr, clp, &hdr); ++ encode_setclientid_confirm(&xdr, arg, &hdr); + encode_putrootfh(&xdr, &hdr); + encode_fsinfo(&xdr, lease_bitmap, &hdr); + encode_nops(&hdr); +@@ -4397,7 +4397,7 @@ out_overflow: + return -EIO; + } + +-static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) ++static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res) + { + __be32 *p; + uint32_t opnum; +@@ -4417,8 +4417,8 @@ static int decode_setclientid(struct xdr + p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); + if (unlikely(!p)) + goto out_overflow; +- p = xdr_decode_hyper(p, &clp->cl_clientid); +- memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE); ++ p = xdr_decode_hyper(p, &res->clientid); ++ memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE); + } else if (nfserr == NFSERR_CLID_INUSE) { + uint32_t len; + +@@ -4815,7 +4815,7 @@ static int nfs4_xdr_dec_remove(struct rp + goto out; + if ((status = decode_remove(&xdr, &res->cinfo)) != 0) + goto out; +- decode_getfattr(&xdr, &res->dir_attr, res->server, ++ decode_getfattr(&xdr, res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); + out: + return status; +@@ -5498,7 +5498,7 @@ static int nfs4_xdr_dec_renew(struct rpc + * Decode SETCLIENTID response + */ + static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, +- struct nfs_client *clp) ++ struct nfs4_setclientid_res *res) + { + struct xdr_stream xdr; + struct compound_hdr hdr; +@@ -5507,7 +5507,7 @@ static int nfs4_xdr_dec_setclientid(stru + xdr_init_decode(&xdr, &req->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (!status) +- status = decode_setclientid(&xdr, clp); ++ status = decode_setclientid(&xdr, res); + return status; + } + +diff -up linux-2.6.34.noarch/fs/nfs/nfsroot.c.orig linux-2.6.34.noarch/fs/nfs/nfsroot.c +--- linux-2.6.34.noarch/fs/nfs/nfsroot.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfsroot.c 2010-08-23 11:01:00.371574358 -0400 +@@ -488,7 +488,6 @@ static int __init root_nfs_ports(void) + */ + static int __init root_nfs_get_handle(void) + { +- struct nfs_fh fh; + struct sockaddr_in sin; + unsigned int auth_flav_len = 0; + struct nfs_mount_request request = { +@@ -499,21 +498,24 @@ static int __init root_nfs_get_handle(vo + NFS_MNT3_VERSION : NFS_MNT_VERSION, + .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ? + XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP, +- .fh = &fh, + .auth_flav_len = &auth_flav_len, + }; +- int status; ++ int status = -ENOMEM; + ++ request.fh = nfs_alloc_fhandle(); ++ if (!request.fh) ++ goto out; + set_sockaddr(&sin, servaddr, htons(mount_port)); + status = nfs_mount(&request); + if (status < 0) + printk(KERN_ERR "Root-NFS: Server returned error %d " + "while mounting %s\n", status, nfs_export_path); + else { +- nfs_data.root.size = fh.size; +- memcpy(nfs_data.root.data, fh.data, fh.size); ++ nfs_data.root.size = request.fh->size; ++ memcpy(&nfs_data.root.data, request.fh->data, request.fh->size); + } +- ++ nfs_free_fhandle(request.fh); ++out: + return status; + } + +diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/pagelist.c +--- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-23 11:01:00.371574358 -0400 +@@ -60,16 +60,10 @@ nfs_create_request(struct nfs_open_conte + { + struct nfs_page *req; + +- for (;;) { +- /* try to allocate the request struct */ +- req = nfs_page_alloc(); +- if (req != NULL) +- break; +- +- if (fatal_signal_pending(current)) +- return ERR_PTR(-ERESTARTSYS); +- yield(); +- } ++ /* try to allocate the request struct */ ++ req = nfs_page_alloc(); ++ if (req == NULL) ++ return ERR_PTR(-ENOMEM); + + /* Initialize the request struct. Initially, we assume a + * long write-back delay. This will be adjusted in +diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc.c +--- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-23 11:01:00.372574292 -0400 +@@ -224,35 +224,60 @@ static int nfs_proc_readlink(struct inod + return status; + } + ++struct nfs_createdata { ++ struct nfs_createargs arg; ++ struct nfs_diropok res; ++ struct nfs_fh fhandle; ++ struct nfs_fattr fattr; ++}; ++ ++static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir, ++ struct dentry *dentry, struct iattr *sattr) ++{ ++ struct nfs_createdata *data; ++ ++ data = kmalloc(sizeof(*data), GFP_KERNEL); ++ ++ if (data != NULL) { ++ data->arg.fh = NFS_FH(dir); ++ data->arg.name = dentry->d_name.name; ++ data->arg.len = dentry->d_name.len; ++ data->arg.sattr = sattr; ++ nfs_fattr_init(&data->fattr); ++ data->fhandle.size = 0; ++ data->res.fh = &data->fhandle; ++ data->res.fattr = &data->fattr; ++ } ++ return data; ++}; ++ ++static void nfs_free_createdata(const struct nfs_createdata *data) ++{ ++ kfree(data); ++} ++ + static int + nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + int flags, struct nameidata *nd) + { +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; +- struct nfs_createargs arg = { +- .fh = NFS_FH(dir), +- .name = dentry->d_name.name, +- .len = dentry->d_name.len, +- .sattr = sattr +- }; +- struct nfs_diropok res = { +- .fh = &fhandle, +- .fattr = &fattr +- }; ++ struct nfs_createdata *data; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_CREATE], +- .rpc_argp = &arg, +- .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + +- nfs_fattr_init(&fattr); + dprintk("NFS call create %s\n", dentry->d_name.name); ++ data = nfs_alloc_createdata(dir, dentry, sattr); ++ if (data == NULL) ++ goto out; ++ msg.rpc_argp = &data->arg; ++ msg.rpc_resp = &data->res; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + if (status == 0) +- status = nfs_instantiate(dentry, &fhandle, &fattr); ++ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); ++ nfs_free_createdata(data); ++out: + dprintk("NFS reply create: %d\n", status); + return status; + } +@@ -264,24 +289,12 @@ static int + nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + dev_t rdev) + { +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; +- struct nfs_createargs arg = { +- .fh = NFS_FH(dir), +- .name = dentry->d_name.name, +- .len = dentry->d_name.len, +- .sattr = sattr +- }; +- struct nfs_diropok res = { +- .fh = &fhandle, +- .fattr = &fattr +- }; ++ struct nfs_createdata *data; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_CREATE], +- .rpc_argp = &arg, +- .rpc_resp = &res, + }; +- int status, mode; ++ umode_t mode; ++ int status = -ENOMEM; + + dprintk("NFS call mknod %s\n", dentry->d_name.name); + +@@ -294,17 +307,24 @@ nfs_proc_mknod(struct inode *dir, struct + sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */ + } + +- nfs_fattr_init(&fattr); ++ data = nfs_alloc_createdata(dir, dentry, sattr); ++ if (data == NULL) ++ goto out; ++ msg.rpc_argp = &data->arg; ++ msg.rpc_resp = &data->res; ++ + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + + if (status == -EINVAL && S_ISFIFO(mode)) { + sattr->ia_mode = mode; +- nfs_fattr_init(&fattr); ++ nfs_fattr_init(data->res.fattr); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + } + if (status == 0) +- status = nfs_instantiate(dentry, &fhandle, &fattr); ++ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); ++ nfs_free_createdata(data); ++out: + dprintk("NFS reply mknod: %d\n", status); + return status; + } +@@ -398,8 +418,8 @@ static int + nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, + unsigned int len, struct iattr *sattr) + { +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; ++ struct nfs_fh *fh; ++ struct nfs_fattr *fattr; + struct nfs_symlinkargs arg = { + .fromfh = NFS_FH(dir), + .fromname = dentry->d_name.name, +@@ -412,12 +432,18 @@ nfs_proc_symlink(struct inode *dir, stru + .rpc_proc = &nfs_procedures[NFSPROC_SYMLINK], + .rpc_argp = &arg, + }; +- int status; ++ int status = -ENAMETOOLONG; ++ ++ dprintk("NFS call symlink %s\n", dentry->d_name.name); + + if (len > NFS2_MAXPATHLEN) +- return -ENAMETOOLONG; ++ goto out; + +- dprintk("NFS call symlink %s\n", dentry->d_name.name); ++ fh = nfs_alloc_fhandle(); ++ fattr = nfs_alloc_fattr(); ++ status = -ENOMEM; ++ if (fh == NULL || fattr == NULL) ++ goto out; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); +@@ -427,12 +453,12 @@ nfs_proc_symlink(struct inode *dir, stru + * filehandle size to zero indicates to nfs_instantiate that it + * should fill in the data with a LOOKUP call on the wire. + */ +- if (status == 0) { +- nfs_fattr_init(&fattr); +- fhandle.size = 0; +- status = nfs_instantiate(dentry, &fhandle, &fattr); +- } ++ if (status == 0) ++ status = nfs_instantiate(dentry, fh, fattr); + ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fh); ++out: + dprintk("NFS reply symlink: %d\n", status); + return status; + } +@@ -440,31 +466,25 @@ nfs_proc_symlink(struct inode *dir, stru + static int + nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) + { +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; +- struct nfs_createargs arg = { +- .fh = NFS_FH(dir), +- .name = dentry->d_name.name, +- .len = dentry->d_name.len, +- .sattr = sattr +- }; +- struct nfs_diropok res = { +- .fh = &fhandle, +- .fattr = &fattr +- }; ++ struct nfs_createdata *data; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_MKDIR], +- .rpc_argp = &arg, +- .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call mkdir %s\n", dentry->d_name.name); +- nfs_fattr_init(&fattr); ++ data = nfs_alloc_createdata(dir, dentry, sattr); ++ if (data == NULL) ++ goto out; ++ msg.rpc_argp = &data->arg; ++ msg.rpc_resp = &data->res; ++ + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + if (status == 0) +- status = nfs_instantiate(dentry, &fhandle, &fattr); ++ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); ++ nfs_free_createdata(data); ++out: + dprintk("NFS reply mkdir: %d\n", status); + return status; + } +diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read.c +--- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-23 11:01:00.373574317 -0400 +@@ -40,7 +40,7 @@ static mempool_t *nfs_rdata_mempool; + + struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) + { +- struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS); ++ struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL); + + if (p) { + memset(p, 0, sizeof(*p)); +@@ -50,7 +50,7 @@ struct nfs_read_data *nfs_readdata_alloc + if (pagecount <= ARRAY_SIZE(p->page_array)) + p->pagevec = p->page_array; + else { +- p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); ++ p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); + if (!p->pagevec) { + mempool_free(p, nfs_rdata_mempool); + p = NULL; +diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/super.c +--- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-23 11:00:23.794511661 -0400 ++++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-23 11:01:00.374564179 -0400 +@@ -141,7 +141,6 @@ static const match_table_t nfs_mount_opt + { Opt_resvport, "resvport" }, + { Opt_noresvport, "noresvport" }, + { Opt_fscache, "fsc" }, +- { Opt_fscache_uniq, "fsc=%s" }, + { Opt_nofscache, "nofsc" }, + + { Opt_port, "port=%s" }, +@@ -171,6 +170,7 @@ static const match_table_t nfs_mount_opt + { Opt_mountaddr, "mountaddr=%s" }, + + { Opt_lookupcache, "lookupcache=%s" }, ++ { Opt_fscache_uniq, "fsc=%s" }, + + { Opt_err, NULL } + }; +@@ -423,15 +423,19 @@ static int nfs_statfs(struct dentry *den + unsigned char blockbits; + unsigned long blockres; + struct nfs_fh *fh = NFS_FH(dentry->d_inode); +- struct nfs_fattr fattr; +- struct nfs_fsstat res = { +- .fattr = &fattr, +- }; +- int error; ++ struct nfs_fsstat res; ++ int error = -ENOMEM; ++ ++ res.fattr = nfs_alloc_fattr(); ++ if (res.fattr == NULL) ++ goto out_err; + + error = server->nfs_client->rpc_ops->statfs(server, fh, &res); ++ ++ nfs_free_fattr(res.fattr); + if (error < 0) + goto out_err; ++ + buf->f_type = NFS_SUPER_MAGIC; + + /* +@@ -1060,14 +1064,6 @@ static int nfs_parse_mount_options(char + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = NULL; + break; +- case Opt_fscache_uniq: +- string = match_strdup(args); +- if (!string) +- goto out_nomem; +- kfree(mnt->fscache_uniq); +- mnt->fscache_uniq = string; +- mnt->options |= NFS_OPTION_FSCACHE; +- break; + + /* + * options that take numeric values +@@ -1398,6 +1394,14 @@ static int nfs_parse_mount_options(char + return 0; + }; + break; ++ case Opt_fscache_uniq: ++ string = match_strdup(args); ++ if (string == NULL) ++ goto out_nomem; ++ kfree(mnt->fscache_uniq); ++ mnt->fscache_uniq = string; ++ mnt->options |= NFS_OPTION_FSCACHE; ++ break; + + /* + * Special options +@@ -2186,7 +2190,7 @@ static int nfs_get_sb(struct file_system + int error = -ENOMEM; + + data = nfs_alloc_parsed_mount_data(3); +- mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); ++ mntfh = nfs_alloc_fhandle(); + if (data == NULL || mntfh == NULL) + goto out_free_fh; + +@@ -2261,7 +2265,7 @@ out: + kfree(data->fscache_uniq); + security_free_mnt_opts(&data->lsm_opts); + out_free_fh: +- kfree(mntfh); ++ nfs_free_fhandle(mntfh); + kfree(data); + return error; + +@@ -2570,7 +2574,7 @@ static int nfs4_remote_get_sb(struct fil + }; + int error = -ENOMEM; + +- mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); ++ mntfh = nfs_alloc_fhandle(); + if (data == NULL || mntfh == NULL) + goto out_free_fh; + +@@ -2628,7 +2632,7 @@ static int nfs4_remote_get_sb(struct fil + out: + security_free_mnt_opts(&data->lsm_opts); + out_free_fh: +- kfree(mntfh); ++ nfs_free_fhandle(mntfh); + return error; + + out_free: +@@ -2683,41 +2687,120 @@ out_freepage: + free_page((unsigned long)page); + } + ++struct nfs_referral_count { ++ struct list_head list; ++ const struct task_struct *task; ++ unsigned int referral_count; ++}; ++ ++static LIST_HEAD(nfs_referral_count_list); ++static DEFINE_SPINLOCK(nfs_referral_count_list_lock); ++ ++static struct nfs_referral_count *nfs_find_referral_count(void) ++{ ++ struct nfs_referral_count *p; ++ ++ list_for_each_entry(p, &nfs_referral_count_list, list) { ++ if (p->task == current) ++ return p; ++ } ++ return NULL; ++} ++ ++#define NFS_MAX_NESTED_REFERRALS 2 ++ ++static int nfs_referral_loop_protect(void) ++{ ++ struct nfs_referral_count *p, *new; ++ int ret = -ENOMEM; ++ ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ goto out; ++ new->task = current; ++ new->referral_count = 1; ++ ++ ret = 0; ++ spin_lock(&nfs_referral_count_list_lock); ++ p = nfs_find_referral_count(); ++ if (p != NULL) { ++ if (p->referral_count >= NFS_MAX_NESTED_REFERRALS) ++ ret = -ELOOP; ++ else ++ p->referral_count++; ++ } else { ++ list_add(&new->list, &nfs_referral_count_list); ++ new = NULL; ++ } ++ spin_unlock(&nfs_referral_count_list_lock); ++ kfree(new); ++out: ++ return ret; ++} ++ ++static void nfs_referral_loop_unprotect(void) ++{ ++ struct nfs_referral_count *p; ++ ++ spin_lock(&nfs_referral_count_list_lock); ++ p = nfs_find_referral_count(); ++ p->referral_count--; ++ if (p->referral_count == 0) ++ list_del(&p->list); ++ else ++ p = NULL; ++ spin_unlock(&nfs_referral_count_list_lock); ++ kfree(p); ++} ++ + static int nfs_follow_remote_path(struct vfsmount *root_mnt, + const char *export_path, struct vfsmount *mnt_target) + { ++ struct nameidata *nd = NULL; + struct mnt_namespace *ns_private; +- struct nameidata nd; + struct super_block *s; + int ret; + ++ nd = kmalloc(sizeof(*nd), GFP_KERNEL); ++ if (nd == NULL) ++ return -ENOMEM; ++ + ns_private = create_mnt_ns(root_mnt); + ret = PTR_ERR(ns_private); + if (IS_ERR(ns_private)) + goto out_mntput; + ++ ret = nfs_referral_loop_protect(); ++ if (ret != 0) ++ goto out_put_mnt_ns; ++ + ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, +- export_path, LOOKUP_FOLLOW, &nd); ++ export_path, LOOKUP_FOLLOW, nd); + ++ nfs_referral_loop_unprotect(); + put_mnt_ns(ns_private); + + if (ret != 0) + goto out_err; + +- s = nd.path.mnt->mnt_sb; ++ s = nd->path.mnt->mnt_sb; + atomic_inc(&s->s_active); + mnt_target->mnt_sb = s; +- mnt_target->mnt_root = dget(nd.path.dentry); ++ mnt_target->mnt_root = dget(nd->path.dentry); + + /* Correct the device pathname */ +- nfs_fix_devname(&nd.path, mnt_target); ++ nfs_fix_devname(&nd->path, mnt_target); + +- path_put(&nd.path); ++ path_put(&nd->path); ++ kfree(nd); + down_write(&s->s_umount); + return 0; ++out_put_mnt_ns: ++ put_mnt_ns(ns_private); + out_mntput: + mntput(root_mnt); + out_err: ++ kfree(nd); + return ret; + } + +@@ -2888,17 +2971,21 @@ static int nfs4_remote_referral_get_sb(s + struct super_block *s; + struct nfs_server *server; + struct dentry *mntroot; +- struct nfs_fh mntfh; ++ struct nfs_fh *mntfh; + int (*compare_super)(struct super_block *, void *) = nfs_compare_super; + struct nfs_sb_mountdata sb_mntdata = { + .mntflags = flags, + }; +- int error; ++ int error = -ENOMEM; + + dprintk("--> nfs4_referral_get_sb()\n"); + ++ mntfh = nfs_alloc_fhandle(); ++ if (mntfh == NULL) ++ goto out_err_nofh; ++ + /* create a new volume representation */ +- server = nfs4_create_referral_server(data, &mntfh); ++ server = nfs4_create_referral_server(data, mntfh); + if (IS_ERR(server)) { + error = PTR_ERR(server); + goto out_err_noserver; +@@ -2930,7 +3017,7 @@ static int nfs4_remote_referral_get_sb(s + nfs_fscache_get_super_cookie(s, NULL, data); + } + +- mntroot = nfs4_get_root(s, &mntfh); ++ mntroot = nfs4_get_root(s, mntfh); + if (IS_ERR(mntroot)) { + error = PTR_ERR(mntroot); + goto error_splat_super; +@@ -2947,12 +3034,15 @@ static int nfs4_remote_referral_get_sb(s + + security_sb_clone_mnt_opts(data->sb, s); + ++ nfs_free_fhandle(mntfh); + dprintk("<-- nfs4_referral_get_sb() = 0\n"); + return 0; + + out_err_nosb: + nfs_free_server(server); + out_err_noserver: ++ nfs_free_fhandle(mntfh); ++out_err_nofh: + dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error); + return error; + +@@ -2961,6 +3051,7 @@ error_splat_super: + bdi_unregister(&server->backing_dev_info); + error_splat_bdi: + deactivate_locked_super(s); ++ nfs_free_fhandle(mntfh); + dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); + return error; + } +diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unlink.c +--- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-23 11:01:00.375554592 -0400 +@@ -23,6 +23,7 @@ struct nfs_unlinkdata { + struct nfs_removeres res; + struct inode *dir; + struct rpc_cred *cred; ++ struct nfs_fattr dir_attr; + }; + + /** +@@ -169,7 +170,7 @@ static int nfs_do_call_unlink(struct den + } + nfs_sb_active(dir->i_sb); + data->args.fh = NFS_FH(dir); +- nfs_fattr_init(&data->res.dir_attr); ++ nfs_fattr_init(data->res.dir_attr); + + NFS_PROTO(dir)->unlink_setup(&msg, dir); + +@@ -259,6 +260,7 @@ nfs_async_unlink(struct inode *dir, stru + goto out_free; + } + data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ data->res.dir_attr = &data->dir_attr; + + status = -EBUSY; + spin_lock(&dentry->d_lock); +diff -up linux-2.6.34.noarch/include/linux/ktime.h.orig linux-2.6.34.noarch/include/linux/ktime.h +--- linux-2.6.34.noarch/include/linux/ktime.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/ktime.h 2010-08-23 11:01:00.377554285 -0400 +@@ -130,7 +130,7 @@ static inline ktime_t timeval_to_ktime(s + /* Convert ktime_t to nanoseconds - NOP in the scalar storage format: */ + #define ktime_to_ns(kt) ((kt).tv64) + +-#else ++#else /* !((BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)) */ + + /* + * Helper macros/inlines to get the ktime_t math right in the timespec +@@ -275,7 +275,7 @@ static inline s64 ktime_to_ns(const ktim + return (s64) kt.tv.sec * NSEC_PER_SEC + kt.tv.nsec; + } + +-#endif ++#endif /* !((BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)) */ + + /** + * ktime_equal - Compares two ktime_t variables to see if they are equal +@@ -295,6 +295,12 @@ static inline s64 ktime_to_us(const ktim + return (s64) tv.tv_sec * USEC_PER_SEC + tv.tv_usec; + } + ++static inline s64 ktime_to_ms(const ktime_t kt) ++{ ++ struct timeval tv = ktime_to_timeval(kt); ++ return (s64) tv.tv_sec * MSEC_PER_SEC + tv.tv_usec / USEC_PER_MSEC; ++} ++ + static inline s64 ktime_us_delta(const ktime_t later, const ktime_t earlier) + { + return ktime_to_us(ktime_sub(later, earlier)); +diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/include/linux/nfs_fs.h +--- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-23 11:00:23.822502111 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-23 11:01:00.378563926 -0400 +@@ -356,6 +356,20 @@ extern struct nfs_open_context *nfs_find + extern u64 nfs_compat_user_ino64(u64 fileid); + extern void nfs_fattr_init(struct nfs_fattr *fattr); + ++extern struct nfs_fattr *nfs_alloc_fattr(void); ++ ++static inline void nfs_free_fattr(const struct nfs_fattr *fattr) ++{ ++ kfree(fattr); ++} ++ ++extern struct nfs_fh *nfs_alloc_fhandle(void); ++ ++static inline void nfs_free_fhandle(const struct nfs_fh *fh) ++{ ++ kfree(fh); ++} ++ + /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */ + extern __be32 root_nfs_parse_addr(char *name); /*__init*/ + extern unsigned long nfs_inc_attr_generation_counter(void); +diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/include/linux/nfs_fs_sb.h +--- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-23 11:01:00.380553887 -0400 +@@ -44,7 +44,6 @@ struct nfs_client { + + #ifdef CONFIG_NFS_V4 + u64 cl_clientid; /* constant */ +- nfs4_verifier cl_confirm; + unsigned long cl_state; + + struct rb_root cl_openowner_id; +diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/include/linux/nfs_xdr.h +--- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-23 11:01:00.381564072 -0400 +@@ -386,8 +386,8 @@ struct nfs_removeargs { + + struct nfs_removeres { + const struct nfs_server *server; ++ struct nfs_fattr *dir_attr; + struct nfs4_change_info cinfo; +- struct nfs_fattr dir_attr; + struct nfs4_sequence_res seq_res; + }; + +@@ -824,6 +824,11 @@ struct nfs4_setclientid { + u32 sc_cb_ident; + }; + ++struct nfs4_setclientid_res { ++ u64 clientid; ++ nfs4_verifier confirm; ++}; ++ + struct nfs4_statfs_arg { + const struct nfs_fh * fh; + const u32 * bitmask; +diff -up linux-2.6.34.noarch/include/linux/sunrpc/auth_gss.h.orig linux-2.6.34.noarch/include/linux/sunrpc/auth_gss.h +--- linux-2.6.34.noarch/include/linux/sunrpc/auth_gss.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/auth_gss.h 2010-08-23 11:01:00.382564026 -0400 +@@ -82,6 +82,7 @@ struct gss_cred { + enum rpc_gss_svc gc_service; + struct gss_cl_ctx *gc_ctx; + struct gss_upcall_msg *gc_upcall; ++ unsigned long gc_upcall_timestamp; + unsigned char gc_machine_cred : 1; + }; + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/auth.h.orig linux-2.6.34.noarch/include/linux/sunrpc/auth.h +--- linux-2.6.34.noarch/include/linux/sunrpc/auth.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/auth.h 2010-08-23 11:01:00.382564026 -0400 +@@ -54,6 +54,7 @@ struct rpc_cred { + #define RPCAUTH_CRED_NEW 0 + #define RPCAUTH_CRED_UPTODATE 1 + #define RPCAUTH_CRED_HASHED 2 ++#define RPCAUTH_CRED_NEGATIVE 3 + + #define RPCAUTH_CRED_MAGIC 0x0f4aa4f0 + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/gss_api.h.orig linux-2.6.34.noarch/include/linux/sunrpc/gss_api.h +--- linux-2.6.34.noarch/include/linux/sunrpc/gss_api.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/gss_api.h 2010-08-23 11:01:00.383574314 -0400 +@@ -35,7 +35,8 @@ int gss_import_sec_context( + const void* input_token, + size_t bufsize, + struct gss_api_mech *mech, +- struct gss_ctx **ctx_id); ++ struct gss_ctx **ctx_id, ++ gfp_t gfp_mask); + u32 gss_get_mic( + struct gss_ctx *ctx_id, + struct xdr_buf *message, +@@ -80,6 +81,8 @@ struct gss_api_mech { + /* pseudoflavors supported by this mechanism: */ + int gm_pf_num; + struct pf_desc * gm_pfs; ++ /* Should the following be a callback operation instead? */ ++ const char *gm_upcall_enctypes; + }; + + /* and must provide the following operations: */ +@@ -87,7 +90,8 @@ struct gss_api_ops { + int (*gss_import_sec_context)( + const void *input_token, + size_t bufsize, +- struct gss_ctx *ctx_id); ++ struct gss_ctx *ctx_id, ++ gfp_t gfp_mask); + u32 (*gss_get_mic)( + struct gss_ctx *ctx_id, + struct xdr_buf *message, +diff -up linux-2.6.34.noarch/include/linux/sunrpc/gss_krb5.h.orig linux-2.6.34.noarch/include/linux/sunrpc/gss_krb5.h +--- linux-2.6.34.noarch/include/linux/sunrpc/gss_krb5.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/gss_krb5.h 2010-08-23 11:01:00.383574314 -0400 +@@ -4,7 +4,7 @@ + * Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h, + * lib/gssapi/krb5/gssapiP_krb5.h, and others + * +- * Copyright (c) 2000 The Regents of the University of Michigan. ++ * Copyright (c) 2000-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson +@@ -36,17 +36,86 @@ + * + */ + ++#include + #include + #include + #include + ++/* Length of constant used in key derivation */ ++#define GSS_KRB5_K5CLENGTH (5) ++ ++/* Maximum key length (in bytes) for the supported crypto algorithms*/ ++#define GSS_KRB5_MAX_KEYLEN (32) ++ ++/* Maximum checksum function output for the supported crypto algorithms */ ++#define GSS_KRB5_MAX_CKSUM_LEN (20) ++ ++/* Maximum blocksize for the supported crypto algorithms */ ++#define GSS_KRB5_MAX_BLOCKSIZE (16) ++ ++struct krb5_ctx; ++ ++struct gss_krb5_enctype { ++ const u32 etype; /* encryption (key) type */ ++ const u32 ctype; /* checksum type */ ++ const char *name; /* "friendly" name */ ++ const char *encrypt_name; /* crypto encrypt name */ ++ const char *cksum_name; /* crypto checksum name */ ++ const u16 signalg; /* signing algorithm */ ++ const u16 sealalg; /* sealing algorithm */ ++ const u32 blocksize; /* encryption blocksize */ ++ const u32 conflen; /* confounder length ++ (normally the same as ++ the blocksize) */ ++ const u32 cksumlength; /* checksum length */ ++ const u32 keyed_cksum; /* is it a keyed cksum? */ ++ const u32 keybytes; /* raw key len, in bytes */ ++ const u32 keylength; /* final key len, in bytes */ ++ u32 (*encrypt) (struct crypto_blkcipher *tfm, ++ void *iv, void *in, void *out, ++ int length); /* encryption function */ ++ u32 (*decrypt) (struct crypto_blkcipher *tfm, ++ void *iv, void *in, void *out, ++ int length); /* decryption function */ ++ u32 (*mk_key) (const struct gss_krb5_enctype *gk5e, ++ struct xdr_netobj *in, ++ struct xdr_netobj *out); /* complete key generation */ ++ u32 (*encrypt_v2) (struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, int ec, ++ struct page **pages); /* v2 encryption function */ ++ u32 (*decrypt_v2) (struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, u32 *headskip, ++ u32 *tailskip); /* v2 decryption function */ ++}; ++ ++/* krb5_ctx flags definitions */ ++#define KRB5_CTX_FLAG_INITIATOR 0x00000001 ++#define KRB5_CTX_FLAG_CFX 0x00000002 ++#define KRB5_CTX_FLAG_ACCEPTOR_SUBKEY 0x00000004 ++ + struct krb5_ctx { + int initiate; /* 1 = initiating, 0 = accepting */ ++ u32 enctype; ++ u32 flags; ++ const struct gss_krb5_enctype *gk5e; /* enctype-specific info */ + struct crypto_blkcipher *enc; + struct crypto_blkcipher *seq; ++ struct crypto_blkcipher *acceptor_enc; ++ struct crypto_blkcipher *initiator_enc; ++ struct crypto_blkcipher *acceptor_enc_aux; ++ struct crypto_blkcipher *initiator_enc_aux; ++ u8 Ksess[GSS_KRB5_MAX_KEYLEN]; /* session key */ ++ u8 cksum[GSS_KRB5_MAX_KEYLEN]; + s32 endtime; + u32 seq_send; ++ u64 seq_send64; + struct xdr_netobj mech_used; ++ u8 initiator_sign[GSS_KRB5_MAX_KEYLEN]; ++ u8 acceptor_sign[GSS_KRB5_MAX_KEYLEN]; ++ u8 initiator_seal[GSS_KRB5_MAX_KEYLEN]; ++ u8 acceptor_seal[GSS_KRB5_MAX_KEYLEN]; ++ u8 initiator_integ[GSS_KRB5_MAX_KEYLEN]; ++ u8 acceptor_integ[GSS_KRB5_MAX_KEYLEN]; + }; + + extern spinlock_t krb5_seq_lock; +@@ -57,6 +126,18 @@ extern spinlock_t krb5_seq_lock; + #define KG_TOK_MIC_MSG 0x0101 + #define KG_TOK_WRAP_MSG 0x0201 + ++#define KG2_TOK_INITIAL 0x0101 ++#define KG2_TOK_RESPONSE 0x0202 ++#define KG2_TOK_MIC 0x0404 ++#define KG2_TOK_WRAP 0x0504 ++ ++#define KG2_TOKEN_FLAG_SENTBYACCEPTOR 0x01 ++#define KG2_TOKEN_FLAG_SEALED 0x02 ++#define KG2_TOKEN_FLAG_ACCEPTORSUBKEY 0x04 ++ ++#define KG2_RESP_FLAG_ERROR 0x0001 ++#define KG2_RESP_FLAG_DELEG_OK 0x0002 ++ + enum sgn_alg { + SGN_ALG_DES_MAC_MD5 = 0x0000, + SGN_ALG_MD2_5 = 0x0001, +@@ -81,6 +162,9 @@ enum seal_alg { + #define CKSUMTYPE_RSA_MD5_DES 0x0008 + #define CKSUMTYPE_NIST_SHA 0x0009 + #define CKSUMTYPE_HMAC_SHA1_DES3 0x000c ++#define CKSUMTYPE_HMAC_SHA1_96_AES128 0x000f ++#define CKSUMTYPE_HMAC_SHA1_96_AES256 0x0010 ++#define CKSUMTYPE_HMAC_MD5_ARCFOUR -138 /* Microsoft md5 hmac cksumtype */ + + /* from gssapi_err_krb5.h */ + #define KG_CCACHE_NOMATCH (39756032L) +@@ -111,11 +195,56 @@ enum seal_alg { + #define ENCTYPE_DES3_CBC_RAW 0x0006 /* DES-3 cbc mode raw */ + #define ENCTYPE_DES_HMAC_SHA1 0x0008 + #define ENCTYPE_DES3_CBC_SHA1 0x0010 ++#define ENCTYPE_AES128_CTS_HMAC_SHA1_96 0x0011 ++#define ENCTYPE_AES256_CTS_HMAC_SHA1_96 0x0012 ++#define ENCTYPE_ARCFOUR_HMAC 0x0017 ++#define ENCTYPE_ARCFOUR_HMAC_EXP 0x0018 + #define ENCTYPE_UNKNOWN 0x01ff + +-s32 +-make_checksum(char *, char *header, int hdrlen, struct xdr_buf *body, +- int body_offset, struct xdr_netobj *cksum); ++/* ++ * Constants used for key derivation ++ */ ++/* for 3DES */ ++#define KG_USAGE_SEAL (22) ++#define KG_USAGE_SIGN (23) ++#define KG_USAGE_SEQ (24) ++ ++/* from rfc3961 */ ++#define KEY_USAGE_SEED_CHECKSUM (0x99) ++#define KEY_USAGE_SEED_ENCRYPTION (0xAA) ++#define KEY_USAGE_SEED_INTEGRITY (0x55) ++ ++/* from rfc4121 */ ++#define KG_USAGE_ACCEPTOR_SEAL (22) ++#define KG_USAGE_ACCEPTOR_SIGN (23) ++#define KG_USAGE_INITIATOR_SEAL (24) ++#define KG_USAGE_INITIATOR_SIGN (25) ++ ++/* ++ * This compile-time check verifies that we will not exceed the ++ * slack space allotted by the client and server auth_gss code ++ * before they call gss_wrap(). ++ */ ++#define GSS_KRB5_MAX_SLACK_NEEDED \ ++ (GSS_KRB5_TOK_HDR_LEN /* gss token header */ \ ++ + GSS_KRB5_MAX_CKSUM_LEN /* gss token checksum */ \ ++ + GSS_KRB5_MAX_BLOCKSIZE /* confounder */ \ ++ + GSS_KRB5_MAX_BLOCKSIZE /* possible padding */ \ ++ + GSS_KRB5_TOK_HDR_LEN /* encrypted hdr in v2 token */\ ++ + GSS_KRB5_MAX_CKSUM_LEN /* encryption hmac */ \ ++ + 4 + 4 /* RPC verifier */ \ ++ + GSS_KRB5_TOK_HDR_LEN \ ++ + GSS_KRB5_MAX_CKSUM_LEN) ++ ++u32 ++make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, ++ struct xdr_buf *body, int body_offset, u8 *cksumkey, ++ unsigned int usage, struct xdr_netobj *cksumout); ++ ++u32 ++make_checksum_v2(struct krb5_ctx *, char *header, int hdrlen, ++ struct xdr_buf *body, int body_offset, u8 *key, ++ unsigned int usage, struct xdr_netobj *cksum); + + u32 gss_get_mic_kerberos(struct gss_ctx *, struct xdr_buf *, + struct xdr_netobj *); +@@ -149,11 +278,54 @@ gss_decrypt_xdr_buf(struct crypto_blkcip + int offset); + + s32 +-krb5_make_seq_num(struct crypto_blkcipher *key, ++krb5_make_seq_num(struct krb5_ctx *kctx, ++ struct crypto_blkcipher *key, + int direction, + u32 seqnum, unsigned char *cksum, unsigned char *buf); + + s32 +-krb5_get_seq_num(struct crypto_blkcipher *key, ++krb5_get_seq_num(struct krb5_ctx *kctx, + unsigned char *cksum, + unsigned char *buf, int *direction, u32 *seqnum); ++ ++int ++xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen); ++ ++u32 ++krb5_derive_key(const struct gss_krb5_enctype *gk5e, ++ const struct xdr_netobj *inkey, ++ struct xdr_netobj *outkey, ++ const struct xdr_netobj *in_constant, ++ gfp_t gfp_mask); ++ ++u32 ++gss_krb5_des3_make_key(const struct gss_krb5_enctype *gk5e, ++ struct xdr_netobj *randombits, ++ struct xdr_netobj *key); ++ ++u32 ++gss_krb5_aes_make_key(const struct gss_krb5_enctype *gk5e, ++ struct xdr_netobj *randombits, ++ struct xdr_netobj *key); ++ ++u32 ++gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, int ec, ++ struct page **pages); ++ ++u32 ++gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, u32 *plainoffset, ++ u32 *plainlen); ++ ++int ++krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, ++ struct crypto_blkcipher *cipher, ++ unsigned char *cksum); ++ ++int ++krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, ++ struct crypto_blkcipher *cipher, ++ s32 seqnum); ++void ++gss_krb5_make_confounder(char *p, u32 conflen); +diff -up linux-2.6.34.noarch/include/linux/sunrpc/metrics.h.orig linux-2.6.34.noarch/include/linux/sunrpc/metrics.h +--- linux-2.6.34.noarch/include/linux/sunrpc/metrics.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/metrics.h 2010-08-23 11:01:00.384611889 -0400 +@@ -26,6 +26,7 @@ + #define _LINUX_SUNRPC_METRICS_H + + #include ++#include + + #define RPC_IOSTATS_VERS "1.0" + +@@ -58,9 +59,9 @@ struct rpc_iostats { + * and the total time the request spent from init to release + * are measured. + */ +- unsigned long long om_queue, /* jiffies queued for xmit */ +- om_rtt, /* jiffies for RPC RTT */ +- om_execute; /* jiffies for RPC execution */ ++ ktime_t om_queue, /* queued for xmit */ ++ om_rtt, /* RPC RTT */ ++ om_execute; /* RPC execution */ + } ____cacheline_aligned; + + struct rpc_task; +diff -up linux-2.6.34.noarch/include/linux/sunrpc/sched.h.orig linux-2.6.34.noarch/include/linux/sunrpc/sched.h +--- linux-2.6.34.noarch/include/linux/sunrpc/sched.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/sched.h 2010-08-23 11:01:00.385361873 -0400 +@@ -10,6 +10,7 @@ + #define _LINUX_SUNRPC_SCHED_H_ + + #include ++#include + #include + #include + #include +@@ -40,21 +41,15 @@ struct rpc_wait { + * This is the RPC task struct + */ + struct rpc_task { +-#ifdef RPC_DEBUG +- unsigned long tk_magic; /* 0xf00baa */ +-#endif + atomic_t tk_count; /* Reference count */ + struct list_head tk_task; /* global list of tasks */ + struct rpc_clnt * tk_client; /* RPC client */ + struct rpc_rqst * tk_rqstp; /* RPC request */ +- int tk_status; /* result of last operation */ + + /* + * RPC call state + */ + struct rpc_message tk_msg; /* RPC call info */ +- __u8 tk_garb_retry; +- __u8 tk_cred_retry; + + /* + * callback to be executed after waking up +@@ -67,7 +62,6 @@ struct rpc_task { + void * tk_calldata; + + unsigned long tk_timeout; /* timeout for rpc_sleep() */ +- unsigned short tk_flags; /* misc flags */ + unsigned long tk_runstate; /* Task run status */ + struct workqueue_struct *tk_workqueue; /* Normally rpciod, but could + * be any workqueue +@@ -78,17 +72,19 @@ struct rpc_task { + struct rpc_wait tk_wait; /* RPC wait */ + } u; + +- unsigned short tk_timeouts; /* maj timeouts */ +- size_t tk_bytes_sent; /* total bytes sent */ +- unsigned long tk_start; /* RPC task init timestamp */ +- long tk_rtt; /* round-trip time (jiffies) */ ++ ktime_t tk_start; /* RPC task init timestamp */ + + pid_t tk_owner; /* Process id for batching tasks */ +- unsigned char tk_priority : 2;/* Task priority */ ++ int tk_status; /* result of last operation */ ++ unsigned short tk_flags; /* misc flags */ ++ unsigned short tk_timeouts; /* maj timeouts */ + + #ifdef RPC_DEBUG + unsigned short tk_pid; /* debugging aid */ + #endif ++ unsigned char tk_priority : 2,/* Task priority */ ++ tk_garb_retry : 2, ++ tk_cred_retry : 2; + }; + #define tk_xprt tk_client->cl_xprt + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xdr.h +--- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-23 11:01:00.385361873 -0400 +@@ -1,7 +1,10 @@ + /* +- * include/linux/sunrpc/xdr.h ++ * XDR standard data types and function declarations + * + * Copyright (C) 1995-1997 Olaf Kirch ++ * ++ * Based on: ++ * RFC 4506 "XDR: External Data Representation Standard", May 2006 + */ + + #ifndef _SUNRPC_XDR_H_ +@@ -62,7 +65,6 @@ struct xdr_buf { + + unsigned int buflen, /* Total length of storage buffer */ + len; /* Length of XDR encoded message */ +- + }; + + /* +@@ -178,7 +180,7 @@ struct xdr_array2_desc { + }; + + extern int xdr_decode_array2(struct xdr_buf *buf, unsigned int base, +- struct xdr_array2_desc *desc); ++ struct xdr_array2_desc *desc); + extern int xdr_encode_array2(struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc); + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/xprt.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xprt.h +--- linux-2.6.34.noarch/include/linux/sunrpc/xprt.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xprt.h 2010-08-23 11:01:00.386574704 -0400 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -65,8 +66,6 @@ struct rpc_rqst { + struct rpc_task * rq_task; /* RPC task data */ + __be32 rq_xid; /* request XID */ + int rq_cong; /* has incremented xprt->cong */ +- int rq_reply_bytes_recvd; /* number of reply */ +- /* bytes received */ + u32 rq_seqno; /* gss seq no. used on req. */ + int rq_enc_pages_num; + struct page **rq_enc_pages; /* scratch pages for use by +@@ -77,12 +76,16 @@ struct rpc_rqst { + __u32 * rq_buffer; /* XDR encode buffer */ + size_t rq_callsize, + rq_rcvsize; ++ size_t rq_xmit_bytes_sent; /* total bytes sent */ ++ size_t rq_reply_bytes_recvd; /* total reply bytes */ ++ /* received */ + + struct xdr_buf rq_private_buf; /* The receive buffer + * used in the softirq. + */ + unsigned long rq_majortimeo; /* major timeout alarm */ + unsigned long rq_timeout; /* Current timeout value */ ++ ktime_t rq_rtt; /* round-trip time */ + unsigned int rq_retries; /* # of retries */ + unsigned int rq_connect_cookie; + /* A cookie used to track the +@@ -94,7 +97,7 @@ struct rpc_rqst { + */ + u32 rq_bytes_sent; /* Bytes we have sent */ + +- unsigned long rq_xtime; /* when transmitted */ ++ ktime_t rq_xtime; /* transmit time stamp */ + int rq_ntrans; + + #if defined(CONFIG_NFS_V4_1) +@@ -174,8 +177,7 @@ struct rpc_xprt { + /* + * Connection of transports + */ +- unsigned long connect_timeout, +- bind_timeout, ++ unsigned long bind_timeout, + reestablish_timeout; + unsigned int connect_cookie; /* A cookie that gets bumped + every time the transport +@@ -294,7 +296,6 @@ void xprt_set_retrans_timeout_rtt(stru + void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); + void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action); + void xprt_write_space(struct rpc_xprt *xprt); +-void xprt_update_rtt(struct rpc_task *task); + void xprt_adjust_cwnd(struct rpc_task *task, int result); + struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid); + void xprt_complete_rqst(struct rpc_task *task, int copied); +diff -up linux-2.6.34.noarch/net/sunrpc/auth.c.orig linux-2.6.34.noarch/net/sunrpc/auth.c +--- linux-2.6.34.noarch/net/sunrpc/auth.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth.c 2010-08-23 11:01:00.387574079 -0400 +@@ -236,10 +236,15 @@ rpcauth_prune_expired(struct list_head * + + list_for_each_entry_safe(cred, next, &cred_unused, cr_lru) { + +- /* Enforce a 60 second garbage collection moratorium */ ++ if (nr_to_scan-- == 0) ++ break; ++ /* ++ * Enforce a 60 second garbage collection moratorium ++ * Note that the cred_unused list must be time-ordered. ++ */ + if (time_in_range(cred->cr_expire, expired, jiffies) && + test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) +- continue; ++ return 0; + + list_del_init(&cred->cr_lru); + number_cred_unused--; +@@ -252,13 +257,10 @@ rpcauth_prune_expired(struct list_head * + get_rpccred(cred); + list_add_tail(&cred->cr_lru, free); + rpcauth_unhash_cred_locked(cred); +- nr_to_scan--; + } + spin_unlock(cache_lock); +- if (nr_to_scan == 0) +- break; + } +- return nr_to_scan; ++ return (number_cred_unused / 100) * sysctl_vfs_cache_pressure; + } + + /* +@@ -270,11 +272,12 @@ rpcauth_cache_shrinker(int nr_to_scan, g + LIST_HEAD(free); + int res; + ++ if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) ++ return (nr_to_scan == 0) ? 0 : -1; + if (list_empty(&cred_unused)) + return 0; + spin_lock(&rpc_credcache_lock); +- nr_to_scan = rpcauth_prune_expired(&free, nr_to_scan); +- res = (number_cred_unused / 100) * sysctl_vfs_cache_pressure; ++ res = rpcauth_prune_expired(&free, nr_to_scan); + spin_unlock(&rpc_credcache_lock); + rpcauth_destroy_credlist(&free); + return res; +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/auth_gss.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/auth_gss.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/auth_gss.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/auth_gss.c 2010-08-23 11:01:00.388574680 -0400 +@@ -57,11 +57,14 @@ static const struct rpc_authops authgss_ + static const struct rpc_credops gss_credops; + static const struct rpc_credops gss_nullops; + ++#define GSS_RETRY_EXPIRED 5 ++static unsigned int gss_expired_cred_retry_delay = GSS_RETRY_EXPIRED; ++ + #ifdef RPC_DEBUG + # define RPCDBG_FACILITY RPCDBG_AUTH + #endif + +-#define GSS_CRED_SLACK 1024 ++#define GSS_CRED_SLACK (RPC_MAX_AUTH_SIZE * 2) + /* length of a krb5 verifier (48), plus data added before arguments when + * using integrity (two 4-byte integers): */ + #define GSS_VERF_SLACK 100 +@@ -229,7 +232,7 @@ gss_fill_context(const void *p, const vo + p = ERR_PTR(-EFAULT); + goto err; + } +- ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx); ++ ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, GFP_NOFS); + if (ret < 0) { + p = ERR_PTR(ret); + goto err; +@@ -350,6 +353,24 @@ gss_unhash_msg(struct gss_upcall_msg *gs + } + + static void ++gss_handle_downcall_result(struct gss_cred *gss_cred, struct gss_upcall_msg *gss_msg) ++{ ++ switch (gss_msg->msg.errno) { ++ case 0: ++ if (gss_msg->ctx == NULL) ++ break; ++ clear_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags); ++ gss_cred_set_ctx(&gss_cred->gc_base, gss_msg->ctx); ++ break; ++ case -EKEYEXPIRED: ++ set_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags); ++ } ++ gss_cred->gc_upcall_timestamp = jiffies; ++ gss_cred->gc_upcall = NULL; ++ rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); ++} ++ ++static void + gss_upcall_callback(struct rpc_task *task) + { + struct gss_cred *gss_cred = container_of(task->tk_msg.rpc_cred, +@@ -358,13 +379,9 @@ gss_upcall_callback(struct rpc_task *tas + struct inode *inode = &gss_msg->inode->vfs_inode; + + spin_lock(&inode->i_lock); +- if (gss_msg->ctx) +- gss_cred_set_ctx(task->tk_msg.rpc_cred, gss_msg->ctx); +- else +- task->tk_status = gss_msg->msg.errno; +- gss_cred->gc_upcall = NULL; +- rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); ++ gss_handle_downcall_result(gss_cred, gss_msg); + spin_unlock(&inode->i_lock); ++ task->tk_status = gss_msg->msg.errno; + gss_release_msg(gss_msg); + } + +@@ -377,11 +394,12 @@ static void gss_encode_v0_msg(struct gss + static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, + struct rpc_clnt *clnt, int machine_cred) + { ++ struct gss_api_mech *mech = gss_msg->auth->mech; + char *p = gss_msg->databuf; + int len = 0; + + gss_msg->msg.len = sprintf(gss_msg->databuf, "mech=%s uid=%d ", +- gss_msg->auth->mech->gm_name, ++ mech->gm_name, + gss_msg->uid); + p += gss_msg->msg.len; + if (clnt->cl_principal) { +@@ -398,6 +416,11 @@ static void gss_encode_v1_msg(struct gss + p += len; + gss_msg->msg.len += len; + } ++ if (mech->gm_upcall_enctypes) { ++ len = sprintf(p, mech->gm_upcall_enctypes); ++ p += len; ++ gss_msg->msg.len += len; ++ } + len = sprintf(p, "\n"); + gss_msg->msg.len += len; + +@@ -507,18 +530,16 @@ gss_refresh_upcall(struct rpc_task *task + spin_lock(&inode->i_lock); + if (gss_cred->gc_upcall != NULL) + rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL); +- else if (gss_msg->ctx != NULL) { +- gss_cred_set_ctx(task->tk_msg.rpc_cred, gss_msg->ctx); +- gss_cred->gc_upcall = NULL; +- rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); +- } else if (gss_msg->msg.errno >= 0) { ++ else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) { + task->tk_timeout = 0; + gss_cred->gc_upcall = gss_msg; + /* gss_upcall_callback will release the reference to gss_upcall_msg */ + atomic_inc(&gss_msg->count); + rpc_sleep_on(&gss_msg->rpc_waitqueue, task, gss_upcall_callback); +- } else ++ } else { ++ gss_handle_downcall_result(gss_cred, gss_msg); + err = gss_msg->msg.errno; ++ } + spin_unlock(&inode->i_lock); + gss_release_msg(gss_msg); + out: +@@ -1117,6 +1138,23 @@ static int gss_renew_cred(struct rpc_tas + return 0; + } + ++static int gss_cred_is_negative_entry(struct rpc_cred *cred) ++{ ++ if (test_bit(RPCAUTH_CRED_NEGATIVE, &cred->cr_flags)) { ++ unsigned long now = jiffies; ++ unsigned long begin, expire; ++ struct gss_cred *gss_cred; ++ ++ gss_cred = container_of(cred, struct gss_cred, gc_base); ++ begin = gss_cred->gc_upcall_timestamp; ++ expire = begin + gss_expired_cred_retry_delay * HZ; ++ ++ if (time_in_range_open(now, begin, expire)) ++ return 1; ++ } ++ return 0; ++} ++ + /* + * Refresh credentials. XXX - finish + */ +@@ -1126,6 +1164,9 @@ gss_refresh(struct rpc_task *task) + struct rpc_cred *cred = task->tk_msg.rpc_cred; + int ret = 0; + ++ if (gss_cred_is_negative_entry(cred)) ++ return -EKEYEXPIRED; ++ + if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && + !test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags)) { + ret = gss_renew_cred(task); +@@ -1316,15 +1357,21 @@ gss_wrap_req_priv(struct rpc_cred *cred, + inpages = snd_buf->pages + first; + snd_buf->pages = rqstp->rq_enc_pages; + snd_buf->page_base -= first << PAGE_CACHE_SHIFT; +- /* Give the tail its own page, in case we need extra space in the +- * head when wrapping: */ ++ /* ++ * Give the tail its own page, in case we need extra space in the ++ * head when wrapping: ++ * ++ * call_allocate() allocates twice the slack space required ++ * by the authentication flavor to rq_callsize. ++ * For GSS, slack is GSS_CRED_SLACK. ++ */ + if (snd_buf->page_len || snd_buf->tail[0].iov_len) { + tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]); + memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len); + snd_buf->tail[0].iov_base = tmp; + } + maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages); +- /* RPC_SLACK_SPACE should prevent this ever happening: */ ++ /* slack space should prevent this ever happening: */ + BUG_ON(snd_buf->len > snd_buf->buflen); + status = -EIO; + /* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was +@@ -1573,5 +1620,11 @@ static void __exit exit_rpcsec_gss(void) + } + + MODULE_LICENSE("GPL"); ++module_param_named(expired_cred_retry_delay, ++ gss_expired_cred_retry_delay, ++ uint, 0644); ++MODULE_PARM_DESC(expired_cred_retry_delay, "Timeout (in seconds) until " ++ "the RPC engine retries an expired credential"); ++ + module_init(init_rpcsec_gss) + module_exit(exit_rpcsec_gss) +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_crypto.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_crypto.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_crypto.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_crypto.c 2010-08-23 11:01:00.390553891 -0400 +@@ -1,7 +1,7 @@ + /* + * linux/net/sunrpc/gss_krb5_crypto.c + * +- * Copyright (c) 2000 The Regents of the University of Michigan. ++ * Copyright (c) 2000-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson +@@ -41,6 +41,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -58,13 +59,13 @@ krb5_encrypt( + { + u32 ret = -EINVAL; + struct scatterlist sg[1]; +- u8 local_iv[16] = {0}; ++ u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; + struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv }; + + if (length % crypto_blkcipher_blocksize(tfm) != 0) + goto out; + +- if (crypto_blkcipher_ivsize(tfm) > 16) { ++ if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { + dprintk("RPC: gss_k5encrypt: tfm iv size too large %d\n", + crypto_blkcipher_ivsize(tfm)); + goto out; +@@ -92,13 +93,13 @@ krb5_decrypt( + { + u32 ret = -EINVAL; + struct scatterlist sg[1]; +- u8 local_iv[16] = {0}; ++ u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; + struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv }; + + if (length % crypto_blkcipher_blocksize(tfm) != 0) + goto out; + +- if (crypto_blkcipher_ivsize(tfm) > 16) { ++ if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { + dprintk("RPC: gss_k5decrypt: tfm iv size too large %d\n", + crypto_blkcipher_ivsize(tfm)); + goto out; +@@ -123,21 +124,155 @@ checksummer(struct scatterlist *sg, void + return crypto_hash_update(desc, sg, sg->length); + } + +-/* checksum the plaintext data and hdrlen bytes of the token header */ +-s32 +-make_checksum(char *cksumname, char *header, int hdrlen, struct xdr_buf *body, +- int body_offset, struct xdr_netobj *cksum) ++static int ++arcfour_hmac_md5_usage_to_salt(unsigned int usage, u8 salt[4]) ++{ ++ unsigned int ms_usage; ++ ++ switch (usage) { ++ case KG_USAGE_SIGN: ++ ms_usage = 15; ++ break; ++ case KG_USAGE_SEAL: ++ ms_usage = 13; ++ break; ++ default: ++ return EINVAL;; ++ } ++ salt[0] = (ms_usage >> 0) & 0xff; ++ salt[1] = (ms_usage >> 8) & 0xff; ++ salt[2] = (ms_usage >> 16) & 0xff; ++ salt[3] = (ms_usage >> 24) & 0xff; ++ ++ return 0; ++} ++ ++static u32 ++make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, ++ struct xdr_buf *body, int body_offset, u8 *cksumkey, ++ unsigned int usage, struct xdr_netobj *cksumout) + { +- struct hash_desc desc; /* XXX add to ctx? */ ++ struct hash_desc desc; + struct scatterlist sg[1]; + int err; ++ u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ u8 rc4salt[4]; ++ struct crypto_hash *md5; ++ struct crypto_hash *hmac_md5; ++ ++ if (cksumkey == NULL) ++ return GSS_S_FAILURE; ++ ++ if (cksumout->len < kctx->gk5e->cksumlength) { ++ dprintk("%s: checksum buffer length, %u, too small for %s\n", ++ __func__, cksumout->len, kctx->gk5e->name); ++ return GSS_S_FAILURE; ++ } ++ ++ if (arcfour_hmac_md5_usage_to_salt(usage, rc4salt)) { ++ dprintk("%s: invalid usage value %u\n", __func__, usage); ++ return GSS_S_FAILURE; ++ } ++ ++ md5 = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); ++ if (IS_ERR(md5)) ++ return GSS_S_FAILURE; ++ ++ hmac_md5 = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(hmac_md5)) { ++ crypto_free_hash(md5); ++ return GSS_S_FAILURE; ++ } ++ ++ desc.tfm = md5; ++ desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out; ++ sg_init_one(sg, rc4salt, 4); ++ err = crypto_hash_update(&desc, sg, 4); ++ if (err) ++ goto out; ++ ++ sg_init_one(sg, header, hdrlen); ++ err = crypto_hash_update(&desc, sg, hdrlen); ++ if (err) ++ goto out; ++ err = xdr_process_buf(body, body_offset, body->len - body_offset, ++ checksummer, &desc); ++ if (err) ++ goto out; ++ err = crypto_hash_final(&desc, checksumdata); ++ if (err) ++ goto out; ++ ++ desc.tfm = hmac_md5; ++ desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out; ++ err = crypto_hash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength); ++ if (err) ++ goto out; ++ ++ sg_init_one(sg, checksumdata, crypto_hash_digestsize(md5)); ++ err = crypto_hash_digest(&desc, sg, crypto_hash_digestsize(md5), ++ checksumdata); ++ if (err) ++ goto out; ++ ++ memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); ++ cksumout->len = kctx->gk5e->cksumlength; ++out: ++ crypto_free_hash(md5); ++ crypto_free_hash(hmac_md5); ++ return err ? GSS_S_FAILURE : 0; ++} ++ ++/* ++ * checksum the plaintext data and hdrlen bytes of the token header ++ * The checksum is performed over the first 8 bytes of the ++ * gss token header and then over the data body ++ */ ++u32 ++make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, ++ struct xdr_buf *body, int body_offset, u8 *cksumkey, ++ unsigned int usage, struct xdr_netobj *cksumout) ++{ ++ struct hash_desc desc; ++ struct scatterlist sg[1]; ++ int err; ++ u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ unsigned int checksumlen; ++ ++ if (kctx->gk5e->ctype == CKSUMTYPE_HMAC_MD5_ARCFOUR) ++ return make_checksum_hmac_md5(kctx, header, hdrlen, ++ body, body_offset, ++ cksumkey, usage, cksumout); ++ ++ if (cksumout->len < kctx->gk5e->cksumlength) { ++ dprintk("%s: checksum buffer length, %u, too small for %s\n", ++ __func__, cksumout->len, kctx->gk5e->name); ++ return GSS_S_FAILURE; ++ } + +- desc.tfm = crypto_alloc_hash(cksumname, 0, CRYPTO_ALG_ASYNC); ++ desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(desc.tfm)) + return GSS_S_FAILURE; +- cksum->len = crypto_hash_digestsize(desc.tfm); + desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + ++ checksumlen = crypto_hash_digestsize(desc.tfm); ++ ++ if (cksumkey != NULL) { ++ err = crypto_hash_setkey(desc.tfm, cksumkey, ++ kctx->gk5e->keylength); ++ if (err) ++ goto out; ++ } ++ + err = crypto_hash_init(&desc); + if (err) + goto out; +@@ -149,15 +284,109 @@ make_checksum(char *cksumname, char *hea + checksummer, &desc); + if (err) + goto out; +- err = crypto_hash_final(&desc, cksum->data); ++ err = crypto_hash_final(&desc, checksumdata); ++ if (err) ++ goto out; + ++ switch (kctx->gk5e->ctype) { ++ case CKSUMTYPE_RSA_MD5: ++ err = kctx->gk5e->encrypt(kctx->seq, NULL, checksumdata, ++ checksumdata, checksumlen); ++ if (err) ++ goto out; ++ memcpy(cksumout->data, ++ checksumdata + checksumlen - kctx->gk5e->cksumlength, ++ kctx->gk5e->cksumlength); ++ break; ++ case CKSUMTYPE_HMAC_SHA1_DES3: ++ memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); ++ break; ++ default: ++ BUG(); ++ break; ++ } ++ cksumout->len = kctx->gk5e->cksumlength; ++out: ++ crypto_free_hash(desc.tfm); ++ return err ? GSS_S_FAILURE : 0; ++} ++ ++/* ++ * checksum the plaintext data and hdrlen bytes of the token header ++ * Per rfc4121, sec. 4.2.4, the checksum is performed over the data ++ * body then over the first 16 octets of the MIC token ++ * Inclusion of the header data in the calculation of the ++ * checksum is optional. ++ */ ++u32 ++make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, ++ struct xdr_buf *body, int body_offset, u8 *cksumkey, ++ unsigned int usage, struct xdr_netobj *cksumout) ++{ ++ struct hash_desc desc; ++ struct scatterlist sg[1]; ++ int err; ++ u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ unsigned int checksumlen; ++ ++ if (kctx->gk5e->keyed_cksum == 0) { ++ dprintk("%s: expected keyed hash for %s\n", ++ __func__, kctx->gk5e->name); ++ return GSS_S_FAILURE; ++ } ++ if (cksumkey == NULL) { ++ dprintk("%s: no key supplied for %s\n", ++ __func__, kctx->gk5e->name); ++ return GSS_S_FAILURE; ++ } ++ ++ desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(desc.tfm)) ++ return GSS_S_FAILURE; ++ checksumlen = crypto_hash_digestsize(desc.tfm); ++ desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; ++ ++ err = crypto_hash_setkey(desc.tfm, cksumkey, kctx->gk5e->keylength); ++ if (err) ++ goto out; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out; ++ err = xdr_process_buf(body, body_offset, body->len - body_offset, ++ checksummer, &desc); ++ if (err) ++ goto out; ++ if (header != NULL) { ++ sg_init_one(sg, header, hdrlen); ++ err = crypto_hash_update(&desc, sg, hdrlen); ++ if (err) ++ goto out; ++ } ++ err = crypto_hash_final(&desc, checksumdata); ++ if (err) ++ goto out; ++ ++ cksumout->len = kctx->gk5e->cksumlength; ++ ++ switch (kctx->gk5e->ctype) { ++ case CKSUMTYPE_HMAC_SHA1_96_AES128: ++ case CKSUMTYPE_HMAC_SHA1_96_AES256: ++ /* note that this truncates the hash */ ++ memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); ++ break; ++ default: ++ BUG(); ++ break; ++ } + out: + crypto_free_hash(desc.tfm); + return err ? GSS_S_FAILURE : 0; + } + + struct encryptor_desc { +- u8 iv[8]; /* XXX hard-coded blocksize */ ++ u8 iv[GSS_KRB5_MAX_BLOCKSIZE]; + struct blkcipher_desc desc; + int pos; + struct xdr_buf *outbuf; +@@ -198,7 +427,7 @@ encryptor(struct scatterlist *sg, void * + desc->fraglen += sg->length; + desc->pos += sg->length; + +- fraglen = thislen & 7; /* XXX hardcoded blocksize */ ++ fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1); + thislen -= fraglen; + + if (thislen == 0) +@@ -256,7 +485,7 @@ gss_encrypt_xdr_buf(struct crypto_blkcip + } + + struct decryptor_desc { +- u8 iv[8]; /* XXX hard-coded blocksize */ ++ u8 iv[GSS_KRB5_MAX_BLOCKSIZE]; + struct blkcipher_desc desc; + struct scatterlist frags[4]; + int fragno; +@@ -278,7 +507,7 @@ decryptor(struct scatterlist *sg, void * + desc->fragno++; + desc->fraglen += sg->length; + +- fraglen = thislen & 7; /* XXX hardcoded blocksize */ ++ fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1); + thislen -= fraglen; + + if (thislen == 0) +@@ -325,3 +554,437 @@ gss_decrypt_xdr_buf(struct crypto_blkcip + + return xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc); + } ++ ++/* ++ * This function makes the assumption that it was ultimately called ++ * from gss_wrap(). ++ * ++ * The client auth_gss code moves any existing tail data into a ++ * separate page before calling gss_wrap. ++ * The server svcauth_gss code ensures that both the head and the ++ * tail have slack space of RPC_MAX_AUTH_SIZE before calling gss_wrap. ++ * ++ * Even with that guarantee, this function may be called more than ++ * once in the processing of gss_wrap(). The best we can do is ++ * verify at compile-time (see GSS_KRB5_SLACK_CHECK) that the ++ * largest expected shift will fit within RPC_MAX_AUTH_SIZE. ++ * At run-time we can verify that a single invocation of this ++ * function doesn't attempt to use more the RPC_MAX_AUTH_SIZE. ++ */ ++ ++int ++xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen) ++{ ++ u8 *p; ++ ++ if (shiftlen == 0) ++ return 0; ++ ++ BUILD_BUG_ON(GSS_KRB5_MAX_SLACK_NEEDED > RPC_MAX_AUTH_SIZE); ++ BUG_ON(shiftlen > RPC_MAX_AUTH_SIZE); ++ ++ p = buf->head[0].iov_base + base; ++ ++ memmove(p + shiftlen, p, buf->head[0].iov_len - base); ++ ++ buf->head[0].iov_len += shiftlen; ++ buf->len += shiftlen; ++ ++ return 0; ++} ++ ++static u32 ++gss_krb5_cts_crypt(struct crypto_blkcipher *cipher, struct xdr_buf *buf, ++ u32 offset, u8 *iv, struct page **pages, int encrypt) ++{ ++ u32 ret; ++ struct scatterlist sg[1]; ++ struct blkcipher_desc desc = { .tfm = cipher, .info = iv }; ++ u8 data[crypto_blkcipher_blocksize(cipher) * 2]; ++ struct page **save_pages; ++ u32 len = buf->len - offset; ++ ++ BUG_ON(len > crypto_blkcipher_blocksize(cipher) * 2); ++ ++ /* ++ * For encryption, we want to read from the cleartext ++ * page cache pages, and write the encrypted data to ++ * the supplied xdr_buf pages. ++ */ ++ save_pages = buf->pages; ++ if (encrypt) ++ buf->pages = pages; ++ ++ ret = read_bytes_from_xdr_buf(buf, offset, data, len); ++ buf->pages = save_pages; ++ if (ret) ++ goto out; ++ ++ sg_init_one(sg, data, len); ++ ++ if (encrypt) ++ ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, len); ++ else ++ ret = crypto_blkcipher_decrypt_iv(&desc, sg, sg, len); ++ ++ if (ret) ++ goto out; ++ ++ ret = write_bytes_to_xdr_buf(buf, offset, data, len); ++ ++out: ++ return ret; ++} ++ ++u32 ++gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, int ec, struct page **pages) ++{ ++ u32 err; ++ struct xdr_netobj hmac; ++ u8 *cksumkey; ++ u8 *ecptr; ++ struct crypto_blkcipher *cipher, *aux_cipher; ++ int blocksize; ++ struct page **save_pages; ++ int nblocks, nbytes; ++ struct encryptor_desc desc; ++ u32 cbcbytes; ++ unsigned int usage; ++ ++ if (kctx->initiate) { ++ cipher = kctx->initiator_enc; ++ aux_cipher = kctx->initiator_enc_aux; ++ cksumkey = kctx->initiator_integ; ++ usage = KG_USAGE_INITIATOR_SEAL; ++ } else { ++ cipher = kctx->acceptor_enc; ++ aux_cipher = kctx->acceptor_enc_aux; ++ cksumkey = kctx->acceptor_integ; ++ usage = KG_USAGE_ACCEPTOR_SEAL; ++ } ++ blocksize = crypto_blkcipher_blocksize(cipher); ++ ++ /* hide the gss token header and insert the confounder */ ++ offset += GSS_KRB5_TOK_HDR_LEN; ++ if (xdr_extend_head(buf, offset, kctx->gk5e->conflen)) ++ return GSS_S_FAILURE; ++ gss_krb5_make_confounder(buf->head[0].iov_base + offset, kctx->gk5e->conflen); ++ offset -= GSS_KRB5_TOK_HDR_LEN; ++ ++ if (buf->tail[0].iov_base != NULL) { ++ ecptr = buf->tail[0].iov_base + buf->tail[0].iov_len; ++ } else { ++ buf->tail[0].iov_base = buf->head[0].iov_base ++ + buf->head[0].iov_len; ++ buf->tail[0].iov_len = 0; ++ ecptr = buf->tail[0].iov_base; ++ } ++ ++ memset(ecptr, 'X', ec); ++ buf->tail[0].iov_len += ec; ++ buf->len += ec; ++ ++ /* copy plaintext gss token header after filler (if any) */ ++ memcpy(ecptr + ec, buf->head[0].iov_base + offset, ++ GSS_KRB5_TOK_HDR_LEN); ++ buf->tail[0].iov_len += GSS_KRB5_TOK_HDR_LEN; ++ buf->len += GSS_KRB5_TOK_HDR_LEN; ++ ++ /* Do the HMAC */ ++ hmac.len = GSS_KRB5_MAX_CKSUM_LEN; ++ hmac.data = buf->tail[0].iov_base + buf->tail[0].iov_len; ++ ++ /* ++ * When we are called, pages points to the real page cache ++ * data -- which we can't go and encrypt! buf->pages points ++ * to scratch pages which we are going to send off to the ++ * client/server. Swap in the plaintext pages to calculate ++ * the hmac. ++ */ ++ save_pages = buf->pages; ++ buf->pages = pages; ++ ++ err = make_checksum_v2(kctx, NULL, 0, buf, ++ offset + GSS_KRB5_TOK_HDR_LEN, ++ cksumkey, usage, &hmac); ++ buf->pages = save_pages; ++ if (err) ++ return GSS_S_FAILURE; ++ ++ nbytes = buf->len - offset - GSS_KRB5_TOK_HDR_LEN; ++ nblocks = (nbytes + blocksize - 1) / blocksize; ++ cbcbytes = 0; ++ if (nblocks > 2) ++ cbcbytes = (nblocks - 2) * blocksize; ++ ++ memset(desc.iv, 0, sizeof(desc.iv)); ++ ++ if (cbcbytes) { ++ desc.pos = offset + GSS_KRB5_TOK_HDR_LEN; ++ desc.fragno = 0; ++ desc.fraglen = 0; ++ desc.pages = pages; ++ desc.outbuf = buf; ++ desc.desc.info = desc.iv; ++ desc.desc.flags = 0; ++ desc.desc.tfm = aux_cipher; ++ ++ sg_init_table(desc.infrags, 4); ++ sg_init_table(desc.outfrags, 4); ++ ++ err = xdr_process_buf(buf, offset + GSS_KRB5_TOK_HDR_LEN, ++ cbcbytes, encryptor, &desc); ++ if (err) ++ goto out_err; ++ } ++ ++ /* Make sure IV carries forward from any CBC results. */ ++ err = gss_krb5_cts_crypt(cipher, buf, ++ offset + GSS_KRB5_TOK_HDR_LEN + cbcbytes, ++ desc.iv, pages, 1); ++ if (err) { ++ err = GSS_S_FAILURE; ++ goto out_err; ++ } ++ ++ /* Now update buf to account for HMAC */ ++ buf->tail[0].iov_len += kctx->gk5e->cksumlength; ++ buf->len += kctx->gk5e->cksumlength; ++ ++out_err: ++ if (err) ++ err = GSS_S_FAILURE; ++ return err; ++} ++ ++u32 ++gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, ++ u32 *headskip, u32 *tailskip) ++{ ++ struct xdr_buf subbuf; ++ u32 ret = 0; ++ u8 *cksum_key; ++ struct crypto_blkcipher *cipher, *aux_cipher; ++ struct xdr_netobj our_hmac_obj; ++ u8 our_hmac[GSS_KRB5_MAX_CKSUM_LEN]; ++ u8 pkt_hmac[GSS_KRB5_MAX_CKSUM_LEN]; ++ int nblocks, blocksize, cbcbytes; ++ struct decryptor_desc desc; ++ unsigned int usage; ++ ++ if (kctx->initiate) { ++ cipher = kctx->acceptor_enc; ++ aux_cipher = kctx->acceptor_enc_aux; ++ cksum_key = kctx->acceptor_integ; ++ usage = KG_USAGE_ACCEPTOR_SEAL; ++ } else { ++ cipher = kctx->initiator_enc; ++ aux_cipher = kctx->initiator_enc_aux; ++ cksum_key = kctx->initiator_integ; ++ usage = KG_USAGE_INITIATOR_SEAL; ++ } ++ blocksize = crypto_blkcipher_blocksize(cipher); ++ ++ ++ /* create a segment skipping the header and leaving out the checksum */ ++ xdr_buf_subsegment(buf, &subbuf, offset + GSS_KRB5_TOK_HDR_LEN, ++ (buf->len - offset - GSS_KRB5_TOK_HDR_LEN - ++ kctx->gk5e->cksumlength)); ++ ++ nblocks = (subbuf.len + blocksize - 1) / blocksize; ++ ++ cbcbytes = 0; ++ if (nblocks > 2) ++ cbcbytes = (nblocks - 2) * blocksize; ++ ++ memset(desc.iv, 0, sizeof(desc.iv)); ++ ++ if (cbcbytes) { ++ desc.fragno = 0; ++ desc.fraglen = 0; ++ desc.desc.info = desc.iv; ++ desc.desc.flags = 0; ++ desc.desc.tfm = aux_cipher; ++ ++ sg_init_table(desc.frags, 4); ++ ++ ret = xdr_process_buf(&subbuf, 0, cbcbytes, decryptor, &desc); ++ if (ret) ++ goto out_err; ++ } ++ ++ /* Make sure IV carries forward from any CBC results. */ ++ ret = gss_krb5_cts_crypt(cipher, &subbuf, cbcbytes, desc.iv, NULL, 0); ++ if (ret) ++ goto out_err; ++ ++ ++ /* Calculate our hmac over the plaintext data */ ++ our_hmac_obj.len = sizeof(our_hmac); ++ our_hmac_obj.data = our_hmac; ++ ++ ret = make_checksum_v2(kctx, NULL, 0, &subbuf, 0, ++ cksum_key, usage, &our_hmac_obj); ++ if (ret) ++ goto out_err; ++ ++ /* Get the packet's hmac value */ ++ ret = read_bytes_from_xdr_buf(buf, buf->len - kctx->gk5e->cksumlength, ++ pkt_hmac, kctx->gk5e->cksumlength); ++ if (ret) ++ goto out_err; ++ ++ if (memcmp(pkt_hmac, our_hmac, kctx->gk5e->cksumlength) != 0) { ++ ret = GSS_S_BAD_SIG; ++ goto out_err; ++ } ++ *headskip = kctx->gk5e->conflen; ++ *tailskip = kctx->gk5e->cksumlength; ++out_err: ++ if (ret && ret != GSS_S_BAD_SIG) ++ ret = GSS_S_FAILURE; ++ return ret; ++} ++ ++/* ++ * Compute Kseq given the initial session key and the checksum. ++ * Set the key of the given cipher. ++ */ ++int ++krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher, ++ unsigned char *cksum) ++{ ++ struct crypto_hash *hmac; ++ struct hash_desc desc; ++ struct scatterlist sg[1]; ++ u8 Kseq[GSS_KRB5_MAX_KEYLEN]; ++ u32 zeroconstant = 0; ++ int err; ++ ++ dprintk("%s: entered\n", __func__); ++ ++ hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); ++ if (IS_ERR(hmac)) { ++ dprintk("%s: error %ld, allocating hash '%s'\n", ++ __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name); ++ return PTR_ERR(hmac); ++ } ++ ++ desc.tfm = hmac; ++ desc.flags = 0; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out_err; ++ ++ /* Compute intermediate Kseq from session key */ ++ err = crypto_hash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ sg_init_table(sg, 1); ++ sg_set_buf(sg, &zeroconstant, 4); ++ ++ err = crypto_hash_digest(&desc, sg, 4, Kseq); ++ if (err) ++ goto out_err; ++ ++ /* Compute final Kseq from the checksum and intermediate Kseq */ ++ err = crypto_hash_setkey(hmac, Kseq, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ sg_set_buf(sg, cksum, 8); ++ ++ err = crypto_hash_digest(&desc, sg, 8, Kseq); ++ if (err) ++ goto out_err; ++ ++ err = crypto_blkcipher_setkey(cipher, Kseq, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ err = 0; ++ ++out_err: ++ crypto_free_hash(hmac); ++ dprintk("%s: returning %d\n", __func__, err); ++ return err; ++} ++ ++/* ++ * Compute Kcrypt given the initial session key and the plaintext seqnum. ++ * Set the key of cipher kctx->enc. ++ */ ++int ++krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher, ++ s32 seqnum) ++{ ++ struct crypto_hash *hmac; ++ struct hash_desc desc; ++ struct scatterlist sg[1]; ++ u8 Kcrypt[GSS_KRB5_MAX_KEYLEN]; ++ u8 zeroconstant[4] = {0}; ++ u8 seqnumarray[4]; ++ int err, i; ++ ++ dprintk("%s: entered, seqnum %u\n", __func__, seqnum); ++ ++ hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); ++ if (IS_ERR(hmac)) { ++ dprintk("%s: error %ld, allocating hash '%s'\n", ++ __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name); ++ return PTR_ERR(hmac); ++ } ++ ++ desc.tfm = hmac; ++ desc.flags = 0; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out_err; ++ ++ /* Compute intermediate Kcrypt from session key */ ++ for (i = 0; i < kctx->gk5e->keylength; i++) ++ Kcrypt[i] = kctx->Ksess[i] ^ 0xf0; ++ ++ err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ sg_init_table(sg, 1); ++ sg_set_buf(sg, zeroconstant, 4); ++ ++ err = crypto_hash_digest(&desc, sg, 4, Kcrypt); ++ if (err) ++ goto out_err; ++ ++ /* Compute final Kcrypt from the seqnum and intermediate Kcrypt */ ++ err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ seqnumarray[0] = (unsigned char) ((seqnum >> 24) & 0xff); ++ seqnumarray[1] = (unsigned char) ((seqnum >> 16) & 0xff); ++ seqnumarray[2] = (unsigned char) ((seqnum >> 8) & 0xff); ++ seqnumarray[3] = (unsigned char) ((seqnum >> 0) & 0xff); ++ ++ sg_set_buf(sg, seqnumarray, 4); ++ ++ err = crypto_hash_digest(&desc, sg, 4, Kcrypt); ++ if (err) ++ goto out_err; ++ ++ err = crypto_blkcipher_setkey(cipher, Kcrypt, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ err = 0; ++ ++out_err: ++ crypto_free_hash(hmac); ++ dprintk("%s: returning %d\n", __func__, err); ++ return err; ++} ++ +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_keys.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_keys.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_keys.c.orig 2010-08-23 11:01:00.390553891 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_keys.c 2010-08-23 11:01:00.391564137 -0400 +@@ -0,0 +1,336 @@ ++/* ++ * COPYRIGHT (c) 2008 ++ * The Regents of the University of Michigan ++ * ALL RIGHTS RESERVED ++ * ++ * Permission is granted to use, copy, create derivative works ++ * and redistribute this software and such derivative works ++ * for any purpose, so long as the name of The University of ++ * Michigan is not used in any advertising or publicity ++ * pertaining to the use of distribution of this software ++ * without specific, written prior authorization. If the ++ * above copyright notice or any other identification of the ++ * University of Michigan is included in any copy of any ++ * portion of this software, then the disclaimer below must ++ * also be included. ++ * ++ * THIS SOFTWARE IS PROVIDED AS IS, WITHOUT REPRESENTATION ++ * FROM THE UNIVERSITY OF MICHIGAN AS TO ITS FITNESS FOR ANY ++ * PURPOSE, AND WITHOUT WARRANTY BY THE UNIVERSITY OF ++ * MICHIGAN OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ++ * WITHOUT LIMITATION THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ++ * REGENTS OF THE UNIVERSITY OF MICHIGAN SHALL NOT BE LIABLE ++ * FOR ANY DAMAGES, INCLUDING SPECIAL, INDIRECT, INCIDENTAL, OR ++ * CONSEQUENTIAL DAMAGES, WITH RESPECT TO ANY CLAIM ARISING ++ * OUT OF OR IN CONNECTION WITH THE USE OF THE SOFTWARE, EVEN ++ * IF IT HAS BEEN OR IS HEREAFTER ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGES. ++ */ ++ ++/* ++ * Copyright (C) 1998 by the FundsXpress, INC. ++ * ++ * All rights reserved. ++ * ++ * Export of this software from the United States of America may require ++ * a specific license from the United States Government. It is the ++ * responsibility of any person or organization contemplating export to ++ * obtain such a license before exporting. ++ * ++ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and ++ * distribute this software and its documentation for any purpose and ++ * without fee is hereby granted, provided that the above copyright ++ * notice appear in all copies and that both that copyright notice and ++ * this permission notice appear in supporting documentation, and that ++ * the name of FundsXpress. not be used in advertising or publicity pertaining ++ * to distribution of the software without specific, written prior ++ * permission. FundsXpress makes no representations about the suitability of ++ * this software for any purpose. It is provided "as is" without express ++ * or implied warranty. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR ++ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED ++ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef RPC_DEBUG ++# define RPCDBG_FACILITY RPCDBG_AUTH ++#endif ++ ++/* ++ * This is the n-fold function as described in rfc3961, sec 5.1 ++ * Taken from MIT Kerberos and modified. ++ */ ++ ++static void krb5_nfold(u32 inbits, const u8 *in, ++ u32 outbits, u8 *out) ++{ ++ int a, b, c, lcm; ++ int byte, i, msbit; ++ ++ /* the code below is more readable if I make these bytes ++ instead of bits */ ++ ++ inbits >>= 3; ++ outbits >>= 3; ++ ++ /* first compute lcm(n,k) */ ++ ++ a = outbits; ++ b = inbits; ++ ++ while (b != 0) { ++ c = b; ++ b = a%b; ++ a = c; ++ } ++ ++ lcm = outbits*inbits/a; ++ ++ /* now do the real work */ ++ ++ memset(out, 0, outbits); ++ byte = 0; ++ ++ /* this will end up cycling through k lcm(k,n)/k times, which ++ is correct */ ++ for (i = lcm-1; i >= 0; i--) { ++ /* compute the msbit in k which gets added into this byte */ ++ msbit = ( ++ /* first, start with the msbit in the first, ++ * unrotated byte */ ++ ((inbits << 3) - 1) ++ /* then, for each byte, shift to the right ++ * for each repetition */ ++ + (((inbits << 3) + 13) * (i/inbits)) ++ /* last, pick out the correct byte within ++ * that shifted repetition */ ++ + ((inbits - (i % inbits)) << 3) ++ ) % (inbits << 3); ++ ++ /* pull out the byte value itself */ ++ byte += (((in[((inbits - 1) - (msbit >> 3)) % inbits] << 8)| ++ (in[((inbits) - (msbit >> 3)) % inbits])) ++ >> ((msbit & 7) + 1)) & 0xff; ++ ++ /* do the addition */ ++ byte += out[i % outbits]; ++ out[i % outbits] = byte & 0xff; ++ ++ /* keep around the carry bit, if any */ ++ byte >>= 8; ++ ++ } ++ ++ /* if there's a carry bit left over, add it back in */ ++ if (byte) { ++ for (i = outbits - 1; i >= 0; i--) { ++ /* do the addition */ ++ byte += out[i]; ++ out[i] = byte & 0xff; ++ ++ /* keep around the carry bit, if any */ ++ byte >>= 8; ++ } ++ } ++} ++ ++/* ++ * This is the DK (derive_key) function as described in rfc3961, sec 5.1 ++ * Taken from MIT Kerberos and modified. ++ */ ++ ++u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e, ++ const struct xdr_netobj *inkey, ++ struct xdr_netobj *outkey, ++ const struct xdr_netobj *in_constant, ++ gfp_t gfp_mask) ++{ ++ size_t blocksize, keybytes, keylength, n; ++ unsigned char *inblockdata, *outblockdata, *rawkey; ++ struct xdr_netobj inblock, outblock; ++ struct crypto_blkcipher *cipher; ++ u32 ret = EINVAL; ++ ++ blocksize = gk5e->blocksize; ++ keybytes = gk5e->keybytes; ++ keylength = gk5e->keylength; ++ ++ if ((inkey->len != keylength) || (outkey->len != keylength)) ++ goto err_return; ++ ++ cipher = crypto_alloc_blkcipher(gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cipher)) ++ goto err_return; ++ if (crypto_blkcipher_setkey(cipher, inkey->data, inkey->len)) ++ goto err_return; ++ ++ /* allocate and set up buffers */ ++ ++ ret = ENOMEM; ++ inblockdata = kmalloc(blocksize, gfp_mask); ++ if (inblockdata == NULL) ++ goto err_free_cipher; ++ ++ outblockdata = kmalloc(blocksize, gfp_mask); ++ if (outblockdata == NULL) ++ goto err_free_in; ++ ++ rawkey = kmalloc(keybytes, gfp_mask); ++ if (rawkey == NULL) ++ goto err_free_out; ++ ++ inblock.data = (char *) inblockdata; ++ inblock.len = blocksize; ++ ++ outblock.data = (char *) outblockdata; ++ outblock.len = blocksize; ++ ++ /* initialize the input block */ ++ ++ if (in_constant->len == inblock.len) { ++ memcpy(inblock.data, in_constant->data, inblock.len); ++ } else { ++ krb5_nfold(in_constant->len * 8, in_constant->data, ++ inblock.len * 8, inblock.data); ++ } ++ ++ /* loop encrypting the blocks until enough key bytes are generated */ ++ ++ n = 0; ++ while (n < keybytes) { ++ (*(gk5e->encrypt))(cipher, NULL, inblock.data, ++ outblock.data, inblock.len); ++ ++ if ((keybytes - n) <= outblock.len) { ++ memcpy(rawkey + n, outblock.data, (keybytes - n)); ++ break; ++ } ++ ++ memcpy(rawkey + n, outblock.data, outblock.len); ++ memcpy(inblock.data, outblock.data, outblock.len); ++ n += outblock.len; ++ } ++ ++ /* postprocess the key */ ++ ++ inblock.data = (char *) rawkey; ++ inblock.len = keybytes; ++ ++ BUG_ON(gk5e->mk_key == NULL); ++ ret = (*(gk5e->mk_key))(gk5e, &inblock, outkey); ++ if (ret) { ++ dprintk("%s: got %d from mk_key function for '%s'\n", ++ __func__, ret, gk5e->encrypt_name); ++ goto err_free_raw; ++ } ++ ++ /* clean memory, free resources and exit */ ++ ++ ret = 0; ++ ++err_free_raw: ++ memset(rawkey, 0, keybytes); ++ kfree(rawkey); ++err_free_out: ++ memset(outblockdata, 0, blocksize); ++ kfree(outblockdata); ++err_free_in: ++ memset(inblockdata, 0, blocksize); ++ kfree(inblockdata); ++err_free_cipher: ++ crypto_free_blkcipher(cipher); ++err_return: ++ return ret; ++} ++ ++#define smask(step) ((1<>step)&smask(step))) ++#define parity_char(x) pstep(pstep(pstep((x), 4), 2), 1) ++ ++static void mit_des_fixup_key_parity(u8 key[8]) ++{ ++ int i; ++ for (i = 0; i < 8; i++) { ++ key[i] &= 0xfe; ++ key[i] |= 1^parity_char(key[i]); ++ } ++} ++ ++/* ++ * This is the des3 key derivation postprocess function ++ */ ++u32 gss_krb5_des3_make_key(const struct gss_krb5_enctype *gk5e, ++ struct xdr_netobj *randombits, ++ struct xdr_netobj *key) ++{ ++ int i; ++ u32 ret = EINVAL; ++ ++ if (key->len != 24) { ++ dprintk("%s: key->len is %d\n", __func__, key->len); ++ goto err_out; ++ } ++ if (randombits->len != 21) { ++ dprintk("%s: randombits->len is %d\n", ++ __func__, randombits->len); ++ goto err_out; ++ } ++ ++ /* take the seven bytes, move them around into the top 7 bits of the ++ 8 key bytes, then compute the parity bits. Do this three times. */ ++ ++ for (i = 0; i < 3; i++) { ++ memcpy(key->data + i*8, randombits->data + i*7, 7); ++ key->data[i*8+7] = (((key->data[i*8]&1)<<1) | ++ ((key->data[i*8+1]&1)<<2) | ++ ((key->data[i*8+2]&1)<<3) | ++ ((key->data[i*8+3]&1)<<4) | ++ ((key->data[i*8+4]&1)<<5) | ++ ((key->data[i*8+5]&1)<<6) | ++ ((key->data[i*8+6]&1)<<7)); ++ ++ mit_des_fixup_key_parity(key->data + i*8); ++ } ++ ret = 0; ++err_out: ++ return ret; ++} ++ ++/* ++ * This is the aes key derivation postprocess function ++ */ ++u32 gss_krb5_aes_make_key(const struct gss_krb5_enctype *gk5e, ++ struct xdr_netobj *randombits, ++ struct xdr_netobj *key) ++{ ++ u32 ret = EINVAL; ++ ++ if (key->len != 16 && key->len != 32) { ++ dprintk("%s: key->len is %d\n", __func__, key->len); ++ goto err_out; ++ } ++ if (randombits->len != 16 && randombits->len != 32) { ++ dprintk("%s: randombits->len is %d\n", ++ __func__, randombits->len); ++ goto err_out; ++ } ++ if (randombits->len != key->len) { ++ dprintk("%s: randombits->len is %d, key->len is %d\n", ++ __func__, randombits->len, key->len); ++ goto err_out; ++ } ++ memcpy(key->data, randombits->data, key->len); ++ ret = 0; ++err_out: ++ return ret; ++} ++ +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_mech.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_mech.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_mech.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_mech.c 2010-08-23 11:01:00.392564136 -0400 +@@ -1,7 +1,7 @@ + /* + * linux/net/sunrpc/gss_krb5_mech.c + * +- * Copyright (c) 2001 The Regents of the University of Michigan. ++ * Copyright (c) 2001-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson +@@ -48,6 +48,143 @@ + # define RPCDBG_FACILITY RPCDBG_AUTH + #endif + ++static struct gss_api_mech gss_kerberos_mech; /* forward declaration */ ++ ++static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { ++ /* ++ * DES (All DES enctypes are mapped to the same gss functionality) ++ */ ++ { ++ .etype = ENCTYPE_DES_CBC_RAW, ++ .ctype = CKSUMTYPE_RSA_MD5, ++ .name = "des-cbc-crc", ++ .encrypt_name = "cbc(des)", ++ .cksum_name = "md5", ++ .encrypt = krb5_encrypt, ++ .decrypt = krb5_decrypt, ++ .mk_key = NULL, ++ .signalg = SGN_ALG_DES_MAC_MD5, ++ .sealalg = SEAL_ALG_DES, ++ .keybytes = 7, ++ .keylength = 8, ++ .blocksize = 8, ++ .conflen = 8, ++ .cksumlength = 8, ++ .keyed_cksum = 0, ++ }, ++ /* ++ * RC4-HMAC ++ */ ++ { ++ .etype = ENCTYPE_ARCFOUR_HMAC, ++ .ctype = CKSUMTYPE_HMAC_MD5_ARCFOUR, ++ .name = "rc4-hmac", ++ .encrypt_name = "ecb(arc4)", ++ .cksum_name = "hmac(md5)", ++ .encrypt = krb5_encrypt, ++ .decrypt = krb5_decrypt, ++ .mk_key = NULL, ++ .signalg = SGN_ALG_HMAC_MD5, ++ .sealalg = SEAL_ALG_MICROSOFT_RC4, ++ .keybytes = 16, ++ .keylength = 16, ++ .blocksize = 1, ++ .conflen = 8, ++ .cksumlength = 8, ++ .keyed_cksum = 1, ++ }, ++ /* ++ * 3DES ++ */ ++ { ++ .etype = ENCTYPE_DES3_CBC_RAW, ++ .ctype = CKSUMTYPE_HMAC_SHA1_DES3, ++ .name = "des3-hmac-sha1", ++ .encrypt_name = "cbc(des3_ede)", ++ .cksum_name = "hmac(sha1)", ++ .encrypt = krb5_encrypt, ++ .decrypt = krb5_decrypt, ++ .mk_key = gss_krb5_des3_make_key, ++ .signalg = SGN_ALG_HMAC_SHA1_DES3_KD, ++ .sealalg = SEAL_ALG_DES3KD, ++ .keybytes = 21, ++ .keylength = 24, ++ .blocksize = 8, ++ .conflen = 8, ++ .cksumlength = 20, ++ .keyed_cksum = 1, ++ }, ++ /* ++ * AES128 ++ */ ++ { ++ .etype = ENCTYPE_AES128_CTS_HMAC_SHA1_96, ++ .ctype = CKSUMTYPE_HMAC_SHA1_96_AES128, ++ .name = "aes128-cts", ++ .encrypt_name = "cts(cbc(aes))", ++ .cksum_name = "hmac(sha1)", ++ .encrypt = krb5_encrypt, ++ .decrypt = krb5_decrypt, ++ .mk_key = gss_krb5_aes_make_key, ++ .encrypt_v2 = gss_krb5_aes_encrypt, ++ .decrypt_v2 = gss_krb5_aes_decrypt, ++ .signalg = -1, ++ .sealalg = -1, ++ .keybytes = 16, ++ .keylength = 16, ++ .blocksize = 16, ++ .conflen = 16, ++ .cksumlength = 12, ++ .keyed_cksum = 1, ++ }, ++ /* ++ * AES256 ++ */ ++ { ++ .etype = ENCTYPE_AES256_CTS_HMAC_SHA1_96, ++ .ctype = CKSUMTYPE_HMAC_SHA1_96_AES256, ++ .name = "aes256-cts", ++ .encrypt_name = "cts(cbc(aes))", ++ .cksum_name = "hmac(sha1)", ++ .encrypt = krb5_encrypt, ++ .decrypt = krb5_decrypt, ++ .mk_key = gss_krb5_aes_make_key, ++ .encrypt_v2 = gss_krb5_aes_encrypt, ++ .decrypt_v2 = gss_krb5_aes_decrypt, ++ .signalg = -1, ++ .sealalg = -1, ++ .keybytes = 32, ++ .keylength = 32, ++ .blocksize = 16, ++ .conflen = 16, ++ .cksumlength = 12, ++ .keyed_cksum = 1, ++ }, ++}; ++ ++static const int num_supported_enctypes = ++ ARRAY_SIZE(supported_gss_krb5_enctypes); ++ ++static int ++supported_gss_krb5_enctype(int etype) ++{ ++ int i; ++ for (i = 0; i < num_supported_enctypes; i++) ++ if (supported_gss_krb5_enctypes[i].etype == etype) ++ return 1; ++ return 0; ++} ++ ++static const struct gss_krb5_enctype * ++get_gss_krb5_enctype(int etype) ++{ ++ int i; ++ for (i = 0; i < num_supported_enctypes; i++) ++ if (supported_gss_krb5_enctypes[i].etype == etype) ++ return &supported_gss_krb5_enctypes[i]; ++ return NULL; ++} ++ + static const void * + simple_get_bytes(const void *p, const void *end, void *res, int len) + { +@@ -78,35 +215,45 @@ simple_get_netobj(const void *p, const v + } + + static inline const void * +-get_key(const void *p, const void *end, struct crypto_blkcipher **res) ++get_key(const void *p, const void *end, ++ struct krb5_ctx *ctx, struct crypto_blkcipher **res) + { + struct xdr_netobj key; + int alg; +- char *alg_name; + + p = simple_get_bytes(p, end, &alg, sizeof(alg)); + if (IS_ERR(p)) + goto out_err; ++ ++ switch (alg) { ++ case ENCTYPE_DES_CBC_CRC: ++ case ENCTYPE_DES_CBC_MD4: ++ case ENCTYPE_DES_CBC_MD5: ++ /* Map all these key types to ENCTYPE_DES_CBC_RAW */ ++ alg = ENCTYPE_DES_CBC_RAW; ++ break; ++ } ++ ++ if (!supported_gss_krb5_enctype(alg)) { ++ printk(KERN_WARNING "gss_kerberos_mech: unsupported " ++ "encryption key algorithm %d\n", alg); ++ goto out_err; ++ } + p = simple_get_netobj(p, end, &key); + if (IS_ERR(p)) + goto out_err; + +- switch (alg) { +- case ENCTYPE_DES_CBC_RAW: +- alg_name = "cbc(des)"; +- break; +- default: +- printk("gss_kerberos_mech: unsupported algorithm %d\n", alg); +- goto out_err_free_key; +- } +- *res = crypto_alloc_blkcipher(alg_name, 0, CRYPTO_ALG_ASYNC); ++ *res = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); + if (IS_ERR(*res)) { +- printk("gss_kerberos_mech: unable to initialize crypto algorithm %s\n", alg_name); ++ printk(KERN_WARNING "gss_kerberos_mech: unable to initialize " ++ "crypto algorithm %s\n", ctx->gk5e->encrypt_name); + *res = NULL; + goto out_err_free_key; + } + if (crypto_blkcipher_setkey(*res, key.data, key.len)) { +- printk("gss_kerberos_mech: error setting key for crypto algorithm %s\n", alg_name); ++ printk(KERN_WARNING "gss_kerberos_mech: error setting key for " ++ "crypto algorithm %s\n", ctx->gk5e->encrypt_name); + goto out_err_free_tfm; + } + +@@ -123,56 +270,55 @@ out_err: + } + + static int +-gss_import_sec_context_kerberos(const void *p, +- size_t len, +- struct gss_ctx *ctx_id) ++gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) + { +- const void *end = (const void *)((const char *)p + len); +- struct krb5_ctx *ctx; + int tmp; + +- if (!(ctx = kzalloc(sizeof(*ctx), GFP_NOFS))) { +- p = ERR_PTR(-ENOMEM); +- goto out_err; +- } +- + p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate)); + if (IS_ERR(p)) +- goto out_err_free_ctx; ++ goto out_err; ++ ++ /* Old format supports only DES! Any other enctype uses new format */ ++ ctx->enctype = ENCTYPE_DES_CBC_RAW; ++ ++ ctx->gk5e = get_gss_krb5_enctype(ctx->enctype); ++ if (ctx->gk5e == NULL) ++ goto out_err; ++ + /* The downcall format was designed before we completely understood + * the uses of the context fields; so it includes some stuff we + * just give some minimal sanity-checking, and some we ignore + * completely (like the next twenty bytes): */ + if (unlikely(p + 20 > end || p + 20 < p)) +- goto out_err_free_ctx; ++ goto out_err; + p += 20; + p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); + if (IS_ERR(p)) +- goto out_err_free_ctx; ++ goto out_err; + if (tmp != SGN_ALG_DES_MAC_MD5) { + p = ERR_PTR(-ENOSYS); +- goto out_err_free_ctx; ++ goto out_err; + } + p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); + if (IS_ERR(p)) +- goto out_err_free_ctx; ++ goto out_err; + if (tmp != SEAL_ALG_DES) { + p = ERR_PTR(-ENOSYS); +- goto out_err_free_ctx; ++ goto out_err; + } + p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); + if (IS_ERR(p)) +- goto out_err_free_ctx; ++ goto out_err; + p = simple_get_bytes(p, end, &ctx->seq_send, sizeof(ctx->seq_send)); + if (IS_ERR(p)) +- goto out_err_free_ctx; ++ goto out_err; + p = simple_get_netobj(p, end, &ctx->mech_used); + if (IS_ERR(p)) +- goto out_err_free_ctx; +- p = get_key(p, end, &ctx->enc); ++ goto out_err; ++ p = get_key(p, end, ctx, &ctx->enc); + if (IS_ERR(p)) + goto out_err_free_mech; +- p = get_key(p, end, &ctx->seq); ++ p = get_key(p, end, ctx, &ctx->seq); + if (IS_ERR(p)) + goto out_err_free_key1; + if (p != end) { +@@ -180,9 +326,6 @@ gss_import_sec_context_kerberos(const vo + goto out_err_free_key2; + } + +- ctx_id->internal_ctx_id = ctx; +- +- dprintk("RPC: Successfully imported new context.\n"); + return 0; + + out_err_free_key2: +@@ -191,18 +334,378 @@ out_err_free_key1: + crypto_free_blkcipher(ctx->enc); + out_err_free_mech: + kfree(ctx->mech_used.data); +-out_err_free_ctx: +- kfree(ctx); + out_err: + return PTR_ERR(p); + } + ++struct crypto_blkcipher * ++context_v2_alloc_cipher(struct krb5_ctx *ctx, const char *cname, u8 *key) ++{ ++ struct crypto_blkcipher *cp; ++ ++ cp = crypto_alloc_blkcipher(cname, 0, CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cp)) { ++ dprintk("gss_kerberos_mech: unable to initialize " ++ "crypto algorithm %s\n", cname); ++ return NULL; ++ } ++ if (crypto_blkcipher_setkey(cp, key, ctx->gk5e->keylength)) { ++ dprintk("gss_kerberos_mech: error setting key for " ++ "crypto algorithm %s\n", cname); ++ crypto_free_blkcipher(cp); ++ return NULL; ++ } ++ return cp; ++} ++ ++static inline void ++set_cdata(u8 cdata[GSS_KRB5_K5CLENGTH], u32 usage, u8 seed) ++{ ++ cdata[0] = (usage>>24)&0xff; ++ cdata[1] = (usage>>16)&0xff; ++ cdata[2] = (usage>>8)&0xff; ++ cdata[3] = usage&0xff; ++ cdata[4] = seed; ++} ++ ++static int ++context_derive_keys_des3(struct krb5_ctx *ctx, gfp_t gfp_mask) ++{ ++ struct xdr_netobj c, keyin, keyout; ++ u8 cdata[GSS_KRB5_K5CLENGTH]; ++ u32 err; ++ ++ c.len = GSS_KRB5_K5CLENGTH; ++ c.data = cdata; ++ ++ keyin.data = ctx->Ksess; ++ keyin.len = ctx->gk5e->keylength; ++ keyout.len = ctx->gk5e->keylength; ++ ++ /* seq uses the raw key */ ++ ctx->seq = context_v2_alloc_cipher(ctx, ctx->gk5e->encrypt_name, ++ ctx->Ksess); ++ if (ctx->seq == NULL) ++ goto out_err; ++ ++ ctx->enc = context_v2_alloc_cipher(ctx, ctx->gk5e->encrypt_name, ++ ctx->Ksess); ++ if (ctx->enc == NULL) ++ goto out_free_seq; ++ ++ /* derive cksum */ ++ set_cdata(cdata, KG_USAGE_SIGN, KEY_USAGE_SEED_CHECKSUM); ++ keyout.data = ctx->cksum; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving cksum key\n", ++ __func__, err); ++ goto out_free_enc; ++ } ++ ++ return 0; ++ ++out_free_enc: ++ crypto_free_blkcipher(ctx->enc); ++out_free_seq: ++ crypto_free_blkcipher(ctx->seq); ++out_err: ++ return -EINVAL; ++} ++ ++/* ++ * Note that RC4 depends on deriving keys using the sequence ++ * number or the checksum of a token. Therefore, the final keys ++ * cannot be calculated until the token is being constructed! ++ */ ++static int ++context_derive_keys_rc4(struct krb5_ctx *ctx) ++{ ++ struct crypto_hash *hmac; ++ char sigkeyconstant[] = "signaturekey"; ++ int slen = strlen(sigkeyconstant) + 1; /* include null terminator */ ++ struct hash_desc desc; ++ struct scatterlist sg[1]; ++ int err; ++ ++ dprintk("RPC: %s: entered\n", __func__); ++ /* ++ * derive cksum (aka Ksign) key ++ */ ++ hmac = crypto_alloc_hash(ctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); ++ if (IS_ERR(hmac)) { ++ dprintk("%s: error %ld allocating hash '%s'\n", ++ __func__, PTR_ERR(hmac), ctx->gk5e->cksum_name); ++ err = PTR_ERR(hmac); ++ goto out_err; ++ } ++ ++ err = crypto_hash_setkey(hmac, ctx->Ksess, ctx->gk5e->keylength); ++ if (err) ++ goto out_err_free_hmac; ++ ++ sg_init_table(sg, 1); ++ sg_set_buf(sg, sigkeyconstant, slen); ++ ++ desc.tfm = hmac; ++ desc.flags = 0; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out_err_free_hmac; ++ ++ err = crypto_hash_digest(&desc, sg, slen, ctx->cksum); ++ if (err) ++ goto out_err_free_hmac; ++ /* ++ * allocate hash, and blkciphers for data and seqnum encryption ++ */ ++ ctx->enc = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(ctx->enc)) { ++ err = PTR_ERR(ctx->enc); ++ goto out_err_free_hmac; ++ } ++ ++ ctx->seq = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(ctx->seq)) { ++ crypto_free_blkcipher(ctx->enc); ++ err = PTR_ERR(ctx->seq); ++ goto out_err_free_hmac; ++ } ++ ++ dprintk("RPC: %s: returning success\n", __func__); ++ ++ err = 0; ++ ++out_err_free_hmac: ++ crypto_free_hash(hmac); ++out_err: ++ dprintk("RPC: %s: returning %d\n", __func__, err); ++ return err; ++} ++ ++static int ++context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask) ++{ ++ struct xdr_netobj c, keyin, keyout; ++ u8 cdata[GSS_KRB5_K5CLENGTH]; ++ u32 err; ++ ++ c.len = GSS_KRB5_K5CLENGTH; ++ c.data = cdata; ++ ++ keyin.data = ctx->Ksess; ++ keyin.len = ctx->gk5e->keylength; ++ keyout.len = ctx->gk5e->keylength; ++ ++ /* initiator seal encryption */ ++ set_cdata(cdata, KG_USAGE_INITIATOR_SEAL, KEY_USAGE_SEED_ENCRYPTION); ++ keyout.data = ctx->initiator_seal; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving initiator_seal key\n", ++ __func__, err); ++ goto out_err; ++ } ++ ctx->initiator_enc = context_v2_alloc_cipher(ctx, ++ ctx->gk5e->encrypt_name, ++ ctx->initiator_seal); ++ if (ctx->initiator_enc == NULL) ++ goto out_err; ++ ++ /* acceptor seal encryption */ ++ set_cdata(cdata, KG_USAGE_ACCEPTOR_SEAL, KEY_USAGE_SEED_ENCRYPTION); ++ keyout.data = ctx->acceptor_seal; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving acceptor_seal key\n", ++ __func__, err); ++ goto out_free_initiator_enc; ++ } ++ ctx->acceptor_enc = context_v2_alloc_cipher(ctx, ++ ctx->gk5e->encrypt_name, ++ ctx->acceptor_seal); ++ if (ctx->acceptor_enc == NULL) ++ goto out_free_initiator_enc; ++ ++ /* initiator sign checksum */ ++ set_cdata(cdata, KG_USAGE_INITIATOR_SIGN, KEY_USAGE_SEED_CHECKSUM); ++ keyout.data = ctx->initiator_sign; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving initiator_sign key\n", ++ __func__, err); ++ goto out_free_acceptor_enc; ++ } ++ ++ /* acceptor sign checksum */ ++ set_cdata(cdata, KG_USAGE_ACCEPTOR_SIGN, KEY_USAGE_SEED_CHECKSUM); ++ keyout.data = ctx->acceptor_sign; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving acceptor_sign key\n", ++ __func__, err); ++ goto out_free_acceptor_enc; ++ } ++ ++ /* initiator seal integrity */ ++ set_cdata(cdata, KG_USAGE_INITIATOR_SEAL, KEY_USAGE_SEED_INTEGRITY); ++ keyout.data = ctx->initiator_integ; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving initiator_integ key\n", ++ __func__, err); ++ goto out_free_acceptor_enc; ++ } ++ ++ /* acceptor seal integrity */ ++ set_cdata(cdata, KG_USAGE_ACCEPTOR_SEAL, KEY_USAGE_SEED_INTEGRITY); ++ keyout.data = ctx->acceptor_integ; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving acceptor_integ key\n", ++ __func__, err); ++ goto out_free_acceptor_enc; ++ } ++ ++ switch (ctx->enctype) { ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ ctx->initiator_enc_aux = ++ context_v2_alloc_cipher(ctx, "cbc(aes)", ++ ctx->initiator_seal); ++ if (ctx->initiator_enc_aux == NULL) ++ goto out_free_acceptor_enc; ++ ctx->acceptor_enc_aux = ++ context_v2_alloc_cipher(ctx, "cbc(aes)", ++ ctx->acceptor_seal); ++ if (ctx->acceptor_enc_aux == NULL) { ++ crypto_free_blkcipher(ctx->initiator_enc_aux); ++ goto out_free_acceptor_enc; ++ } ++ } ++ ++ return 0; ++ ++out_free_acceptor_enc: ++ crypto_free_blkcipher(ctx->acceptor_enc); ++out_free_initiator_enc: ++ crypto_free_blkcipher(ctx->initiator_enc); ++out_err: ++ return -EINVAL; ++} ++ ++static int ++gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, ++ gfp_t gfp_mask) ++{ ++ int keylen; ++ ++ p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags)); ++ if (IS_ERR(p)) ++ goto out_err; ++ ctx->initiate = ctx->flags & KRB5_CTX_FLAG_INITIATOR; ++ ++ p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); ++ if (IS_ERR(p)) ++ goto out_err; ++ p = simple_get_bytes(p, end, &ctx->seq_send64, sizeof(ctx->seq_send64)); ++ if (IS_ERR(p)) ++ goto out_err; ++ /* set seq_send for use by "older" enctypes */ ++ ctx->seq_send = ctx->seq_send64; ++ if (ctx->seq_send64 != ctx->seq_send) { ++ dprintk("%s: seq_send64 %lx, seq_send %x overflow?\n", __func__, ++ (long unsigned)ctx->seq_send64, ctx->seq_send); ++ goto out_err; ++ } ++ p = simple_get_bytes(p, end, &ctx->enctype, sizeof(ctx->enctype)); ++ if (IS_ERR(p)) ++ goto out_err; ++ /* Map ENCTYPE_DES3_CBC_SHA1 to ENCTYPE_DES3_CBC_RAW */ ++ if (ctx->enctype == ENCTYPE_DES3_CBC_SHA1) ++ ctx->enctype = ENCTYPE_DES3_CBC_RAW; ++ ctx->gk5e = get_gss_krb5_enctype(ctx->enctype); ++ if (ctx->gk5e == NULL) { ++ dprintk("gss_kerberos_mech: unsupported krb5 enctype %u\n", ++ ctx->enctype); ++ p = ERR_PTR(-EINVAL); ++ goto out_err; ++ } ++ keylen = ctx->gk5e->keylength; ++ ++ p = simple_get_bytes(p, end, ctx->Ksess, keylen); ++ if (IS_ERR(p)) ++ goto out_err; ++ ++ if (p != end) { ++ p = ERR_PTR(-EINVAL); ++ goto out_err; ++ } ++ ++ ctx->mech_used.data = kmemdup(gss_kerberos_mech.gm_oid.data, ++ gss_kerberos_mech.gm_oid.len, gfp_mask); ++ if (unlikely(ctx->mech_used.data == NULL)) { ++ p = ERR_PTR(-ENOMEM); ++ goto out_err; ++ } ++ ctx->mech_used.len = gss_kerberos_mech.gm_oid.len; ++ ++ switch (ctx->enctype) { ++ case ENCTYPE_DES3_CBC_RAW: ++ return context_derive_keys_des3(ctx, gfp_mask); ++ case ENCTYPE_ARCFOUR_HMAC: ++ return context_derive_keys_rc4(ctx); ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ return context_derive_keys_new(ctx, gfp_mask); ++ default: ++ return -EINVAL; ++ } ++ ++out_err: ++ return PTR_ERR(p); ++} ++ ++static int ++gss_import_sec_context_kerberos(const void *p, size_t len, ++ struct gss_ctx *ctx_id, ++ gfp_t gfp_mask) ++{ ++ const void *end = (const void *)((const char *)p + len); ++ struct krb5_ctx *ctx; ++ int ret; ++ ++ ctx = kzalloc(sizeof(*ctx), gfp_mask); ++ if (ctx == NULL) ++ return -ENOMEM; ++ ++ if (len == 85) ++ ret = gss_import_v1_context(p, end, ctx); ++ else ++ ret = gss_import_v2_context(p, end, ctx, gfp_mask); ++ ++ if (ret == 0) ++ ctx_id->internal_ctx_id = ctx; ++ else ++ kfree(ctx); ++ ++ dprintk("RPC: %s: returning %d\n", __func__, ret); ++ return ret; ++} ++ + static void + gss_delete_sec_context_kerberos(void *internal_ctx) { + struct krb5_ctx *kctx = internal_ctx; + + crypto_free_blkcipher(kctx->seq); + crypto_free_blkcipher(kctx->enc); ++ crypto_free_blkcipher(kctx->acceptor_enc); ++ crypto_free_blkcipher(kctx->initiator_enc); ++ crypto_free_blkcipher(kctx->acceptor_enc_aux); ++ crypto_free_blkcipher(kctx->initiator_enc_aux); + kfree(kctx->mech_used.data); + kfree(kctx); + } +@@ -241,6 +744,7 @@ static struct gss_api_mech gss_kerberos_ + .gm_ops = &gss_kerberos_ops, + .gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs), + .gm_pfs = gss_kerberos_pfs, ++ .gm_upcall_enctypes = "enctypes=18,17,16,23,3,1,2 ", + }; + + static int __init init_kerberos_module(void) +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seal.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seal.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seal.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seal.c 2010-08-23 11:01:00.392564136 -0400 +@@ -3,7 +3,7 @@ + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5seal.c + * +- * Copyright (c) 2000 The Regents of the University of Michigan. ++ * Copyright (c) 2000-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson +@@ -70,53 +70,154 @@ + + DEFINE_SPINLOCK(krb5_seq_lock); + +-u32 +-gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, ++static char * ++setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token) ++{ ++ __be16 *ptr, *krb5_hdr; ++ int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength; ++ ++ token->len = g_token_size(&ctx->mech_used, body_size); ++ ++ ptr = (__be16 *)token->data; ++ g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr); ++ ++ /* ptr now at start of header described in rfc 1964, section 1.2.1: */ ++ krb5_hdr = ptr; ++ *ptr++ = KG_TOK_MIC_MSG; ++ *ptr++ = cpu_to_le16(ctx->gk5e->signalg); ++ *ptr++ = SEAL_ALG_NONE; ++ *ptr++ = 0xffff; ++ ++ return (char *)krb5_hdr; ++} ++ ++static void * ++setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) ++{ ++ __be16 *ptr, *krb5_hdr; ++ u8 *p, flags = 0x00; ++ ++ if ((ctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0) ++ flags |= 0x01; ++ if (ctx->flags & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY) ++ flags |= 0x04; ++ ++ /* Per rfc 4121, sec 4.2.6.1, there is no header, ++ * just start the token */ ++ krb5_hdr = ptr = (__be16 *)token->data; ++ ++ *ptr++ = KG2_TOK_MIC; ++ p = (u8 *)ptr; ++ *p++ = flags; ++ *p++ = 0xff; ++ ptr = (__be16 *)p; ++ *ptr++ = 0xffff; ++ *ptr++ = 0xffff; ++ ++ token->len = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength; ++ return krb5_hdr; ++} ++ ++static u32 ++gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, + struct xdr_netobj *token) + { +- struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; +- char cksumdata[16]; +- struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; +- unsigned char *ptr, *msg_start; ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), ++ .data = cksumdata}; ++ void *ptr; + s32 now; + u32 seq_send; ++ u8 *cksumkey; + +- dprintk("RPC: gss_krb5_seal\n"); ++ dprintk("RPC: %s\n", __func__); + BUG_ON(ctx == NULL); + + now = get_seconds(); + +- token->len = g_token_size(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8); ++ ptr = setup_token(ctx, token); + +- ptr = token->data; +- g_make_token_header(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8, &ptr); ++ if (ctx->gk5e->keyed_cksum) ++ cksumkey = ctx->cksum; ++ else ++ cksumkey = NULL; + +- /* ptr now at header described in rfc 1964, section 1.2.1: */ +- ptr[0] = (unsigned char) ((KG_TOK_MIC_MSG >> 8) & 0xff); +- ptr[1] = (unsigned char) (KG_TOK_MIC_MSG & 0xff); ++ if (make_checksum(ctx, ptr, 8, text, 0, cksumkey, ++ KG_USAGE_SIGN, &md5cksum)) ++ return GSS_S_FAILURE; + +- msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8; ++ memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); + +- *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5); +- memset(ptr + 4, 0xff, 4); ++ spin_lock(&krb5_seq_lock); ++ seq_send = ctx->seq_send++; ++ spin_unlock(&krb5_seq_lock); + +- if (make_checksum("md5", ptr, 8, text, 0, &md5cksum)) ++ if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff, ++ seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)) + return GSS_S_FAILURE; + +- if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, +- md5cksum.data, md5cksum.len)) +- return GSS_S_FAILURE; ++ return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; ++} ++ ++u32 ++gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, ++ struct xdr_netobj *token) ++{ ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj cksumobj = { .len = sizeof(cksumdata), ++ .data = cksumdata}; ++ void *krb5_hdr; ++ s32 now; ++ u64 seq_send; ++ u8 *cksumkey; ++ unsigned int cksum_usage; ++ ++ dprintk("RPC: %s\n", __func__); + +- memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8); ++ krb5_hdr = setup_token_v2(ctx, token); + ++ /* Set up the sequence number. Now 64-bits in clear ++ * text and w/o direction indicator */ + spin_lock(&krb5_seq_lock); +- seq_send = ctx->seq_send++; ++ seq_send = ctx->seq_send64++; + spin_unlock(&krb5_seq_lock); ++ *((u64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send); + +- if (krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff, +- seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, +- ptr + 8)) ++ if (ctx->initiate) { ++ cksumkey = ctx->initiator_sign; ++ cksum_usage = KG_USAGE_INITIATOR_SIGN; ++ } else { ++ cksumkey = ctx->acceptor_sign; ++ cksum_usage = KG_USAGE_ACCEPTOR_SIGN; ++ } ++ ++ if (make_checksum_v2(ctx, krb5_hdr, GSS_KRB5_TOK_HDR_LEN, ++ text, 0, cksumkey, cksum_usage, &cksumobj)) + return GSS_S_FAILURE; + ++ memcpy(krb5_hdr + GSS_KRB5_TOK_HDR_LEN, cksumobj.data, cksumobj.len); ++ ++ now = get_seconds(); ++ + return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; + } ++ ++u32 ++gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, ++ struct xdr_netobj *token) ++{ ++ struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; ++ ++ switch (ctx->enctype) { ++ default: ++ BUG(); ++ case ENCTYPE_DES_CBC_RAW: ++ case ENCTYPE_DES3_CBC_RAW: ++ case ENCTYPE_ARCFOUR_HMAC: ++ return gss_get_mic_v1(ctx, text, token); ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ return gss_get_mic_v2(ctx, text, token); ++ } ++} ++ +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seqnum.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seqnum.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seqnum.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seqnum.c 2010-08-23 11:01:00.393496180 -0400 +@@ -39,14 +39,51 @@ + # define RPCDBG_FACILITY RPCDBG_AUTH + #endif + ++static s32 ++krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum, ++ unsigned char *cksum, unsigned char *buf) ++{ ++ struct crypto_blkcipher *cipher; ++ unsigned char plain[8]; ++ s32 code; ++ ++ dprintk("RPC: %s:\n", __func__); ++ cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cipher)) ++ return PTR_ERR(cipher); ++ ++ plain[0] = (unsigned char) ((seqnum >> 24) & 0xff); ++ plain[1] = (unsigned char) ((seqnum >> 16) & 0xff); ++ plain[2] = (unsigned char) ((seqnum >> 8) & 0xff); ++ plain[3] = (unsigned char) ((seqnum >> 0) & 0xff); ++ plain[4] = direction; ++ plain[5] = direction; ++ plain[6] = direction; ++ plain[7] = direction; ++ ++ code = krb5_rc4_setup_seq_key(kctx, cipher, cksum); ++ if (code) ++ goto out; ++ ++ code = krb5_encrypt(cipher, cksum, plain, buf, 8); ++out: ++ crypto_free_blkcipher(cipher); ++ return code; ++} + s32 +-krb5_make_seq_num(struct crypto_blkcipher *key, ++krb5_make_seq_num(struct krb5_ctx *kctx, ++ struct crypto_blkcipher *key, + int direction, + u32 seqnum, + unsigned char *cksum, unsigned char *buf) + { + unsigned char plain[8]; + ++ if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) ++ return krb5_make_rc4_seq_num(kctx, direction, seqnum, ++ cksum, buf); ++ + plain[0] = (unsigned char) (seqnum & 0xff); + plain[1] = (unsigned char) ((seqnum >> 8) & 0xff); + plain[2] = (unsigned char) ((seqnum >> 16) & 0xff); +@@ -60,17 +97,59 @@ krb5_make_seq_num(struct crypto_blkciphe + return krb5_encrypt(key, cksum, plain, buf, 8); + } + ++static s32 ++krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum, ++ unsigned char *buf, int *direction, s32 *seqnum) ++{ ++ struct crypto_blkcipher *cipher; ++ unsigned char plain[8]; ++ s32 code; ++ ++ dprintk("RPC: %s:\n", __func__); ++ cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cipher)) ++ return PTR_ERR(cipher); ++ ++ code = krb5_rc4_setup_seq_key(kctx, cipher, cksum); ++ if (code) ++ goto out; ++ ++ code = krb5_decrypt(cipher, cksum, buf, plain, 8); ++ if (code) ++ goto out; ++ ++ if ((plain[4] != plain[5]) || (plain[4] != plain[6]) ++ || (plain[4] != plain[7])) { ++ code = (s32)KG_BAD_SEQ; ++ goto out; ++ } ++ ++ *direction = plain[4]; ++ ++ *seqnum = ((plain[0] << 24) | (plain[1] << 16) | ++ (plain[2] << 8) | (plain[3])); ++out: ++ crypto_free_blkcipher(cipher); ++ return code; ++} ++ + s32 +-krb5_get_seq_num(struct crypto_blkcipher *key, ++krb5_get_seq_num(struct krb5_ctx *kctx, + unsigned char *cksum, + unsigned char *buf, + int *direction, u32 *seqnum) + { + s32 code; + unsigned char plain[8]; ++ struct crypto_blkcipher *key = kctx->seq; + + dprintk("RPC: krb5_get_seq_num:\n"); + ++ if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) ++ return krb5_get_rc4_seq_num(kctx, cksum, buf, ++ direction, seqnum); ++ + if ((code = krb5_decrypt(key, cksum, buf, plain, 8))) + return code; + +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_unseal.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_unseal.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_unseal.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_unseal.c 2010-08-23 11:01:00.393496180 -0400 +@@ -3,7 +3,7 @@ + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5unseal.c + * +- * Copyright (c) 2000 The Regents of the University of Michigan. ++ * Copyright (c) 2000-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson +@@ -70,20 +70,21 @@ + /* read_token is a mic token, and message_buffer is the data that the mic was + * supposedly taken over. */ + +-u32 +-gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, ++static u32 ++gss_verify_mic_v1(struct krb5_ctx *ctx, + struct xdr_buf *message_buffer, struct xdr_netobj *read_token) + { +- struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; + int signalg; + int sealalg; +- char cksumdata[16]; +- struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), ++ .data = cksumdata}; + s32 now; + int direction; + u32 seqnum; + unsigned char *ptr = (unsigned char *)read_token->data; + int bodysize; ++ u8 *cksumkey; + + dprintk("RPC: krb5_read_token\n"); + +@@ -98,7 +99,7 @@ gss_verify_mic_kerberos(struct gss_ctx * + /* XXX sanity-check bodysize?? */ + + signalg = ptr[2] + (ptr[3] << 8); +- if (signalg != SGN_ALG_DES_MAC_MD5) ++ if (signalg != ctx->gk5e->signalg) + return GSS_S_DEFECTIVE_TOKEN; + + sealalg = ptr[4] + (ptr[5] << 8); +@@ -108,13 +109,17 @@ gss_verify_mic_kerberos(struct gss_ctx * + if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) + return GSS_S_DEFECTIVE_TOKEN; + +- if (make_checksum("md5", ptr, 8, message_buffer, 0, &md5cksum)) +- return GSS_S_FAILURE; ++ if (ctx->gk5e->keyed_cksum) ++ cksumkey = ctx->cksum; ++ else ++ cksumkey = NULL; + +- if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, md5cksum.data, 16)) ++ if (make_checksum(ctx, ptr, 8, message_buffer, 0, ++ cksumkey, KG_USAGE_SIGN, &md5cksum)) + return GSS_S_FAILURE; + +- if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8)) ++ if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN, ++ ctx->gk5e->cksumlength)) + return GSS_S_BAD_SIG; + + /* it got through unscathed. Make sure the context is unexpired */ +@@ -126,7 +131,8 @@ gss_verify_mic_kerberos(struct gss_ctx * + + /* do sequencing checks */ + +- if (krb5_get_seq_num(ctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, &direction, &seqnum)) ++ if (krb5_get_seq_num(ctx, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, ++ &direction, &seqnum)) + return GSS_S_FAILURE; + + if ((ctx->initiate && direction != 0xff) || +@@ -135,3 +141,86 @@ gss_verify_mic_kerberos(struct gss_ctx * + + return GSS_S_COMPLETE; + } ++ ++static u32 ++gss_verify_mic_v2(struct krb5_ctx *ctx, ++ struct xdr_buf *message_buffer, struct xdr_netobj *read_token) ++{ ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj cksumobj = {.len = sizeof(cksumdata), ++ .data = cksumdata}; ++ s32 now; ++ u64 seqnum; ++ u8 *ptr = read_token->data; ++ u8 *cksumkey; ++ u8 flags; ++ int i; ++ unsigned int cksum_usage; ++ ++ dprintk("RPC: %s\n", __func__); ++ ++ if (be16_to_cpu(*((__be16 *)ptr)) != KG2_TOK_MIC) ++ return GSS_S_DEFECTIVE_TOKEN; ++ ++ flags = ptr[2]; ++ if ((!ctx->initiate && (flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR)) || ++ (ctx->initiate && !(flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR))) ++ return GSS_S_BAD_SIG; ++ ++ if (flags & KG2_TOKEN_FLAG_SEALED) { ++ dprintk("%s: token has unexpected sealed flag\n", __func__); ++ return GSS_S_FAILURE; ++ } ++ ++ for (i = 3; i < 8; i++) ++ if (ptr[i] != 0xff) ++ return GSS_S_DEFECTIVE_TOKEN; ++ ++ if (ctx->initiate) { ++ cksumkey = ctx->acceptor_sign; ++ cksum_usage = KG_USAGE_ACCEPTOR_SIGN; ++ } else { ++ cksumkey = ctx->initiator_sign; ++ cksum_usage = KG_USAGE_INITIATOR_SIGN; ++ } ++ ++ if (make_checksum_v2(ctx, ptr, GSS_KRB5_TOK_HDR_LEN, message_buffer, 0, ++ cksumkey, cksum_usage, &cksumobj)) ++ return GSS_S_FAILURE; ++ ++ if (memcmp(cksumobj.data, ptr + GSS_KRB5_TOK_HDR_LEN, ++ ctx->gk5e->cksumlength)) ++ return GSS_S_BAD_SIG; ++ ++ /* it got through unscathed. Make sure the context is unexpired */ ++ now = get_seconds(); ++ if (now > ctx->endtime) ++ return GSS_S_CONTEXT_EXPIRED; ++ ++ /* do sequencing checks */ ++ ++ seqnum = be64_to_cpup((__be64 *)ptr + 8); ++ ++ return GSS_S_COMPLETE; ++} ++ ++u32 ++gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, ++ struct xdr_buf *message_buffer, ++ struct xdr_netobj *read_token) ++{ ++ struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; ++ ++ switch (ctx->enctype) { ++ default: ++ BUG(); ++ case ENCTYPE_DES_CBC_RAW: ++ case ENCTYPE_DES3_CBC_RAW: ++ case ENCTYPE_ARCFOUR_HMAC: ++ return gss_verify_mic_v1(ctx, message_buffer, read_token); ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ return gss_verify_mic_v2(ctx, message_buffer, read_token); ++ } ++} ++ +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_wrap.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_wrap.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_wrap.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_wrap.c 2010-08-23 11:01:00.394576083 -0400 +@@ -1,3 +1,33 @@ ++/* ++ * COPYRIGHT (c) 2008 ++ * The Regents of the University of Michigan ++ * ALL RIGHTS RESERVED ++ * ++ * Permission is granted to use, copy, create derivative works ++ * and redistribute this software and such derivative works ++ * for any purpose, so long as the name of The University of ++ * Michigan is not used in any advertising or publicity ++ * pertaining to the use of distribution of this software ++ * without specific, written prior authorization. If the ++ * above copyright notice or any other identification of the ++ * University of Michigan is included in any copy of any ++ * portion of this software, then the disclaimer below must ++ * also be included. ++ * ++ * THIS SOFTWARE IS PROVIDED AS IS, WITHOUT REPRESENTATION ++ * FROM THE UNIVERSITY OF MICHIGAN AS TO ITS FITNESS FOR ANY ++ * PURPOSE, AND WITHOUT WARRANTY BY THE UNIVERSITY OF ++ * MICHIGAN OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ++ * WITHOUT LIMITATION THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ++ * REGENTS OF THE UNIVERSITY OF MICHIGAN SHALL NOT BE LIABLE ++ * FOR ANY DAMAGES, INCLUDING SPECIAL, INDIRECT, INCIDENTAL, OR ++ * CONSEQUENTIAL DAMAGES, WITH RESPECT TO ANY CLAIM ARISING ++ * OUT OF OR IN CONNECTION WITH THE USE OF THE SOFTWARE, EVEN ++ * IF IT HAS BEEN OR IS HEREAFTER ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGES. ++ */ ++ + #include + #include + #include +@@ -12,10 +42,7 @@ + static inline int + gss_krb5_padding(int blocksize, int length) + { +- /* Most of the code is block-size independent but currently we +- * use only 8: */ +- BUG_ON(blocksize != 8); +- return 8 - (length & 7); ++ return blocksize - (length % blocksize); + } + + static inline void +@@ -86,8 +113,8 @@ out: + return 0; + } + +-static void +-make_confounder(char *p, u32 conflen) ++void ++gss_krb5_make_confounder(char *p, u32 conflen) + { + static u64 i = 0; + u64 *q = (u64 *)p; +@@ -127,69 +154,73 @@ make_confounder(char *p, u32 conflen) + + /* XXX factor out common code with seal/unseal. */ + +-u32 +-gss_wrap_kerberos(struct gss_ctx *ctx, int offset, ++static u32 ++gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, + struct xdr_buf *buf, struct page **pages) + { +- struct krb5_ctx *kctx = ctx->internal_ctx_id; +- char cksumdata[16]; +- struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), ++ .data = cksumdata}; + int blocksize = 0, plainlen; + unsigned char *ptr, *msg_start; + s32 now; + int headlen; + struct page **tmp_pages; + u32 seq_send; ++ u8 *cksumkey; ++ u32 conflen = kctx->gk5e->conflen; + +- dprintk("RPC: gss_wrap_kerberos\n"); ++ dprintk("RPC: %s\n", __func__); + + now = get_seconds(); + + blocksize = crypto_blkcipher_blocksize(kctx->enc); + gss_krb5_add_padding(buf, offset, blocksize); + BUG_ON((buf->len - offset) % blocksize); +- plainlen = blocksize + buf->len - offset; ++ plainlen = conflen + buf->len - offset; + +- headlen = g_token_size(&kctx->mech_used, 24 + plainlen) - +- (buf->len - offset); ++ headlen = g_token_size(&kctx->mech_used, ++ GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength + plainlen) - ++ (buf->len - offset); + + ptr = buf->head[0].iov_base + offset; + /* shift data to make room for header. */ ++ xdr_extend_head(buf, offset, headlen); ++ + /* XXX Would be cleverer to encrypt while copying. */ +- /* XXX bounds checking, slack, etc. */ +- memmove(ptr + headlen, ptr, buf->head[0].iov_len - offset); +- buf->head[0].iov_len += headlen; +- buf->len += headlen; + BUG_ON((buf->len - offset - headlen) % blocksize); + + g_make_token_header(&kctx->mech_used, +- GSS_KRB5_TOK_HDR_LEN + 8 + plainlen, &ptr); ++ GSS_KRB5_TOK_HDR_LEN + ++ kctx->gk5e->cksumlength + plainlen, &ptr); + + + /* ptr now at header described in rfc 1964, section 1.2.1: */ + ptr[0] = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff); + ptr[1] = (unsigned char) (KG_TOK_WRAP_MSG & 0xff); + +- msg_start = ptr + 24; ++ msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength; + +- *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5); ++ *(__be16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg); + memset(ptr + 4, 0xff, 4); +- *(__be16 *)(ptr + 4) = htons(SEAL_ALG_DES); ++ *(__be16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg); + +- make_confounder(msg_start, blocksize); ++ gss_krb5_make_confounder(msg_start, conflen); ++ ++ if (kctx->gk5e->keyed_cksum) ++ cksumkey = kctx->cksum; ++ else ++ cksumkey = NULL; + + /* XXXJBF: UGH!: */ + tmp_pages = buf->pages; + buf->pages = pages; +- if (make_checksum("md5", ptr, 8, buf, +- offset + headlen - blocksize, &md5cksum)) ++ if (make_checksum(kctx, ptr, 8, buf, offset + headlen - conflen, ++ cksumkey, KG_USAGE_SEAL, &md5cksum)) + return GSS_S_FAILURE; + buf->pages = tmp_pages; + +- if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, +- md5cksum.data, md5cksum.len)) +- return GSS_S_FAILURE; +- memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8); ++ memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); + + spin_lock(&krb5_seq_lock); + seq_send = kctx->seq_send++; +@@ -197,25 +228,42 @@ gss_wrap_kerberos(struct gss_ctx *ctx, i + + /* XXX would probably be more efficient to compute checksum + * and encrypt at the same time: */ +- if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff, ++ if ((krb5_make_seq_num(kctx, kctx->seq, kctx->initiate ? 0 : 0xff, + seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8))) + return GSS_S_FAILURE; + +- if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize, +- pages)) +- return GSS_S_FAILURE; ++ if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { ++ struct crypto_blkcipher *cipher; ++ int err; ++ cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cipher)) ++ return GSS_S_FAILURE; ++ ++ krb5_rc4_setup_enc_key(kctx, cipher, seq_send); ++ ++ err = gss_encrypt_xdr_buf(cipher, buf, ++ offset + headlen - conflen, pages); ++ crypto_free_blkcipher(cipher); ++ if (err) ++ return GSS_S_FAILURE; ++ } else { ++ if (gss_encrypt_xdr_buf(kctx->enc, buf, ++ offset + headlen - conflen, pages)) ++ return GSS_S_FAILURE; ++ } + + return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; + } + +-u32 +-gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf) ++static u32 ++gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) + { +- struct krb5_ctx *kctx = ctx->internal_ctx_id; + int signalg; + int sealalg; +- char cksumdata[16]; +- struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), ++ .data = cksumdata}; + s32 now; + int direction; + s32 seqnum; +@@ -224,6 +272,9 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, + void *data_start, *orig_start; + int data_len; + int blocksize; ++ u32 conflen = kctx->gk5e->conflen; ++ int crypt_offset; ++ u8 *cksumkey; + + dprintk("RPC: gss_unwrap_kerberos\n"); + +@@ -241,29 +292,65 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, + /* get the sign and seal algorithms */ + + signalg = ptr[2] + (ptr[3] << 8); +- if (signalg != SGN_ALG_DES_MAC_MD5) ++ if (signalg != kctx->gk5e->signalg) + return GSS_S_DEFECTIVE_TOKEN; + + sealalg = ptr[4] + (ptr[5] << 8); +- if (sealalg != SEAL_ALG_DES) ++ if (sealalg != kctx->gk5e->sealalg) + return GSS_S_DEFECTIVE_TOKEN; + + if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) + return GSS_S_DEFECTIVE_TOKEN; + +- if (gss_decrypt_xdr_buf(kctx->enc, buf, +- ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base)) +- return GSS_S_DEFECTIVE_TOKEN; ++ /* ++ * Data starts after token header and checksum. ptr points ++ * to the beginning of the token header ++ */ ++ crypt_offset = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) - ++ (unsigned char *)buf->head[0].iov_base; ++ ++ /* ++ * Need plaintext seqnum to derive encryption key for arcfour-hmac ++ */ ++ if (krb5_get_seq_num(kctx, ptr + GSS_KRB5_TOK_HDR_LEN, ++ ptr + 8, &direction, &seqnum)) ++ return GSS_S_BAD_SIG; + +- if (make_checksum("md5", ptr, 8, buf, +- ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base, &md5cksum)) +- return GSS_S_FAILURE; ++ if ((kctx->initiate && direction != 0xff) || ++ (!kctx->initiate && direction != 0)) ++ return GSS_S_BAD_SIG; ++ ++ if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { ++ struct crypto_blkcipher *cipher; ++ int err; ++ ++ cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cipher)) ++ return GSS_S_FAILURE; ++ ++ krb5_rc4_setup_enc_key(kctx, cipher, seqnum); ++ ++ err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset); ++ crypto_free_blkcipher(cipher); ++ if (err) ++ return GSS_S_DEFECTIVE_TOKEN; ++ } else { ++ if (gss_decrypt_xdr_buf(kctx->enc, buf, crypt_offset)) ++ return GSS_S_DEFECTIVE_TOKEN; ++ } + +- if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, +- md5cksum.data, md5cksum.len)) ++ if (kctx->gk5e->keyed_cksum) ++ cksumkey = kctx->cksum; ++ else ++ cksumkey = NULL; ++ ++ if (make_checksum(kctx, ptr, 8, buf, crypt_offset, ++ cksumkey, KG_USAGE_SEAL, &md5cksum)) + return GSS_S_FAILURE; + +- if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8)) ++ if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN, ++ kctx->gk5e->cksumlength)) + return GSS_S_BAD_SIG; + + /* it got through unscathed. Make sure the context is unexpired */ +@@ -275,19 +362,12 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, + + /* do sequencing checks */ + +- if (krb5_get_seq_num(kctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, +- &direction, &seqnum)) +- return GSS_S_BAD_SIG; +- +- if ((kctx->initiate && direction != 0xff) || +- (!kctx->initiate && direction != 0)) +- return GSS_S_BAD_SIG; +- + /* Copy the data back to the right position. XXX: Would probably be + * better to copy and encrypt at the same time. */ + + blocksize = crypto_blkcipher_blocksize(kctx->enc); +- data_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8 + blocksize; ++ data_start = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) + ++ conflen; + orig_start = buf->head[0].iov_base + offset; + data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start; + memmove(orig_start, data_start, data_len); +@@ -299,3 +379,209 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, + + return GSS_S_COMPLETE; + } ++ ++/* ++ * We cannot currently handle tokens with rotated data. We need a ++ * generalized routine to rotate the data in place. It is anticipated ++ * that we won't encounter rotated data in the general case. ++ */ ++static u32 ++rotate_left(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, u16 rrc) ++{ ++ unsigned int realrrc = rrc % (buf->len - offset - GSS_KRB5_TOK_HDR_LEN); ++ ++ if (realrrc == 0) ++ return 0; ++ ++ dprintk("%s: cannot process token with rotated data: " ++ "rrc %u, realrrc %u\n", __func__, rrc, realrrc); ++ return 1; ++} ++ ++static u32 ++gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, struct page **pages) ++{ ++ int blocksize; ++ u8 *ptr, *plainhdr; ++ s32 now; ++ u8 flags = 0x00; ++ __be16 *be16ptr, ec = 0; ++ __be64 *be64ptr; ++ u32 err; ++ ++ dprintk("RPC: %s\n", __func__); ++ ++ if (kctx->gk5e->encrypt_v2 == NULL) ++ return GSS_S_FAILURE; ++ ++ /* make room for gss token header */ ++ if (xdr_extend_head(buf, offset, GSS_KRB5_TOK_HDR_LEN)) ++ return GSS_S_FAILURE; ++ ++ /* construct gss token header */ ++ ptr = plainhdr = buf->head[0].iov_base + offset; ++ *ptr++ = (unsigned char) ((KG2_TOK_WRAP>>8) & 0xff); ++ *ptr++ = (unsigned char) (KG2_TOK_WRAP & 0xff); ++ ++ if ((kctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0) ++ flags |= KG2_TOKEN_FLAG_SENTBYACCEPTOR; ++ if ((kctx->flags & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY) != 0) ++ flags |= KG2_TOKEN_FLAG_ACCEPTORSUBKEY; ++ /* We always do confidentiality in wrap tokens */ ++ flags |= KG2_TOKEN_FLAG_SEALED; ++ ++ *ptr++ = flags; ++ *ptr++ = 0xff; ++ be16ptr = (__be16 *)ptr; ++ ++ blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc); ++ *be16ptr++ = cpu_to_be16(ec); ++ /* "inner" token header always uses 0 for RRC */ ++ *be16ptr++ = cpu_to_be16(0); ++ ++ be64ptr = (__be64 *)be16ptr; ++ spin_lock(&krb5_seq_lock); ++ *be64ptr = cpu_to_be64(kctx->seq_send64++); ++ spin_unlock(&krb5_seq_lock); ++ ++ err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, ec, pages); ++ if (err) ++ return err; ++ ++ now = get_seconds(); ++ return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; ++} ++ ++static u32 ++gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) ++{ ++ s32 now; ++ u64 seqnum; ++ u8 *ptr; ++ u8 flags = 0x00; ++ u16 ec, rrc; ++ int err; ++ u32 headskip, tailskip; ++ u8 decrypted_hdr[GSS_KRB5_TOK_HDR_LEN]; ++ unsigned int movelen; ++ ++ ++ dprintk("RPC: %s\n", __func__); ++ ++ if (kctx->gk5e->decrypt_v2 == NULL) ++ return GSS_S_FAILURE; ++ ++ ptr = buf->head[0].iov_base + offset; ++ ++ if (be16_to_cpu(*((__be16 *)ptr)) != KG2_TOK_WRAP) ++ return GSS_S_DEFECTIVE_TOKEN; ++ ++ flags = ptr[2]; ++ if ((!kctx->initiate && (flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR)) || ++ (kctx->initiate && !(flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR))) ++ return GSS_S_BAD_SIG; ++ ++ if ((flags & KG2_TOKEN_FLAG_SEALED) == 0) { ++ dprintk("%s: token missing expected sealed flag\n", __func__); ++ return GSS_S_DEFECTIVE_TOKEN; ++ } ++ ++ if (ptr[3] != 0xff) ++ return GSS_S_DEFECTIVE_TOKEN; ++ ++ ec = be16_to_cpup((__be16 *)(ptr + 4)); ++ rrc = be16_to_cpup((__be16 *)(ptr + 6)); ++ ++ seqnum = be64_to_cpup((__be64 *)(ptr + 8)); ++ ++ if (rrc != 0) { ++ err = rotate_left(kctx, offset, buf, rrc); ++ if (err) ++ return GSS_S_FAILURE; ++ } ++ ++ err = (*kctx->gk5e->decrypt_v2)(kctx, offset, buf, ++ &headskip, &tailskip); ++ if (err) ++ return GSS_S_FAILURE; ++ ++ /* ++ * Retrieve the decrypted gss token header and verify ++ * it against the original ++ */ ++ err = read_bytes_from_xdr_buf(buf, ++ buf->len - GSS_KRB5_TOK_HDR_LEN - tailskip, ++ decrypted_hdr, GSS_KRB5_TOK_HDR_LEN); ++ if (err) { ++ dprintk("%s: error %u getting decrypted_hdr\n", __func__, err); ++ return GSS_S_FAILURE; ++ } ++ if (memcmp(ptr, decrypted_hdr, 6) ++ || memcmp(ptr + 8, decrypted_hdr + 8, 8)) { ++ dprintk("%s: token hdr, plaintext hdr mismatch!\n", __func__); ++ return GSS_S_FAILURE; ++ } ++ ++ /* do sequencing checks */ ++ ++ /* it got through unscathed. Make sure the context is unexpired */ ++ now = get_seconds(); ++ if (now > kctx->endtime) ++ return GSS_S_CONTEXT_EXPIRED; ++ ++ /* ++ * Move the head data back to the right position in xdr_buf. ++ * We ignore any "ec" data since it might be in the head or ++ * the tail, and we really don't need to deal with it. ++ * Note that buf->head[0].iov_len may indicate the available ++ * head buffer space rather than that actually occupied. ++ */ ++ movelen = min_t(unsigned int, buf->head[0].iov_len, buf->len); ++ movelen -= offset + GSS_KRB5_TOK_HDR_LEN + headskip; ++ BUG_ON(offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen > ++ buf->head[0].iov_len); ++ memmove(ptr, ptr + GSS_KRB5_TOK_HDR_LEN + headskip, movelen); ++ buf->head[0].iov_len -= GSS_KRB5_TOK_HDR_LEN + headskip; ++ buf->len -= GSS_KRB5_TOK_HDR_LEN + headskip; ++ ++ return GSS_S_COMPLETE; ++} ++ ++u32 ++gss_wrap_kerberos(struct gss_ctx *gctx, int offset, ++ struct xdr_buf *buf, struct page **pages) ++{ ++ struct krb5_ctx *kctx = gctx->internal_ctx_id; ++ ++ switch (kctx->enctype) { ++ default: ++ BUG(); ++ case ENCTYPE_DES_CBC_RAW: ++ case ENCTYPE_DES3_CBC_RAW: ++ case ENCTYPE_ARCFOUR_HMAC: ++ return gss_wrap_kerberos_v1(kctx, offset, buf, pages); ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ return gss_wrap_kerberos_v2(kctx, offset, buf, pages); ++ } ++} ++ ++u32 ++gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, struct xdr_buf *buf) ++{ ++ struct krb5_ctx *kctx = gctx->internal_ctx_id; ++ ++ switch (kctx->enctype) { ++ default: ++ BUG(); ++ case ENCTYPE_DES_CBC_RAW: ++ case ENCTYPE_DES3_CBC_RAW: ++ case ENCTYPE_ARCFOUR_HMAC: ++ return gss_unwrap_kerberos_v1(kctx, offset, buf); ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ return gss_unwrap_kerberos_v2(kctx, offset, buf); ++ } ++} ++ +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_mech_switch.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_mech_switch.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_mech_switch.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_mech_switch.c 2010-08-23 11:01:00.395574706 -0400 +@@ -249,14 +249,15 @@ EXPORT_SYMBOL_GPL(gss_mech_put); + int + gss_import_sec_context(const void *input_token, size_t bufsize, + struct gss_api_mech *mech, +- struct gss_ctx **ctx_id) ++ struct gss_ctx **ctx_id, ++ gfp_t gfp_mask) + { +- if (!(*ctx_id = kzalloc(sizeof(**ctx_id), GFP_KERNEL))) ++ if (!(*ctx_id = kzalloc(sizeof(**ctx_id), gfp_mask))) + return -ENOMEM; + (*ctx_id)->mech_type = gss_mech_get(mech); + + return mech->gm_ops +- ->gss_import_sec_context(input_token, bufsize, *ctx_id); ++ ->gss_import_sec_context(input_token, bufsize, *ctx_id, gfp_mask); + } + + /* gss_get_mic: compute a mic over message and return mic_token. */ +@@ -285,6 +286,20 @@ gss_verify_mic(struct gss_ctx *context_ + mic_token); + } + ++/* ++ * This function is called from both the client and server code. ++ * Each makes guarantees about how much "slack" space is available ++ * for the underlying function in "buf"'s head and tail while ++ * performing the wrap. ++ * ++ * The client and server code allocate RPC_MAX_AUTH_SIZE extra ++ * space in both the head and tail which is available for use by ++ * the wrap function. ++ * ++ * Underlying functions should verify they do not use more than ++ * RPC_MAX_AUTH_SIZE of extra space in either the head or tail ++ * when performing the wrap. ++ */ + u32 + gss_wrap(struct gss_ctx *ctx_id, + int offset, +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_spkm3_mech.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_spkm3_mech.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_spkm3_mech.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_spkm3_mech.c 2010-08-23 11:01:00.396574085 -0400 +@@ -84,13 +84,14 @@ simple_get_netobj(const void *p, const v + + static int + gss_import_sec_context_spkm3(const void *p, size_t len, +- struct gss_ctx *ctx_id) ++ struct gss_ctx *ctx_id, ++ gfp_t gfp_mask) + { + const void *end = (const void *)((const char *)p + len); + struct spkm3_ctx *ctx; + int version; + +- if (!(ctx = kzalloc(sizeof(*ctx), GFP_NOFS))) ++ if (!(ctx = kzalloc(sizeof(*ctx), gfp_mask))) + goto out_err; + + p = simple_get_bytes(p, end, &version, sizeof(version)); +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/Makefile.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/Makefile +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/Makefile 2010-08-23 11:01:00.387574079 -0400 +@@ -10,7 +10,7 @@ auth_rpcgss-objs := auth_gss.o gss_gener + obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o + + rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ +- gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o ++ gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o gss_krb5_keys.o + + obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o + +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/svcauth_gss.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/svcauth_gss.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/svcauth_gss.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/svcauth_gss.c 2010-08-23 11:01:00.396574085 -0400 +@@ -494,7 +494,7 @@ static int rsc_parse(struct cache_detail + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; +- status = gss_import_sec_context(buf, len, gm, &rsci.mechctx); ++ status = gss_import_sec_context(buf, len, gm, &rsci.mechctx, GFP_KERNEL); + if (status) + goto out; + +@@ -1315,6 +1315,14 @@ svcauth_gss_wrap_resp_priv(struct svc_rq + inpages = resbuf->pages; + /* XXX: Would be better to write some xdr helper functions for + * nfs{2,3,4}xdr.c that place the data right, instead of copying: */ ++ ++ /* ++ * If there is currently tail data, make sure there is ++ * room for the head, tail, and 2 * RPC_MAX_AUTH_SIZE in ++ * the page, and move the current tail data such that ++ * there is RPC_MAX_AUTH_SIZE slack space available in ++ * both the head and tail. ++ */ + if (resbuf->tail[0].iov_base) { + BUG_ON(resbuf->tail[0].iov_base >= resbuf->head[0].iov_base + + PAGE_SIZE); +@@ -1327,6 +1335,13 @@ svcauth_gss_wrap_resp_priv(struct svc_rq + resbuf->tail[0].iov_len); + resbuf->tail[0].iov_base += RPC_MAX_AUTH_SIZE; + } ++ /* ++ * If there is no current tail data, make sure there is ++ * room for the head data, and 2 * RPC_MAX_AUTH_SIZE in the ++ * allotted page, and set up tail information such that there ++ * is RPC_MAX_AUTH_SIZE slack space available in both the ++ * head and tail. ++ */ + if (resbuf->tail[0].iov_base == NULL) { + if (resbuf->head[0].iov_len + 2*RPC_MAX_AUTH_SIZE > PAGE_SIZE) + return -ENOMEM; +diff -up linux-2.6.34.noarch/net/sunrpc/clnt.c.orig linux-2.6.34.noarch/net/sunrpc/clnt.c +--- linux-2.6.34.noarch/net/sunrpc/clnt.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/clnt.c 2010-08-23 11:01:00.397622347 -0400 +@@ -556,26 +556,16 @@ static const struct rpc_call_ops rpc_def + */ + struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) + { +- struct rpc_task *task, *ret; ++ struct rpc_task *task; + + task = rpc_new_task(task_setup_data); +- if (task == NULL) { +- rpc_release_calldata(task_setup_data->callback_ops, +- task_setup_data->callback_data); +- ret = ERR_PTR(-ENOMEM); ++ if (IS_ERR(task)) + goto out; +- } + +- if (task->tk_status != 0) { +- ret = ERR_PTR(task->tk_status); +- rpc_put_task(task); +- goto out; +- } + atomic_inc(&task->tk_count); + rpc_execute(task); +- ret = task; + out: +- return ret; ++ return task; + } + EXPORT_SYMBOL_GPL(rpc_run_task); + +@@ -657,9 +647,8 @@ struct rpc_task *rpc_run_bc_task(struct + * Create an rpc_task to send the data + */ + task = rpc_new_task(&task_setup_data); +- if (!task) { ++ if (IS_ERR(task)) { + xprt_free_bc_request(req); +- task = ERR_PTR(-ENOMEM); + goto out; + } + task->tk_rqstp = req; +diff -up linux-2.6.34.noarch/net/sunrpc/sched.c.orig linux-2.6.34.noarch/net/sunrpc/sched.c +--- linux-2.6.34.noarch/net/sunrpc/sched.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/sched.c 2010-08-23 11:01:00.398564598 -0400 +@@ -25,7 +25,6 @@ + + #ifdef RPC_DEBUG + #define RPCDBG_FACILITY RPCDBG_SCHED +-#define RPC_TASK_MAGIC_ID 0xf00baa + #endif + + /* +@@ -237,7 +236,6 @@ static void rpc_task_set_debuginfo(struc + { + static atomic_t rpc_pid; + +- task->tk_magic = RPC_TASK_MAGIC_ID; + task->tk_pid = atomic_inc_return(&rpc_pid); + } + #else +@@ -360,9 +358,6 @@ static void __rpc_do_wake_up_task(struct + dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n", + task->tk_pid, jiffies); + +-#ifdef RPC_DEBUG +- BUG_ON(task->tk_magic != RPC_TASK_MAGIC_ID); +-#endif + /* Has the task been executed yet? If not, we cannot wake it up! */ + if (!RPC_IS_ACTIVATED(task)) { + printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task); +@@ -834,7 +829,7 @@ static void rpc_init_task(struct rpc_tas + } + + /* starting timestamp */ +- task->tk_start = jiffies; ++ task->tk_start = ktime_get(); + + dprintk("RPC: new task initialized, procpid %u\n", + task_pid_nr(current)); +@@ -856,16 +851,23 @@ struct rpc_task *rpc_new_task(const stru + + if (task == NULL) { + task = rpc_alloc_task(); +- if (task == NULL) +- goto out; ++ if (task == NULL) { ++ rpc_release_calldata(setup_data->callback_ops, ++ setup_data->callback_data); ++ return ERR_PTR(-ENOMEM); ++ } + flags = RPC_TASK_DYNAMIC; + } + + rpc_init_task(task, setup_data); ++ if (task->tk_status < 0) { ++ int err = task->tk_status; ++ rpc_put_task(task); ++ return ERR_PTR(err); ++ } + + task->tk_flags |= flags; + dprintk("RPC: allocated task %p\n", task); +-out: + return task; + } + +@@ -909,9 +911,6 @@ EXPORT_SYMBOL_GPL(rpc_put_task); + + static void rpc_release_task(struct rpc_task *task) + { +-#ifdef RPC_DEBUG +- BUG_ON(task->tk_magic != RPC_TASK_MAGIC_ID); +-#endif + dprintk("RPC: %5u release task\n", task->tk_pid); + + if (!list_empty(&task->tk_task)) { +@@ -923,9 +922,6 @@ static void rpc_release_task(struct rpc_ + } + BUG_ON (RPC_IS_QUEUED(task)); + +-#ifdef RPC_DEBUG +- task->tk_magic = 0; +-#endif + /* Wake up anyone who is waiting for task completion */ + rpc_mark_complete_task(task); + +diff -up linux-2.6.34.noarch/net/sunrpc/stats.c.orig linux-2.6.34.noarch/net/sunrpc/stats.c +--- linux-2.6.34.noarch/net/sunrpc/stats.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/stats.c 2010-08-23 11:01:00.399574225 -0400 +@@ -144,7 +144,7 @@ void rpc_count_iostats(struct rpc_task * + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_iostats *stats; + struct rpc_iostats *op_metrics; +- long rtt, execute, queue; ++ ktime_t delta; + + if (!task->tk_client || !task->tk_client->cl_metrics || !req) + return; +@@ -156,23 +156,16 @@ void rpc_count_iostats(struct rpc_task * + op_metrics->om_ntrans += req->rq_ntrans; + op_metrics->om_timeouts += task->tk_timeouts; + +- op_metrics->om_bytes_sent += task->tk_bytes_sent; ++ op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent; + op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd; + +- queue = (long)req->rq_xtime - task->tk_start; +- if (queue < 0) +- queue = -queue; +- op_metrics->om_queue += queue; +- +- rtt = task->tk_rtt; +- if (rtt < 0) +- rtt = -rtt; +- op_metrics->om_rtt += rtt; +- +- execute = (long)jiffies - task->tk_start; +- if (execute < 0) +- execute = -execute; +- op_metrics->om_execute += execute; ++ delta = ktime_sub(req->rq_xtime, task->tk_start); ++ op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta); ++ ++ op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt); ++ ++ delta = ktime_sub(ktime_get(), task->tk_start); ++ op_metrics->om_execute = ktime_add(op_metrics->om_execute, delta); + } + + static void _print_name(struct seq_file *seq, unsigned int op, +@@ -186,8 +179,6 @@ static void _print_name(struct seq_file + seq_printf(seq, "\t%12u: ", op); + } + +-#define MILLISECS_PER_JIFFY (1000 / HZ) +- + void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt) + { + struct rpc_iostats *stats = clnt->cl_metrics; +@@ -214,9 +205,9 @@ void rpc_print_iostats(struct seq_file * + metrics->om_timeouts, + metrics->om_bytes_sent, + metrics->om_bytes_recv, +- metrics->om_queue * MILLISECS_PER_JIFFY, +- metrics->om_rtt * MILLISECS_PER_JIFFY, +- metrics->om_execute * MILLISECS_PER_JIFFY); ++ ktime_to_ms(metrics->om_queue), ++ ktime_to_ms(metrics->om_rtt), ++ ktime_to_ms(metrics->om_execute)); + } + } + EXPORT_SYMBOL_GPL(rpc_print_iostats); +diff -up linux-2.6.34.noarch/net/sunrpc/xdr.c.orig linux-2.6.34.noarch/net/sunrpc/xdr.c +--- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-23 11:01:00.400574086 -0400 +@@ -762,6 +762,7 @@ int write_bytes_to_xdr_buf(struct xdr_bu + __write_bytes_to_xdr_buf(&subbuf, obj, len); + return 0; + } ++EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf); + + int + xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj) +diff -up linux-2.6.34.noarch/net/sunrpc/xprt.c.orig linux-2.6.34.noarch/net/sunrpc/xprt.c +--- linux-2.6.34.noarch/net/sunrpc/xprt.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xprt.c 2010-08-23 11:01:00.401372963 -0400 +@@ -43,6 +43,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -62,7 +63,6 @@ + * Local functions + */ + static void xprt_request_init(struct rpc_task *, struct rpc_xprt *); +-static inline void do_xprt_reserve(struct rpc_task *); + static void xprt_connect_status(struct rpc_task *task); + static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); + +@@ -711,12 +711,16 @@ void xprt_connect(struct rpc_task *task) + if (task->tk_rqstp) + task->tk_rqstp->rq_bytes_sent = 0; + +- task->tk_timeout = xprt->connect_timeout; ++ task->tk_timeout = task->tk_rqstp->rq_timeout; + rpc_sleep_on(&xprt->pending, task, xprt_connect_status); ++ ++ if (test_bit(XPRT_CLOSING, &xprt->state)) ++ return; ++ if (xprt_test_and_set_connecting(xprt)) ++ return; + xprt->stat.connect_start = jiffies; + xprt->ops->connect(task); + } +- return; + } + + static void xprt_connect_status(struct rpc_task *task) +@@ -771,25 +775,19 @@ struct rpc_rqst *xprt_lookup_rqst(struct + } + EXPORT_SYMBOL_GPL(xprt_lookup_rqst); + +-/** +- * xprt_update_rtt - update an RPC client's RTT state after receiving a reply +- * @task: RPC request that recently completed +- * +- */ +-void xprt_update_rtt(struct rpc_task *task) ++static void xprt_update_rtt(struct rpc_task *task) + { + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_rtt *rtt = task->tk_client->cl_rtt; + unsigned timer = task->tk_msg.rpc_proc->p_timer; ++ long m = usecs_to_jiffies(ktime_to_us(req->rq_rtt)); + + if (timer) { + if (req->rq_ntrans == 1) +- rpc_update_rtt(rtt, timer, +- (long)jiffies - req->rq_xtime); ++ rpc_update_rtt(rtt, timer, m); + rpc_set_timeo(rtt, timer, req->rq_ntrans - 1); + } + } +-EXPORT_SYMBOL_GPL(xprt_update_rtt); + + /** + * xprt_complete_rqst - called when reply processing is complete +@@ -807,7 +805,9 @@ void xprt_complete_rqst(struct rpc_task + task->tk_pid, ntohl(req->rq_xid), copied); + + xprt->stat.recvs++; +- task->tk_rtt = (long)jiffies - req->rq_xtime; ++ req->rq_rtt = ktime_sub(ktime_get(), req->rq_xtime); ++ if (xprt->ops->timer != NULL) ++ xprt_update_rtt(task); + + list_del_init(&req->rq_list); + req->rq_private_buf.len = copied; +@@ -906,7 +906,7 @@ void xprt_transmit(struct rpc_task *task + return; + + req->rq_connect_cookie = xprt->connect_cookie; +- req->rq_xtime = jiffies; ++ req->rq_xtime = ktime_get(); + status = xprt->ops->send_request(task); + if (status != 0) { + task->tk_status = status; +@@ -935,7 +935,7 @@ void xprt_transmit(struct rpc_task *task + spin_unlock_bh(&xprt->transport_lock); + } + +-static inline void do_xprt_reserve(struct rpc_task *task) ++static void xprt_alloc_slot(struct rpc_task *task) + { + struct rpc_xprt *xprt = task->tk_xprt; + +@@ -955,6 +955,16 @@ static inline void do_xprt_reserve(struc + rpc_sleep_on(&xprt->backlog, task, NULL); + } + ++static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) ++{ ++ memset(req, 0, sizeof(*req)); /* mark unused */ ++ ++ spin_lock(&xprt->reserve_lock); ++ list_add(&req->rq_list, &xprt->free); ++ rpc_wake_up_next(&xprt->backlog); ++ spin_unlock(&xprt->reserve_lock); ++} ++ + /** + * xprt_reserve - allocate an RPC request slot + * @task: RPC task requesting a slot allocation +@@ -968,7 +978,7 @@ void xprt_reserve(struct rpc_task *task) + + task->tk_status = -EIO; + spin_lock(&xprt->reserve_lock); +- do_xprt_reserve(task); ++ xprt_alloc_slot(task); + spin_unlock(&xprt->reserve_lock); + } + +@@ -1006,14 +1016,10 @@ void xprt_release(struct rpc_task *task) + { + struct rpc_xprt *xprt; + struct rpc_rqst *req; +- int is_bc_request; + + if (!(req = task->tk_rqstp)) + return; + +- /* Preallocated backchannel request? */ +- is_bc_request = bc_prealloc(req); +- + xprt = req->rq_xprt; + rpc_count_iostats(task); + spin_lock_bh(&xprt->transport_lock); +@@ -1027,21 +1033,16 @@ void xprt_release(struct rpc_task *task) + mod_timer(&xprt->timer, + xprt->last_used + xprt->idle_timeout); + spin_unlock_bh(&xprt->transport_lock); +- if (!bc_prealloc(req)) ++ if (req->rq_buffer) + xprt->ops->buf_free(req->rq_buffer); + task->tk_rqstp = NULL; + if (req->rq_release_snd_buf) + req->rq_release_snd_buf(req); + + dprintk("RPC: %5u release request %p\n", task->tk_pid, req); +- if (likely(!is_bc_request)) { +- memset(req, 0, sizeof(*req)); /* mark unused */ +- +- spin_lock(&xprt->reserve_lock); +- list_add(&req->rq_list, &xprt->free); +- rpc_wake_up_next(&xprt->backlog); +- spin_unlock(&xprt->reserve_lock); +- } else ++ if (likely(!bc_prealloc(req))) ++ xprt_free_slot(xprt, req); ++ else + xprt_free_bc_request(req); + } + +diff -up linux-2.6.34.noarch/net/sunrpc/xprtrdma/transport.c.orig linux-2.6.34.noarch/net/sunrpc/xprtrdma/transport.c +--- linux-2.6.34.noarch/net/sunrpc/xprtrdma/transport.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xprtrdma/transport.c 2010-08-23 11:01:00.402563985 -0400 +@@ -305,7 +305,6 @@ xprt_setup_rdma(struct xprt_create *args + /* 60 second timeout, no retries */ + xprt->timeout = &xprt_rdma_default_timeout; + xprt->bind_timeout = (60U * HZ); +- xprt->connect_timeout = (60U * HZ); + xprt->reestablish_timeout = (5U * HZ); + xprt->idle_timeout = (5U * 60 * HZ); + +@@ -449,21 +448,19 @@ xprt_rdma_connect(struct rpc_task *task) + struct rpc_xprt *xprt = (struct rpc_xprt *)task->tk_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + +- if (!xprt_test_and_set_connecting(xprt)) { +- if (r_xprt->rx_ep.rep_connected != 0) { +- /* Reconnect */ +- schedule_delayed_work(&r_xprt->rdma_connect, +- xprt->reestablish_timeout); +- xprt->reestablish_timeout <<= 1; +- if (xprt->reestablish_timeout > (30 * HZ)) +- xprt->reestablish_timeout = (30 * HZ); +- else if (xprt->reestablish_timeout < (5 * HZ)) +- xprt->reestablish_timeout = (5 * HZ); +- } else { +- schedule_delayed_work(&r_xprt->rdma_connect, 0); +- if (!RPC_IS_ASYNC(task)) +- flush_scheduled_work(); +- } ++ if (r_xprt->rx_ep.rep_connected != 0) { ++ /* Reconnect */ ++ schedule_delayed_work(&r_xprt->rdma_connect, ++ xprt->reestablish_timeout); ++ xprt->reestablish_timeout <<= 1; ++ if (xprt->reestablish_timeout > (30 * HZ)) ++ xprt->reestablish_timeout = (30 * HZ); ++ else if (xprt->reestablish_timeout < (5 * HZ)) ++ xprt->reestablish_timeout = (5 * HZ); ++ } else { ++ schedule_delayed_work(&r_xprt->rdma_connect, 0); ++ if (!RPC_IS_ASYNC(task)) ++ flush_scheduled_work(); + } + } + +@@ -677,7 +674,7 @@ xprt_rdma_send_request(struct rpc_task * + if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + goto drop_connection; + +- task->tk_bytes_sent += rqst->rq_snd_buf.len; ++ rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len; + rqst->rq_bytes_sent = 0; + return 0; + +diff -up linux-2.6.34.noarch/net/sunrpc/xprtsock.c.orig linux-2.6.34.noarch/net/sunrpc/xprtsock.c +--- linux-2.6.34.noarch/net/sunrpc/xprtsock.c.orig 2010-08-23 11:00:23.890501549 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xprtsock.c 2010-08-23 11:01:00.403564023 -0400 +@@ -138,20 +138,6 @@ static ctl_table sunrpc_table[] = { + #endif + + /* +- * Time out for an RPC UDP socket connect. UDP socket connects are +- * synchronous, but we set a timeout anyway in case of resource +- * exhaustion on the local host. +- */ +-#define XS_UDP_CONN_TO (5U * HZ) +- +-/* +- * Wait duration for an RPC TCP connection to be established. Solaris +- * NFS over TCP uses 60 seconds, for example, which is in line with how +- * long a server takes to reboot. +- */ +-#define XS_TCP_CONN_TO (60U * HZ) +- +-/* + * Wait duration for a reply from the RPC portmapper. + */ + #define XS_BIND_TO (60U * HZ) +@@ -543,7 +529,7 @@ static int xs_udp_send_request(struct rp + xdr->len - req->rq_bytes_sent, status); + + if (status >= 0) { +- task->tk_bytes_sent += status; ++ req->rq_xmit_bytes_sent += status; + if (status >= req->rq_slen) + return 0; + /* Still some bytes left; set up for a retry later. */ +@@ -639,7 +625,7 @@ static int xs_tcp_send_request(struct rp + /* If we've sent the entire packet, immediately + * reset the count of bytes sent. */ + req->rq_bytes_sent += status; +- task->tk_bytes_sent += status; ++ req->rq_xmit_bytes_sent += status; + if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_bytes_sent = 0; + return 0; +@@ -859,7 +845,6 @@ static void xs_udp_data_ready(struct soc + dst_confirm(skb_dst(skb)); + + xprt_adjust_cwnd(task, copied); +- xprt_update_rtt(task); + xprt_complete_rqst(task, copied); + + out_unlock: +@@ -2022,9 +2007,6 @@ static void xs_connect(struct rpc_task * + struct rpc_xprt *xprt = task->tk_xprt; + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + +- if (xprt_test_and_set_connecting(xprt)) +- return; +- + if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) { + dprintk("RPC: xs_connect delayed xprt %p for %lu " + "seconds\n", +@@ -2044,16 +2026,6 @@ static void xs_connect(struct rpc_task * + } + } + +-static void xs_tcp_connect(struct rpc_task *task) +-{ +- struct rpc_xprt *xprt = task->tk_xprt; +- +- /* Exit if we need to wait for socket shutdown to complete */ +- if (test_bit(XPRT_CLOSING, &xprt->state)) +- return; +- xs_connect(task); +-} +- + /** + * xs_udp_print_stats - display UDP socket-specifc stats + * @xprt: rpc_xprt struct containing statistics +@@ -2252,7 +2224,7 @@ static struct rpc_xprt_ops xs_tcp_ops = + .release_xprt = xs_tcp_release_xprt, + .rpcbind = rpcb_getport_async, + .set_port = xs_set_port, +- .connect = xs_tcp_connect, ++ .connect = xs_connect, + .buf_alloc = rpc_malloc, + .buf_free = rpc_free, + .send_request = xs_tcp_send_request, +@@ -2343,7 +2315,6 @@ static struct rpc_xprt *xs_setup_udp(str + xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); + + xprt->bind_timeout = XS_BIND_TO; +- xprt->connect_timeout = XS_UDP_CONN_TO; + xprt->reestablish_timeout = XS_UDP_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; + +@@ -2418,7 +2389,6 @@ static struct rpc_xprt *xs_setup_tcp(str + xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; + + xprt->bind_timeout = XS_BIND_TO; +- xprt->connect_timeout = XS_TCP_CONN_TO; + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; + +@@ -2478,9 +2448,6 @@ static struct rpc_xprt *xs_setup_bc_tcp( + struct sock_xprt *transport; + struct svc_sock *bc_sock; + +- if (!args->bc_xprt) +- ERR_PTR(-EINVAL); +- + xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries); + if (IS_ERR(xprt)) + return xprt; +@@ -2494,7 +2461,6 @@ static struct rpc_xprt *xs_setup_bc_tcp( + /* backchannel */ + xprt_set_bound(xprt); + xprt->bind_timeout = 0; +- xprt->connect_timeout = 0; + xprt->reestablish_timeout = 0; + xprt->idle_timeout = 0; + diff --git a/nfsd-35-fc.patch b/nfsd-35-fc.patch new file mode 100644 index 000000000..ef99b4995 --- /dev/null +++ b/nfsd-35-fc.patch @@ -0,0 +1,1808 @@ +diff -up linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt +--- linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt 2010-08-23 09:57:18.233564439 -0400 +@@ -137,7 +137,7 @@ NS*| OPENATTR | OPT | + | READ | REQ | | Section 18.22 | + | READDIR | REQ | | Section 18.23 | + | READLINK | OPT | | Section 18.24 | +-NS | RECLAIM_COMPLETE | REQ | | Section 18.51 | ++ | RECLAIM_COMPLETE | REQ | | Section 18.51 | + | RELEASE_LOCKOWNER | MNI | | N/A | + | REMOVE | REQ | | Section 18.25 | + | RENAME | REQ | | Section 18.26 | +diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c +--- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 09:57:18.234564075 -0400 +@@ -259,10 +259,9 @@ static struct cache_detail svc_expkey_ca + .alloc = expkey_alloc, + }; + +-static struct svc_expkey * +-svc_expkey_lookup(struct svc_expkey *item) ++static int ++svc_expkey_hash(struct svc_expkey *item) + { +- struct cache_head *ch; + int hash = item->ek_fsidtype; + char * cp = (char*)item->ek_fsid; + int len = key_len(item->ek_fsidtype); +@@ -270,6 +269,14 @@ svc_expkey_lookup(struct svc_expkey *ite + hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); + hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS); + hash &= EXPKEY_HASHMASK; ++ return hash; ++} ++ ++static struct svc_expkey * ++svc_expkey_lookup(struct svc_expkey *item) ++{ ++ struct cache_head *ch; ++ int hash = svc_expkey_hash(item); + + ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h, + hash); +@@ -283,13 +290,7 @@ static struct svc_expkey * + svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old) + { + struct cache_head *ch; +- int hash = new->ek_fsidtype; +- char * cp = (char*)new->ek_fsid; +- int len = key_len(new->ek_fsidtype); +- +- hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); +- hash ^= hash_ptr(new->ek_client, EXPKEY_HASHBITS); +- hash &= EXPKEY_HASHMASK; ++ int hash = svc_expkey_hash(new); + + ch = sunrpc_cache_update(&svc_expkey_cache, &new->h, + &old->h, hash); +@@ -738,14 +739,22 @@ struct cache_detail svc_export_cache = { + .alloc = svc_export_alloc, + }; + +-static struct svc_export * +-svc_export_lookup(struct svc_export *exp) ++static int ++svc_export_hash(struct svc_export *exp) + { +- struct cache_head *ch; + int hash; ++ + hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS); + hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS); + hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS); ++ return hash; ++} ++ ++static struct svc_export * ++svc_export_lookup(struct svc_export *exp) ++{ ++ struct cache_head *ch; ++ int hash = svc_export_hash(exp); + + ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h, + hash); +@@ -759,10 +768,7 @@ static struct svc_export * + svc_export_update(struct svc_export *new, struct svc_export *old) + { + struct cache_head *ch; +- int hash; +- hash = hash_ptr(old->ex_client, EXPORT_HASHBITS); +- hash ^= hash_ptr(old->ex_path.dentry, EXPORT_HASHBITS); +- hash ^= hash_ptr(old->ex_path.mnt, EXPORT_HASHBITS); ++ int hash = svc_export_hash(old); + + ch = sunrpc_cache_update(&svc_export_cache, &new->h, + &old->h, +@@ -1071,9 +1077,9 @@ exp_export(struct nfsctl_export *nxp) + err = 0; + finish: + kfree(new.ex_pathname); +- if (exp) ++ if (!IS_ERR_OR_NULL(exp)) + exp_put(exp); +- if (fsid_key && !IS_ERR(fsid_key)) ++ if (!IS_ERR_OR_NULL(fsid_key)) + cache_put(&fsid_key->h, &svc_expkey_cache); + path_put(&path); + out_put_clp: +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 10:00:37.257414684 -0400 +@@ -79,11 +79,6 @@ enum nfs_cb_opnum4 { + cb_sequence_dec_sz + \ + op_dec_sz) + +-struct nfs4_rpc_args { +- void *args_op; +- struct nfsd4_cb_sequence args_seq; +-}; +- + /* + * Generic encode routines from fs/nfs/nfs4xdr.c + */ +@@ -428,13 +423,19 @@ static struct rpc_procinfo nfs4_cb_p + }; + + static struct rpc_version nfs_cb_version4 = { ++/* ++ * Note on the callback rpc program version number: despite language in rfc ++ * 5661 section 18.36.3 requiring servers to use 4 in this field, the ++ * official xdr descriptions for both 4.0 and 4.1 specify version 1, and ++ * in practice that appears to be what implementations use. The section ++ * 18.36.3 language is expected to be fixed in an erratum. ++ */ + .number = 1, + .nrprocs = ARRAY_SIZE(nfs4_cb_procedures), + .procs = nfs4_cb_procedures + }; + + static struct rpc_version * nfs_cb_version[] = { +- NULL, + &nfs_cb_version4, + }; + +@@ -456,15 +457,14 @@ static struct rpc_program cb_program = { + + static int max_cb_time(void) + { +- return max(NFSD_LEASE_TIME/10, (time_t)1) * HZ; ++ return max(nfsd4_lease/10, (time_t)1) * HZ; + } + + /* Reference counting, callback cleanup, etc., all look racy as heck. +- * And why is cb_set an atomic? */ ++ * And why is cl_cb_set an atomic? */ + +-int setup_callback_client(struct nfs4_client *clp) ++int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb) + { +- struct nfs4_cb_conn *cb = &clp->cl_cb_conn; + struct rpc_timeout timeparms = { + .to_initval = max_cb_time(), + .to_retries = 0, +@@ -476,7 +476,7 @@ int setup_callback_client(struct nfs4_cl + .timeout = &timeparms, + .program = &cb_program, + .prognumber = cb->cb_prog, +- .version = nfs_cb_version[1]->number, ++ .version = 0, + .authflavor = clp->cl_flavor, + .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), + .client_name = clp->cl_principal, +@@ -486,7 +486,7 @@ int setup_callback_client(struct nfs4_cl + if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) + return -EINVAL; + if (cb->cb_minorversion) { +- args.bc_xprt = clp->cl_cb_xprt; ++ args.bc_xprt = cb->cb_xprt; + args.protocol = XPRT_TRANSPORT_BC_TCP; + } + /* Create RPC client */ +@@ -496,7 +496,7 @@ int setup_callback_client(struct nfs4_cl + PTR_ERR(client)); + return PTR_ERR(client); + } +- cb->cb_client = client; ++ nfsd4_set_callback_client(clp, client); + return 0; + + } +@@ -514,8 +514,7 @@ static void nfsd4_cb_probe_done(struct r + if (task->tk_status) + warn_no_callback_path(clp, task->tk_status); + else +- atomic_set(&clp->cl_cb_conn.cb_set, 1); +- put_nfs4_client(clp); ++ atomic_set(&clp->cl_cb_set, 1); + } + + static const struct rpc_call_ops nfsd4_cb_probe_ops = { +@@ -537,7 +536,6 @@ int set_callback_cred(void) + + void do_probe_callback(struct nfs4_client *clp) + { +- struct nfs4_cb_conn *cb = &clp->cl_cb_conn; + struct rpc_message msg = { + .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], + .rpc_argp = clp, +@@ -545,34 +543,28 @@ void do_probe_callback(struct nfs4_clien + }; + int status; + +- status = rpc_call_async(cb->cb_client, &msg, ++ status = rpc_call_async(cb->cl_cb_client, &msg, + RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + &nfsd4_cb_probe_ops, (void *)clp); +- if (status) { ++ if (status) + warn_no_callback_path(clp, status); +- put_nfs4_client(clp); +- } + } + + /* + * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... + */ +-void +-nfsd4_probe_callback(struct nfs4_client *clp) ++void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb) + { + int status; + +- BUG_ON(atomic_read(&clp->cl_cb_conn.cb_set)); ++ BUG_ON(atomic_read(&clp->cl_cb_set)); + +- status = setup_callback_client(clp); ++ status = setup_callback_client(clp, cb); + if (status) { + warn_no_callback_path(clp, status); + return; + } + +- /* the task holds a reference to the nfs4_client struct */ +- atomic_inc(&clp->cl_count); +- + do_probe_callback(clp); + } + +@@ -658,18 +650,32 @@ static void nfsd4_cb_done(struct rpc_tas + } + } + ++ + static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) + { + struct nfs4_delegation *dp = calldata; + struct nfs4_client *clp = dp->dl_client; ++ struct rpc_clnt *current_rpc_client = clp->cl_cb_client; + + nfsd4_cb_done(task, calldata); + ++ if (current_rpc_client == NULL) { ++ /* We're shutting down; give up. */ ++ /* XXX: err, or is it ok just to fall through ++ * and rpc_restart_call? */ ++ return; ++ } ++ + switch (task->tk_status) { + case -EIO: + /* Network partition? */ +- atomic_set(&clp->cl_cb_conn.cb_set, 0); ++ atomic_set(&clp->cl_cb_set, 0); + warn_no_callback_path(clp, task->tk_status); ++ if (current_rpc_client != task->tk_client) { ++ /* queue a callback on the new connection: */ ++ nfsd4_cb_recall(dp); ++ return; ++ } + case -EBADHANDLE: + case -NFS4ERR_BAD_STATEID: + /* Race: client probably got cb_recall +@@ -677,7 +683,7 @@ static void nfsd4_cb_recall_done(struct + break; + default: + /* success, or error we can't handle */ +- goto done; ++ return; + } + if (dp->dl_retries--) { + rpc_delay(task, 2*HZ); +@@ -685,20 +691,16 @@ static void nfsd4_cb_recall_done(struct + rpc_restart_call(task); + return; + } else { +- atomic_set(&clp->cl_cb_conn.cb_set, 0); ++ atomic_set(&clp->cl_cb_set, 0); + warn_no_callback_path(clp, task->tk_status); + } +-done: +- kfree(task->tk_msg.rpc_argp); + } + + static void nfsd4_cb_recall_release(void *calldata) + { + struct nfs4_delegation *dp = calldata; +- struct nfs4_client *clp = dp->dl_client; + + nfs4_put_delegation(dp); +- put_nfs4_client(clp); + } + + static const struct rpc_call_ops nfsd4_cb_recall_ops = { +@@ -707,33 +709,75 @@ static const struct rpc_call_ops nfsd4_c + .rpc_release = nfsd4_cb_recall_release, + }; + ++static struct workqueue_struct *callback_wq; ++ ++int nfsd4_create_callback_queue(void) ++{ ++ callback_wq = create_singlethread_workqueue("nfsd4_callbacks"); ++ if (!callback_wq) ++ return -ENOMEM; ++ return 0; ++} ++ ++void nfsd4_destroy_callback_queue(void) ++{ ++ destroy_workqueue(callback_wq); ++} ++ ++/* must be called under the state lock */ ++void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new) ++{ ++ struct rpc_clnt *old = clp->cl_cb_client; ++ ++ clp->cl_cb_client = new; ++ /* ++ * After this, any work that saw the old value of cl_cb_client will ++ * be gone: ++ */ ++ flush_workqueue(callback_wq); ++ /* So we can safely shut it down: */ ++ if (old) ++ rpc_shutdown_client(old); ++} ++ + /* + * called with dp->dl_count inc'ed. + */ +-void +-nfsd4_cb_recall(struct nfs4_delegation *dp) ++static void _nfsd4_cb_recall(struct nfs4_delegation *dp) + { + struct nfs4_client *clp = dp->dl_client; +- struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client; +- struct nfs4_rpc_args *args; ++ struct rpc_clnt *clnt = clp->cl_cb_client; ++ struct nfs4_rpc_args *args = &dp->dl_recall.cb_args; + struct rpc_message msg = { + .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], + .rpc_cred = callback_cred + }; +- int status = -ENOMEM; ++ int status; ++ ++ if (clnt == NULL) ++ return; /* Client is shutting down; give up. */ + +- args = kzalloc(sizeof(*args), GFP_KERNEL); +- if (!args) +- goto out; + args->args_op = dp; + msg.rpc_argp = args; + dp->dl_retries = 1; + status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, + &nfsd4_cb_recall_ops, dp); +-out: +- if (status) { +- kfree(args); +- put_nfs4_client(clp); ++ if (status) + nfs4_put_delegation(dp); +- } ++} ++ ++void nfsd4_do_callback_rpc(struct work_struct *w) ++{ ++ /* XXX: for now, just send off delegation recall. */ ++ /* In future, generalize to handle any sort of callback. */ ++ struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work); ++ struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall); ++ ++ _nfsd4_cb_recall(dp); ++} ++ ++ ++void nfsd4_cb_recall(struct nfs4_delegation *dp) ++{ ++ queue_work(callback_wq, &dp->dl_recall.cb_work); + } +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 09:57:18.237376763 -0400 +@@ -969,20 +969,36 @@ static struct nfsd4_operation nfsd4_ops[ + static const char *nfsd4_op_name(unsigned opnum); + + /* +- * Enforce NFSv4.1 COMPOUND ordering rules. ++ * Enforce NFSv4.1 COMPOUND ordering rules: + * +- * TODO: +- * - enforce NFS4ERR_NOT_ONLY_OP, +- * - DESTROY_SESSION MUST be the final operation in the COMPOUND request. ++ * Also note, enforced elsewhere: ++ * - SEQUENCE other than as first op results in ++ * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().) ++ * - BIND_CONN_TO_SESSION must be the only op in its compound ++ * (Will be enforced in nfsd4_bind_conn_to_session().) ++ * - DESTROY_SESSION must be the final operation in a compound, if ++ * sessionid's in SEQUENCE and DESTROY_SESSION are the same. ++ * (Enforced in nfsd4_destroy_session().) + */ +-static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args) ++static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args) + { +- if (args->minorversion && args->opcnt > 0) { +- struct nfsd4_op *op = &args->ops[0]; +- return (op->status == nfserr_op_illegal) || +- (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP); +- } +- return true; ++ struct nfsd4_op *op = &args->ops[0]; ++ ++ /* These ordering requirements don't apply to NFSv4.0: */ ++ if (args->minorversion == 0) ++ return nfs_ok; ++ /* This is weird, but OK, not our problem: */ ++ if (args->opcnt == 0) ++ return nfs_ok; ++ if (op->status == nfserr_op_illegal) ++ return nfs_ok; ++ if (!(nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP)) ++ return nfserr_op_not_in_session; ++ if (op->opnum == OP_SEQUENCE) ++ return nfs_ok; ++ if (args->opcnt != 1) ++ return nfserr_not_only_op; ++ return nfs_ok; + } + + /* +@@ -1012,6 +1028,7 @@ nfsd4_proc_compound(struct svc_rqst *rqs + resp->rqstp = rqstp; + resp->cstate.minorversion = args->minorversion; + resp->cstate.replay_owner = NULL; ++ resp->cstate.session = NULL; + fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); + fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); + /* Use the deferral mechanism only for NFSv4.0 compounds */ +@@ -1024,13 +1041,13 @@ nfsd4_proc_compound(struct svc_rqst *rqs + if (args->minorversion > nfsd_supported_minorversion) + goto out; + +- if (!nfs41_op_ordering_ok(args)) { ++ status = nfs41_check_op_ordering(args); ++ if (status) { + op = &args->ops[0]; +- op->status = nfserr_sequence_pos; ++ op->status = status; + goto encode_op; + } + +- status = nfs_ok; + while (!status && resp->opcnt < args->opcnt) { + op = &args->ops[resp->opcnt++]; + +@@ -1295,6 +1312,11 @@ static struct nfsd4_operation nfsd4_ops[ + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_SEQUENCE", + }, ++ [OP_RECLAIM_COMPLETE] = { ++ .op_func = (nfsd4op_func)nfsd4_reclaim_complete, ++ .op_flags = ALLOWED_WITHOUT_FH, ++ .op_name = "OP_RECLAIM_COMPLETE", ++ }, + }; + + static const char *nfsd4_op_name(unsigned opnum) +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 09:57:18.240356512 -0400 +@@ -45,8 +45,8 @@ + #define NFSDDBG_FACILITY NFSDDBG_PROC + + /* Globals */ +-static time_t lease_time = 90; /* default lease time */ +-static time_t user_lease_time = 90; ++time_t nfsd4_lease = 90; /* default lease time */ ++time_t nfsd4_grace = 90; + static time_t boot_time; + static u32 current_ownerid = 1; + static u32 current_fileid = 1; +@@ -190,7 +190,7 @@ alloc_init_deleg(struct nfs4_client *clp + dp->dl_vfs_file = stp->st_vfs_file; + dp->dl_type = type; + dp->dl_ident = cb->cb_ident; +- dp->dl_stateid.si_boot = get_seconds(); ++ dp->dl_stateid.si_boot = boot_time; + dp->dl_stateid.si_stateownerid = current_delegid++; + dp->dl_stateid.si_fileid = 0; + dp->dl_stateid.si_generation = 0; +@@ -199,6 +199,7 @@ alloc_init_deleg(struct nfs4_client *clp + atomic_set(&dp->dl_count, 1); + list_add(&dp->dl_perfile, &fp->fi_delegations); + list_add(&dp->dl_perclnt, &clp->cl_delegations); ++ INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc); + return dp; + } + +@@ -249,6 +250,9 @@ unhash_delegation(struct nfs4_delegation + * SETCLIENTID state + */ + ++/* client_lock protects the client lru list and session hash table */ ++static DEFINE_SPINLOCK(client_lock); ++ + /* Hash tables for nfs4_clientid state */ + #define CLIENT_HASH_BITS 4 + #define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) +@@ -367,7 +371,6 @@ static void release_openowner(struct nfs + nfs4_put_stateowner(sop); + } + +-static DEFINE_SPINLOCK(sessionid_lock); + #define SESSION_HASH_SIZE 512 + static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE]; + +@@ -565,10 +568,10 @@ alloc_init_session(struct svc_rqst *rqst + + new->se_flags = cses->flags; + kref_init(&new->se_ref); +- spin_lock(&sessionid_lock); ++ spin_lock(&client_lock); + list_add(&new->se_hash, &sessionid_hashtbl[idx]); + list_add(&new->se_perclnt, &clp->cl_sessions); +- spin_unlock(&sessionid_lock); ++ spin_unlock(&client_lock); + + status = nfs_ok; + out: +@@ -579,7 +582,7 @@ out_free: + goto out; + } + +-/* caller must hold sessionid_lock */ ++/* caller must hold client_lock */ + static struct nfsd4_session * + find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid) + { +@@ -602,7 +605,7 @@ find_in_sessionid_hashtbl(struct nfs4_se + return NULL; + } + +-/* caller must hold sessionid_lock */ ++/* caller must hold client_lock */ + static void + unhash_session(struct nfsd4_session *ses) + { +@@ -610,15 +613,6 @@ unhash_session(struct nfsd4_session *ses + list_del(&ses->se_perclnt); + } + +-static void +-release_session(struct nfsd4_session *ses) +-{ +- spin_lock(&sessionid_lock); +- unhash_session(ses); +- spin_unlock(&sessionid_lock); +- nfsd4_put_session(ses); +-} +- + void + free_session(struct kref *kref) + { +@@ -634,9 +628,18 @@ free_session(struct kref *kref) + kfree(ses); + } + ++/* must be called under the client_lock */ + static inline void +-renew_client(struct nfs4_client *clp) ++renew_client_locked(struct nfs4_client *clp) + { ++ if (is_client_expired(clp)) { ++ dprintk("%s: client (clientid %08x/%08x) already expired\n", ++ __func__, ++ clp->cl_clientid.cl_boot, ++ clp->cl_clientid.cl_id); ++ return; ++ } ++ + /* + * Move client to the end to the LRU list. + */ +@@ -647,6 +650,14 @@ renew_client(struct nfs4_client *clp) + clp->cl_time = get_seconds(); + } + ++static inline void ++renew_client(struct nfs4_client *clp) ++{ ++ spin_lock(&client_lock); ++ renew_client_locked(clp); ++ spin_unlock(&client_lock); ++} ++ + /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ + static int + STALE_CLIENTID(clientid_t *clid) +@@ -680,27 +691,9 @@ static struct nfs4_client *alloc_client( + return clp; + } + +-static void +-shutdown_callback_client(struct nfs4_client *clp) +-{ +- struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client; +- +- if (clnt) { +- /* +- * Callback threads take a reference on the client, so there +- * should be no outstanding callbacks at this point. +- */ +- clp->cl_cb_conn.cb_client = NULL; +- rpc_shutdown_client(clnt); +- } +-} +- + static inline void + free_client(struct nfs4_client *clp) + { +- shutdown_callback_client(clp); +- if (clp->cl_cb_xprt) +- svc_xprt_put(clp->cl_cb_xprt); + if (clp->cl_cred.cr_group_info) + put_group_info(clp->cl_cred.cr_group_info); + kfree(clp->cl_principal); +@@ -709,10 +702,34 @@ free_client(struct nfs4_client *clp) + } + + void +-put_nfs4_client(struct nfs4_client *clp) ++release_session_client(struct nfsd4_session *session) + { +- if (atomic_dec_and_test(&clp->cl_count)) ++ struct nfs4_client *clp = session->se_client; ++ ++ if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock)) ++ return; ++ if (is_client_expired(clp)) { + free_client(clp); ++ session->se_client = NULL; ++ } else ++ renew_client_locked(clp); ++ spin_unlock(&client_lock); ++ nfsd4_put_session(session); ++} ++ ++/* must be called under the client_lock */ ++static inline void ++unhash_client_locked(struct nfs4_client *clp) ++{ ++ mark_client_expired(clp); ++ list_del(&clp->cl_lru); ++ while (!list_empty(&clp->cl_sessions)) { ++ struct nfsd4_session *ses; ++ ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, ++ se_perclnt); ++ unhash_session(ses); ++ nfsd4_put_session(ses); ++ } + } + + static void +@@ -722,9 +739,6 @@ expire_client(struct nfs4_client *clp) + struct nfs4_delegation *dp; + struct list_head reaplist; + +- dprintk("NFSD: expire_client cl_count %d\n", +- atomic_read(&clp->cl_count)); +- + INIT_LIST_HEAD(&reaplist); + spin_lock(&recall_lock); + while (!list_empty(&clp->cl_delegations)) { +@@ -740,20 +754,20 @@ expire_client(struct nfs4_client *clp) + list_del_init(&dp->dl_recall_lru); + unhash_delegation(dp); + } +- list_del(&clp->cl_idhash); +- list_del(&clp->cl_strhash); +- list_del(&clp->cl_lru); + while (!list_empty(&clp->cl_openowners)) { + sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); + release_openowner(sop); + } +- while (!list_empty(&clp->cl_sessions)) { +- struct nfsd4_session *ses; +- ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, +- se_perclnt); +- release_session(ses); +- } +- put_nfs4_client(clp); ++ nfsd4_set_callback_client(clp, NULL); ++ if (clp->cl_cb_conn.cb_xprt) ++ svc_xprt_put(clp->cl_cb_conn.cb_xprt); ++ list_del(&clp->cl_idhash); ++ list_del(&clp->cl_strhash); ++ spin_lock(&client_lock); ++ unhash_client_locked(clp); ++ if (atomic_read(&clp->cl_refcount) == 0) ++ free_client(clp); ++ spin_unlock(&client_lock); + } + + static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) +@@ -839,14 +853,15 @@ static struct nfs4_client *create_client + } + + memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); +- atomic_set(&clp->cl_count, 1); +- atomic_set(&clp->cl_cb_conn.cb_set, 0); ++ atomic_set(&clp->cl_refcount, 0); ++ atomic_set(&clp->cl_cb_set, 0); + INIT_LIST_HEAD(&clp->cl_idhash); + INIT_LIST_HEAD(&clp->cl_strhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); + INIT_LIST_HEAD(&clp->cl_sessions); + INIT_LIST_HEAD(&clp->cl_lru); ++ clp->cl_time = get_seconds(); + clear_bit(0, &clp->cl_cb_slot_busy); + rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); + copy_verf(clp, verf); +@@ -877,8 +892,7 @@ add_to_unconfirmed(struct nfs4_client *c + list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]); + idhashval = clientid_hashval(clp->cl_clientid.cl_id); + list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]); +- list_add_tail(&clp->cl_lru, &client_lru); +- clp->cl_time = get_seconds(); ++ renew_client(clp); + } + + static void +@@ -888,10 +902,9 @@ move_to_confirmed(struct nfs4_client *cl + unsigned int strhashval; + + dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); +- list_del_init(&clp->cl_strhash); + list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); + strhashval = clientstr_hashval(clp->cl_recdir); +- list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); ++ list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); + renew_client(clp); + } + +@@ -1327,15 +1340,9 @@ nfsd4_create_session(struct svc_rqst *rq + cs_slot->sl_seqid++; /* from 0 to 1 */ + move_to_confirmed(unconf); + +- /* +- * We do not support RDMA or persistent sessions +- */ +- cr_ses->flags &= ~SESSION4_PERSIST; +- cr_ses->flags &= ~SESSION4_RDMA; +- + if (cr_ses->flags & SESSION4_BACK_CHAN) { +- unconf->cl_cb_xprt = rqstp->rq_xprt; +- svc_xprt_get(unconf->cl_cb_xprt); ++ unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; ++ svc_xprt_get(rqstp->rq_xprt); + rpc_copy_addr( + (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, + sa); +@@ -1344,7 +1351,7 @@ nfsd4_create_session(struct svc_rqst *rq + cstate->minorversion; + unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; + unconf->cl_cb_seq_nr = 1; +- nfsd4_probe_callback(unconf); ++ nfsd4_probe_callback(unconf, &unconf->cl_cb_conn); + } + conf = unconf; + } else { +@@ -1352,6 +1359,12 @@ nfsd4_create_session(struct svc_rqst *rq + goto out; + } + ++ /* ++ * We do not support RDMA or persistent sessions ++ */ ++ cr_ses->flags &= ~SESSION4_PERSIST; ++ cr_ses->flags &= ~SESSION4_RDMA; ++ + status = alloc_init_session(rqstp, conf, cr_ses); + if (status) + goto out; +@@ -1369,6 +1382,21 @@ out: + return status; + } + ++static bool nfsd4_last_compound_op(struct svc_rqst *rqstp) ++{ ++ struct nfsd4_compoundres *resp = rqstp->rq_resp; ++ struct nfsd4_compoundargs *argp = rqstp->rq_argp; ++ ++ return argp->opcnt == resp->opcnt; ++} ++ ++static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) ++{ ++ if (!session) ++ return 0; ++ return !memcmp(sid, &session->se_sessionid, sizeof(*sid)); ++} ++ + __be32 + nfsd4_destroy_session(struct svc_rqst *r, + struct nfsd4_compound_state *cstate, +@@ -1384,19 +1412,25 @@ nfsd4_destroy_session(struct svc_rqst *r + * - Do we need to clear any callback info from previous session? + */ + ++ if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) { ++ if (!nfsd4_last_compound_op(r)) ++ return nfserr_not_only_op; ++ } + dump_sessionid(__func__, &sessionid->sessionid); +- spin_lock(&sessionid_lock); ++ spin_lock(&client_lock); + ses = find_in_sessionid_hashtbl(&sessionid->sessionid); + if (!ses) { +- spin_unlock(&sessionid_lock); ++ spin_unlock(&client_lock); + goto out; + } + + unhash_session(ses); +- spin_unlock(&sessionid_lock); ++ spin_unlock(&client_lock); + ++ nfs4_lock_state(); + /* wait for callbacks */ +- shutdown_callback_client(ses->se_client); ++ nfsd4_set_callback_client(ses->se_client, NULL); ++ nfs4_unlock_state(); + nfsd4_put_session(ses); + status = nfs_ok; + out: +@@ -1417,7 +1451,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, + if (resp->opcnt != 1) + return nfserr_sequence_pos; + +- spin_lock(&sessionid_lock); ++ spin_lock(&client_lock); + status = nfserr_badsession; + session = find_in_sessionid_hashtbl(&seq->sessionid); + if (!session) +@@ -1456,23 +1490,47 @@ nfsd4_sequence(struct svc_rqst *rqstp, + cstate->slot = slot; + cstate->session = session; + +- /* Hold a session reference until done processing the compound: +- * nfsd4_put_session called only if the cstate slot is set. +- */ +- nfsd4_get_session(session); + out: +- spin_unlock(&sessionid_lock); +- /* Renew the clientid on success and on replay */ ++ /* Hold a session reference until done processing the compound. */ + if (cstate->session) { +- nfs4_lock_state(); +- renew_client(session->se_client); +- nfs4_unlock_state(); ++ nfsd4_get_session(cstate->session); ++ atomic_inc(&session->se_client->cl_refcount); + } ++ spin_unlock(&client_lock); + dprintk("%s: return %d\n", __func__, ntohl(status)); + return status; + } + + __be32 ++nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) ++{ ++ if (rc->rca_one_fs) { ++ if (!cstate->current_fh.fh_dentry) ++ return nfserr_nofilehandle; ++ /* ++ * We don't take advantage of the rca_one_fs case. ++ * That's OK, it's optional, we can safely ignore it. ++ */ ++ return nfs_ok; ++ } ++ nfs4_lock_state(); ++ if (is_client_expired(cstate->session->se_client)) { ++ nfs4_unlock_state(); ++ /* ++ * The following error isn't really legal. ++ * But we only get here if the client just explicitly ++ * destroyed the client. Surely it no longer cares what ++ * error it gets back on an operation for the dead ++ * client. ++ */ ++ return nfserr_stale_clientid; ++ } ++ nfsd4_create_clid_dir(cstate->session->se_client); ++ nfs4_unlock_state(); ++ return nfs_ok; ++} ++ ++__be32 + nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_setclientid *setclid) + { +@@ -1631,9 +1689,8 @@ nfsd4_setclientid_confirm(struct svc_rqs + if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) + status = nfserr_clid_inuse; + else { +- /* XXX: We just turn off callbacks until we can handle +- * change request correctly. */ +- atomic_set(&conf->cl_cb_conn.cb_set, 0); ++ atomic_set(&conf->cl_cb_set, 0); ++ nfsd4_probe_callback(conf, &unconf->cl_cb_conn); + expire_client(unconf); + status = nfs_ok; + +@@ -1667,7 +1724,7 @@ nfsd4_setclientid_confirm(struct svc_rqs + } + move_to_confirmed(unconf); + conf = unconf; +- nfsd4_probe_callback(conf); ++ nfsd4_probe_callback(conf, &conf->cl_cb_conn); + status = nfs_ok; + } + } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) +@@ -1700,12 +1757,12 @@ alloc_init_file(struct inode *ino) + INIT_LIST_HEAD(&fp->fi_hash); + INIT_LIST_HEAD(&fp->fi_stateids); + INIT_LIST_HEAD(&fp->fi_delegations); +- spin_lock(&recall_lock); +- list_add(&fp->fi_hash, &file_hashtbl[hashval]); +- spin_unlock(&recall_lock); + fp->fi_inode = igrab(ino); + fp->fi_id = current_fileid++; + fp->fi_had_conflict = false; ++ spin_lock(&recall_lock); ++ list_add(&fp->fi_hash, &file_hashtbl[hashval]); ++ spin_unlock(&recall_lock); + return fp; + } + return NULL; +@@ -1827,7 +1884,7 @@ init_stateid(struct nfs4_stateid *stp, s + stp->st_stateowner = sop; + get_nfs4_file(fp); + stp->st_file = fp; +- stp->st_stateid.si_boot = get_seconds(); ++ stp->st_stateid.si_boot = boot_time; + stp->st_stateid.si_stateownerid = sop->so_id; + stp->st_stateid.si_fileid = fp->fi_id; + stp->st_stateid.si_generation = 0; +@@ -2028,7 +2085,6 @@ void nfsd_break_deleg_cb(struct file_loc + * lock) we know the server hasn't removed the lease yet, we know + * it's safe to take a reference: */ + atomic_inc(&dp->dl_count); +- atomic_inc(&dp->dl_client->cl_count); + + spin_lock(&recall_lock); + list_add_tail(&dp->dl_recall_lru, &del_recall_lru); +@@ -2347,7 +2403,7 @@ nfs4_open_delegation(struct svc_fh *fh, + { + struct nfs4_delegation *dp; + struct nfs4_stateowner *sop = stp->st_stateowner; +- struct nfs4_cb_conn *cb = &sop->so_client->cl_cb_conn; ++ int cb_up = atomic_read(&sop->so_client->cl_cb_set); + struct file_lock fl, *flp = &fl; + int status, flag = 0; + +@@ -2355,7 +2411,7 @@ nfs4_open_delegation(struct svc_fh *fh, + open->op_recall = 0; + switch (open->op_claim_type) { + case NFS4_OPEN_CLAIM_PREVIOUS: +- if (!atomic_read(&cb->cb_set)) ++ if (!cb_up) + open->op_recall = 1; + flag = open->op_delegate_type; + if (flag == NFS4_OPEN_DELEGATE_NONE) +@@ -2366,7 +2422,7 @@ nfs4_open_delegation(struct svc_fh *fh, + * had the chance to reclaim theirs.... */ + if (locks_in_grace()) + goto out; +- if (!atomic_read(&cb->cb_set) || !sop->so_confirmed) ++ if (!cb_up || !sop->so_confirmed) + goto out; + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) + flag = NFS4_OPEN_DELEGATE_WRITE; +@@ -2483,10 +2539,8 @@ nfsd4_process_open2(struct svc_rqst *rqs + } + memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); + +- if (nfsd4_has_session(&resp->cstate)) { ++ if (nfsd4_has_session(&resp->cstate)) + open->op_stateowner->so_confirmed = 1; +- nfsd4_create_clid_dir(open->op_stateowner->so_client); +- } + + /* + * Attempt to hand out a delegation. No error return, because the +@@ -2537,7 +2591,7 @@ nfsd4_renew(struct svc_rqst *rqstp, stru + renew_client(clp); + status = nfserr_cb_path_down; + if (!list_empty(&clp->cl_delegations) +- && !atomic_read(&clp->cl_cb_conn.cb_set)) ++ && !atomic_read(&clp->cl_cb_set)) + goto out; + status = nfs_ok; + out: +@@ -2554,6 +2608,12 @@ nfsd4_end_grace(void) + dprintk("NFSD: end of grace period\n"); + nfsd4_recdir_purge_old(); + locks_end_grace(&nfsd4_manager); ++ /* ++ * Now that every NFSv4 client has had the chance to recover and ++ * to see the (possibly new, possibly shorter) lease time, we ++ * can safely set the next grace time to the current lease time: ++ */ ++ nfsd4_grace = nfsd4_lease; + } + + static time_t +@@ -2563,15 +2623,17 @@ nfs4_laundromat(void) + struct nfs4_stateowner *sop; + struct nfs4_delegation *dp; + struct list_head *pos, *next, reaplist; +- time_t cutoff = get_seconds() - NFSD_LEASE_TIME; +- time_t t, clientid_val = NFSD_LEASE_TIME; +- time_t u, test_val = NFSD_LEASE_TIME; ++ time_t cutoff = get_seconds() - nfsd4_lease; ++ time_t t, clientid_val = nfsd4_lease; ++ time_t u, test_val = nfsd4_lease; + + nfs4_lock_state(); + + dprintk("NFSD: laundromat service - starting\n"); + if (locks_in_grace()) + nfsd4_end_grace(); ++ INIT_LIST_HEAD(&reaplist); ++ spin_lock(&client_lock); + list_for_each_safe(pos, next, &client_lru) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { +@@ -2580,12 +2642,22 @@ nfs4_laundromat(void) + clientid_val = t; + break; + } ++ if (atomic_read(&clp->cl_refcount)) { ++ dprintk("NFSD: client in use (clientid %08x)\n", ++ clp->cl_clientid.cl_id); ++ continue; ++ } ++ unhash_client_locked(clp); ++ list_add(&clp->cl_lru, &reaplist); ++ } ++ spin_unlock(&client_lock); ++ list_for_each_safe(pos, next, &reaplist) { ++ clp = list_entry(pos, struct nfs4_client, cl_lru); + dprintk("NFSD: purging unused client (clientid %08x)\n", + clp->cl_clientid.cl_id); + nfsd4_remove_clid_dir(clp); + expire_client(clp); + } +- INIT_LIST_HEAD(&reaplist); + spin_lock(&recall_lock); + list_for_each_safe(pos, next, &del_recall_lru) { + dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); +@@ -2605,7 +2677,7 @@ nfs4_laundromat(void) + list_del_init(&dp->dl_recall_lru); + unhash_delegation(dp); + } +- test_val = NFSD_LEASE_TIME; ++ test_val = nfsd4_lease; + list_for_each_safe(pos, next, &close_lru) { + sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); + if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) { +@@ -2661,39 +2733,11 @@ nfs4_check_fh(struct svc_fh *fhp, struct + static int + STALE_STATEID(stateid_t *stateid) + { +- if (time_after((unsigned long)boot_time, +- (unsigned long)stateid->si_boot)) { +- dprintk("NFSD: stale stateid " STATEID_FMT "!\n", +- STATEID_VAL(stateid)); +- return 1; +- } +- return 0; +-} +- +-static int +-EXPIRED_STATEID(stateid_t *stateid) +-{ +- if (time_before((unsigned long)boot_time, +- ((unsigned long)stateid->si_boot)) && +- time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) { +- dprintk("NFSD: expired stateid " STATEID_FMT "!\n", +- STATEID_VAL(stateid)); +- return 1; +- } +- return 0; +-} +- +-static __be32 +-stateid_error_map(stateid_t *stateid) +-{ +- if (STALE_STATEID(stateid)) +- return nfserr_stale_stateid; +- if (EXPIRED_STATEID(stateid)) +- return nfserr_expired; +- +- dprintk("NFSD: bad stateid " STATEID_FMT "!\n", ++ if (stateid->si_boot == boot_time) ++ return 0; ++ dprintk("NFSD: stale stateid " STATEID_FMT "!\n", + STATEID_VAL(stateid)); +- return nfserr_bad_stateid; ++ return 1; + } + + static inline int +@@ -2817,10 +2861,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_ + status = nfserr_bad_stateid; + if (is_delegation_stateid(stateid)) { + dp = find_delegation_stateid(ino, stateid); +- if (!dp) { +- status = stateid_error_map(stateid); ++ if (!dp) + goto out; +- } + status = check_stateid_generation(stateid, &dp->dl_stateid, + flags); + if (status) +@@ -2833,10 +2875,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_ + *filpp = dp->dl_vfs_file; + } else { /* open or lock stateid */ + stp = find_stateid(stateid, flags); +- if (!stp) { +- status = stateid_error_map(stateid); ++ if (!stp) + goto out; +- } + if (nfs4_check_fh(current_fh, stp)) + goto out; + if (!stp->st_stateowner->so_confirmed) +@@ -2908,7 +2948,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_co + */ + sop = search_close_lru(stateid->si_stateownerid, flags); + if (sop == NULL) +- return stateid_error_map(stateid); ++ return nfserr_bad_stateid; + *sopp = sop; + goto check_replay; + } +@@ -3175,10 +3215,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp + if (!is_delegation_stateid(stateid)) + goto out; + dp = find_delegation_stateid(inode, stateid); +- if (!dp) { +- status = stateid_error_map(stateid); ++ if (!dp) + goto out; +- } + status = check_stateid_generation(stateid, &dp->dl_stateid, flags); + if (status) + goto out; +@@ -3404,7 +3442,7 @@ alloc_init_lock_stateid(struct nfs4_stat + stp->st_stateowner = sop; + get_nfs4_file(fp); + stp->st_file = fp; +- stp->st_stateid.si_boot = get_seconds(); ++ stp->st_stateid.si_boot = boot_time; + stp->st_stateid.si_stateownerid = sop->so_id; + stp->st_stateid.si_fileid = fp->fi_id; + stp->st_stateid.si_generation = 0; +@@ -3976,12 +4014,6 @@ nfsd4_load_reboot_recovery_data(void) + printk("NFSD: Failure reading reboot recovery data\n"); + } + +-unsigned long +-get_nfs4_grace_period(void) +-{ +- return max(user_lease_time, lease_time) * HZ; +-} +- + /* + * Since the lifetime of a delegation isn't limited to that of an open, a + * client may quite reasonably hang on to a delegation as long as it has +@@ -4008,20 +4040,27 @@ set_max_delegations(void) + static int + __nfs4_state_start(void) + { +- unsigned long grace_time; ++ int ret; + + boot_time = get_seconds(); +- grace_time = get_nfs4_grace_period(); +- lease_time = user_lease_time; + locks_start_grace(&nfsd4_manager); + printk(KERN_INFO "NFSD: starting %ld-second grace period\n", +- grace_time/HZ); ++ nfsd4_grace); ++ ret = set_callback_cred(); ++ if (ret) ++ return -ENOMEM; + laundry_wq = create_singlethread_workqueue("nfsd4"); + if (laundry_wq == NULL) + return -ENOMEM; +- queue_delayed_work(laundry_wq, &laundromat_work, grace_time); ++ ret = nfsd4_create_callback_queue(); ++ if (ret) ++ goto out_free_laundry; ++ queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ); + set_max_delegations(); +- return set_callback_cred(); ++ return 0; ++out_free_laundry: ++ destroy_workqueue(laundry_wq); ++ return ret; + } + + int +@@ -4039,12 +4078,6 @@ nfs4_state_start(void) + return 0; + } + +-time_t +-nfs4_lease_time(void) +-{ +- return lease_time; +-} +- + static void + __nfs4_state_shutdown(void) + { +@@ -4089,6 +4122,7 @@ nfs4_state_shutdown(void) + nfs4_lock_state(); + nfs4_release_reclaim(); + __nfs4_state_shutdown(); ++ nfsd4_destroy_callback_queue(); + nfs4_unlock_state(); + } + +@@ -4128,21 +4162,3 @@ nfs4_recoverydir(void) + { + return user_recovery_dirname; + } +- +-/* +- * Called when leasetime is changed. +- * +- * The only way the protocol gives us to handle on-the-fly lease changes is to +- * simulate a reboot. Instead of doing that, we just wait till the next time +- * we start to register any changes in lease time. If the administrator +- * really wants to change the lease time *now*, they can go ahead and bring +- * nfsd down and then back up again after changing the lease time. +- * +- * user_lease_time is protected by nfsd_mutex since it's only really accessed +- * when nfsd is starting +- */ +-void +-nfs4_reset_lease(time_t leasetime) +-{ +- user_lease_time = leasetime; +-} +diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c +--- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 09:57:20.629370282 -0400 +@@ -46,6 +46,7 @@ enum { + */ + #ifdef CONFIG_NFSD_V4 + NFSD_Leasetime, ++ NFSD_Gracetime, + NFSD_RecoveryDir, + #endif + }; +@@ -70,6 +71,7 @@ static ssize_t write_ports(struct file * + static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); + #ifdef CONFIG_NFSD_V4 + static ssize_t write_leasetime(struct file *file, char *buf, size_t size); ++static ssize_t write_gracetime(struct file *file, char *buf, size_t size); + static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); + #endif + +@@ -91,6 +93,7 @@ static ssize_t (*write_op[])(struct file + [NFSD_MaxBlkSize] = write_maxblksize, + #ifdef CONFIG_NFSD_V4 + [NFSD_Leasetime] = write_leasetime, ++ [NFSD_Gracetime] = write_gracetime, + [NFSD_RecoveryDir] = write_recoverydir, + #endif + }; +@@ -1204,29 +1207,45 @@ static ssize_t write_maxblksize(struct f + } + + #ifdef CONFIG_NFSD_V4 +-extern time_t nfs4_leasetime(void); +- +-static ssize_t __write_leasetime(struct file *file, char *buf, size_t size) ++static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) + { +- /* if size > 10 seconds, call +- * nfs4_reset_lease() then write out the new lease (seconds) as reply +- */ + char *mesg = buf; +- int rv, lease; ++ int rv, i; + + if (size > 0) { + if (nfsd_serv) + return -EBUSY; +- rv = get_int(&mesg, &lease); ++ rv = get_int(&mesg, &i); + if (rv) + return rv; +- if (lease < 10 || lease > 3600) ++ /* ++ * Some sanity checking. We don't have a reason for ++ * these particular numbers, but problems with the ++ * extremes are: ++ * - Too short: the briefest network outage may ++ * cause clients to lose all their locks. Also, ++ * the frequent polling may be wasteful. ++ * - Too long: do you really want reboot recovery ++ * to take more than an hour? Or to make other ++ * clients wait an hour before being able to ++ * revoke a dead client's locks? ++ */ ++ if (i < 10 || i > 3600) + return -EINVAL; +- nfs4_reset_lease(lease); ++ *time = i; + } + +- return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", +- nfs4_lease_time()); ++ return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time); ++} ++ ++static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) ++{ ++ ssize_t rv; ++ ++ mutex_lock(&nfsd_mutex); ++ rv = __nfsd4_write_time(file, buf, size, time); ++ mutex_unlock(&nfsd_mutex); ++ return rv; + } + + /** +@@ -1252,12 +1271,22 @@ static ssize_t __write_leasetime(struct + */ + static ssize_t write_leasetime(struct file *file, char *buf, size_t size) + { +- ssize_t rv; ++ return nfsd4_write_time(file, buf, size, &nfsd4_lease); ++} + +- mutex_lock(&nfsd_mutex); +- rv = __write_leasetime(file, buf, size); +- mutex_unlock(&nfsd_mutex); +- return rv; ++/** ++ * write_gracetime - Set or report current NFSv4 grace period time ++ * ++ * As above, but sets the time of the NFSv4 grace period. ++ * ++ * Note this should never be set to less than the *previous* ++ * lease-period time, but we don't try to enforce this. (In the common ++ * case (a new boot), we don't know what the previous lease time was ++ * anyway.) ++ */ ++static ssize_t write_gracetime(struct file *file, char *buf, size_t size) ++{ ++ return nfsd4_write_time(file, buf, size, &nfsd4_grace); + } + + extern char *nfs4_recoverydir(void); +@@ -1351,6 +1380,7 @@ static int nfsd_fill_super(struct super_ + [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, + #ifdef CONFIG_NFSD_V4 + [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, ++ [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, + #endif + /* last one */ {""} +diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h +--- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 09:57:20.629370282 -0400 +@@ -82,7 +82,6 @@ int nfs4_state_init(void); + void nfsd4_free_slabs(void); + int nfs4_state_start(void); + void nfs4_state_shutdown(void); +-time_t nfs4_lease_time(void); + void nfs4_reset_lease(time_t leasetime); + int nfs4_reset_recoverydir(char *recdir); + #else +@@ -90,7 +89,6 @@ static inline int nfs4_state_init(void) + static inline void nfsd4_free_slabs(void) { } + static inline int nfs4_state_start(void) { return 0; } + static inline void nfs4_state_shutdown(void) { } +-static inline time_t nfs4_lease_time(void) { return 0; } + static inline void nfs4_reset_lease(time_t leasetime) { } + static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } + #endif +@@ -229,6 +227,9 @@ extern struct timeval nfssvc_boot; + + #ifdef CONFIG_NFSD_V4 + ++extern time_t nfsd4_lease; ++extern time_t nfsd4_grace; ++ + /* before processing a COMPOUND operation, we have to check that there + * is enough space in the buffer for XDR encode to succeed. otherwise, + * we might process an operation with side effects, and be unable to +@@ -247,7 +248,6 @@ extern struct timeval nfssvc_boot; + #define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ + #define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ + +-#define NFSD_LEASE_TIME (nfs4_lease_time()) + #define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */ + + /* +diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h +--- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 09:57:21.807501619 -0400 +@@ -70,6 +70,16 @@ struct nfsd4_cb_sequence { + struct nfs4_client *cbs_clp; + }; + ++struct nfs4_rpc_args { ++ void *args_op; ++ struct nfsd4_cb_sequence args_seq; ++}; ++ ++struct nfsd4_callback { ++ struct nfs4_rpc_args cb_args; ++ struct work_struct cb_work; ++}; ++ + struct nfs4_delegation { + struct list_head dl_perfile; + struct list_head dl_perclnt; +@@ -86,6 +96,7 @@ struct nfs4_delegation { + stateid_t dl_stateid; + struct knfsd_fh dl_fh; + int dl_retries; ++ struct nfsd4_callback dl_recall; + }; + + /* client delegation callback info */ +@@ -96,9 +107,7 @@ struct nfs4_cb_conn { + u32 cb_prog; + u32 cb_minorversion; + u32 cb_ident; /* minorversion 0 only */ +- /* RPC client info */ +- atomic_t cb_set; /* successful CB_NULL call */ +- struct rpc_clnt * cb_client; ++ struct svc_xprt *cb_xprt; /* minorversion 1 only */ + }; + + /* Maximum number of slots per session. 160 is useful for long haul TCP */ +@@ -157,7 +166,7 @@ struct nfsd4_session { + struct list_head se_hash; /* hash by sessionid */ + struct list_head se_perclnt; + u32 se_flags; +- struct nfs4_client *se_client; /* for expire_client */ ++ struct nfs4_client *se_client; + struct nfs4_sessionid se_sessionid; + struct nfsd4_channel_attrs se_fchannel; + struct nfsd4_channel_attrs se_bchannel; +@@ -212,25 +221,41 @@ struct nfs4_client { + struct svc_cred cl_cred; /* setclientid principal */ + clientid_t cl_clientid; /* generated by server */ + nfs4_verifier cl_confirm; /* generated by server */ +- struct nfs4_cb_conn cl_cb_conn; /* callback info */ +- atomic_t cl_count; /* ref count */ + u32 cl_firststate; /* recovery dir creation */ + ++ /* for v4.0 and v4.1 callbacks: */ ++ struct nfs4_cb_conn cl_cb_conn; ++ struct rpc_clnt *cl_cb_client; ++ atomic_t cl_cb_set; ++ + /* for nfs41 */ + struct list_head cl_sessions; + struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ + u32 cl_exchange_flags; + struct nfs4_sessionid cl_sessionid; ++ /* number of rpc's in progress over an associated session: */ ++ atomic_t cl_refcount; + + /* for nfs41 callbacks */ + /* We currently support a single back channel with a single slot */ + unsigned long cl_cb_slot_busy; + u32 cl_cb_seq_nr; +- struct svc_xprt *cl_cb_xprt; /* 4.1 callback transport */ + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ + }; + ++static inline void ++mark_client_expired(struct nfs4_client *clp) ++{ ++ clp->cl_time = 0; ++} ++ ++static inline bool ++is_client_expired(struct nfs4_client *clp) ++{ ++ return clp->cl_time == 0; ++} ++ + /* struct nfs4_client_reset + * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl + * upon lease reset, or from upcall to state_daemon (to read in state +@@ -377,11 +402,14 @@ extern void nfs4_lock_state(void); + extern void nfs4_unlock_state(void); + extern int nfs4_in_grace(void); + extern __be32 nfs4_check_open_reclaim(clientid_t *clid); +-extern void put_nfs4_client(struct nfs4_client *clp); + extern void nfs4_free_stateowner(struct kref *kref); + extern int set_callback_cred(void); +-extern void nfsd4_probe_callback(struct nfs4_client *clp); ++extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); ++extern void nfsd4_do_callback_rpc(struct work_struct *); + extern void nfsd4_cb_recall(struct nfs4_delegation *dp); ++extern int nfsd4_create_callback_queue(void); ++extern void nfsd4_destroy_callback_queue(void); ++extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *); + extern void nfs4_put_delegation(struct nfs4_delegation *dp); + extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); + extern void nfsd4_init_recdir(char *recdir_name); +@@ -392,6 +420,7 @@ extern int nfs4_has_reclaimed_state(cons + extern void nfsd4_recdir_purge_old(void); + extern int nfsd4_create_clid_dir(struct nfs4_client *clp); + extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); ++extern void release_session_client(struct nfsd4_session *); + + static inline void + nfs4_put_stateowner(struct nfs4_stateowner *so) +diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h +--- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 09:57:23.994379831 -0400 +@@ -381,6 +381,10 @@ struct nfsd4_destroy_session { + struct nfs4_sessionid sessionid; + }; + ++struct nfsd4_reclaim_complete { ++ u32 rca_one_fs; ++}; ++ + struct nfsd4_op { + int opnum; + __be32 status; +@@ -421,6 +425,7 @@ struct nfsd4_op { + struct nfsd4_create_session create_session; + struct nfsd4_destroy_session destroy_session; + struct nfsd4_sequence sequence; ++ struct nfsd4_reclaim_complete reclaim_complete; + } u; + struct nfs4_replay * replay; + }; +@@ -513,9 +518,8 @@ extern void nfsd4_store_cache_entry(stru + extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, + struct nfsd4_sequence *seq); + extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, +- struct nfsd4_compound_state *, +-struct nfsd4_exchange_id *); +- extern __be32 nfsd4_create_session(struct svc_rqst *, ++ struct nfsd4_compound_state *, struct nfsd4_exchange_id *); ++extern __be32 nfsd4_create_session(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_create_session *); + extern __be32 nfsd4_sequence(struct svc_rqst *, +@@ -524,6 +528,7 @@ extern __be32 nfsd4_sequence(struct svc_ + extern __be32 nfsd4_destroy_session(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_destroy_session *); ++__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *); + extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, + struct nfsd4_open *open); + extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h 2010-08-23 09:57:23.994379831 -0400 +@@ -40,12 +40,12 @@ struct nfs_fhbase_old { + * This is the new flexible, extensible style NFSv2/v3 file handle. + * by Neil Brown - March 2000 + * +- * The file handle is seens as a list of 4byte words. +- * The first word contains a version number (1) and four descriptor bytes ++ * The file handle starts with a sequence of four-byte words. ++ * The first word contains a version number (1) and three descriptor bytes + * that tell how the remaining 3 variable length fields should be handled. + * These three bytes are auth_type, fsid_type and fileid_type. + * +- * All 4byte values are in host-byte-order. ++ * All four-byte values are in host-byte-order. + * + * The auth_type field specifies how the filehandle can be authenticated + * This might allow a file to be confirmed to be in a writable part of a +diff -up linux-2.6.34.noarch/net/sunrpc/cache.c.orig linux-2.6.34.noarch/net/sunrpc/cache.c +--- linux-2.6.34.noarch/net/sunrpc/cache.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/cache.c 2010-08-23 09:57:23.995376793 -0400 +@@ -49,11 +49,17 @@ static void cache_init(struct cache_head + h->last_refresh = now; + } + ++static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h) ++{ ++ return (h->expiry_time < get_seconds()) || ++ (detail->flush_time > h->last_refresh); ++} ++ + struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, + struct cache_head *key, int hash) + { + struct cache_head **head, **hp; +- struct cache_head *new = NULL; ++ struct cache_head *new = NULL, *freeme = NULL; + + head = &detail->hash_table[hash]; + +@@ -62,6 +68,9 @@ struct cache_head *sunrpc_cache_lookup(s + for (hp=head; *hp != NULL ; hp = &(*hp)->next) { + struct cache_head *tmp = *hp; + if (detail->match(tmp, key)) { ++ if (cache_is_expired(detail, tmp)) ++ /* This entry is expired, we will discard it. */ ++ break; + cache_get(tmp); + read_unlock(&detail->hash_lock); + return tmp; +@@ -86,6 +95,13 @@ struct cache_head *sunrpc_cache_lookup(s + for (hp=head; *hp != NULL ; hp = &(*hp)->next) { + struct cache_head *tmp = *hp; + if (detail->match(tmp, key)) { ++ if (cache_is_expired(detail, tmp)) { ++ *hp = tmp->next; ++ tmp->next = NULL; ++ detail->entries --; ++ freeme = tmp; ++ break; ++ } + cache_get(tmp); + write_unlock(&detail->hash_lock); + cache_put(new, detail); +@@ -98,6 +114,8 @@ struct cache_head *sunrpc_cache_lookup(s + cache_get(new); + write_unlock(&detail->hash_lock); + ++ if (freeme) ++ cache_put(freeme, detail); + return new; + } + EXPORT_SYMBOL_GPL(sunrpc_cache_lookup); +@@ -183,10 +201,7 @@ static int cache_make_upcall(struct cach + + static inline int cache_is_valid(struct cache_detail *detail, struct cache_head *h) + { +- if (!test_bit(CACHE_VALID, &h->flags) || +- h->expiry_time < get_seconds()) +- return -EAGAIN; +- else if (detail->flush_time > h->last_refresh) ++ if (!test_bit(CACHE_VALID, &h->flags)) + return -EAGAIN; + else { + /* entry is valid */ +diff -up linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig linux-2.6.34.noarch/net/sunrpc/svcsock.c +--- linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/svcsock.c 2010-08-23 09:57:23.997368707 -0400 +@@ -547,7 +547,6 @@ static int svc_udp_recvfrom(struct svc_r + dprintk("svc: recvfrom returned error %d\n", -err); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + } +- svc_xprt_received(&svsk->sk_xprt); + return -EAGAIN; + } + len = svc_addr_len(svc_addr(rqstp)); +@@ -562,11 +561,6 @@ static int svc_udp_recvfrom(struct svc_r + svsk->sk_sk->sk_stamp = skb->tstamp; + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ + +- /* +- * Maybe more packets - kick another thread ASAP. +- */ +- svc_xprt_received(&svsk->sk_xprt); +- + len = skb->len - sizeof(struct udphdr); + rqstp->rq_arg.len = len; + +@@ -917,7 +911,6 @@ static int svc_tcp_recv_record(struct sv + if (len < want) { + dprintk("svc: short recvfrom while reading record " + "length (%d of %d)\n", len, want); +- svc_xprt_received(&svsk->sk_xprt); + goto err_again; /* record header not complete */ + } + +@@ -953,7 +946,6 @@ static int svc_tcp_recv_record(struct sv + if (len < svsk->sk_reclen) { + dprintk("svc: incomplete TCP record (%d of %d)\n", + len, svsk->sk_reclen); +- svc_xprt_received(&svsk->sk_xprt); + goto err_again; /* record not complete */ + } + len = svsk->sk_reclen; +@@ -961,10 +953,8 @@ static int svc_tcp_recv_record(struct sv + + return len; + error: +- if (len == -EAGAIN) { ++ if (len == -EAGAIN) + dprintk("RPC: TCP recv_record got EAGAIN\n"); +- svc_xprt_received(&svsk->sk_xprt); +- } + return len; + err_delete: + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); +@@ -1110,7 +1100,6 @@ out: + svsk->sk_tcplen = 0; + + svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt); +- svc_xprt_received(&svsk->sk_xprt); + if (serv->sv_stats) + serv->sv_stats->nettcpcnt++; + +@@ -1119,7 +1108,6 @@ out: + err_again: + if (len == -EAGAIN) { + dprintk("RPC: TCP recvfrom got EAGAIN\n"); +- svc_xprt_received(&svsk->sk_xprt); + return len; + } + error: +diff -up linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig linux-2.6.34.noarch/net/sunrpc/svc_xprt.c +--- linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/svc_xprt.c 2010-08-23 09:57:23.996377209 -0400 +@@ -744,8 +744,10 @@ int svc_recv(struct svc_rqst *rqstp, lon + if (rqstp->rq_deferred) { + svc_xprt_received(xprt); + len = svc_deferred_recv(rqstp); +- } else ++ } else { + len = xprt->xpt_ops->xpo_recvfrom(rqstp); ++ svc_xprt_received(xprt); ++ } + dprintk("svc: got len=%d\n", len); + } + +@@ -893,12 +895,12 @@ void svc_delete_xprt(struct svc_xprt *xp + */ + if (test_bit(XPT_TEMP, &xprt->xpt_flags)) + serv->sv_tmpcnt--; ++ spin_unlock_bh(&serv->sv_lock); + + while ((dr = svc_deferred_dequeue(xprt)) != NULL) + kfree(dr); + + svc_xprt_put(xprt); +- spin_unlock_bh(&serv->sv_lock); + } + + void svc_close_xprt(struct svc_xprt *xprt) +diff -up linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c.orig linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +--- linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 2010-08-23 09:57:23.998377481 -0400 +@@ -566,7 +566,6 @@ static int rdma_read_complete(struct svc + ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, + rqstp->rq_arg.head[0].iov_len); + +- svc_xprt_received(rqstp->rq_xprt); + return ret; + } + +@@ -665,7 +664,6 @@ int svc_rdma_recvfrom(struct svc_rqst *r + rqstp->rq_arg.head[0].iov_len); + rqstp->rq_prot = IPPROTO_MAX; + svc_xprt_copy_addrs(rqstp, xprt); +- svc_xprt_received(xprt); + return ret; + + close_out: +@@ -678,6 +676,5 @@ int svc_rdma_recvfrom(struct svc_rqst *r + */ + set_bit(XPT_CLOSE, &xprt->xpt_flags); + defer: +- svc_xprt_received(xprt); + return 0; + } diff --git a/pnfs-all-2.6.35-2010-08-19-f13.patch b/pnfs-all-2.6.35-2010-08-19-f13.patch new file mode 100644 index 000000000..a9d78ba0e --- /dev/null +++ b/pnfs-all-2.6.35-2010-08-19-f13.patch @@ -0,0 +1,31788 @@ +diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arch/um/os-Linux/mem.c +--- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-08-23 12:08:27.310584826 -0400 ++++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-08-23 12:09:03.273553977 -0400 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include "init.h" + #include "kern_constants.h" + #include "os.h" +diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd.c +--- linux-2.6.34.noarch/block/genhd.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/block/genhd.c 2010-08-23 12:09:03.273553977 -0400 +@@ -1009,6 +1009,7 @@ static void disk_release(struct device * + struct class block_class = { + .name = "block", + }; ++EXPORT_SYMBOL(block_class); + + static char *block_devnode(struct device *dev, mode_t *mode) + { +diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt +--- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-23 12:09:03.274563927 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-08-23 12:09:03.274563927 -0400 +@@ -0,0 +1,211 @@ ++(c) 2007 Network Appliance Inc. ++ ++spNFS ++----- ++ ++An spNFS system consists of a Meta Data Server (MDS), a number of Client machines (C) and a number of Data Servers (DS). ++ ++A file system is mounted by the clients from the MDS, and all file data ++is striped across the DSs. ++ ++Identify the machines that will be filling each of these roles. ++ ++The spnfs kernel will be installed on all machines: clients, the MDS and DSs. ++ ++ ++Building and installing the spNFS kernel ++---------------------------------------- ++ ++Get the spNFS kernel from: ++ ++ git://linux-nfs.org/~bhalevy/linux-pnfs.git ++ ++Use the pnfs-all-latest branch and add these options to your .config file ++ ++ CONFIG_NETWORK_FILESYSTEMS=y ++ CONFIG_NFS_FS=m ++ CONFIG_NFS_V4=y ++ CONFIG_NFS_V4_1=y ++ CONFIG_PNFS=y ++ CONFIG_NFSD=m ++ CONFIG_PNFSD=y ++ # CONFIG_PNFSD_LOCAL_EXPORT is not set ++ CONFIG_SPNFS=y ++ ++By default, spNFS uses whole-file layouts. Layout segments can be enabled ++by adding: ++ ++ CONFIG_SPNFS_LAYOUTSEGMENTS=y ++ ++to your .config file. ++ ++Building and installation of kernel+modules is as usual. ++This kernel should be installed and booted on the client, MDS and DSs. ++ ++Note that CONFIG_PNFSD_LOCAL_EXPORT must be disabled for spnfs as it ++takes over the pnfs export interface. ++ ++Building nfs-utils ++------------------ ++ ++Get the nfs-utils package containing spnfsd from: ++ ++ git://linux-nfs.org/~bhalevy/pnfs-nfs-utils.git ++ ++Follow the standard instructions for building nfs-utils. ++ ++After building, the spnfsd daemon will be located in utils/spnfsd. The spnfsd ++daemon will only be needed on the MDS. ++ ++ ++Installation ++------------ ++ ++The nfs-utils package contains a default spnfsd.conf file in ++utils/spnfsd/spnfsd.conf. Copy this file to /etc/spnfsd.conf. ++ ++By default, the DS-Mount-Directory is set to /spnfs (see spnfsd.conf). Under ++this directory, mount points must be created for each DS to ++be used for pNFS data stripes. These mount points are named by the ip address ++of the corresponding DS. In the sample spnfsd.conf, there are two ++DSs defined (172.16.28.134 and 172.16.28.141). ++ ++Following the sample spnfsd.conf, ++ ++ mkdir /spnfs ++ ++on the MDS (corresponding to DS-Mount-Directory). Then ++ ++ mkdir /spnfs/172.16.28.134 ++ mkdir /spnfs/172.16.28.141 ++ ++to create the mount points for the DSs. ++ ++On the DSs, chose a directory where data stripes will be created by the MDS. ++For the sample file, this directory is /pnfs, so on each DS execute: ++ ++ mkdir /pnfs ++ ++This directory is specified in the spnfsd.conf file by the DS*_ROOT option ++(where * is replaced by the DS number). DS_ROOT is specified relative to ++the directory being exported by the DSs. In our example, our DSs are exporting ++the root directory (/) and therefore our DS_ROOT is /pnfs. On the DSs, we have ++the following entry in /etc/exports: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check) ++ ++N.B. If we had created a /exports directory and a /pnfs directory under ++/exports, and if we were exporting /exports, then DS_ROOT would still be /pnfs ++(not /exports/pnfs). ++ ++It may be useful to add entries to /etc/fstab on the MDS to automatically ++mount the DS_ROOT file systems. For this example, our MDS fstab would ++contain: ++ ++ 172.17.84.128:/pnfs /spnfs/172.17.84.128 nfs defaults 1 2 ++ 172.17.84.122:/pnfs /spnfs/172.17.84.122 nfs defaults 1 2 ++ ++The DS mounts must be performed manually or via fstab at this time (automatic ++mounting, directory creation, etc. are on the todo list). To perform I/O ++through the MDS, the DS mounts MUST use NFSv3 at this time (this restriction ++will eventually be removed). ++ ++ ++On the MDS, choose a file system to use with spNFS and export it, e.g.: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check,pnfs) ++ ++Make sure nfsd and all supporting processes are running on the MDS and DSs. ++ ++ ++Running ++------- ++ ++If rpc_pipefs is not already mounted (if you're running idmapd it probably is), ++you may want to add the following line to /etc/fstab: ++ ++ rpc_pipefs /var/lib/nfs/rpc_pipefs rpc_pipefs defaults 0 0 ++ ++to automatically mount rpc_pipefs. ++ ++With spnfsd.conf configured for your environment and the mounts mounted as ++described above, spnfsd can now be started. ++ ++On the MDS, execute spnfsd: ++ ++ spnfsd ++ ++The executable is located in the directory where it was built, and ++may also have been installed elsewhere depending on how you built nfs-utils. ++It will run in the foreground by default, and in fact will do so despite ++any options suggesting the contrary (it's still a debugging build). ++ ++On the client, make sure the nfslayoutdriver module is loaded: ++ ++ modprobe nfslayoutdriver ++ ++Then mount the file system from the MDS: ++ ++ mount -t nfs4 -o minorversion=1 mds:/ /mnt ++ ++I/O through the MDS is now supported. To use it, do not load the ++nfslayoutdriver on the client, and mount the MDS using NFSv4 or 4.1 ++(NFSv2 and v3 are not yet supported). ++ ++You may now use spNFS by performing file system activities in /mnt. ++If you create files in /mnt, you should see stripe files corresponding to ++new files being created on the DSs. The current implementation names the ++stripe files based on the inode number of the file on the MDS. For example, ++if you create a file foo in /mnt and do an 'ls -li /mnt/foo': ++ ++ # ls -li foo ++ 1233 -rw-r--r-- 1 root root 0 Nov 29 15:54 foo ++ ++You should see stripe files on each under /pnfs (per the sample) named ++1233. The file /pnfs/1233 on DS1 will contain the first bytes ++of data written to foo, DS2 will contain the next bytes, etc. ++Removing /mnt/foo will remove the corresponding stripe files on the DSs. ++Other file system operations should behave (mostly :-) as expected. ++ ++ ++Layout Segments ++--------------- ++ ++If the kernel is compiled to support layout segments, there will ++be two files created under /proc/fs/spnfs for controlling layout ++segment functionality. ++ ++To enable layout segments, write a '1' to /proc/fs/spnfs/layoutseg, e.g.: ++ ++ echo 1 > /proc/fs/spnfs/layoutseg ++ ++Layout segments can be disabled (returning to whole-file layouts) by ++writing a '0' to /proc/fs/spnfs/layoutseg: ++ ++ echo 0 > /proc/fs/spnfs/layoutseg ++ ++When layout segments are enabled, the size of the layouts returned can ++be specified by writing a decimal number (ascii representation) to ++/proc/fs/spnfs/layoutsegsize: ++ ++ echo 1024 > /proc/fs/spnfs/layoutsegsize ++ ++The value'0' has a special meaning--it causes the server to return a ++layout that is exactly the size requested by the client: ++ ++ echo 0 > /proc/fs/spnfs/layoutsegsize ++ ++ ++Troubleshooting ++--------------- ++ ++If you see data being written to the files on the MDS rather than ++the stripe files, make sure the nfslayoutdriver is loaded on the client ++(see above). ++ ++If you get a "permission denied" error, make sure mountd is running on the mds ++(it occasionally fails to start). ++ ++Bugs, enhancements, compliments, complaints to: dmuntz@netapp.com ++ ++ +diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/drivers/md/dm-ioctl.c +--- linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-08-23 12:09:03.275584050 -0400 +@@ -651,6 +651,12 @@ static int dev_create(struct dm_ioctl *p + return r; + } + ++int dm_dev_create(struct dm_ioctl *param) ++{ ++ return dev_create(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_create); ++ + /* + * Always use UUID for lookups if it's present, otherwise use name or dev. + */ +@@ -745,6 +751,12 @@ static int dev_remove(struct dm_ioctl *p + return 0; + } + ++int dm_dev_remove(struct dm_ioctl *param) ++{ ++ return dev_remove(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_remove); ++ + /* + * Check a string doesn't overrun the chunk of + * memory we copied from userland. +@@ -917,6 +929,12 @@ static int do_resume(struct dm_ioctl *pa + return r; + } + ++int dm_do_resume(struct dm_ioctl *param) ++{ ++ return do_resume(param); ++} ++EXPORT_SYMBOL(dm_do_resume); ++ + /* + * Set or unset the suspension state of a device. + * If the device already is in the requested state we just return its status. +@@ -1194,6 +1212,12 @@ out: + return r; + } + ++int dm_table_load(struct dm_ioctl *param, size_t param_size) ++{ ++ return table_load(param, param_size); ++} ++EXPORT_SYMBOL(dm_table_load); ++ + static int table_clear(struct dm_ioctl *param, size_t param_size) + { + int r; +diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drivers/scsi/hosts.c +--- linux-2.6.34.noarch/drivers/scsi/hosts.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-08-23 12:09:03.276563906 -0400 +@@ -49,7 +49,7 @@ static void scsi_host_cls_release(struct + put_device(&class_to_shost(dev)->shost_gendev); + } + +-static struct class shost_class = { ++struct class shost_class = { + .name = "scsi_host", + .dev_release = scsi_host_cls_release, + }; +diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/exofs.h +--- linux-2.6.34.noarch/fs/exofs/exofs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-08-23 12:09:03.277563890 -0400 +@@ -36,13 +36,9 @@ + #include + #include + #include ++#include + #include "common.h" + +-/* FIXME: Remove once pnfs hits mainline +- * #include +- */ +-#include "pnfs.h" +- + #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) + + #ifdef CONFIG_EXOFS_DEBUG +@@ -103,6 +99,7 @@ struct exofs_sb_info { + struct exofs_i_info { + struct inode vfs_inode; /* normal in-memory inode */ + wait_queue_head_t i_wq; /* wait queue for inode */ ++ spinlock_t i_layout_lock; /* lock for layout/return/recall */ + unsigned long i_flags; /* various atomic flags */ + uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ + uint32_t i_dir_start_lookup; /* which page to start lookup */ +@@ -166,6 +163,9 @@ static inline unsigned exofs_io_state_si + */ + #define OBJ_2BCREATED 0 /* object will be created soon*/ + #define OBJ_CREATED 1 /* object has been created on the osd*/ ++/* Below are not used atomic but reuse the same i_flags */ ++#define OBJ_LAYOUT_IS_GIVEN 2 /* inode has given layouts to clients*/ ++#define OBJ_IN_LAYOUT_RECALL 3 /* inode is in the middle of a layout recall*/ + + static inline int obj_2bcreated(struct exofs_i_info *oi) + { +@@ -304,4 +304,20 @@ extern const struct inode_operations exo + extern const struct inode_operations exofs_symlink_inode_operations; + extern const struct inode_operations exofs_fast_symlink_inode_operations; + ++/* export.c */ ++typedef int (exofs_recall_fn)(struct inode *inode); ++#ifdef CONFIG_PNFSD ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo); ++void exofs_init_export(struct super_block *sb); ++#else ++static inline int exofs_inode_recall_layout(struct inode *inode, ++ enum pnfs_iomode iomode, exofs_recall_fn todo) ++{ ++ return todo(inode); ++} ++ ++static inline void exofs_init_export(struct super_block *sb) {} ++#endif ++ + #endif +diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs/export.c +--- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-08-23 12:09:03.278386746 -0400 ++++ linux-2.6.34.noarch/fs/exofs/export.c 2010-08-23 12:09:03.278386746 -0400 +@@ -0,0 +1,396 @@ ++/* ++ * export.c - Implementation of the pnfs_export_operations ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Boaz Harrosh ++ * ++ * This file is part of exofs. ++ * ++ * exofs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation. Since it is based on ext2, and the only ++ * valid version of GPL for the Linux kernel is version 2, the only valid ++ * version of GPL for exofs is version 2. ++ * ++ * exofs is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with exofs; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include ++#include "exofs.h" ++ ++static int exofs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_OSD2_OBJECTS; ++} ++ ++static void set_dev_id(struct pnfs_deviceid *pnfs_devid, u64 sbid, u64 devid) ++{ ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)pnfs_devid; ++ ++ dev_id->sbid = sbid; ++ dev_id->devid = devid; ++} ++ ++static int cb_layout_recall(struct inode *inode, enum pnfs_iomode iomode, ++ u64 offset, u64 length, void *cookie) ++{ ++ struct nfsd4_pnfs_cb_layout cbl; ++ struct pnfsd_cb_ctl cb_ctl; ++ int status; ++ ++ memset(&cb_ctl, 0, sizeof(cb_ctl)); ++ status = pnfsd_get_cb_op(&cb_ctl); ++ if (unlikely(status)) { ++ EXOFS_ERR("%s: nfsd unloaded!! inode (0x%lx) status=%d\n", ++ __func__, inode->i_ino, status); ++ goto err; ++ } ++ ++ memset(&cbl, 0, sizeof(cbl)); ++ cbl.cbl_recall_type = RETURN_FILE; ++ cbl.cbl_seg.layout_type = LAYOUT_OSD2_OBJECTS; ++ cbl.cbl_seg.iomode = iomode; ++ cbl.cbl_seg.offset = offset; ++ cbl.cbl_seg.length = length; ++ cbl.cbl_cookie = cookie; ++ ++ status = cb_ctl.cb_op->cb_layout_recall(inode->i_sb, inode, &cbl); ++ pnfsd_put_cb_op(&cb_ctl); ++ ++err: ++ return status; ++} ++ ++static enum nfsstat4 exofs_layout_get( ++ struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; ++ struct exofs_layout *el = &sbi->layout; ++ struct pnfs_osd_object_cred *creds = NULL; ++ struct pnfs_osd_layout layout; ++ __be32 *start; ++ bool in_recall; ++ int i, err; ++ enum nfsstat4 nfserr; ++ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ res->lg_seg.iomode = IOMODE_RW; ++ res->lg_return_on_close = true; /* TODO: unused but will be soon */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ creds = kcalloc(el->s_numdevs, sizeof(*creds), GFP_KERNEL); ++ if (!creds) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto out; ++ } ++ ++ /* Fill in a pnfs_osd_layout struct */ ++ layout.olo_map = sbi->data_map; ++ ++ for (i = 0; i < el->s_numdevs; i++) { ++ struct pnfs_osd_object_cred *cred = &creds[i]; ++ osd_id id = exofs_oi_objno(oi); ++ unsigned dev = exofs_layout_od_id(el, id, i); ++ ++ set_dev_id(&cred->oc_object_id.oid_device_id, args->lg_sbid, ++ dev); ++ cred->oc_object_id.oid_partition_id = el->s_pid; ++ cred->oc_object_id.oid_object_id = id; ++ cred->oc_osd_version = osd_dev_is_ver1(el->s_ods[dev]) ? ++ PNFS_OSD_VERSION_1 : ++ PNFS_OSD_VERSION_2; ++ cred->oc_cap_key_sec = PNFS_OSD_CAP_KEY_SEC_NONE; ++ ++ cred->oc_cap_key.cred_len = 0; ++ cred->oc_cap_key.cred = NULL; ++ ++ cred->oc_cap.cred_len = OSD_CAP_LEN; ++ cred->oc_cap.cred = oi->i_cred; ++ } ++ ++ layout.olo_comps_index = 0; ++ layout.olo_num_comps = el->s_numdevs; ++ layout.olo_comps = creds; ++ ++ err = pnfs_osd_xdr_encode_layout(xdr, &layout); ++ if (err) { ++ nfserr = NFS4ERR_TOOSMALL; /* FIXME: Change osd_xdr error codes */ ++ goto out; ++ } ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ if (!in_recall) { ++ __set_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ nfserr = NFS4_OK; ++ } else { ++ nfserr = NFS4ERR_RECALLCONFLICT; ++ } ++ spin_unlock(&oi->i_layout_lock); ++ ++out: ++ kfree(creds); ++ EXOFS_DBGMSG("(0x%lx) nfserr=%u xdr_bytes=%zu\n", ++ inode->i_ino, nfserr, exp_xdr_qbytes(xdr->p - start)); ++ return nfserr; ++} ++ ++/* NOTE: inode mutex must NOT be held */ ++static int exofs_layout_commit( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct timespec mtime; ++ loff_t i_size; ++ int in_recall; ++ ++ /* In case of a recall we ignore the new size and mtime since they ++ * are going to be changed again by truncate, and since we cannot take ++ * the inode lock in that case. ++ */ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ if (in_recall) { ++ EXOFS_DBGMSG("(0x%lx) commit was called during recall\n", ++ inode->i_ino); ++ return 0; ++ } ++ ++ /* NOTE: I would love to call inode_setattr here ++ * but i cannot since this will cause an eventual vmtruncate, ++ * which will cause a layout_recall. So open code the i_size ++ * and mtime/atime changes under i_mutex. ++ */ ++ mutex_lock_nested(&inode->i_mutex, I_MUTEX_NORMAL); ++ ++ if (args->lc_mtime.seconds) { ++ mtime.tv_sec = args->lc_mtime.seconds; ++ mtime.tv_nsec = args->lc_mtime.nseconds; ++ ++ /* layout commit may only make time bigger, since there might ++ * be reordering of the notifications and it might arrive after ++ * A local change. ++ * TODO: if mtime > ctime then we know set_attr did an mtime ++ * in the future. and we can let this update through ++ */ ++ if (0 <= timespec_compare(&mtime, &inode->i_mtime)) ++ mtime = inode->i_mtime; ++ } else { ++ mtime = current_fs_time(inode->i_sb); ++ } ++ ++ /* TODO: Will below work? since mark_inode_dirty has it's own ++ * Time handling ++ */ ++ inode->i_atime = inode->i_mtime = mtime; ++ ++ i_size = i_size_read(inode); ++ if (args->lc_newoffset) { ++ loff_t new_size = args->lc_last_wr + 1; ++ ++ if (i_size < new_size) { ++ i_size_write(inode, i_size = new_size); ++ res->lc_size_chg = 1; ++ res->lc_newsize = new_size; ++ } ++ } ++ /* TODO: else { i_size = osd_get_object_length() } */ ++ ++/* TODO: exofs does not currently use the osd_xdr part of the layout_commit */ ++ ++ mark_inode_dirty_sync(inode); ++ ++ mutex_unlock(&inode->i_mutex); ++ EXOFS_DBGMSG("(0x%lx) i_size=0x%llx lcp->off=0x%llx\n", ++ inode->i_ino, i_size, args->lc_last_wr); ++ return 0; ++} ++ ++static void exofs_handle_error(struct pnfs_osd_ioerr *ioerr) ++{ ++ EXOFS_ERR("exofs_handle_error: errno=%d is_write=%d obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ ioerr->oer_errno, ioerr->oer_iswrite, ++ _LLU(ioerr->oer_component.oid_object_id), ++ _LLU(ioerr->oer_comp_offset), ++ _LLU(ioerr->oer_comp_length)); ++} ++ ++static int exofs_layout_return( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ __be32 *p = args->lrf_body; ++ unsigned len = exp_xdr_qwords(args->lrf_body_len); ++ ++ EXOFS_DBGMSG("(0x%lx) cookie %p xdr_len %d\n", ++ inode->i_ino, args->lr_cookie, len); ++ ++ while (len >= pnfs_osd_ioerr_xdr_sz()) { ++ struct pnfs_osd_ioerr ioerr; ++ ++ p = pnfs_osd_xdr_decode_ioerr(&ioerr, p); ++ len -= pnfs_osd_ioerr_xdr_sz(); ++ exofs_handle_error(&ioerr); ++ } ++ ++ if (args->lr_cookie) { ++ struct exofs_i_info *oi = exofs_i(inode); ++ bool in_recall; ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ __clear_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ /* TODO: how to communicate cookie with the waiter */ ++ if (in_recall) ++ wake_up(&oi->i_wq); /* wakeup any recalls */ ++ } ++ ++ return 0; ++} ++ ++int exofs_get_device_info(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct exofs_sb_info *sbi = sb->s_fs_info; ++ struct pnfs_osd_deviceaddr devaddr; ++ const struct osd_dev_info *odi; ++ u64 devno = devid->devid; ++ __be32 *start; ++ int err; ++ ++ memset(&devaddr, 0, sizeof(devaddr)); ++ ++ if (unlikely(devno >= sbi->layout.s_numdevs)) ++ return -ENODEV; ++ ++ odi = osduld_device_info(sbi->layout.s_ods[devno]); ++ ++ devaddr.oda_systemid.len = odi->systemid_len; ++ devaddr.oda_systemid.data = (void *)odi->systemid; /* !const cast */ ++ ++ devaddr.oda_osdname.len = odi->osdname_len ; ++ devaddr.oda_osdname.data = (void *)odi->osdname;/* !const cast */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ err = -E2BIG; ++ goto err; ++ } ++ ++ err = pnfs_osd_xdr_encode_deviceaddr(xdr, &devaddr); ++ if (err) ++ goto err; ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ EXOFS_DBGMSG("xdr_bytes=%Zu devno=%lld osdname-%s\n", ++ exp_xdr_qbytes(xdr->p - start), devno, odi->osdname); ++ return 0; ++ ++err: ++ EXOFS_DBGMSG("Error: err=%d at_byte=%zu\n", ++ err, exp_xdr_qbytes(xdr->p - start)); ++ return err; ++} ++ ++struct pnfs_export_operations exofs_pnfs_ops = { ++ .layout_type = exofs_layout_type, ++ .layout_get = exofs_layout_get, ++ .layout_commit = exofs_layout_commit, ++ .layout_return = exofs_layout_return, ++ .get_device_info = exofs_get_device_info, ++}; ++ ++static bool is_layout_returned(struct exofs_i_info *oi) ++{ ++ bool layout_given; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ return !layout_given; ++} ++ ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ int layout_given; ++ int error = 0; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ __set_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ if (!layout_given) ++ goto exec; ++ ++ for (;;) { ++ EXOFS_DBGMSG("(0x%lx) has_layout issue a recall\n", ++ inode->i_ino); ++ error = cb_layout_recall(inode, iomode, 0, NFS4_MAX_UINT64, ++ &oi->i_wq); ++ switch (error) { ++ case 0: ++ case -EAGAIN: ++ break; ++ case -ENOENT: ++ goto exec; ++ default: ++ goto err; ++ } ++ ++ error = wait_event_interruptible(oi->i_wq, ++ is_layout_returned(oi)); ++ if (error) ++ goto err; ++ } ++ ++exec: ++ error = todo(inode); ++ ++err: ++ spin_lock(&oi->i_layout_lock); ++ __clear_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ EXOFS_DBGMSG("(0x%lx) return=>%d\n", inode->i_ino, error); ++ return error; ++} ++ ++void exofs_init_export(struct super_block *sb) ++{ ++ sb->s_pnfs_op = &exofs_pnfs_ops; ++} +diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/inode.c +--- linux-2.6.34.noarch/fs/exofs/inode.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-08-23 12:09:03.279502002 -0400 +@@ -833,7 +833,7 @@ void exofs_truncate(struct inode *inode) + if (unlikely(wait_obj_created(oi))) + goto fail; + +- ret = _do_truncate(inode); ++ ret = exofs_inode_recall_layout(inode, IOMODE_ANY, _do_truncate); + if (ret) + goto fail; + +@@ -964,6 +964,7 @@ static void __oi_init(struct exofs_i_inf + { + init_waitqueue_head(&oi->i_wq); + oi->i_flags = 0; ++ spin_lock_init(&oi->i_layout_lock); + } + /* + * Fill in an inode read from the OSD and set it up for use +diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/Kbuild +--- linux-2.6.34.noarch/fs/exofs/Kbuild.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-08-23 12:09:03.279502002 -0400 +@@ -13,4 +13,5 @@ + # + + exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o ++exofs-$(CONFIG_PNFSD) += export.o + obj-$(CONFIG_EXOFS_FS) += exofs.o +diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/Kconfig +--- linux-2.6.34.noarch/fs/exofs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-08-23 12:09:03.280553663 -0400 +@@ -1,6 +1,7 @@ + config EXOFS_FS + tristate "exofs: OSD based file system support" + depends on SCSI_OSD_ULD ++ select EXPORTFS_OSD_LAYOUT if PNFSD + help + EXOFS is a file system that uses an OSD storage device, + as its backing storage. +diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/super.c +--- linux-2.6.34.noarch/fs/exofs/super.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/super.c 2010-08-23 12:09:03.281511951 -0400 +@@ -621,6 +621,7 @@ static int exofs_fill_super(struct super + sb->s_fs_info = sbi; + sb->s_op = &exofs_sops; + sb->s_export_op = &exofs_export_ops; ++ exofs_init_export(sb); + root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); + if (IS_ERR(root)) { + EXOFS_ERR("ERROR: exofs_iget failed\n"); +diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exportfs/expfs.c +--- linux-2.6.34.noarch/fs/exportfs/expfs.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-08-23 12:09:03.282511528 -0400 +@@ -16,6 +16,13 @@ + #include + #include + ++#if defined(CONFIG_PNFSD) ++struct pnfsd_cb_ctl pnfsd_cb_ctl = { ++ .lock = __SPIN_LOCK_UNLOCKED(pnfsd_cb_ctl.lock) ++}; ++EXPORT_SYMBOL(pnfsd_cb_ctl); ++#endif /* CONFIG_PNFSD */ ++ + #define dprintk(fmt, args...) do{}while(0) + + +diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/exportfs/Makefile +--- linux-2.6.34.noarch/fs/exportfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-08-23 12:09:03.282511528 -0400 +@@ -3,4 +3,7 @@ + + obj-$(CONFIG_EXPORTFS) += exportfs.o + +-exportfs-objs := expfs.o ++exportfs-y := expfs.o ++exportfs-$(CONFIG_EXPORTFS_FILE_LAYOUT) += nfs4filelayoutxdr.o ++exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o ++exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o +diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c +--- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-23 12:09:03.283511561 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-23 12:09:03.283511561 -0400 +@@ -0,0 +1,158 @@ ++/* ++ * linux/fs/nfsd/nfs4blocklayoutxdr.c ++ * ++ * ++ * Created by Rick McNeal on 3/31/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++#include ++#include ++#include ++#include ++ ++static int ++bl_encode_simple(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 12 + 4 + bld->u.simple.bld_sig_len); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u32(p, 1); ++ p = exp_xdr_encode_u64(p, bld->u.simple.bld_offset); ++ exp_xdr_encode_opaque(p, bld->u.simple.bld_sig, ++ bld->u.simple.bld_sig_len); ++ ++ return 0; ++} ++ ++static int ++bl_encode_slice(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2 + 2 + 1); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_start); ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_len); ++ exp_xdr_encode_u32(p, bld->u.slice.bld_index); ++ ++ return 0; ++} ++ ++static int ++bl_encode_concat(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ return -ENOTSUPP; ++} ++ ++static int ++bl_encode_stripe(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ int i; ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 2 + 1 + bld->u.stripe.bld_stripes); ++ ++ p = exp_xdr_encode_u64(p, bld->u.stripe.bld_chunk_size); ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripes); ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripe_indexs[i]); ++ ++ return 0; ++} ++ ++int ++blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes) ++{ ++ u32 num_vols = 0, ++ *layoutlen_p = xdr->p; ++ pnfs_blocklayout_devinfo_t *bld; ++ int status = 0; ++ __be32 *p; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -ETOOSMALL; ++ p += 2; ++ ++ /* ++ * All simple volumes with their signature are required to be listed ++ * first. ++ */ ++ list_for_each_entry(bld, volumes, bld_list) { ++ num_vols++; ++ p = exp_xdr_reserve_qwords(xdr, 1); ++ if (!p) ++ return -ETOOSMALL; ++ p = exp_xdr_encode_u32(p, bld->bld_type); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ status = bl_encode_simple(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_SLICE: ++ status = bl_encode_slice(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ status = bl_encode_concat(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ status = bl_encode_stripe(xdr, bld); ++ break; ++ default: ++ BUG(); ++ } ++ if (status) ++ goto error; ++ } ++ ++ /* ---- Fill in the overall length and number of volumes ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (xdr->p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, num_vols); ++ ++error: ++ return status; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_devinfo); ++ ++enum nfsstat4 ++blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *bl_head) ++{ ++ struct pnfs_blocklayout_layout *b; ++ u32 *layoutlen_p = xdr->p, ++ extents = 0; ++ __be32 *p; ++ ++ /* ++ * Save spot for opaque block layout length and number of extents, ++ * fill-in later. ++ */ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p += 2; ++ ++ list_for_each_entry(b, bl_head, bll_list) { ++ extents++; ++ p = exp_xdr_reserve_qwords(xdr, 5 * 2 + 1); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.sbid); ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.devid); ++ p = exp_xdr_encode_u64(p, b->bll_foff); ++ p = exp_xdr_encode_u64(p, b->bll_len); ++ p = exp_xdr_encode_u64(p, b->bll_soff); ++ p = exp_xdr_encode_u32(p, b->bll_es); ++ } ++ ++ /* ---- Fill in the overall length and number of extents ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, extents); ++ ++ return NFS4_OK; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_layout); +diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c +--- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-23 12:09:03.283511561 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-23 12:09:03.283511561 -0400 +@@ -0,0 +1,218 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++/* We do our-own dprintk so filesystems are not dependent on sunrpc */ ++#ifdef dprintk ++#undef dprintk ++#endif ++#define dprintk(fmt, args, ...) do { } while (0) ++ ++/* Calculate the XDR length of the GETDEVICEINFO4resok structure ++ * excluding the gdir_notification and the gdir_device_addr da_layout_type. ++ */ ++static int fl_devinfo_xdr_words(const struct pnfs_filelayout_device *fdev) ++{ ++ struct pnfs_filelayout_devaddr *fl_addr; ++ struct pnfs_filelayout_multipath *mp; ++ int i, j, nwords; ++ ++ /* da_addr_body length, indice length, indices, ++ * multipath_list4 length */ ++ nwords = 1 + 1 + fdev->fl_stripeindices_length + 1; ++ for (i = 0; i < fdev->fl_device_length; i++) { ++ mp = &fdev->fl_device_list[i]; ++ nwords++; /* multipath list length */ ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ fl_addr = mp->fl_multipath_list; ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_netid.len); ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_addr.len); ++ } ++ } ++ dprintk("<-- %s nwords %d\n", __func__, nwords); ++ return nwords; ++} ++ ++/* Encodes the nfsv4_1_file_layout_ds_addr4 structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++int ++filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ unsigned int i, j, len = 0, opaque_words; ++ u32 *p_in; ++ u32 index_count = fdev->fl_stripeindices_length; ++ u32 dev_count = fdev->fl_device_length; ++ int error = 0; ++ __be32 *p; ++ ++ opaque_words = fl_devinfo_xdr_words(fdev); ++ dprintk("%s: Begin indx_cnt: %u dev_cnt: %u total size %u\n", ++ __func__, ++ index_count, ++ dev_count, ++ opaque_words*4); ++ ++ /* check space for opaque length */ ++ p = p_in = exp_xdr_reserve_qwords(xdr, opaque_words); ++ if (!p) { ++ error = -ETOOSMALL; ++ goto out; ++ } ++ ++ /* Fill in length later */ ++ p++; ++ ++ /* encode device list indices */ ++ p = exp_xdr_encode_u32(p, index_count); ++ for (i = 0; i < index_count; i++) ++ p = exp_xdr_encode_u32(p, fdev->fl_stripeindices_list[i]); ++ ++ /* encode device list */ ++ p = exp_xdr_encode_u32(p, dev_count); ++ for (i = 0; i < dev_count; i++) { ++ struct pnfs_filelayout_multipath *mp = &fdev->fl_device_list[i]; ++ ++ p = exp_xdr_encode_u32(p, mp->fl_multipath_length); ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ struct pnfs_filelayout_devaddr *da = ++ &mp->fl_multipath_list[j]; ++ ++ /* Encode device info */ ++ p = exp_xdr_encode_opaque(p, da->r_netid.data, ++ da->r_netid.len); ++ p = exp_xdr_encode_opaque(p, da->r_addr.data, ++ da->r_addr.len); ++ } ++ } ++ ++ /* backfill in length. Subtract 4 for da_addr_body size */ ++ len = (char *)p - (char *)p_in; ++ exp_xdr_encode_u32(p_in, len - 4); ++ ++ error = 0; ++out: ++ dprintk("%s: End err %d xdrlen %d\n", ++ __func__, error, len); ++ return error; ++} ++EXPORT_SYMBOL(filelayout_encode_devinfo); ++ ++/* Encodes the loc_body structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++enum nfsstat4 ++filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp) ++{ ++ u32 len = 0, nfl_util, fhlen, i; ++ u32 *layoutlen_p; ++ enum nfsstat4 nfserr; ++ __be32 *p; ++ ++ dprintk("%s: device_id %llx:%llx fsi %u, numfh %u\n", ++ __func__, ++ flp->device_id.pnfs_fsid, ++ flp->device_id.pnfs_devid, ++ flp->lg_first_stripe_index, ++ flp->lg_fh_length); ++ ++ /* Ensure file system added at least one file handle */ ++ if (flp->lg_fh_length <= 0) { ++ dprintk("%s: File Layout has no file handles!!\n", __func__); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto out; ++ } ++ ++ /* Ensure room for len, devid, util, first_stripe_index, ++ * pattern_offset, number of filehandles */ ++ p = layoutlen_p = exp_xdr_reserve_qwords(xdr, 1+2+2+1+1+2+1); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ /* save spot for opaque file layout length, fill-in later*/ ++ p++; ++ ++ /* encode device id */ ++ p = exp_xdr_encode_u64(p, flp->device_id.sbid); ++ p = exp_xdr_encode_u64(p, flp->device_id.devid); ++ ++ /* set and encode flags */ ++ nfl_util = flp->lg_stripe_unit; ++ if (flp->lg_commit_through_mds) ++ nfl_util |= NFL4_UFLG_COMMIT_THRU_MDS; ++ if (flp->lg_stripe_type == STRIPE_DENSE) ++ nfl_util |= NFL4_UFLG_DENSE; ++ p = exp_xdr_encode_u32(p, nfl_util); ++ ++ /* encode first stripe index */ ++ p = exp_xdr_encode_u32(p, flp->lg_first_stripe_index); ++ ++ /* encode striping pattern start */ ++ p = exp_xdr_encode_u64(p, flp->lg_pattern_offset); ++ ++ /* encode number of file handles */ ++ p = exp_xdr_encode_u32(p, flp->lg_fh_length); ++ ++ /* encode file handles */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ fhlen = flp->lg_fh_list[i].fh_size; ++ p = exp_xdr_reserve_space(xdr, 4 + fhlen); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ p = exp_xdr_encode_opaque(p, &flp->lg_fh_list[i].fh_base, fhlen); ++ } ++ ++ /* Set number of bytes encoded = total_bytes_encoded - length var */ ++ len = (char *)p - (char *)layoutlen_p; ++ exp_xdr_encode_u32(layoutlen_p, len - 4); ++ ++ nfserr = NFS4_OK; ++out: ++ dprintk("%s: End err %u xdrlen %d\n", ++ __func__, nfserr, len); ++ return nfserr; ++} ++EXPORT_SYMBOL(filelayout_encode_layout); +diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c +--- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-23 12:09:03.284511493 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-23 12:09:03.284511493 -0400 +@@ -0,0 +1,289 @@ ++/* ++ * pnfs_osd_xdr_enc.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_data_map( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_data_map *data_map) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 1+2+1+1+1+1); ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, data_map->odm_num_comps); ++ p = exp_xdr_encode_u64(p, data_map->odm_stripe_unit); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_width); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_depth); ++ p = exp_xdr_encode_u32(p, data_map->odm_mirror_cnt); ++ p = exp_xdr_encode_u32(p, data_map->odm_raid_algorithm); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline int pnfs_osd_xdr_encode_objid( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2+2+2+2); ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)&object_id->oid_device_id; ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u64(p, dev_id->sbid); ++ p = exp_xdr_encode_u64(p, dev_id->devid); ++ p = exp_xdr_encode_u64(p, object_id->oid_partition_id); ++ p = exp_xdr_encode_u64(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * enum pnfs_osd_cap_key_sec4 { ++ * PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ * PNFS_OSD_CAP_KEY_SEC_SSV = 1 ++ * }; ++ * ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_object_cred( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_object_cred *olo_comp) ++{ ++ __be32 *p; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_objid(xdr, &olo_comp->oc_object_id); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_space(xdr, 3*4 + 4+olo_comp->oc_cap.cred_len); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, olo_comp->oc_osd_version); ++ ++ /* No sec for now */ ++ p = exp_xdr_encode_u32(p, PNFS_OSD_CAP_KEY_SEC_NONE); ++ p = exp_xdr_encode_u32(p, 0); /* opaque oc_capability_key<> */ ++ ++ exp_xdr_encode_opaque(p, olo_comp->oc_cap.cred, ++ olo_comp->oc_cap.cred_len); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_layout { ++ * struct pnfs_osd_data_map olo_map; ++ * u32 olo_comps_index; ++ * u32 olo_num_comps; ++ * struct pnfs_osd_object_cred *olo_comps; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *pol) ++{ ++ __be32 *p; ++ u32 i; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_data_map(xdr, &pol->olo_map); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, pol->olo_comps_index); ++ p = exp_xdr_encode_u32(p, pol->olo_num_comps); ++ ++ for (i = 0; i < pol->olo_num_comps; i++) { ++ err = pnfs_osd_xdr_encode_object_cred(xdr, &pol->olo_comps[i]); ++ if (err) ++ return err; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_layout); ++ ++static int _encode_string(struct exp_xdr_stream *xdr, ++ const struct nfs4_string *str) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, 4 + str->len); ++ ++ if (!p) ++ return -E2BIG; ++ exp_xdr_encode_opaque(p, str->data, str->len); ++ return 0; ++} ++ ++/* struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr) ++{ ++ __be32 *p; ++ int err; ++ ++ p = exp_xdr_reserve_space(xdr, 4 + 4 + sizeof(devaddr->oda_lun)); ++ if (!p) ++ return -E2BIG; ++ ++ /* Empty oda_targetid */ ++ p = exp_xdr_encode_u32(p, OBJ_TARGET_ANON); ++ ++ /* Empty oda_targetaddr for now */ ++ p = exp_xdr_encode_u32(p, 0); ++ ++ /* oda_lun */ ++ exp_xdr_encode_bytes(p, devaddr->oda_lun, sizeof(devaddr->oda_lun)); ++ ++ err = _encode_string(xdr, &devaddr->oda_systemid); ++ if (err) ++ return err; ++ ++ err = pnfs_osd_xdr_encode_object_cred(xdr, ++ &devaddr->oda_root_obj_cred); ++ if (err) ++ return err; ++ ++ err = _encode_string(xdr, &devaddr->oda_osdname); ++ if (err) ++ return err; ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_deviceaddr); ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p) ++{ ++ lou->dsu_valid = be32_to_cpu(*p++); ++ if (lou->dsu_valid) ++ p = xdr_decode_hyper(p, &lou->dsu_delta); ++ lou->olu_ioerr_flag = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_layoutupdate); ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline __be32 * ++pnfs_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) ++{ ++ /* FIXME: p = xdr_decode_fixed(...) */ ++ memcpy(objid->oid_device_id.data, p, sizeof(objid->oid_device_id.data)); ++ p += XDR_QUADLEN(sizeof(objid->oid_device_id.data)); ++ ++ p = xdr_decode_hyper(p, &objid->oid_partition_id); ++ p = xdr_decode_hyper(p, &objid->oid_object_id); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p) ++{ ++ p = pnfs_osd_xdr_decode_objid(p, &ioerr->oer_component); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_offset); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_length); ++ ioerr->oer_iswrite = be32_to_cpu(*p++); ++ ioerr->oer_errno = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); +diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gfs2/ops_fstype.c +--- linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-08-23 12:09:03.285539075 -0400 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + #include "gfs2.h" + #include "incore.h" +@@ -1146,6 +1147,9 @@ static int fill_super(struct super_block + sb->s_magic = GFS2_MAGIC; + sb->s_op = &gfs2_super_ops; + sb->s_export_op = &gfs2_export_ops; ++#if defined(CONFIG_PNFSD) ++ sb->s_pnfs_op = &pnfs_dlm_export_ops; ++#endif /* CONFIG_PNFSD */ + sb->s_xattr = gfs2_xattr_handlers; + sb->s_qcop = &gfs2_quotactl_ops; + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; +diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig +--- linux-2.6.34.noarch/fs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/Kconfig 2010-08-23 12:09:03.286512316 -0400 +@@ -224,6 +224,31 @@ config LOCKD_V4 + config EXPORTFS + tristate + ++config EXPORTFS_FILE_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 files layout type. ++ Must be automatically selected by supporting filesystems. ++ ++config EXPORTFS_OSD_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 objects layout type. ++ Must be automatically selected by supporting osd ++ filesystems. ++ ++ If unsure, say N. ++ ++config EXPORTFS_BLOCK_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 blocks layout type. ++ Must be automatically selected by supporting filesystems. ++ ++ + config NFS_ACL_SUPPORT + tristate + select FS_POSIX_ACL +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-23 12:09:03.287381619 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-23 12:09:03.287381619 -0400 +@@ -0,0 +1,66 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pipefs_list bl_device_list; ++struct dentry *bl_device_pipe; ++ ++ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, size_t len) ++{ ++ int err; ++ struct pipefs_hdr *msg; ++ ++ dprintk("Entering %s...\n", __func__); ++ ++ msg = pipefs_readmsg(filp, src, len); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: unable to read pipefs message.\n"); ++ return PTR_ERR(msg); ++ } ++ ++ /* now assign the result, which wakes the blocked thread */ ++ err = pipefs_assign_upcall_reply(msg, &bl_device_list); ++ if (err) { ++ dprintk("ERROR: failed to assign upcall with id %u\n", ++ msg->msgid); ++ kfree(msg); ++ } ++ return len; ++} ++ ++static const struct rpc_pipe_ops bl_pipe_ops = { ++ .upcall = pipefs_generic_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = pipefs_generic_destroy_msg, ++}; ++ ++int bl_pipe_init(void) ++{ ++ dprintk("%s: block_device pipefs registering...\n", __func__); ++ bl_device_pipe = pipefs_mkpipe("bl_device_pipe", &bl_pipe_ops, 1); ++ if (IS_ERR(bl_device_pipe)) ++ dprintk("ERROR, unable to make block_device pipe\n"); ++ ++ if (!bl_device_pipe) ++ dprintk("bl_device_pipe is NULL!\n"); ++ else ++ dprintk("bl_device_pipe created!\n"); ++ pipefs_init_list(&bl_device_list); ++ return 0; ++} ++ ++void bl_pipe_exit(void) ++{ ++ dprintk("%s: block_device pipefs unregistering...\n", __func__); ++ if (IS_ERR(bl_device_pipe)) ++ return ; ++ pipefs_closepipe(bl_device_pipe); ++ return; ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-23 12:09:03.288501648 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-23 12:09:03.288501648 -0400 +@@ -0,0 +1,1160 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include ++ ++#include /* various write calls */ ++#include /* struct bio */ ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Andy Adamson "); ++MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); ++ ++/* Callback operations to the pNFS client */ ++static struct pnfs_client_operations *pnfs_block_callback_ops; ++ ++static void print_page(struct page *page) ++{ ++ dprintk("PRINTPAGE page %p\n", page); ++ dprintk(" PagePrivate %d\n", PagePrivate(page)); ++ dprintk(" PageUptodate %d\n", PageUptodate(page)); ++ dprintk(" PageError %d\n", PageError(page)); ++ dprintk(" PageDirty %d\n", PageDirty(page)); ++ dprintk(" PageReferenced %d\n", PageReferenced(page)); ++ dprintk(" PageLocked %d\n", PageLocked(page)); ++ dprintk(" PageWriteback %d\n", PageWriteback(page)); ++ dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); ++ dprintk("\n"); ++} ++ ++/* Given the be associated with isect, determine if page data needs to be ++ * initialized. ++ */ ++static int is_hole(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return !is_sector_initialized(be->be_inval, isect); ++} ++ ++/* Given the be associated with isect, determine if page data can be ++ * written to disk. ++ */ ++static int is_writable(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return is_sector_initialized(be->be_inval, isect); ++} ++ ++static int ++dont_like_caller(struct nfs_page *req) ++{ ++ if (atomic_read(&req->wb_complete)) { ++ /* Called by _multi */ ++ return 1; ++ } else { ++ /* Called by _one */ ++ return 0; ++ } ++} ++ ++static enum pnfs_try_status ++bl_commit(struct nfs_write_data *nfs_data, ++ int sync) ++{ ++ dprintk("%s enter\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++/* The data we are handed might be spread across several bios. We need ++ * to track when the last one is finished. ++ */ ++struct parallel_io { ++ struct kref refcnt; ++ struct rpc_call_ops call_ops; ++ void (*pnfs_callback) (void *data); ++ void *data; ++}; ++ ++static inline struct parallel_io *alloc_parallel(void *data) ++{ ++ struct parallel_io *rv; ++ ++ rv = kmalloc(sizeof(*rv), GFP_KERNEL); ++ if (rv) { ++ rv->data = data; ++ kref_init(&rv->refcnt); ++ } ++ return rv; ++} ++ ++static inline void get_parallel(struct parallel_io *p) ++{ ++ kref_get(&p->refcnt); ++} ++ ++static void destroy_parallel(struct kref *kref) ++{ ++ struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); ++ ++ dprintk("%s enter\n", __func__); ++ p->pnfs_callback(p->data); ++ kfree(p); ++} ++ ++static inline void put_parallel(struct parallel_io *p) ++{ ++ kref_put(&p->refcnt, destroy_parallel); ++} ++ ++static struct bio * ++bl_submit_bio(int rw, struct bio *bio) ++{ ++ if (bio) { ++ get_parallel(bio->bi_private); ++ dprintk("%s submitting %s bio %u@%llu\n", __func__, ++ rw == READ ? "read" : "write", ++ bio->bi_size, (u64)bio->bi_sector); ++ submit_bio(rw, bio); ++ } ++ return NULL; ++} ++ ++static inline void ++bl_done_with_rpage(struct page *page, const int ok) ++{ ++ if (ok) { ++ ClearPagePnfsErr(page); ++ SetPageUptodate(page); ++ } else { ++ ClearPageUptodate(page); ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ } ++ /* Page is unlocked via rpc_release. Should really be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_read(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_rpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++static void bl_read_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ pnfs_block_callback_ops->nfs_readlist_complete(rdata); ++} ++ ++static void ++bl_end_par_io_read(void *data) ++{ ++ struct nfs_read_data *rdata = data; ++ ++ INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); ++ schedule_work(&rdata->task.u.tk_work); ++} ++ ++/* We don't want normal .rpc_call_done callback used, so we replace it ++ * with this stub. ++ */ ++static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) ++{ ++ return; ++} ++ ++static enum pnfs_try_status ++bl_read_pagelist(struct nfs_read_data *rdata, ++ unsigned nr_pages) ++{ ++ int i, hole; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t f_offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct page **pages = rdata->args.pages; ++ int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, ++ nr_pages, f_offset, count); ++ ++ if (dont_like_caller(rdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ goto use_mds; ++ } ++ if ((nr_pages == 1) && PagePnfsErr(rdata->req->wb_page)) { ++ /* We want to fall back to mds in case of read_page ++ * after error on read_pages. ++ */ ++ dprintk("%s PG_pnfserr set\n", __func__); ++ goto use_mds; ++ } ++ par = alloc_parallel(rdata); ++ if (!par) ++ goto use_mds; ++ par->call_ops = *rdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_read; ++ /* At this point, we can no longer jump to use_mds */ ++ ++ isect = (sector_t) (f_offset >> 9); ++ /* Code assumes extents are page-aligned */ ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ put_extent(cow_read); ++ bio = bl_submit_bio(READ, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(rdata->pdata.lseg), ++ isect, &cow_read); ++ if (!be) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ if (cow_read) { ++ sector_t cow_length = cow_read->be_length - ++ (isect - cow_read->be_f_offset); ++ extent_length = min(extent_length, cow_length); ++ } ++ } ++ hole = is_hole(be, isect); ++ if (hole && !cow_read) { ++ bio = bl_submit_bio(READ, bio); ++ /* Fill hole w/ zeroes w/o accessing device */ ++ dprintk("%s Zeroing page for hole\n", __func__); ++ zero_user(pages[i], 0, ++ min_t(int, PAGE_CACHE_SIZE, count)); ++ print_page(pages[i]); ++ bl_done_with_rpage(pages[i], 1); ++ } else { ++ struct pnfs_block_extent *be_read; ++ ++ be_read = (hole && cow_read) ? cow_read : be; ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - ++ be_read->be_f_offset + ++ be_read->be_v_offset; ++ bio->bi_bdev = be_read->be_mdev; ++ bio->bi_end_io = bl_end_io_read; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(READ, bio); ++ } ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ if ((isect << 9) >= rdata->inode->i_size) { ++ rdata->res.eof = 1; ++ rdata->res.count = rdata->inode->i_size - f_offset; ++ } else { ++ rdata->res.count = (isect << 9) - f_offset; ++ } ++ put_extent(be); ++ put_extent(cow_read); ++ bl_submit_bio(READ, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++ ++ use_mds: ++ dprintk("Giving up and using normal NFS\n"); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static void mark_extents_written(struct pnfs_block_layout *bl, ++ __u64 offset, __u32 count) ++{ ++ sector_t isect, end; ++ struct pnfs_block_extent *be; ++ ++ dprintk("%s(%llu, %u)\n", __func__, offset, count); ++ if (count == 0) ++ return; ++ isect = (offset & (long)(PAGE_CACHE_MASK)) >> 9; ++ end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); ++ end >>= 9; ++ while (isect < end) { ++ sector_t len; ++ be = find_get_extent(bl, isect, NULL); ++ BUG_ON(!be); /* FIXME */ ++ len = min(end, be->be_f_offset + be->be_length) - isect; ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ mark_for_commit(be, isect, len); /* What if fails? */ ++ isect += len; ++ put_extent(be); ++ } ++} ++ ++/* STUB - this needs thought */ ++static inline void ++bl_done_with_wpage(struct page *page, const int ok) ++{ ++ if (!ok) { ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ /* This is an inline copy of nfs_zap_mapping */ ++ /* This is oh so fishy, and needs deep thought */ ++ if (page->mapping->nrpages != 0) { ++ struct inode *inode = page->mapping->host; ++ spin_lock(&inode->i_lock); ++ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; ++ spin_unlock(&inode->i_lock); ++ } ++ } ++ /* end_page_writeback called in rpc_release. Should be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_write(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_wpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++/* Function scheduled for call during bl_end_par_io_write, ++ * it marks sectors as written and extends the commitlist. ++ */ ++static void bl_write_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ if (!wdata->task.tk_status) { ++ /* Marks for LAYOUTCOMMIT */ ++ /* BUG - this should be called after each bio, not after ++ * all finish, unless have some way of storing success/failure ++ */ ++ mark_extents_written(BLK_LSEG2EXT(wdata->pdata.lseg), ++ wdata->args.offset, wdata->args.count); ++ } ++ pnfs_block_callback_ops->nfs_writelist_complete(wdata); ++} ++ ++/* Called when last of bios associated with a bl_write_pagelist call finishes */ ++static void ++bl_end_par_io_write(void *data) ++{ ++ struct nfs_write_data *wdata = data; ++ ++ /* STUB - ignoring error handling */ ++ wdata->task.tk_status = 0; ++ wdata->verf.committed = NFS_FILE_SYNC; ++ INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); ++ schedule_work(&wdata->task.u.tk_work); ++} ++ ++static enum pnfs_try_status ++bl_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int sync) ++{ ++ int i; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t offset = wdata->args.offset; ++ size_t count = wdata->args.count; ++ struct page **pages = wdata->args.pages; ++ int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); ++ if (!wdata->req->wb_lseg) { ++ dprintk("%s no lseg, falling back to MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ if (dont_like_caller(wdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ /* At this point, wdata->pages is a (sequential) list of nfs_pages. ++ * We want to write each, and if there is an error remove it from ++ * list and call ++ * nfs_retry_request(req) to have it redone using nfs. ++ * QUEST? Do as block or per req? Think have to do per block ++ * as part of end_bio ++ */ ++ par = alloc_parallel(wdata); ++ if (!par) ++ return PNFS_NOT_ATTEMPTED; ++ par->call_ops = *wdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_write; ++ /* At this point, have to be more careful with error handling */ ++ ++ isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> 9); ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ bio = bl_submit_bio(WRITE, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(wdata->pdata.lseg), ++ isect, NULL); ++ if (!be || !is_writable(be, isect)) { ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ } ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - be->be_f_offset + ++ be->be_v_offset; ++ bio->bi_bdev = be->be_mdev; ++ bio->bi_end_io = bl_end_io_write; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(WRITE, bio); ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ wdata->res.count = (isect << 9) - (offset & (long)PAGE_CACHE_MASK); ++ put_extent(be); ++ bl_submit_bio(WRITE, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++} ++ ++/* FIXME - range ignored */ ++static void ++release_extents(struct pnfs_block_layout *bl, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ int i; ++ struct pnfs_block_extent *be; ++ ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ while (!list_empty(&bl->bl_extents[i])) { ++ be = list_first_entry(&bl->bl_extents[i], ++ struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++} ++ ++static void ++release_inval_marks(struct pnfs_inval_markings *marks) ++{ ++ struct pnfs_inval_tracking *pos, *temp; ++ ++ list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { ++ list_del(&pos->it_link); ++ kfree(pos); ++ } ++ return; ++} ++ ++/* Note we are relying on caller locking to prevent nasty races. */ ++static void ++bl_free_layout(struct pnfs_layout_type *lo) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ ++ dprintk("%s enter\n", __func__); ++ release_extents(bl, NULL); ++ release_inval_marks(&bl->bl_inval); ++ kfree(bl); ++} ++ ++static struct pnfs_layout_type * ++bl_alloc_layout(struct inode *inode) ++{ ++ struct pnfs_block_layout *bl; ++ ++ dprintk("%s enter\n", __func__); ++ bl = kzalloc(sizeof(*bl), GFP_KERNEL); ++ if (!bl) ++ return NULL; ++ spin_lock_init(&bl->bl_ext_lock); ++ INIT_LIST_HEAD(&bl->bl_extents[0]); ++ INIT_LIST_HEAD(&bl->bl_extents[1]); ++ INIT_LIST_HEAD(&bl->bl_commit); ++ bl->bl_count = 0; ++ bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> 9; ++ INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); ++ return &bl->bl_layout; ++} ++ ++static void ++bl_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter\n", __func__); ++ kfree(lseg); ++} ++ ++/* Because the generic infrastructure does not correctly merge layouts, ++ * we pretty much ignore lseg, and store all data layout wide, so we ++ * can correctly merge. Eventually we should push some correct merge ++ * behavior up to the generic code, as the current behavior tends to ++ * cause lots of unnecessary overlapping LAYOUTGET requests. ++ */ ++static struct pnfs_layout_segment * ++bl_alloc_lseg(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layoutget_res *lgr) ++{ ++ struct pnfs_layout_segment *lseg; ++ int status; ++ ++ dprintk("%s enter\n", __func__); ++ lseg = kzalloc(sizeof(*lseg) + 0, GFP_KERNEL); ++ if (!lseg) ++ return NULL; ++ status = nfs4_blk_process_layoutget(lo, lgr); ++ if (status) { ++ /* We don't want to call the full-blown bl_free_lseg, ++ * since on error extents were not touched. ++ */ ++ /* STUB - we really want to distinguish between 2 error ++ * conditions here. This lseg failed, but lo data structures ++ * are OK, or we hosed the lo data structures. The calling ++ * code probably needs to distinguish this too. ++ */ ++ kfree(lseg); ++ return ERR_PTR(status); ++ } ++ return lseg; ++} ++ ++static int ++bl_setup_layoutcommit(struct pnfs_layout_type *lo, ++ struct pnfs_layoutcommit_arg *arg) ++{ ++ struct nfs_server *nfss = PNFS_NFS_SERVER(lo); ++ struct bl_layoutupdate_data *layoutupdate_data; ++ ++ dprintk("%s enter\n", __func__); ++ /* Need to ensure commit is block-size aligned */ ++ if (nfss->pnfs_blksize) { ++ u64 mask = nfss->pnfs_blksize - 1; ++ u64 offset = arg->lseg.offset & mask; ++ ++ arg->lseg.offset -= offset; ++ arg->lseg.length += offset + mask; ++ arg->lseg.length &= ~mask; ++ } ++ ++ layoutupdate_data = kmalloc(sizeof(struct bl_layoutupdate_data), ++ GFP_KERNEL); ++ if (unlikely(!layoutupdate_data)) ++ return -ENOMEM; ++ INIT_LIST_HEAD(&layoutupdate_data->ranges); ++ arg->layoutdriver_data = layoutupdate_data; ++ ++ return 0; ++} ++ ++static void ++bl_encode_layoutcommit(struct pnfs_layout_type *lo, struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *arg) ++{ ++ dprintk("%s enter\n", __func__); ++ encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); ++} ++ ++static void ++bl_cleanup_layoutcommit(struct pnfs_layout_type *lo, ++ struct pnfs_layoutcommit_arg *arg, int status) ++{ ++ dprintk("%s enter\n", __func__); ++ clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), arg, status); ++ kfree(arg->layoutdriver_data); ++} ++ ++static void free_blk_mountid(struct block_mount_id *mid) ++{ ++ if (mid) { ++ struct pnfs_block_dev *dev; ++ spin_lock(&mid->bm_lock); ++ while (!list_empty(&mid->bm_devlist)) { ++ dev = list_first_entry(&mid->bm_devlist, ++ struct pnfs_block_dev, ++ bm_node); ++ list_del(&dev->bm_node); ++ free_block_dev(dev); ++ } ++ spin_unlock(&mid->bm_lock); ++ kfree(mid); ++ } ++} ++ ++/* This is mostly copied form the filelayout's get_device_info function. ++ * It seems much of this should be at the generic pnfs level. ++ */ ++static struct pnfs_block_dev * ++nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, ++ struct pnfs_deviceid *d_id, ++ struct list_head *sdlist) ++{ ++ struct pnfs_device *dev; ++ struct pnfs_block_dev *rv = NULL; ++ u32 max_resp_sz; ++ int max_pages; ++ struct page **pages = NULL; ++ int i, rc; ++ ++ /* ++ * Use the session max response size as the basis for setting ++ * GETDEVICEINFO's maxcount ++ */ ++ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; ++ max_pages = max_resp_sz >> PAGE_SHIFT; ++ dprintk("%s max_resp_sz %u max_pages %d\n", ++ __func__, max_resp_sz, max_pages); ++ ++ dev = kmalloc(sizeof(*dev), GFP_KERNEL); ++ if (!dev) { ++ dprintk("%s kmalloc failed\n", __func__); ++ return NULL; ++ } ++ ++ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); ++ if (pages == NULL) { ++ kfree(dev); ++ return NULL; ++ } ++ for (i = 0; i < max_pages; i++) { ++ pages[i] = alloc_page(GFP_KERNEL); ++ if (!pages[i]) ++ goto out_free; ++ } ++ ++ /* set dev->area */ ++ dev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); ++ if (!dev->area) ++ goto out_free; ++ ++ memcpy(&dev->dev_id, d_id, sizeof(*d_id)); ++ dev->layout_type = LAYOUT_BLOCK_VOLUME; ++ dev->dev_notify_types = 0; ++ dev->pages = pages; ++ dev->pgbase = 0; ++ dev->pglen = PAGE_SIZE * max_pages; ++ dev->mincount = 0; ++ ++ dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); ++ rc = pnfs_block_callback_ops->nfs_getdeviceinfo(server, dev); ++ dprintk("%s getdevice info returns %d\n", __func__, rc); ++ if (rc) ++ goto out_free; ++ ++ rv = nfs4_blk_decode_device(server, dev, sdlist); ++ out_free: ++ if (dev->area != NULL) ++ vunmap(dev->area); ++ for (i = 0; i < max_pages; i++) ++ __free_page(pages[i]); ++ kfree(pages); ++ kfree(dev); ++ return rv; ++} ++ ++ ++/* ++ * Retrieve the list of available devices for the mountpoint. ++ */ ++static int ++bl_initialize_mountpoint(struct nfs_server *server, const struct nfs_fh *fh) ++{ ++ struct block_mount_id *b_mt_id = NULL; ++ struct pnfs_mount_type *mtype = NULL; ++ struct pnfs_devicelist *dlist = NULL; ++ struct pnfs_block_dev *bdev; ++ LIST_HEAD(block_disklist); ++ int status = 0, i; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (server->pnfs_blksize == 0) { ++ dprintk("%s Server did not return blksize\n", __func__); ++ return -EINVAL; ++ } ++ b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_KERNEL); ++ if (!b_mt_id) { ++ status = -ENOMEM; ++ goto out_error; ++ } ++ /* Initialize nfs4 block layout mount id */ ++ spin_lock_init(&b_mt_id->bm_lock); ++ INIT_LIST_HEAD(&b_mt_id->bm_devlist); ++ ++ dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_KERNEL); ++ if (!dlist) ++ goto out_error; ++ dlist->eof = 0; ++ while (!dlist->eof) { ++ status = pnfs_block_callback_ops->nfs_getdevicelist( ++ server, fh, dlist); ++ if (status) ++ goto out_error; ++ dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", ++ __func__, dlist->num_devs, dlist->eof); ++ /* For each device returned in dlist, call GETDEVICEINFO, and ++ * decode the opaque topology encoding to create a flat ++ * volume topology, matching VOLUME_SIMPLE disk signatures ++ * to disks in the visible block disk list. ++ * Construct an LVM meta device from the flat volume topology. ++ */ ++ for (i = 0; i < dlist->num_devs; i++) { ++ bdev = nfs4_blk_get_deviceinfo(server, fh, ++ &dlist->dev_id[i], ++ &block_disklist); ++ if (!bdev) ++ goto out_error; ++ spin_lock(&b_mt_id->bm_lock); ++ list_add(&bdev->bm_node, &b_mt_id->bm_devlist); ++ spin_unlock(&b_mt_id->bm_lock); ++ } ++ } ++ dprintk("%s SUCCESS\n", __func__); ++ server->pnfs_ld_data = b_mt_id; ++ ++ out_return: ++ kfree(dlist); ++ return status; ++ ++ out_error: ++ free_blk_mountid(b_mt_id); ++ kfree(mtype); ++ goto out_return; ++} ++ ++static int ++bl_uninitialize_mountpoint(struct nfs_server *server) ++{ ++ struct block_mount_id *b_mt_id = server->pnfs_ld_data; ++ ++ dprintk("%s enter\n", __func__); ++ free_blk_mountid(b_mt_id); ++ dprintk("%s RETURNS\n", __func__); ++ return 0; ++} ++ ++/* STUB - mark intersection of layout and page as bad, so is not ++ * used again. ++ */ ++static void mark_bad_read(void) ++{ ++ return; ++} ++ ++/* Copied from buffer.c */ ++static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) ++{ ++ if (uptodate) { ++ set_buffer_uptodate(bh); ++ } else { ++ /* This happens, due to failed READA attempts. */ ++ clear_buffer_uptodate(bh); ++ } ++ unlock_buffer(bh); ++} ++ ++/* Copied from buffer.c */ ++static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) ++{ ++ __end_buffer_read_notouch(bh, uptodate); ++} ++ ++/* ++ * map_block: map a requested I/0 block (isect) into an offset in the LVM ++ * meta block_device ++ */ ++static void ++map_block(sector_t isect, struct pnfs_block_extent *be, struct buffer_head *bh) ++{ ++ dprintk("%s enter be=%p\n", __func__, be); ++ ++ set_buffer_mapped(bh); ++ bh->b_bdev = be->be_mdev; ++ bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> ++ (be->be_mdev->bd_inode->i_blkbits - 9); ++ ++ dprintk("%s isect %ld, bh->b_blocknr %ld, using bsize %Zd\n", ++ __func__, (long)isect, ++ (long)bh->b_blocknr, ++ bh->b_size); ++ return; ++} ++ ++/* Given an unmapped page, zero it (or read in page for COW), ++ * and set appropriate flags/markings, but it is safe to not initialize ++ * the range given in [from, to). ++ */ ++/* This is loosely based on nobh_write_begin */ ++static int ++init_page_for_write(struct pnfs_block_layout *bl, struct page *page, ++ unsigned from, unsigned to, sector_t **pages_to_mark) ++{ ++ struct buffer_head *bh; ++ int inval, ret = -EIO; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect; ++ ++ dprintk("%s enter, %p\n", __func__, page); ++ bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); ++ if (!bh) { ++ ret = -ENOMEM; ++ goto cleanup; ++ } ++ ++ isect = (sector_t)page->index << (PAGE_CACHE_SHIFT - 9); ++ be = find_get_extent(bl, isect, &cow_read); ++ if (!be) ++ goto cleanup; ++ inval = is_hole(be, isect); ++ dprintk("%s inval=%i, from=%u, to=%u\n", __func__, inval, from, to); ++ if (inval) { ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) { ++ dprintk("%s PANIC - got NONE_DATA extent %p\n", ++ __func__, be); ++ goto cleanup; ++ } ++ map_block(isect, be, bh); ++ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); ++ } ++ if (PageUptodate(page)) { ++ /* Do nothing */ ++ } else if (inval & !cow_read) { ++ zero_user_segments(page, 0, from, to, PAGE_CACHE_SIZE); ++ } else if (0 < from || PAGE_CACHE_SIZE > to) { ++ struct pnfs_block_extent *read_extent; ++ ++ read_extent = (inval && cow_read) ? cow_read : be; ++ map_block(isect, read_extent, bh); ++ lock_buffer(bh); ++ bh->b_end_io = end_buffer_read_nobh; ++ submit_bh(READ, bh); ++ dprintk("%s: Waiting for buffer read\n", __func__); ++ /* XXX Don't really want to hold layout lock here */ ++ wait_on_buffer(bh); ++ if (!buffer_uptodate(bh)) ++ goto cleanup; ++ } ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ /* There is a BUG here if is a short copy after write_begin, ++ * but I think this is a generic fs bug. The problem is that ++ * we have marked the page as initialized, but it is possible ++ * that the section not copied may never get copied. ++ */ ++ ret = mark_initialized_sectors(be->be_inval, isect, ++ PAGE_CACHE_SECTORS, ++ pages_to_mark); ++ /* Want to preallocate mem so above can't fail */ ++ if (ret) ++ goto cleanup; ++ } ++ SetPageMappedToDisk(page); ++ ret = 0; ++ ++cleanup: ++ free_buffer_head(bh); ++ put_extent(be); ++ put_extent(cow_read); ++ if (ret) { ++ /* Need to mark layout with bad read...should now ++ * just use nfs4 for reads and writes. ++ */ ++ mark_bad_read(); ++ } ++ return ret; ++} ++ ++static int ++bl_write_begin(struct pnfs_layout_segment *lseg, struct page *page, loff_t pos, ++ unsigned count, struct pnfs_fsdata *fsdata) ++{ ++ unsigned from, to; ++ int ret; ++ sector_t *pages_to_mark = NULL; ++ struct pnfs_block_layout *bl = BLK_LSEG2EXT(lseg); ++ ++ dprintk("%s enter, %u@%lld\n", __func__, count, pos); ++ print_page(page); ++ /* The following code assumes blocksize >= PAGE_CACHE_SIZE */ ++ if (bl->bl_blocksize < (PAGE_CACHE_SIZE >> 9)) { ++ dprintk("%s Can't handle blocksize %llu\n", __func__, ++ (u64)bl->bl_blocksize); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ return 0; ++ } ++ if (PageMappedToDisk(page)) { ++ /* Basically, this is a flag that says we have ++ * successfully called write_begin already on this page. ++ */ ++ /* NOTE - there are cache consistency issues here. ++ * For example, what if the layout is recalled, then regained? ++ * If the file is closed and reopened, will the page flags ++ * be reset? If not, we'll have to use layout info instead of ++ * the page flag. ++ */ ++ return 0; ++ } ++ from = pos & (PAGE_CACHE_SIZE - 1); ++ to = from + count; ++ ret = init_page_for_write(bl, page, from, to, &pages_to_mark); ++ if (ret) { ++ dprintk("%s init page failed with %i", __func__, ret); ++ /* Revert back to plain NFS and just continue on with ++ * write. This assumes there is no request attached, which ++ * should be true if we get here. ++ */ ++ BUG_ON(PagePrivate(page)); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ kfree(pages_to_mark); ++ ret = 0; ++ } else { ++ fsdata->private = pages_to_mark; ++ } ++ return ret; ++} ++ ++/* CAREFUL - what happens if copied < count??? */ ++static int ++bl_write_end(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter, %u@%lld, lseg=%p\n", __func__, count, pos, lseg); ++ print_page(page); ++ if (lseg) ++ SetPageUptodate(page); ++ return 0; ++} ++ ++/* Return any memory allocated to fsdata->private, and take advantage ++ * of no page locks to mark pages noted in write_begin as needing ++ * initialization. ++ */ ++static void ++bl_write_end_cleanup(struct file *filp, struct pnfs_fsdata *fsdata) ++{ ++ struct page *page; ++ pgoff_t index; ++ sector_t *pos; ++ struct address_space *mapping = filp->f_mapping; ++ struct pnfs_fsdata *fake_data; ++ struct pnfs_layout_segment *lseg; ++ ++ if (!fsdata) ++ return; ++ lseg = fsdata->lseg; ++ if (!lseg) ++ return; ++ pos = fsdata->private; ++ if (!pos) ++ return; ++ dprintk("%s enter with pos=%llu\n", __func__, (u64)(*pos)); ++ for (; *pos != ~0; pos++) { ++ index = *pos >> (PAGE_CACHE_SHIFT - 9); ++ /* XXX How do we properly deal with failures here??? */ ++ page = grab_cache_page_write_begin(mapping, index, 0); ++ if (!page) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", __func__); ++ continue; ++ } ++ dprintk("%s: Examining block page\n", __func__); ++ print_page(page); ++ if (!PageMappedToDisk(page)) { ++ /* XXX How do we properly deal with failures here??? */ ++ dprintk("%s Marking block page\n", __func__); ++ init_page_for_write(BLK_LSEG2EXT(fsdata->lseg), page, ++ PAGE_CACHE_SIZE, PAGE_CACHE_SIZE, ++ NULL); ++ print_page(page); ++ fake_data = kzalloc(sizeof(*fake_data), GFP_KERNEL); ++ if (!fake_data) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", ++ __func__); ++ unlock_page(page); ++ continue; ++ } ++ get_lseg(lseg); ++ fake_data->lseg = lseg; ++ fake_data->bypass_eof = 1; ++ mapping->a_ops->write_end(filp, mapping, ++ index << PAGE_CACHE_SHIFT, ++ PAGE_CACHE_SIZE, ++ PAGE_CACHE_SIZE, ++ page, fake_data); ++ /* Note fake_data is freed by nfs_write_end */ ++ } else ++ unlock_page(page); ++ } ++ kfree(fsdata->private); ++ fsdata->private = NULL; ++} ++ ++static ssize_t ++bl_get_stripesize(struct pnfs_layout_type *lo) ++{ ++ dprintk("%s enter\n", __func__); ++ return 0; ++} ++ ++/* This is called by nfs_can_coalesce_requests via nfs_pageio_do_add_request. ++ * Should return False if there is a reason requests can not be coalesced, ++ * otherwise, should default to returning True. ++ */ ++static int ++bl_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ dprintk("%s enter\n", __func__); ++ if (pgio->pg_iswrite) ++ return prev->wb_lseg == req->wb_lseg; ++ else ++ return 1; ++} ++ ++static struct layoutdriver_io_operations blocklayout_io_operations = { ++ .commit = bl_commit, ++ .read_pagelist = bl_read_pagelist, ++ .write_pagelist = bl_write_pagelist, ++ .write_begin = bl_write_begin, ++ .write_end = bl_write_end, ++ .write_end_cleanup = bl_write_end_cleanup, ++ .alloc_layout = bl_alloc_layout, ++ .free_layout = bl_free_layout, ++ .alloc_lseg = bl_alloc_lseg, ++ .free_lseg = bl_free_lseg, ++ .setup_layoutcommit = bl_setup_layoutcommit, ++ .encode_layoutcommit = bl_encode_layoutcommit, ++ .cleanup_layoutcommit = bl_cleanup_layoutcommit, ++ .initialize_mountpoint = bl_initialize_mountpoint, ++ .uninitialize_mountpoint = bl_uninitialize_mountpoint, ++}; ++ ++static struct layoutdriver_policy_operations blocklayout_policy_operations = { ++ .get_stripesize = bl_get_stripesize, ++ .pg_test = bl_pg_test, ++}; ++ ++static struct pnfs_layoutdriver_type blocklayout_type = { ++ .id = LAYOUT_BLOCK_VOLUME, ++ .name = "LAYOUT_BLOCK_VOLUME", ++ .ld_io_ops = &blocklayout_io_operations, ++ .ld_policy_ops = &blocklayout_policy_operations, ++}; ++ ++static int __init nfs4blocklayout_init(void) ++{ ++ dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); ++ ++ pnfs_block_callback_ops = pnfs_register_layoutdriver(&blocklayout_type); ++ bl_pipe_init(); ++ return 0; ++} ++ ++static void __exit nfs4blocklayout_exit(void) ++{ ++ dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", ++ __func__); ++ ++ pnfs_unregister_layoutdriver(&blocklayout_type); ++ bl_pipe_exit(); ++} ++ ++module_init(nfs4blocklayout_init); ++module_exit(nfs4blocklayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-23 12:09:03.289501933 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-23 12:09:03.289501933 -0400 +@@ -0,0 +1,335 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdev.c ++ * ++ * Device operations for the pnfs nfs4 file layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include /* __bread */ ++ ++#include ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes) ++{ ++ uint32_t *q = p + XDR_QUADLEN(nbytes); ++ if (unlikely(q > end || q < p)) ++ return NULL; ++ return p; ++} ++EXPORT_SYMBOL(blk_overflow); ++ ++/* Open a block_device by device number. */ ++struct block_device *nfs4_blkdev_get(dev_t dev) ++{ ++ struct block_device *bd; ++ ++ dprintk("%s enter\n", __func__); ++ bd = open_by_devnum(dev, FMODE_READ); ++ if (IS_ERR(bd)) ++ goto fail; ++ return bd; ++fail: ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ return NULL; ++} ++ ++/* ++ * Release the block device ++ */ ++int nfs4_blkdev_put(struct block_device *bdev) ++{ ++ dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), ++ MINOR(bdev->bd_dev)); ++ bd_release(bdev); ++ return blkdev_put(bdev, FMODE_READ); ++} ++ ++/* Decodes pnfs_block_deviceaddr4 (draft-8) which is XDR encoded ++ * in dev->dev_addr_buf. ++ */ ++struct pnfs_block_dev * ++nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist) ++{ ++ struct pnfs_block_dev *rv = NULL; ++ struct block_device *bd = NULL; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint32_t major, minor; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return NULL; ++ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); ++ dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, ++ dev->mincount); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_MOUNT, 0, dev->area, ++ dev->mincount); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out_err; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ dprintk("%s CALLING USERSPACE DAEMON\n", __func__); ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out_err; ++ } ++ if (reply->status != BL_DEVICE_REQUEST_PROC) { ++ dprintk("%s failed to open device: %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ memcpy(&major, (uint32_t *)(payload_of(reply)), sizeof(uint32_t)); ++ memcpy(&minor, (uint32_t *)(payload_of(reply) + sizeof(uint32_t)), ++ sizeof(uint32_t)); ++ bd = nfs4_blkdev_get(MKDEV(major, minor)); ++ if (IS_ERR(bd)) { ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ ++ rv = kzalloc(sizeof(*rv), GFP_KERNEL); ++ if (!rv) ++ goto out_err; ++ ++ rv->bm_mdev = bd; ++ memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct pnfs_deviceid)); ++ dprintk("%s Created device %s with bd_block_size %u\n", ++ __func__, ++ bd->bd_disk->disk_name, ++ bd->bd_block_size); ++ kfree(reply); ++ kfree(msg); ++ return rv; ++ ++out_err: ++ kfree(rv); ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return NULL; ++} ++ ++/* Map deviceid returned by the server to constructed block_device */ ++static struct block_device *translate_devid(struct pnfs_layout_type *lo, ++ struct pnfs_deviceid *id) ++{ ++ struct block_device *rv = NULL; ++ struct block_mount_id *mid; ++ struct pnfs_block_dev *dev; ++ ++ dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); ++ mid = BLK_ID(lo); ++ spin_lock(&mid->bm_lock); ++ list_for_each_entry(dev, &mid->bm_devlist, bm_node) { ++ if (memcmp(id->data, dev->bm_mdevid.data, ++ NFS4_PNFS_DEVICEID4_SIZE) == 0) { ++ rv = dev->bm_mdev; ++ goto out; ++ } ++ } ++ out: ++ spin_unlock(&mid->bm_lock); ++ dprintk("%s returning %p\n", __func__, rv); ++ return rv; ++} ++ ++/* Tracks info needed to ensure extents in layout obey constraints of spec */ ++struct layout_verification { ++ u32 mode; /* R or RW */ ++ u64 start; /* Expected start of next non-COW extent */ ++ u64 inval; /* Start of INVAL coverage */ ++ u64 cowread; /* End of COW read coverage */ ++}; ++ ++/* Verify the extent meets the layout requirements of the pnfs-block draft, ++ * section 2.3.1. ++ */ ++static int verify_extent(struct pnfs_block_extent *be, ++ struct layout_verification *lv) ++{ ++ if (lv->mode == IOMODE_READ) { ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA || ++ be->be_state == PNFS_BLOCK_INVALID_DATA) ++ return -EIO; ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } ++ /* lv->mode == IOMODE_RW */ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ if (lv->cowread > lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ lv->inval = lv->start; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_READ_DATA) { ++ if (be->be_f_offset > lv->start) ++ return -EIO; ++ if (be->be_f_offset < lv->inval) ++ return -EIO; ++ if (be->be_f_offset < lv->cowread) ++ return -EIO; ++ /* It looks like you might want to min this with lv->start, ++ * but you really don't. ++ */ ++ lv->inval = lv->inval + be->be_length; ++ lv->cowread = be->be_f_offset + be->be_length; ++ return 0; ++ } else ++ return -EIO; ++} ++ ++/* XDR decode pnfs_block_layout4 structure */ ++int ++nfs4_blk_process_layoutget(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layoutget_res *lgr) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ uint32_t *p = (uint32_t *)lgr->layout.buf; ++ uint32_t *end = (uint32_t *)((char *)lgr->layout.buf + lgr->layout.len); ++ int i, status = -EIO; ++ uint32_t count; ++ struct pnfs_block_extent *be = NULL, *save; ++ uint64_t tmp; /* Used by READSECTOR */ ++ struct layout_verification lv = { ++ .mode = lgr->lseg.iomode, ++ .start = lgr->lseg.offset >> 9, ++ .inval = lgr->lseg.offset >> 9, ++ .cowread = lgr->lseg.offset >> 9, ++ }; ++ ++ LIST_HEAD(extents); ++ ++ BLK_READBUF(p, end, 4); ++ READ32(count); ++ ++ dprintk("%s enter, number of extents %i\n", __func__, count); ++ BLK_READBUF(p, end, (28 + NFS4_PNFS_DEVICEID4_SIZE) * count); ++ ++ /* Decode individual extents, putting them in temporary ++ * staging area until whole layout is decoded to make error ++ * recovery easier. ++ */ ++ for (i = 0; i < count; i++) { ++ be = alloc_extent(); ++ if (!be) { ++ status = -ENOMEM; ++ goto out_err; ++ } ++ READ_DEVID(&be->be_devid); ++ be->be_mdev = translate_devid(lo, &be->be_devid); ++ if (!be->be_mdev) ++ goto out_err; ++ /* The next three values are read in as bytes, ++ * but stored as 512-byte sector lengths ++ */ ++ READ_SECTOR(be->be_f_offset); ++ READ_SECTOR(be->be_length); ++ READ_SECTOR(be->be_v_offset); ++ READ32(be->be_state); ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ be->be_inval = &bl->bl_inval; ++ if (verify_extent(be, &lv)) { ++ dprintk("%s verify failed\n", __func__); ++ goto out_err; ++ } ++ list_add_tail(&be->be_node, &extents); ++ } ++ if (p != end) { ++ dprintk("%s Undecoded cruft at end of opaque\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lgr->lseg.offset + lgr->lseg.length != lv.start << 9) { ++ dprintk("%s Final length mismatch\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lv.start < lv.cowread) { ++ dprintk("%s Final uncovered COW extent\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ /* Extents decoded properly, now try to merge them in to ++ * existing layout extents. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ list_for_each_entry_safe(be, save, &extents, be_node) { ++ list_del(&be->be_node); ++ status = add_and_merge_extent(bl, be); ++ if (status) { ++ spin_unlock(&bl->bl_ext_lock); ++ /* This is a fairly catastrophic error, as the ++ * entire layout extent lists are now corrupted. ++ * We should have some way to distinguish this. ++ */ ++ be = NULL; ++ goto out_err; ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ status = 0; ++ out: ++ dprintk("%s returns %i\n", __func__, status); ++ return status; ++ ++ out_err: ++ put_extent(be); ++ while (!list_empty(&extents)) { ++ be = list_first_entry(&extents, struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ goto out; ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-23 12:09:03.290395707 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-23 12:09:03.290395707 -0400 +@@ -0,0 +1,120 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdm.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2007 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Fred Isaman ++ * Andy Adamson ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include /* gendisk - used in a dprintk*/ ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Defines used for calculating memory usage in nfs4_blk_flatten() */ ++#define ARGSIZE 24 /* Max bytes needed for linear target arg string */ ++#define SPECSIZE (sizeof8(struct dm_target_spec) + ARGSIZE) ++#define SPECS_PER_PAGE (PAGE_SIZE / SPECSIZE) ++#define SPEC_HEADER_ADJUST (SPECS_PER_PAGE - \ ++ (PAGE_SIZE - sizeof8(struct dm_ioctl)) / SPECSIZE) ++#define roundup8(x) (((x)+7) & ~7) ++#define sizeof8(x) roundup8(sizeof(x)) ++ ++static int dev_remove(dev_t dev) ++{ ++ int ret = 1; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint64_t bl_dev; ++ uint32_t major = MAJOR(dev), minor = MINOR(dev); ++ ++ dprintk("Entering %s\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return ret; ++ ++ memcpy((void *)&bl_dev, &major, sizeof(uint32_t)); ++ memcpy((void *)&bl_dev + sizeof(uint32_t), &minor, sizeof(uint32_t)); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_UMOUNT, 0, (void *)&bl_dev, ++ sizeof(uint64_t)); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out; ++ } ++ ++ if (reply->status == BL_DEVICE_REQUEST_PROC) ++ ret = 0; /*TODO: what to return*/ ++out: ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return ret; ++} ++ ++/* ++ * Release meta device ++ */ ++static int nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) ++{ ++ int rv; ++ ++ dprintk("%s Releasing\n", __func__); ++ /* XXX Check return? */ ++ rv = nfs4_blkdev_put(bdev->bm_mdev); ++ dprintk("%s nfs4_blkdev_put returns %d\n", __func__, rv); ++ ++ rv = dev_remove(bdev->bm_mdev->bd_dev); ++ dprintk("%s Returns %d\n", __func__, rv); ++ return rv; ++} ++ ++void free_block_dev(struct pnfs_block_dev *bdev) ++{ ++ if (bdev) { ++ if (bdev->bm_mdev) { ++ dprintk("%s Removing DM device: %d:%d\n", ++ __func__, ++ MAJOR(bdev->bm_mdev->bd_dev), ++ MINOR(bdev->bm_mdev->bd_dev)); ++ /* XXX Check status ?? */ ++ nfs4_blk_metadev_release(bdev); ++ } ++ kfree(bdev); ++ } ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-23 12:09:03.290395707 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-23 12:09:03.291501560 -0400 +@@ -0,0 +1,303 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#ifndef FS_NFS_NFS4BLOCKLAYOUT_H ++#define FS_NFS_NFS4BLOCKLAYOUT_H ++ ++#include ++#include /* Needed by nfs4_pnfs.h */ ++#include ++#include /* Needed for struct dm_ioctl*/ ++ ++#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> 9) ++ ++#define PG_pnfserr PG_owner_priv_1 ++#define PagePnfsErr(page) test_bit(PG_pnfserr, &(page)->flags) ++#define SetPagePnfsErr(page) set_bit(PG_pnfserr, &(page)->flags) ++#define ClearPagePnfsErr(page) clear_bit(PG_pnfserr, &(page)->flags) ++ ++extern int dm_dev_create(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_dev_remove(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_do_resume(struct dm_ioctl *param); ++extern int dm_table_load(struct dm_ioctl *param, size_t param_size); ++ ++struct block_mount_id { ++ spinlock_t bm_lock; /* protects list */ ++ struct list_head bm_devlist; /* holds pnfs_block_dev */ ++}; ++ ++struct pnfs_block_dev { ++ struct list_head bm_node; ++ struct pnfs_deviceid bm_mdevid; /* associated devid */ ++ struct block_device *bm_mdev; /* meta device itself */ ++}; ++ ++/* holds visible disks that can be matched against VOLUME_SIMPLE signatures */ ++struct visible_block_device { ++ struct list_head vi_node; ++ struct block_device *vi_bdev; ++ int vi_mapped; ++ int vi_put_done; ++}; ++ ++enum blk_vol_type { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, /* maps to a single LU */ ++ PNFS_BLOCK_VOLUME_SLICE = 1, /* slice of another volume */ ++ PNFS_BLOCK_VOLUME_CONCAT = 2, /* concatenation of multiple volumes */ ++ PNFS_BLOCK_VOLUME_STRIPE = 3 /* striped across multiple volumes */ ++}; ++ ++/* All disk offset/lengths are stored in 512-byte sectors */ ++struct pnfs_blk_volume { ++ uint32_t bv_type; ++ sector_t bv_size; ++ struct pnfs_blk_volume **bv_vols; ++ int bv_vol_n; ++ union { ++ dev_t bv_dev; ++ sector_t bv_stripe_unit; ++ sector_t bv_offset; ++ }; ++}; ++ ++/* Since components need not be aligned, cannot use sector_t */ ++struct pnfs_blk_sig_comp { ++ int64_t bs_offset; /* In bytes */ ++ uint32_t bs_length; /* In bytes */ ++ char *bs_string; ++}; ++ ++/* Maximum number of signatures components in a simple volume */ ++# define PNFS_BLOCK_MAX_SIG_COMP 16 ++ ++struct pnfs_blk_sig { ++ int si_num_comps; ++ struct pnfs_blk_sig_comp si_comps[PNFS_BLOCK_MAX_SIG_COMP]; ++}; ++ ++enum exstate4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ ++ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ ++}; ++ ++#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ ++ ++struct my_tree_t { ++ sector_t mtt_step_size; /* Internal sector alignment */ ++ struct list_head mtt_stub; /* Should be a radix tree */ ++}; ++ ++struct pnfs_inval_markings { ++ spinlock_t im_lock; ++ struct my_tree_t im_tree; /* Sectors that need LAYOUTCOMMIT */ ++ sector_t im_block_size; /* Server blocksize in sectors */ ++}; ++ ++struct pnfs_inval_tracking { ++ struct list_head it_link; ++ int it_sector; ++ int it_tags; ++}; ++ ++/* sector_t fields are all in 512-byte sectors */ ++struct pnfs_block_extent { ++ struct kref be_refcnt; ++ struct list_head be_node; /* link into lseg list */ ++ struct pnfs_deviceid be_devid; /* STUB - remevable??? */ ++ struct block_device *be_mdev; ++ sector_t be_f_offset; /* the starting offset in the file */ ++ sector_t be_length; /* the size of the extent */ ++ sector_t be_v_offset; /* the starting offset in the volume */ ++ enum exstate4 be_state; /* the state of this extent */ ++ struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ ++}; ++ ++/* Shortened extent used by LAYOUTCOMMIT */ ++struct pnfs_block_short_extent { ++ struct list_head bse_node; ++ struct pnfs_deviceid bse_devid; /* STUB - removable??? */ ++ struct block_device *bse_mdev; ++ sector_t bse_f_offset; /* the starting offset in the file */ ++ sector_t bse_length; /* the size of the extent */ ++}; ++ ++static inline void ++INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) ++{ ++ spin_lock_init(&marks->im_lock); ++ INIT_LIST_HEAD(&marks->im_tree.mtt_stub); ++ marks->im_block_size = blocksize; ++ marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, ++ blocksize); ++} ++ ++enum extentclass4 { ++ RW_EXTENT = 0, /* READWRTE and INVAL */ ++ RO_EXTENT = 1, /* READ and NONE */ ++ EXTENT_LISTS = 2, ++}; ++ ++static inline int choose_list(enum exstate4 state) ++{ ++ if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) ++ return RO_EXTENT; ++ else ++ return RW_EXTENT; ++} ++ ++struct pnfs_block_layout { ++ struct pnfs_layout_type bl_layout; ++ struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ ++ spinlock_t bl_ext_lock; /* Protects list manipulation */ ++ struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ ++ struct list_head bl_commit; /* Needs layout commit */ ++ unsigned int bl_count; /* entries in bl_commit */ ++ sector_t bl_blocksize; /* Server blocksize in sectors */ ++}; ++ ++/* this struct is comunicated between: ++ * bl_setup_layoutcommit && bl_encode_layoutcommit && bl_cleanup_layoutcommit ++ */ ++struct bl_layoutupdate_data { ++ struct list_head ranges; ++}; ++ ++#define BLK_ID(lo) ((struct block_mount_id *)(PNFS_NFS_SERVER(lo)->pnfs_ld_data)) ++ ++static inline struct pnfs_block_layout * ++BLK_LO2EXT(struct pnfs_layout_type *lo) ++{ ++ return container_of(lo, struct pnfs_block_layout, bl_layout); ++} ++ ++static inline struct pnfs_block_layout * ++BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) ++{ ++ return BLK_LO2EXT(lseg->layout); ++} ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes); ++ ++#define BLK_READBUF(p, e, nbytes) do { \ ++ p = blk_overflow(p, e, nbytes); \ ++ if (!p) { \ ++ printk(KERN_WARNING \ ++ "%s: reply buffer overflowed in line %d.\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++} while (0) ++ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (uint64_t)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++#define READ_DEVID(x) COPYMEM((x)->data, NFS4_PNFS_DEVICEID4_SIZE) ++#define READ_SECTOR(x) do { \ ++ READ64(tmp); \ ++ if (tmp & 0x1ff) { \ ++ printk(KERN_WARNING \ ++ "%s Value not 512-byte aligned at line %d\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++ (x) = tmp >> 9; \ ++} while (0) ++ ++#define WRITE32(n) do { \ ++ *p++ = htonl(n); \ ++ } while (0) ++#define WRITE64(n) do { \ ++ *p++ = htonl((uint32_t)((n) >> 32)); \ ++ *p++ = htonl((uint32_t)(n)); \ ++} while (0) ++#define WRITEMEM(ptr, nbytes) do { \ ++ p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ ++} while (0) ++#define WRITE_DEVID(x) WRITEMEM((x)->data, NFS4_PNFS_DEVICEID4_SIZE) ++ ++/* blocklayoutdev.c */ ++struct block_device *nfs4_blkdev_get(dev_t dev); ++int nfs4_blkdev_put(struct block_device *bdev); ++struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist); ++int nfs4_blk_process_layoutget(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layoutget_res *lgr); ++int nfs4_blk_create_block_disk_list(struct list_head *); ++void nfs4_blk_destroy_disk_list(struct list_head *); ++/* blocklayoutdm.c */ ++int nfs4_blk_flatten(struct pnfs_blk_volume *, int, struct pnfs_block_dev *); ++void free_block_dev(struct pnfs_block_dev *bdev); ++/* extents.c */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read); ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages); ++void put_extent(struct pnfs_block_extent *be); ++struct pnfs_block_extent *alloc_extent(void); ++struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be); ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect); ++int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *arg); ++void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct pnfs_layoutcommit_arg *arg, ++ int status); ++int add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new); ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length); ++ ++#include ++ ++extern struct pipefs_list bl_device_list; ++extern struct dentry *bl_device_pipe; ++ ++int bl_pipe_init(void); ++void bl_pipe_exit(void); ++ ++#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ ++#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ ++#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ ++#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ ++#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ ++ ++#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-23 12:09:03.292511531 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-08-23 12:09:03.292511531 -0400 +@@ -0,0 +1,948 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include "blocklayout.h" ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Bit numbers */ ++#define EXTENT_INITIALIZED 0 ++#define EXTENT_WRITTEN 1 ++#define EXTENT_IN_COMMIT 2 ++#define INTERNAL_EXISTS MY_MAX_TAGS ++#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) ++ ++/* Returns largest t<=s s.t. t%base==0 */ ++static inline sector_t normalize(sector_t s, int base) ++{ ++ sector_t tmp = s; /* Since do_div modifies its argument */ ++ return s - do_div(tmp, base); ++} ++ ++static inline sector_t normalize_up(sector_t s, int base) ++{ ++ return normalize(s + base - 1, base); ++} ++ ++/* Complete stub using list while determine API wanted */ ++ ++/* Returns tags, or negative */ ++static int32_t _find_entry(struct my_tree_t *tree, u64 s) ++{ ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu) enter\n", __func__, s); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) ++ return pos->it_tags & INTERNAL_MASK; ++ else ++ break; ++ } ++ return -ENOENT; ++} ++ ++static inline ++int _has_tag(struct my_tree_t *tree, u64 s, int32_t tag) ++{ ++ int32_t tags; ++ ++ dprintk("%s(%llu, %i) enter\n", __func__, s, tag); ++ s = normalize(s, tree->mtt_step_size); ++ tags = _find_entry(tree, s); ++ if ((tags < 0) || !(tags & (1 << tag))) ++ return 0; ++ else ++ return 1; ++} ++ ++/* Creates entry with tag, or if entry already exists, unions tag to it. ++ * If storage is not NULL, newly created entry will use it. ++ * Returns number of entries added, or negative on error. ++ */ ++static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag, ++ struct pnfs_inval_tracking *storage) ++{ ++ int found = 0; ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) { ++ found = 1; ++ break; ++ } else ++ break; ++ } ++ if (found) { ++ pos->it_tags |= (1 << tag); ++ return 0; ++ } else { ++ struct pnfs_inval_tracking *new; ++ if (storage) ++ new = storage; ++ else { ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ } ++ new->it_sector = s; ++ new->it_tags = (1 << tag); ++ list_add(&new->it_link, &pos->it_link); ++ return 1; ++ } ++} ++ ++/* XXXX Really want option to not create */ ++/* Over range, unions tag with existing entries, else creates entry with tag */ ++static int _set_range(struct my_tree_t *tree, int32_t tag, u64 s, u64 length) ++{ ++ u64 i; ++ ++ dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); ++ for (i = normalize(s, tree->mtt_step_size); i < s + length; ++ i += tree->mtt_step_size) ++ if (_add_entry(tree, i, tag, NULL)) ++ return -ENOMEM; ++ return 0; ++} ++ ++/* Ensure that future operations on given range of tree will not malloc */ ++static int _preload_range(struct my_tree_t *tree, u64 offset, u64 length) ++{ ++ u64 start, end, s; ++ int count, i, used = 0, status = -ENOMEM; ++ struct pnfs_inval_tracking **storage; ++ ++ dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); ++ start = normalize(offset, tree->mtt_step_size); ++ end = normalize_up(offset + length, tree->mtt_step_size); ++ count = (int)(end - start) / (int)tree->mtt_step_size; ++ ++ /* Pre-malloc what memory we might need */ ++ storage = kmalloc(sizeof(*storage) * count, GFP_KERNEL); ++ if (!storage) ++ return -ENOMEM; ++ for (i = 0; i < count; i++) { ++ storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), ++ GFP_KERNEL); ++ if (!storage[i]) ++ goto out_cleanup; ++ } ++ ++ /* Now need lock - HOW??? */ ++ ++ for (s = start; s < end; s += tree->mtt_step_size) ++ used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); ++ ++ /* Unlock - HOW??? */ ++ status = 0; ++ ++ out_cleanup: ++ for (i = used; i < count; i++) { ++ if (!storage[i]) ++ break; ++ kfree(storage[i]); ++ } ++ kfree(storage); ++ return status; ++} ++ ++static void set_needs_init(sector_t *array, sector_t offset) ++{ ++ sector_t *p = array; ++ ++ dprintk("%s enter\n", __func__); ++ if (!p) ++ return; ++ while (*p < offset) ++ p++; ++ if (*p == offset) ++ return; ++ else if (*p == ~0) { ++ *p++ = offset; ++ *p = ~0; ++ return; ++ } else { ++ sector_t *save = p; ++ dprintk("%s Adding %llu\n", __func__, (u64)offset); ++ while (*p != ~0) ++ p++; ++ p++; ++ memmove(save + 1, save, (char *)p - (char *)save); ++ *save = offset; ++ return; ++ } ++} ++ ++/* We are relying on page lock to serialize this */ ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Assume start, end already sector aligned */ ++static int ++_range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag) ++{ ++ struct pnfs_inval_tracking *pos; ++ u64 expect = 0; ++ ++ dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector >= end) ++ continue; ++ if (!expect) { ++ if ((pos->it_sector == end - tree->mtt_step_size) && ++ (pos->it_tags & (1 << tag))) { ++ expect = pos->it_sector - tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ continue; ++ } else { ++ return 0; ++ } ++ } ++ if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) ++ return 0; ++ expect -= tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ } ++ return 0; ++} ++ ++static int is_range_written(struct pnfs_inval_markings *marks, ++ sector_t start, sector_t end) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Marks sectors in [offest, offset_length) as having been initialized. ++ * All lengths are step-aligned, where step is min(pagesize, blocksize). ++ * Notes where partial block is initialized, and helps prepare it for ++ * complete initialization later. ++ */ ++/* Currently assumes offset is page-aligned */ ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages) ++{ ++ sector_t s, start, end; ++ sector_t *array = NULL; /* Pages to mark */ ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", ++ __func__, (u64)offset, (u64)length); ++ s = max((sector_t) 3, ++ 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); ++ dprintk("%s set max=%llu\n", __func__, (u64)s); ++ if (pages) { ++ array = kmalloc(s * sizeof(sector_t), GFP_KERNEL); ++ if (!array) ++ goto outerr; ++ array[0] = ~0; ++ } ++ ++ start = normalize(offset, marks->im_block_size); ++ end = normalize_up(offset + length, marks->im_block_size); ++ if (_preload_range(&marks->im_tree, start, end - start)) ++ goto outerr; ++ ++ spin_lock(&marks->im_lock); ++ ++ for (s = normalize_up(start, PAGE_CACHE_SECTORS); ++ s < offset; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s pre-area pages\n", __func__); ++ /* Portion of used block is not initialized */ ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) ++ goto out_unlock; ++ for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); ++ s < end; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s post-area pages\n", __func__); ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ ++ spin_unlock(&marks->im_lock); ++ ++ if (pages) { ++ if (array[0] == ~0) { ++ kfree(array); ++ *pages = NULL; ++ } else ++ *pages = array; ++ } ++ return 0; ++ ++ out_unlock: ++ spin_unlock(&marks->im_lock); ++ outerr: ++ if (pages) { ++ kfree(array); ++ *pages = NULL; ++ } ++ return -ENOMEM; ++} ++ ++/* Marks sectors in [offest, offset+length) as having been written to disk. ++ * All lengths should be block aligned. ++ */ ++int mark_written_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length) ++{ ++ int status; ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, ++ (u64)offset, (u64)length); ++ spin_lock(&marks->im_lock); ++ status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); ++ spin_unlock(&marks->im_lock); ++ return status; ++} ++ ++static void print_short_extent(struct pnfs_block_short_extent *be) ++{ ++ dprintk("PRINT SHORT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->bse_length); ++ } ++} ++ ++void print_clist(struct list_head *list, unsigned int count) ++{ ++ struct pnfs_block_short_extent *be; ++ unsigned int i = 0; ++ ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, bse_node) { ++ i++; ++ print_short_extent(be); ++ } ++ if (i != count) ++ dprintk("\n\nExpected %u entries\n\n\n", count); ++ dprintk("****************\n"); ++} ++ ++/* Note: In theory, we should do more checking that devid's match between ++ * old and new, but if they don't, the lists are too corrupt to salvage anyway. ++ */ ++/* Note this is very similar to add_and_merge_extent */ ++static void add_to_commitlist(struct pnfs_block_layout *bl, ++ struct pnfs_block_short_extent *new) ++{ ++ struct list_head *clist = &bl->bl_commit; ++ struct pnfs_block_short_extent *old, *save; ++ sector_t end = new->bse_f_offset + new->bse_length; ++ ++ dprintk("%s enter\n", __func__); ++ print_short_extent(new); ++ print_clist(clist, bl->bl_count); ++ bl->bl_count++; ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe(old, save, clist, bse_node) { ++ if (new->bse_f_offset < old->bse_f_offset) ++ break; ++ if (end <= old->bse_f_offset + old->bse_length) { ++ /* Range is already in list */ ++ bl->bl_count--; ++ kfree(new); ++ return; ++ } else if (new->bse_f_offset <= ++ old->bse_f_offset + old->bse_length) { ++ /* new overlaps or abuts existing be */ ++ if (new->bse_mdev == old->bse_mdev) { ++ /* extend new to fully replace old */ ++ new->bse_length += new->bse_f_offset - ++ old->bse_f_offset; ++ new->bse_f_offset = old->bse_f_offset; ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ } ++ /* Note that if we never hit the above break, old will not point to a ++ * valid extent. However, in that case &old->bse_node==list. ++ */ ++ list_add_tail(&new->bse_node, &old->bse_node); ++ /* Scan forward for overlaps. If we find any, extend new and ++ * remove the overlapped extent. ++ */ ++ old = list_prepare_entry(new, clist, bse_node); ++ list_for_each_entry_safe_continue(old, save, clist, bse_node) { ++ if (end < old->bse_f_offset) ++ break; ++ /* new overlaps or abuts old */ ++ if (new->bse_mdev == old->bse_mdev) { ++ if (end < old->bse_f_offset + old->bse_length) { ++ /* extend new to fully cover old */ ++ end = old->bse_f_offset + old->bse_length; ++ new->bse_length = end - new->bse_f_offset; ++ } ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ dprintk("%s: after merging\n", __func__); ++ print_clist(clist, bl->bl_count); ++} ++ ++/* Note the range described by offset, length is guaranteed to be contained ++ * within be. ++ */ ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length) ++{ ++ sector_t new_end, end = offset + length; ++ struct pnfs_block_short_extent *new; ++ struct pnfs_block_layout *bl = container_of(be->be_inval, ++ struct pnfs_block_layout, ++ bl_inval); ++ ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ ++ mark_written_sectors(be->be_inval, offset, length); ++ /* We want to add the range to commit list, but it must be ++ * block-normalized, and verified that the normalized range has ++ * been entirely written to disk. ++ */ ++ new->bse_f_offset = offset; ++ offset = normalize(offset, bl->bl_blocksize); ++ if (offset < new->bse_f_offset) { ++ if (is_range_written(be->be_inval, offset, new->bse_f_offset)) ++ new->bse_f_offset = offset; ++ else ++ new->bse_f_offset = offset + bl->bl_blocksize; ++ } ++ new_end = normalize_up(end, bl->bl_blocksize); ++ if (end < new_end) { ++ if (is_range_written(be->be_inval, end, new_end)) ++ end = new_end; ++ else ++ end = new_end - bl->bl_blocksize; ++ } ++ if (end <= new->bse_f_offset) { ++ kfree(new); ++ return 0; ++ } ++ new->bse_length = end - new->bse_f_offset; ++ new->bse_devid = be->be_devid; ++ new->bse_mdev = be->be_mdev; ++ ++ spin_lock(&bl->bl_ext_lock); ++ /* new will be freed, either by add_to_commitlist if it decides not ++ * to use it, or after LAYOUTCOMMIT uses it in the commitlist. ++ */ ++ add_to_commitlist(bl, new); ++ spin_unlock(&bl->bl_ext_lock); ++ return 0; ++} ++ ++static void print_bl_extent(struct pnfs_block_extent *be) ++{ ++ dprintk("PRINT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->be_length); ++ dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); ++ dprintk(" be_state %d\n", be->be_state); ++ } ++} ++ ++static void ++destroy_extent(struct kref *kref) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = container_of(kref, struct pnfs_block_extent, be_refcnt); ++ dprintk("%s be=%p\n", __func__, be); ++ kfree(be); ++} ++ ++void ++put_extent(struct pnfs_block_extent *be) ++{ ++ if (be) { ++ dprintk("%s enter %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_put(&be->be_refcnt, destroy_extent); ++ } ++} ++ ++struct pnfs_block_extent *alloc_extent(void) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = kmalloc(sizeof(struct pnfs_block_extent), GFP_KERNEL); ++ if (!be) ++ return NULL; ++ INIT_LIST_HEAD(&be->be_node); ++ kref_init(&be->be_refcnt); ++ be->be_inval = NULL; ++ return be; ++} ++ ++struct pnfs_block_extent * ++get_extent(struct pnfs_block_extent *be) ++{ ++ if (be) ++ kref_get(&be->be_refcnt); ++ return be; ++} ++ ++void print_elist(struct list_head *list) ++{ ++ struct pnfs_block_extent *be; ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, be_node) { ++ print_bl_extent(be); ++ } ++ dprintk("****************\n"); ++} ++ ++static inline int ++extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) ++{ ++ /* Note this assumes new->be_f_offset >= old->be_f_offset */ ++ return (new->be_state == old->be_state) && ++ ((new->be_state == PNFS_BLOCK_NONE_DATA) || ++ ((new->be_v_offset - old->be_v_offset == ++ new->be_f_offset - old->be_f_offset) && ++ new->be_mdev == old->be_mdev)); ++} ++ ++/* Adds new to appropriate list in bl, modifying new and removing existing ++ * extents as appropriate to deal with overlaps. ++ * ++ * See find_get_extent for list constraints. ++ * ++ * Refcount on new is already set. If end up not using it, or error out, ++ * need to put the reference. ++ * ++ * Lock is held by caller. ++ */ ++int ++add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new) ++{ ++ struct pnfs_block_extent *be, *tmp; ++ sector_t end = new->be_f_offset + new->be_length; ++ struct list_head *list; ++ ++ dprintk("%s enter with be=%p\n", __func__, new); ++ print_bl_extent(new); ++ list = &bl->bl_extents[choose_list(new->be_state)]; ++ print_elist(list); ++ ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe_reverse(be, tmp, list, be_node) { ++ if (new->be_f_offset >= be->be_f_offset + be->be_length) ++ break; ++ if (new->be_f_offset >= be->be_f_offset) { ++ if (end <= be->be_f_offset + be->be_length) { ++ /* new is a subset of existing be*/ ++ if (extents_consistent(be, new)) { ++ dprintk("%s: new is subset, ignoring\n", ++ __func__); ++ put_extent(new); ++ return 0; ++ } else { ++ goto out_err; ++ } ++ } else { ++ /* |<-- be -->| ++ * |<-- new -->| */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ new->be_length += new->be_f_offset - ++ be->be_f_offset; ++ new->be_f_offset = be->be_f_offset; ++ new->be_v_offset = be->be_v_offset; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } else if (end >= be->be_f_offset + be->be_length) { ++ /* new extent overlap existing be */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } else if (end > be->be_f_offset) { ++ /* |<-- be -->| ++ *|<-- new -->| */ ++ if (extents_consistent(new, be)) { ++ /* extend new to fully replace be */ ++ new->be_length += be->be_f_offset + be->be_length - ++ new->be_f_offset - new->be_length; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } ++ /* Note that if we never hit the above break, be will not point to a ++ * valid extent. However, in that case &be->be_node==list. ++ */ ++ list_add(&new->be_node, &be->be_node); ++ dprintk("%s: inserting new\n", __func__); ++ print_elist(list); ++ /* STUB - The per-list consistency checks have all been done, ++ * should now check cross-list consistency. ++ */ ++ return 0; ++ ++ out_err: ++ put_extent(new); ++ return -EIO; ++} ++ ++/* Returns extent, or NULL. If a second READ extent exists, it is returned ++ * in cow_read, if given. ++ * ++ * The extents are kept in two seperate ordered lists, one for READ and NONE, ++ * one for READWRITE and INVALID. Within each list, we assume: ++ * 1. Extents are ordered by file offset. ++ * 2. For any given isect, there is at most one extents that matches. ++ */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read) ++{ ++ struct pnfs_block_extent *be, *cow, *ret; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ cow = ret = NULL; ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret && ++ (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ if (!ret) ++ ret = be; ++ else if (be->be_state != PNFS_BLOCK_READ_DATA) ++ put_extent(be); ++ else ++ cow = be; ++ break; ++ } ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ if (cow_read) ++ *cow_read = cow; ++ print_bl_extent(ret); ++ return ret; ++} ++ ++/* Similar to find_get_extent, but called with lock held, and ignores cow */ ++static struct pnfs_block_extent * ++find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) ++{ ++ struct pnfs_block_extent *be, *ret = NULL; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ ret = be; ++ break; ++ } ++ } ++ } ++ print_bl_extent(ret); ++ return ret; ++} ++ ++int ++encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *arg) ++{ ++ sector_t start, end; ++ struct pnfs_block_short_extent *lce, *save; ++ unsigned int count = 0; ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct list_head *ranges = &bld->ranges; ++ __be32 *p, *xdr_start; ++ ++ dprintk("%s enter\n", __func__); ++ start = arg->lseg.offset >> 9; ++ end = start + (arg->lseg.length >> 9); ++ dprintk("%s set start=%llu, end=%llu\n", ++ __func__, (u64)start, (u64)end); ++ ++ /* BUG - creation of bl_commit is buggy - need to wait for ++ * entire block to be marked WRITTEN before it can be added. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ /* Want to adjust for possible truncate */ ++ /* We now want to adjust argument range */ ++ ++ /* XDR encode the ranges found */ ++ xdr_start = xdr_reserve_space(xdr, 8); ++ if (!xdr_start) ++ goto out; ++ list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { ++ p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); ++ if (!p) ++ break; ++ WRITE_DEVID(&lce->bse_devid); ++ WRITE64(lce->bse_f_offset << 9); ++ WRITE64(lce->bse_length << 9); ++ WRITE64(0LL); ++ WRITE32(PNFS_BLOCK_READWRITE_DATA); ++ list_del(&lce->bse_node); ++ list_add_tail(&lce->bse_node, ranges); ++ bl->bl_count--; ++ count++; ++ } ++ xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); ++ xdr_start[1] = cpu_to_be32(count); ++out: ++ spin_unlock(&bl->bl_ext_lock); ++ dprintk("%s found %i ranges\n", __func__, count); ++ return 0; ++} ++ ++/* Helper function to set_to_rw that initialize a new extent */ ++static void ++_prep_new_extent(struct pnfs_block_extent *new, ++ struct pnfs_block_extent *orig, ++ sector_t offset, sector_t length, int state) ++{ ++ kref_init(&new->be_refcnt); ++ /* don't need to INIT_LIST_HEAD(&new->be_node) */ ++ memcpy(&new->be_devid, &orig->be_devid, sizeof(struct pnfs_deviceid)); ++ new->be_mdev = orig->be_mdev; ++ new->be_f_offset = offset; ++ new->be_length = length; ++ new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; ++ new->be_state = state; ++ new->be_inval = orig->be_inval; ++} ++ ++/* Tries to merge be with extent in front of it in list. ++ * Frees storage if not used. ++ */ ++static struct pnfs_block_extent * ++_front_merge(struct pnfs_block_extent *be, struct list_head *head, ++ struct pnfs_block_extent *storage) ++{ ++ struct pnfs_block_extent *prev; ++ ++ if (!storage) ++ goto no_merge; ++ if (&be->be_node == head || be->be_node.prev == head) ++ goto no_merge; ++ prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); ++ if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || ++ !extents_consistent(prev, be)) ++ goto no_merge; ++ _prep_new_extent(storage, prev, prev->be_f_offset, ++ prev->be_length + be->be_length, prev->be_state); ++ list_replace(&prev->be_node, &storage->be_node); ++ put_extent(prev); ++ list_del(&be->be_node); ++ put_extent(be); ++ return storage; ++ ++ no_merge: ++ kfree(storage); ++ return be; ++} ++ ++static u64 ++set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) ++{ ++ u64 rv = offset + length; ++ struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; ++ struct pnfs_block_extent *children[3]; ++ struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; ++ int i = 0, j; ++ ++ dprintk("%s(%llu, %llu)\n", __func__, offset, length); ++ /* Create storage for up to three new extents e1, e2, e3 */ ++ e1 = kmalloc(sizeof(*e1), GFP_KERNEL); ++ e2 = kmalloc(sizeof(*e2), GFP_KERNEL); ++ e3 = kmalloc(sizeof(*e3), GFP_KERNEL); ++ /* BUG - we are ignoring any failure */ ++ if (!e1 || !e2 || !e3) ++ goto out_nosplit; ++ ++ spin_lock(&bl->bl_ext_lock); ++ be = find_get_extent_locked(bl, offset); ++ rv = be->be_f_offset + be->be_length; ++ if (be->be_state != PNFS_BLOCK_INVALID_DATA) { ++ spin_unlock(&bl->bl_ext_lock); ++ goto out_nosplit; ++ } ++ /* Add e* to children, bumping e*'s krefs */ ++ if (be->be_f_offset != offset) { ++ _prep_new_extent(e1, be, be->be_f_offset, ++ offset - be->be_f_offset, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e1; ++ print_bl_extent(e1); ++ } else ++ merge1 = e1; ++ _prep_new_extent(e2, be, offset, ++ min(length, be->be_f_offset + be->be_length - offset), ++ PNFS_BLOCK_READWRITE_DATA); ++ children[i++] = e2; ++ print_bl_extent(e2); ++ if (offset + length < be->be_f_offset + be->be_length) { ++ _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, ++ be->be_f_offset + be->be_length - ++ offset - length, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e3; ++ print_bl_extent(e3); ++ } else ++ merge2 = e3; ++ ++ /* Remove be from list, and insert the e* */ ++ /* We don't get refs on e*, since this list is the base reference ++ * set when init'ed. ++ */ ++ if (i < 3) ++ children[i] = NULL; ++ new = children[0]; ++ list_replace(&be->be_node, &new->be_node); ++ put_extent(be); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); ++ for (j = 1; j < i; j++) { ++ old = new; ++ new = children[j]; ++ list_add(&new->be_node, &old->be_node); ++ } ++ if (merge2) { ++ /* This is a HACK, should just create a _back_merge function */ ++ new = list_entry(new->be_node.next, ++ struct pnfs_block_extent, be_node); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ ++ /* Since we removed the base reference above, be is now scheduled for ++ * destruction. ++ */ ++ put_extent(be); ++ dprintk("%s returns %llu after split\n", __func__, rv); ++ return rv; ++ ++ out_nosplit: ++ kfree(e1); ++ kfree(e2); ++ kfree(e3); ++ dprintk("%s returns %llu without splitting\n", __func__, rv); ++ return rv; ++} ++ ++void ++clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct pnfs_layoutcommit_arg *arg, ++ int status) ++{ ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct pnfs_block_short_extent *lce, *save; ++ ++ dprintk("%s status %d\n", __func__, status); ++ list_for_each_entry_safe_reverse(lce, save, &bld->ranges, bse_node) { ++ if (likely(!status)) { ++ u64 offset = lce->bse_f_offset; ++ u64 end = offset + lce->bse_length; ++ ++ do { ++ offset = set_to_rw(bl, offset, end - offset); ++ } while (offset < end); ++ ++ kfree(lce); ++ } else { ++ spin_lock(&bl->bl_ext_lock); ++ add_to_commitlist(bl, lce); ++ spin_unlock(&bl->bl_ext_lock); ++ } ++ } ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile +--- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-23 12:09:03.292511531 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-08-23 12:09:03.293491476 -0400 +@@ -0,0 +1,6 @@ ++# ++# Makefile for the pNFS block layout driver kernel module ++# ++obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o ++blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \ ++ extents.o block-device-discovery-pipe.o +diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/callback.h +--- linux-2.6.34.noarch/fs/nfs/callback.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-08-23 12:09:03.293491476 -0400 +@@ -8,6 +8,8 @@ + #ifndef __LINUX_FS_NFS_CALLBACK_H + #define __LINUX_FS_NFS_CALLBACK_H + ++#include ++ + #define NFS4_CALLBACK 0x40000000 + #define NFS4_CALLBACK_XDRSIZE 2048 + #define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE) +@@ -72,6 +74,8 @@ struct cb_recallargs { + + #if defined(CONFIG_NFS_V4_1) + ++#include ++ + struct referring_call { + uint32_t rc_sequenceid; + uint32_t rc_slotid; +@@ -111,6 +115,13 @@ extern int nfs41_validate_delegation_sta + + #define RCA4_TYPE_MASK_RDATA_DLG 0 + #define RCA4_TYPE_MASK_WDATA_DLG 1 ++#define RCA4_TYPE_MASK_DIR_DLG 2 ++#define RCA4_TYPE_MASK_FILE_LAYOUT 3 ++#define RCA4_TYPE_MASK_BLK_LAYOUT 4 ++#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8 ++#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 ++#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 ++#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 + + struct cb_recallanyargs { + struct sockaddr *craa_addr; +@@ -127,6 +138,37 @@ struct cb_recallslotargs { + extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, + void *dummy); + ++struct cb_pnfs_layoutrecallargs { ++ struct sockaddr *cbl_addr; ++ struct nfs_fh cbl_fh; ++ struct nfs4_pnfs_layout_segment cbl_seg; ++ struct nfs_fsid cbl_fsid; ++ uint32_t cbl_recall_type; ++ uint32_t cbl_layout_type; ++ uint32_t cbl_layoutchanged; ++ nfs4_stateid cbl_stateid; ++}; ++ ++extern unsigned pnfs_cb_layoutrecall(struct cb_pnfs_layoutrecallargs *args, ++ void *dummy); ++ ++struct cb_pnfs_devicenotifyitem { ++ uint32_t cbd_notify_type; ++ uint32_t cbd_layout_type; ++ struct pnfs_deviceid cbd_dev_id; ++ uint32_t cbd_immediate; ++}; ++ ++/* XXX: Should be dynamic up to max compound size */ ++#define NFS4_DEV_NOTIFY_MAXENTRIES 10 ++struct cb_pnfs_devicenotifyargs { ++ struct sockaddr *addr; ++ int ndevs; ++ struct cb_pnfs_devicenotifyitem devs[NFS4_DEV_NOTIFY_MAXENTRIES]; ++}; ++ ++extern unsigned pnfs_cb_devicenotify(struct cb_pnfs_devicenotifyargs *args, ++ void *dummy); + #endif /* CONFIG_NFS_V4_1 */ + + extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); +diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/nfs/callback_proc.c +--- linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-08-23 12:09:03.294522414 -0400 +@@ -8,10 +8,15 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + #include "nfs4_fs.h" + #include "callback.h" + #include "delegation.h" + #include "internal.h" ++#include "pnfs.h" + + #ifdef NFS_DEBUG + #define NFSDBG_FACILITY NFSDBG_CALLBACK +@@ -62,16 +67,6 @@ out: + return res->status; + } + +-static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *) +-{ +-#if defined(CONFIG_NFS_V4_1) +- if (clp->cl_minorversion > 0) +- return nfs41_validate_delegation_stateid; +-#endif +- return nfs4_validate_delegation_stateid; +-} +- +- + __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) + { + struct nfs_client *clp; +@@ -92,8 +87,7 @@ __be32 nfs4_callback_recall(struct cb_re + inode = nfs_delegation_find_inode(clp, &args->fh); + if (inode != NULL) { + /* Set up a helper thread to actually return the delegation */ +- switch (nfs_async_inode_return_delegation(inode, &args->stateid, +- nfs_validate_delegation_stateid(clp))) { ++ switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { + case 0: + res = 0; + break; +@@ -116,24 +110,364 @@ out: + + int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) + { +- if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, +- sizeof(delegation->stateid.data)) != 0) ++ if (delegation == NULL || memcmp(delegation->stateid.u.data, ++ stateid->u.data, ++ sizeof(delegation->stateid.u.data))) + return 0; + return 1; + } + + #if defined(CONFIG_NFS_V4_1) + ++static bool ++pnfs_is_next_layout_stateid(const struct pnfs_layout_type *lo, ++ const nfs4_stateid stateid) ++{ ++ int seqlock; ++ bool res; ++ u32 oldseqid, newseqid; ++ ++ do { ++ seqlock = read_seqbegin(&lo->seqlock); ++ oldseqid = be32_to_cpu(lo->stateid.u.stateid.seqid); ++ newseqid = be32_to_cpu(stateid.u.stateid.seqid); ++ res = !memcmp(lo->stateid.u.stateid.other, ++ stateid.u.stateid.other, ++ NFS4_STATEID_OTHER_SIZE); ++ if (res) { /* comparing layout stateids */ ++ if (oldseqid == ~0) ++ res = (newseqid == 1); ++ else ++ res = (newseqid == oldseqid + 1); ++ } else { /* open stateid */ ++ res = !memcmp(lo->stateid.u.data, ++ &zero_stateid, ++ NFS4_STATEID_SIZE); ++ if (res) ++ res = (newseqid == 1); ++ } ++ } while (read_seqretry(&lo->seqlock, seqlock)); ++ ++ return res; ++} ++ ++/* ++ * Retrieve an inode based on layout recall parameters ++ * ++ * Note: caller must iput(inode) to dereference the inode. ++ */ ++static struct inode * ++nfs_layoutrecall_find_inode(struct nfs_client *clp, ++ const struct cb_pnfs_layoutrecallargs *args) ++{ ++ struct nfs_inode *nfsi; ++ struct pnfs_layout_type *layout; ++ struct nfs_server *server; ++ struct inode *ino = NULL; ++ ++ dprintk("%s: Begin recall_type=%d clp %p\n", ++ __func__, args->cbl_recall_type, clp); ++ ++ spin_lock(&clp->cl_lock); ++ list_for_each_entry(layout, &clp->cl_layouts, lo_layouts) { ++ nfsi = PNFS_NFS_INODE(layout); ++ if (!nfsi) ++ continue; ++ ++ dprintk("%s: Searching inode=%lu\n", ++ __func__, nfsi->vfs_inode.i_ino); ++ ++ if (args->cbl_recall_type == RETURN_FILE) { ++ if (nfs_compare_fh(&args->cbl_fh, &nfsi->fh)) ++ continue; ++ } else if (args->cbl_recall_type == RETURN_FSID) { ++ server = NFS_SERVER(&nfsi->vfs_inode); ++ if (server->fsid.major != args->cbl_fsid.major || ++ server->fsid.minor != args->cbl_fsid.minor) ++ continue; ++ } ++ ++ /* Make sure client didn't clean up layout without ++ * telling the server */ ++ if (!has_layout(nfsi)) ++ continue; ++ ++ ino = igrab(&nfsi->vfs_inode); ++ dprintk("%s: Found inode=%p\n", __func__, ino); ++ break; ++ } ++ spin_unlock(&clp->cl_lock); ++ return ino; ++} ++ ++struct recall_layout_threadargs { ++ struct inode *inode; ++ struct nfs_client *clp; ++ struct completion started; ++ struct cb_pnfs_layoutrecallargs *rl; ++ int result; ++}; ++ ++static int pnfs_recall_layout(void *data) ++{ ++ struct inode *inode, *ino; ++ struct nfs_client *clp; ++ struct cb_pnfs_layoutrecallargs rl; ++ struct nfs4_pnfs_layoutreturn *lrp; ++ struct recall_layout_threadargs *args = ++ (struct recall_layout_threadargs *)data; ++ int status = 0; ++ ++ daemonize("nfsv4-layoutreturn"); ++ ++ dprintk("%s: recall_type=%d fsid 0x%llx-0x%llx start\n", ++ __func__, args->rl->cbl_recall_type, ++ args->rl->cbl_fsid.major, args->rl->cbl_fsid.minor); ++ ++ clp = args->clp; ++ inode = args->inode; ++ rl = *args->rl; ++ ++ /* support whole file layouts only */ ++ rl.cbl_seg.offset = 0; ++ rl.cbl_seg.length = NFS4_MAX_UINT64; ++ ++ if (rl.cbl_recall_type == RETURN_FILE) { ++ if (pnfs_is_next_layout_stateid(NFS_I(inode)->layout, ++ rl.cbl_stateid)) ++ status = pnfs_return_layout(inode, &rl.cbl_seg, ++ &rl.cbl_stateid, RETURN_FILE, ++ false); ++ else ++ status = cpu_to_be32(NFS4ERR_DELAY); ++ if (status) ++ dprintk("%s RETURN_FILE error: %d\n", __func__, status); ++ else ++ status = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); ++ args->result = status; ++ complete(&args->started); ++ goto out; ++ } ++ ++ status = cpu_to_be32(NFS4_OK); ++ args->result = status; ++ complete(&args->started); ++ args = NULL; ++ ++ /* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */ ++ while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) { ++ /* FIXME: need to check status on pnfs_return_layout */ ++ pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, false); ++ iput(ino); ++ } ++ ++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); ++ if (!lrp) { ++ dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n", ++ __func__); ++ goto out; ++ } ++ ++ /* send final layoutreturn */ ++ lrp->args.reclaim = 0; ++ lrp->args.layout_type = rl.cbl_layout_type; ++ lrp->args.return_type = rl.cbl_recall_type; ++ lrp->args.lseg = rl.cbl_seg; ++ lrp->args.inode = inode; ++ pnfs4_proc_layoutreturn(lrp, true); ++ ++out: ++ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); ++ nfs_put_client(clp); ++ module_put_and_exit(0); ++ dprintk("%s: exit status %d\n", __func__, 0); ++ return 0; ++} ++ ++/* ++ * Asynchronous layout recall! ++ */ ++static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode, ++ struct cb_pnfs_layoutrecallargs *rl) ++{ ++ struct recall_layout_threadargs data = { ++ .clp = clp, ++ .inode = inode, ++ .rl = rl, ++ }; ++ struct task_struct *t; ++ int status = -EAGAIN; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ /* FIXME: do not allow two concurrent layout recalls */ ++ if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) ++ return status; ++ ++ init_completion(&data.started); ++ __module_get(THIS_MODULE); ++ if (!atomic_inc_not_zero(&clp->cl_count)) ++ goto out_put_no_client; ++ ++ t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout"); ++ if (IS_ERR(t)) { ++ printk(KERN_INFO "NFS: Layout recall callback thread failed " ++ "for client (clientid %08x/%08x)\n", ++ (unsigned)(clp->cl_clientid >> 32), ++ (unsigned)(clp->cl_clientid)); ++ status = PTR_ERR(t); ++ goto out_module_put; ++ } ++ wait_for_completion(&data.started); ++ return data.result; ++out_module_put: ++ nfs_put_client(clp); ++out_put_no_client: ++ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); ++ module_put(THIS_MODULE); ++ return status; ++} ++ ++static int pnfs_recall_all_layouts(struct nfs_client *clp) ++{ ++ struct cb_pnfs_layoutrecallargs rl; ++ struct inode *inode; ++ int status = 0; ++ ++ rl.cbl_recall_type = RETURN_ALL; ++ rl.cbl_seg.iomode = IOMODE_ANY; ++ rl.cbl_seg.offset = 0; ++ rl.cbl_seg.length = NFS4_MAX_UINT64; ++ ++ /* we need the inode to get the nfs_server struct */ ++ inode = nfs_layoutrecall_find_inode(clp, &rl); ++ if (!inode) ++ return status; ++ status = pnfs_async_return_layout(clp, inode, &rl); ++ iput(inode); ++ ++ return status; ++} ++ ++__be32 pnfs_cb_layoutrecall(struct cb_pnfs_layoutrecallargs *args, ++ void *dummy) ++{ ++ struct nfs_client *clp; ++ struct inode *inode = NULL; ++ __be32 res; ++ int status; ++ unsigned int num_client = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); ++ clp = nfs_find_client(args->cbl_addr, 4); ++ if (clp == NULL) { ++ dprintk("%s: no client for addr %u.%u.%u.%u\n", ++ __func__, NIPQUAD(args->cbl_addr)); ++ goto out; ++ } ++ ++ res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); ++ do { ++ struct nfs_client *prev = clp; ++ num_client++; ++ /* the callback must come from the MDS personality */ ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) ++ goto loop; ++ if (args->cbl_recall_type == RETURN_FILE) { ++ inode = nfs_layoutrecall_find_inode(clp, args); ++ if (inode != NULL) { ++ status = pnfs_async_return_layout(clp, inode, ++ args); ++ if (status) ++ res = cpu_to_be32(NFS4ERR_DELAY); ++ iput(inode); ++ } ++ } else { /* _ALL or _FSID */ ++ /* we need the inode to get the nfs_server struct */ ++ inode = nfs_layoutrecall_find_inode(clp, args); ++ if (!inode) ++ goto loop; ++ status = pnfs_async_return_layout(clp, inode, args); ++ if (status) ++ res = cpu_to_be32(NFS4ERR_DELAY); ++ iput(inode); ++ } ++loop: ++ clp = nfs_find_client_next(prev); ++ nfs_put_client(prev); ++ } while (clp != NULL); ++ ++out: ++ dprintk("%s: exit with status = %d numclient %u\n", ++ __func__, ntohl(res), num_client); ++ return res; ++} ++ ++/* Remove the deviceid(s) from the nfs_client deviceid cache */ ++static __be32 pnfs_devicenotify_client(struct nfs_client *clp, ++ struct cb_pnfs_devicenotifyargs *args) ++{ ++ uint32_t type; ++ int i; ++ ++ dprintk("%s: --> clp %p\n", __func__, clp); ++ ++ for (i = 0; i < args->ndevs; i++) { ++ struct cb_pnfs_devicenotifyitem *dev = &args->devs[i]; ++ type = dev->cbd_notify_type; ++ if (type == NOTIFY_DEVICEID4_DELETE && clp->cl_devid_cache) ++ nfs4_delete_device(clp->cl_devid_cache, ++ &dev->cbd_dev_id); ++ else if (type == NOTIFY_DEVICEID4_CHANGE) ++ printk(KERN_ERR "%s: NOTIFY_DEVICEID4_CHANGE " ++ "not supported\n", __func__); ++ } ++ return 0; ++} ++ ++__be32 pnfs_cb_devicenotify(struct cb_pnfs_devicenotifyargs *args, ++ void *dummy) ++{ ++ struct nfs_client *clp; ++ __be32 res = 0; ++ unsigned int num_client = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ res = __constant_htonl(NFS4ERR_INVAL); ++ clp = nfs_find_client(args->addr, 4); ++ if (clp == NULL) { ++ dprintk("%s: no client for addr %u.%u.%u.%u\n", ++ __func__, NIPQUAD(args->addr)); ++ goto out; ++ } ++ ++ do { ++ struct nfs_client *prev = clp; ++ num_client++; ++ res = pnfs_devicenotify_client(clp, args); ++ clp = nfs_find_client_next(prev); ++ nfs_put_client(prev); ++ } while (clp != NULL); ++ ++out: ++ dprintk("%s: exit with status = %d numclient %u\n", ++ __func__, ntohl(res), num_client); ++ return res; ++} ++ + int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) + { + if (delegation == NULL) + return 0; + +- /* seqid is 4-bytes long */ +- if (((u32 *) &stateid->data)[0] != 0) ++ if (stateid->u.stateid.seqid != 0) + return 0; +- if (memcmp(&delegation->stateid.data[4], &stateid->data[4], +- sizeof(stateid->data)-4)) ++ if (memcmp(&delegation->stateid.u.stateid.other, ++ &stateid->u.stateid.other, ++ NFS4_STATEID_OTHER_SIZE)) + return 0; + + return 1; +@@ -335,13 +669,37 @@ out: + return status; + } + ++static inline bool ++validate_bitmap_values(const unsigned long *mask) ++{ ++ int i; ++ ++ if (*mask == 0) ++ return true; ++ if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_WDATA_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_DIR_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, mask) || ++ test_bit(RCA4_TYPE_MASK_BLK_LAYOUT, mask)) ++ return true; ++ for (i = RCA4_TYPE_MASK_OBJ_LAYOUT_MIN; ++ i <= RCA4_TYPE_MASK_OBJ_LAYOUT_MAX; i++) ++ if (test_bit(i, mask)) ++ return true; ++ for (i = RCA4_TYPE_MASK_OTHER_LAYOUT_MIN; ++ i <= RCA4_TYPE_MASK_OTHER_LAYOUT_MAX; i++) ++ if (test_bit(i, mask)) ++ return true; ++ return false; ++} ++ + __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) + { + struct nfs_client *clp; + __be32 status; + fmode_t flags = 0; + +- status = htonl(NFS4ERR_OP_NOT_IN_SESSION); ++ status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + clp = nfs_find_client(args->craa_addr, 4); + if (clp == NULL) + goto out; +@@ -349,16 +707,25 @@ __be32 nfs4_callback_recallany(struct cb + dprintk("NFS: RECALL_ANY callback request from %s\n", + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + ++ status = cpu_to_be32(NFS4ERR_INVAL); ++ if (!validate_bitmap_values((const unsigned long *) ++ &args->craa_type_mask)) ++ return status; ++ ++ status = cpu_to_be32(NFS4_OK); + if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags = FMODE_READ; + if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags |= FMODE_WRITE; ++ if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) ++ &args->craa_type_mask)) ++ if (pnfs_recall_all_layouts(clp) == -EAGAIN) ++ status = cpu_to_be32(NFS4ERR_DELAY); + + if (flags) + nfs_expire_all_delegation_types(clp, flags); +- status = htonl(NFS4_OK); + out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/nfs/callback_xdr.c +--- linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-08-23 12:09:03.295502055 -0400 +@@ -22,6 +22,8 @@ + #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + + #if defined(CONFIG_NFS_V4_1) ++#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) ++#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ + 4 + 1 + 3) + #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +@@ -136,7 +138,7 @@ static __be32 decode_stateid(struct xdr_ + p = read_buf(xdr, 16); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); +- memcpy(stateid->data, p, 16); ++ memcpy(stateid->u.data, p, 16); + return 0; + } + +@@ -220,6 +222,148 @@ out: + + #if defined(CONFIG_NFS_V4_1) + ++static __be32 decode_pnfs_layoutrecall_args(struct svc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct cb_pnfs_layoutrecallargs *args) ++{ ++ __be32 *p; ++ __be32 status = 0; ++ ++ args->cbl_addr = svc_addr(rqstp); ++ p = read_buf(xdr, 4 * sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ ++ args->cbl_layout_type = ntohl(*p++); ++ args->cbl_seg.iomode = ntohl(*p++); ++ args->cbl_layoutchanged = ntohl(*p++); ++ args->cbl_recall_type = ntohl(*p++); ++ ++ if (likely(args->cbl_recall_type == RETURN_FILE)) { ++ status = decode_fh(xdr, &args->cbl_fh); ++ if (unlikely(status != 0)) ++ goto out; ++ ++ p = read_buf(xdr, 2 * sizeof(uint64_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ p = xdr_decode_hyper(p, &args->cbl_seg.offset); ++ p = xdr_decode_hyper(p, &args->cbl_seg.length); ++ status = decode_stateid(xdr, &args->cbl_stateid); ++ if (unlikely(status != 0)) ++ goto out; ++ } else if (args->cbl_recall_type == RETURN_FSID) { ++ p = read_buf(xdr, 2 * sizeof(uint64_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ p = xdr_decode_hyper(p, &args->cbl_fsid.major); ++ p = xdr_decode_hyper(p, &args->cbl_fsid.minor); ++ } ++ dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d " ++ "fsid %llx-%llx fhsize %d\n", __func__, ++ args->cbl_layout_type, args->cbl_seg.iomode, ++ args->cbl_layoutchanged, args->cbl_recall_type, ++ args->cbl_fsid.major, args->cbl_fsid.minor, ++ args->cbl_fh.size); ++out: ++ dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); ++ return status; ++} ++ ++static ++__be32 decode_pnfs_devicenotify_args(struct svc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct cb_pnfs_devicenotifyargs *args) ++{ ++ __be32 *p; ++ __be32 status = 0; ++ u32 tmp; ++ int n, i; ++ args->ndevs = 0; ++ ++ args->addr = svc_addr(rqstp); ++ ++ /* Num of device notifications */ ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ n = ntohl(*p++); ++ if (n <= 0) ++ goto out; ++ ++ /* XXX: need to possibly return error in this case */ ++ if (n > NFS4_DEV_NOTIFY_MAXENTRIES) { ++ dprintk("%s: Processing (%d) notifications out of (%d)\n", ++ __func__, NFS4_DEV_NOTIFY_MAXENTRIES, n); ++ n = NFS4_DEV_NOTIFY_MAXENTRIES; ++ } ++ ++ /* Decode each dev notification */ ++ for (i = 0; i < n; i++) { ++ struct cb_pnfs_devicenotifyitem *dev = &args->devs[i]; ++ ++ p = read_buf(xdr, (4 * sizeof(uint32_t)) ++ + NFS4_PNFS_DEVICEID4_SIZE); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* bitmap size */ ++ if (tmp != 1) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_notify_type = ntohl(*p++); ++ if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* opaque size */ ++ if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) && ++ (tmp != NFS4_PNFS_DEVICEID4_SIZE + 8)) || ++ ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) && ++ (tmp != NFS4_PNFS_DEVICEID4_SIZE + 4))) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_layout_type = ntohl(*p++); ++ memcpy(dev->cbd_dev_id.data, p, NFS4_PNFS_DEVICEID4_SIZE); ++ p += XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE); ++ ++ if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_DELAY); ++ goto out; ++ } ++ dev->cbd_immediate = ntohl(*p++); ++ } else { ++ dev->cbd_immediate = 0; ++ } ++ ++ args->ndevs++; ++ ++ dprintk("%s: type %d layout 0x%x immediate %d\n", ++ __func__, dev->cbd_notify_type, dev->cbd_layout_type, ++ dev->cbd_immediate); ++ } ++out: ++ dprintk("%s: status %d ndevs %d\n", ++ __func__, ntohl(status), args->ndevs); ++ return status; ++} ++ + static __be32 decode_sessionid(struct xdr_stream *xdr, + struct nfs4_sessionid *sid) + { +@@ -574,11 +718,11 @@ preprocess_nfs41_op(int nop, unsigned in + case OP_CB_SEQUENCE: + case OP_CB_RECALL_ANY: + case OP_CB_RECALL_SLOT: ++ case OP_CB_LAYOUTRECALL: ++ case OP_CB_NOTIFY_DEVICEID: + *op = &callback_ops[op_nr]; + break; + +- case OP_CB_LAYOUTRECALL: +- case OP_CB_NOTIFY_DEVICEID: + case OP_CB_NOTIFY: + case OP_CB_PUSH_DELEG: + case OP_CB_RECALLABLE_OBJ_AVAIL: +@@ -739,6 +883,18 @@ static struct callback_op callback_ops[] + .res_maxsize = CB_OP_RECALL_RES_MAXSZ, + }, + #if defined(CONFIG_NFS_V4_1) ++ [OP_CB_LAYOUTRECALL] = { ++ .process_op = (callback_process_op_t)pnfs_cb_layoutrecall, ++ .decode_args = ++ (callback_decode_arg_t)decode_pnfs_layoutrecall_args, ++ .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, ++ }, ++ [OP_CB_NOTIFY_DEVICEID] = { ++ .process_op = (callback_process_op_t)pnfs_cb_devicenotify, ++ .decode_args = ++ (callback_decode_arg_t)decode_pnfs_devicenotify_args, ++ .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ, ++ }, + [OP_CB_SEQUENCE] = { + .process_op = (callback_process_op_t)nfs4_callback_sequence, + .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, +diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/client.c +--- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-08-23 12:08:29.037481540 -0400 ++++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-23 12:09:03.297501650 -0400 +@@ -39,6 +39,7 @@ + #include + #include + #include ++#include + + #include + +@@ -48,6 +49,7 @@ + #include "iostat.h" + #include "internal.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_CLIENT + +@@ -150,11 +152,14 @@ static struct nfs_client *nfs_alloc_clie + clp->cl_boot_time = CURRENT_TIME; + clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; + clp->cl_minorversion = cl_init->minorversion; ++ clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; + #endif + cred = rpc_lookup_machine_cred(); + if (!IS_ERR(cred)) + clp->cl_machine_cred = cred; +- ++#if defined(CONFIG_NFS_V4_1) ++ INIT_LIST_HEAD(&clp->cl_layouts); ++#endif + nfs_fscache_get_client_cookie(clp); + + return clp; +@@ -178,7 +183,7 @@ static void nfs4_clear_client_minor_vers + clp->cl_session = NULL; + } + +- clp->cl_call_sync = _nfs4_call_sync; ++ clp->cl_mvops = nfs_v4_minor_ops[0]; + #endif /* CONFIG_NFS_V4_1 */ + } + +@@ -188,7 +193,7 @@ static void nfs4_clear_client_minor_vers + static void nfs4_destroy_callback(struct nfs_client *clp) + { + if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) +- nfs_callback_down(clp->cl_minorversion); ++ nfs_callback_down(clp->cl_mvops->minor_version); + } + + static void nfs4_shutdown_client(struct nfs_client *clp) +@@ -251,6 +256,7 @@ void nfs_put_client(struct nfs_client *c + nfs_free_client(clp); + } + } ++EXPORT_SYMBOL(nfs_put_client); + + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + /* +@@ -343,7 +349,7 @@ static int nfs_sockaddr_match_ipaddr(con + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, including the port number. + */ +-static int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2) + { + if (sa1->sa_family != sa2->sa_family) +@@ -357,6 +363,7 @@ static int nfs_sockaddr_cmp(const struct + } + return 0; + } ++EXPORT_SYMBOL(nfs_sockaddr_cmp); + + /* + * Find a client by IP address and protocol version +@@ -548,6 +555,7 @@ int nfs4_check_client_ready(struct nfs_c + return -EPROTONOSUPPORT; + return 0; + } ++EXPORT_SYMBOL(nfs4_check_client_ready); + + /* + * Initialise the timeout values for a connection +@@ -865,9 +873,34 @@ error: + } + + /* ++ * Initialize the pNFS layout driver and setup pNFS related parameters ++ */ ++static void nfs4_init_pnfs(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) ++{ ++#if defined(CONFIG_NFS_V4_1) ++ struct nfs_client *clp = server->nfs_client; ++ ++ if (nfs4_has_session(clp) && ++ (clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) { ++ server->pnfs_blksize = fsinfo->blksize; ++ set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype); ++ pnfs_set_ds_iosize(server); ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ ++static void nfs4_uninit_pnfs(struct nfs_server *server) ++{ ++#if defined(CONFIG_NFS_V4_1) ++ if (server->nfs_client && nfs4_has_session(server->nfs_client)) ++ unmount_pnfs_layoutdriver(server); ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ ++/* + * Load up the server record from information gained in an fsinfo record + */ +-static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) ++static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) + { + unsigned long max_rpc_payload; + +@@ -897,6 +930,8 @@ static void nfs_server_set_fsinfo(struct + if (server->wsize > NFS_MAX_FILE_IO_SIZE) + server->wsize = NFS_MAX_FILE_IO_SIZE; + server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; ++ nfs4_init_pnfs(server, mntfh, fsinfo); ++ + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); + + server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); +@@ -938,7 +973,7 @@ static int nfs_probe_fsinfo(struct nfs_s + if (error < 0) + goto out_error; + +- nfs_server_set_fsinfo(server, &fsinfo); ++ nfs_server_set_fsinfo(server, mntfh, &fsinfo); + + /* Get some general file system info */ + if (server->namelen == 0) { +@@ -1016,6 +1051,7 @@ void nfs_free_server(struct nfs_server * + { + dprintk("--> nfs_free_server()\n"); + ++ nfs4_uninit_pnfs(server); + spin_lock(&nfs_client_lock); + list_del(&server->client_link); + list_del(&server->master_link); +@@ -1126,7 +1162,7 @@ static int nfs4_init_callback(struct nfs + return error; + } + +- error = nfs_callback_up(clp->cl_minorversion, ++ error = nfs_callback_up(clp->cl_mvops->minor_version, + clp->cl_rpcclient->cl_xprt); + if (error < 0) { + dprintk("%s: failed to start callback. Error = %d\n", +@@ -1143,10 +1179,8 @@ static int nfs4_init_callback(struct nfs + */ + static int nfs4_init_client_minor_version(struct nfs_client *clp) + { +- clp->cl_call_sync = _nfs4_call_sync; +- + #if defined(CONFIG_NFS_V4_1) +- if (clp->cl_minorversion) { ++ if (clp->cl_mvops->minor_version) { + struct nfs4_session *session = NULL; + /* + * Create the session and mark it expired. +@@ -1158,7 +1192,13 @@ static int nfs4_init_client_minor_versio + return -ENOMEM; + + clp->cl_session = session; +- clp->cl_call_sync = _nfs4_call_sync_session; ++ /* ++ * The create session reply races with the server back ++ * channel probe. Mark the client NFS_CS_SESSION_INITING ++ * so that the client back channel can find the ++ * nfs_client struct ++ */ ++ clp->cl_cons_state = NFS_CS_SESSION_INITING; + } + #endif /* CONFIG_NFS_V4_1 */ + +@@ -1216,7 +1256,7 @@ error: + /* + * Set up an NFS4 client + */ +-static int nfs4_set_client(struct nfs_server *server, ++int nfs4_set_client(struct nfs_server *server, + const char *hostname, + const struct sockaddr *addr, + const size_t addrlen, +@@ -1259,6 +1299,7 @@ error: + dprintk("<-- nfs4_set_client() = xerror %d\n", error); + return error; + } ++EXPORT_SYMBOL(nfs4_set_client); + + + /* +@@ -1448,7 +1489,7 @@ struct nfs_server *nfs4_create_referral_ + data->authflavor, + parent_server->client->cl_xprt->prot, + parent_server->client->cl_timeout, +- parent_client->cl_minorversion); ++ parent_client->cl_mvops->minor_version); + if (error < 0) + goto error; + +diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/bl_com.c +--- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-08-23 12:09:03.297501650 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-08-23 12:09:03.298501447 -0400 +@@ -0,0 +1,292 @@ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++static ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); ++static void bl_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops bl_upcall_ops = { ++ .upcall = bl_pipe_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = bl_pipe_destroy_msg, ++}; ++ ++bl_comm_t *bl_comm_global; ++ ++int ++nfsd_bl_start(void) ++{ ++ bl_comm_t *bl_comm = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ dprintk("%s: starting pipe\n", __func__); ++ if (bl_comm_global) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ bl_comm = kzalloc(sizeof (*bl_comm), GFP_KERNEL); ++ if (!bl_comm) { ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ /* FIXME: rename to "spnfs_block" */ ++ bl_comm->pipe_dentry = rpc_mkpipe(nd.path.dentry, "pnfs_block", bl_comm, ++ &bl_upcall_ops, 0); ++ if (IS_ERR(bl_comm->pipe_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ mutex_init(&bl_comm->lock); ++ mutex_init(&bl_comm->pipe_lock); ++ init_waitqueue_head(&bl_comm->pipe_wq); ++ ++ bl_comm_global = bl_comm; ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(bl_comm); ++ return rc; ++} ++ ++void ++nfsd_bl_stop(void) ++{ ++ bl_comm_t *c = bl_comm_global; ++ ++ dprintk("%s: stopping pipe\n", __func__); ++ if (!c) ++ return; ++ rpc_unlink(c->pipe_dentry); ++ rpc_put_mount(); ++ bl_comm_global = NULL; ++ kfree(c); ++} ++ ++static ssize_t ++bl_pipe_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *dst, ++ size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied, ++ left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ ++ return mlen; ++} ++ ++static ssize_t ++bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ bl_comm_t *bc = (bl_comm_t *)rpci->private; ++ bl_comm_msg_t *im = &bc->msg; ++ int ret; ++ bl_comm_res_t *res; ++ ++ ++ if (mlen == 0) { ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ im->msg_res = NULL; ++ wake_up(&bc->pipe_wq); ++ return -EFAULT; ++ } ++ ++ if ((res = kmalloc(mlen, GFP_KERNEL)) == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(res, src, mlen)) { ++ kfree(res); ++ return -EFAULT; ++ } ++ ++ mutex_lock(&bc->pipe_lock); ++ ++ ret = mlen; ++ im->msg_status = res->res_status; ++ im->msg_res = res; ++ ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++ return ret; ++} ++ ++static void ++bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ bl_comm_msg_t *im = msg->data; ++ bl_comm_t *bc = container_of(im, struct bl_comm, msg); ++ ++ if (msg->errno >= 0) ++ return; ++ ++ mutex_lock(&bc->pipe_lock); ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++} ++ ++int ++bl_upcall(bl_comm_t *bc, bl_comm_msg_t *upmsg, bl_comm_res_t **res) ++{ ++ struct rpc_pipe_msg msg; ++ DECLARE_WAITQUEUE(wq, current); ++ int rval = 1; ++ bl_comm_msg_t *m = &bc->msg; ++ ++ if (bc == NULL) { ++ dprintk("%s: No pNFS block daemon available\n", __func__); ++ return 1; ++ } ++ ++ mutex_lock(&bc->lock); ++ mutex_lock(&bc->pipe_lock); ++ ++ memcpy(m, upmsg, sizeof (*m)); ++ ++ memset(&msg, 0, sizeof (msg)); ++ msg.data = m; ++ msg.len = sizeof (*m); ++ ++ add_wait_queue(&bc->pipe_wq, &wq); ++ rval = rpc_queue_upcall(bc->pipe_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&bc->pipe_lock); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ mutex_lock(&bc->pipe_lock); ++ ++ if (m->msg_status == PNFS_BLOCK_SUCCESS) { ++ *res = m->msg_res; ++ rval = 0; ++ } else ++ rval = 1; ++ ++out: ++ mutex_unlock(&bc->pipe_lock); ++ mutex_unlock(&bc->lock); ++ return rval; ++} ++ ++static ssize_t ctl_write(struct file *file, const char __user *buf, size_t len, ++ loff_t *offset) ++{ ++ int cmd, ++ rc; ++ bl_comm_t *bc = bl_comm_global; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof (int))) ++ return -EFAULT; ++ switch (cmd) { ++ case PNFS_BLOCK_CTL_STOP: ++ msg.msg_type = PNFS_UPCALL_MSG_STOP; ++ (void) bl_upcall(bc, &msg, &res); ++ kfree(res); ++ nfsd_bl_stop(); ++ break; ++ ++ case PNFS_BLOCK_CTL_START: ++ rc = nfsd_bl_start(); ++ if (rc != 0) ++ return rc; ++ break; ++ ++ case PNFS_BLOCK_CTL_VERS: ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bc, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ kfree(res); ++ break; ++ ++ default: ++ dprintk("%s: unknown ctl command %d\n", __func__, cmd); ++ break; ++ } ++ return len; ++} ++ ++static struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++ ++/* ++ * bl_init_proc -- set up proc interfaces ++ * ++ * Creating a pnfs_block directory isn't really required at this point ++ * since we've only got a single node in that directory. If the need for ++ * more nodes doesn't present itself shortly this code should revert ++ * to a single top level node. McNeal 11-Aug-2008. ++ */ ++int ++bl_init_proc(void) ++{ ++ struct proc_dir_entry *e; ++ ++ e = proc_mkdir("fs/pnfs_block", NULL); ++ if (!e) ++ return -ENOMEM; ++ ++ e = create_proc_entry("fs/pnfs_block/ctl", 0, NULL); ++ if (!e) ++ return -ENOMEM; ++ e->proc_fops = &ctl_ops; ++ ++ return 0; ++} ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/bl_ops.c +--- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-08-23 12:09:03.299501445 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-08-23 12:09:03.299501445 -0400 +@@ -0,0 +1,1672 @@ ++/* ++ * bl_ops.c ++ * spNFS ++ * ++ * Created by Rick McNeal on 4/1/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++ ++/* ++ * Block layout operations. ++ * ++ * These functions, with the exception of pnfs_block_enabled, are assigned to ++ * the super block s_export_op structure. ++ */ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#define MIN(a, b) ((a) < (b) ? (a) : (b)) ++ ++#define BL_LAYOUT_HASH_BITS 4 ++#define BL_LAYOUT_HASH_SIZE (1 << BL_LAYOUT_HASH_BITS) ++#define BL_LAYOUT_HASH_MASK (BL_LAYOUT_HASH_SIZE - 1) ++#define BL_LIST_REQ (sizeof (struct dm_ioctl) + 256) ++ ++#define bl_layout_hashval(id) \ ++ ((id) & BL_LAYOUT_HASH_MASK) ++ ++#define BLL_F_END(p) ((p)->bll_foff + (p)->bll_len) ++#define BLL_S_END(p) ((p)->bll_soff + (p)->bll_len) ++#define _2SECTS(v) ((v) >> 9) ++ ++#ifndef READ32 ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++(x) = (u64)ntohl(*p++) << 32; \ ++(x) |= ntohl(*p++); \ ++} while (0) ++#endif ++ ++ ++typedef enum {True, False} boolean_t; ++/* ---- block layoutget and commit structure ---- */ ++typedef struct bl_layout_rec { ++ struct list_head blr_hash, ++ blr_layouts; ++ dev_t blr_rdev; ++ struct inode *blr_inode; ++ int blr_recalled; // debug ++ u64 blr_orig_size, ++ blr_commit_size, ++ blr_ext_size; ++ spinlock_t blr_lock; // Protects blr_layouts ++} bl_layout_rec_t; ++ ++static struct list_head layout_hash; ++static struct list_head layout_hashtbl[BL_LAYOUT_HASH_SIZE]; ++static spinlock_t layout_hashtbl_lock; ++ ++/* ---- prototypes ---- */ ++static boolean_t device_slice(dev_t devid); ++static boolean_t device_dm(dev_t devid); ++static boolean_t layout_inode_add(struct inode *i, bl_layout_rec_t **); ++static bl_layout_rec_t *layout_inode_find(struct inode *i); ++static void layout_inode_del(struct inode *i); ++static char *map_state2name(enum pnfs_block_extent_state4 s); ++static pnfs_blocklayout_devinfo_t *bld_alloc(struct list_head *volume, int type); ++static void bld_free(pnfs_blocklayout_devinfo_t *bld); ++static pnfs_blocklayout_devinfo_t *bld_simple(struct list_head *volumes, ++ dev_t devid, int local_index); ++static pnfs_blocklayout_devinfo_t *bld_slice(struct list_head *volumes, ++ dev_t devid, int my_loc, int idx); ++static int layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg); ++struct list_head *layout_cache_iter(bl_layout_rec_t *r, ++ struct list_head *bl_possible, struct nfsd4_layout_seg *seg); ++static void layout_cache_merge(bl_layout_rec_t *r, struct list_head *h); ++static int layout_cache_update(bl_layout_rec_t *r, struct list_head *h); ++static void layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg); ++static void print_bll(pnfs_blocklayout_layout_t *b, char *); ++static inline boolean_t layout_cache_fill_from_list(bl_layout_rec_t *r, ++ struct list_head *h, struct nfsd4_layout_seg *seg); ++static inline void bll_collapse(bl_layout_rec_t *r, ++ pnfs_blocklayout_layout_t *c); ++static pnfs_blocklayout_layout_t *bll_alloc(u64 offset, u64 len, ++ enum bl_cache_state state, struct list_head *h); ++static pnfs_blocklayout_layout_t *bll_alloc_dup(pnfs_blocklayout_layout_t *b, ++ enum bl_cache_state c, struct list_head *h); ++static inline boolean_t layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s); ++static void extents_setup(struct fiemap_extent_info *fei); ++static void extents_count(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_get(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_process(struct fiemap_extent_info *fei, ++ struct list_head *bl_candidates, struct nfsd4_layout_seg *, dev_t dev, ++ pnfs_blocklayout_layout_t *b); ++static void extents_cleanup(struct fiemap_extent_info *fei); ++ ++void ++nfsd_bl_init(void) ++{ ++ int i; ++ dprintk("%s loaded\n", __func__); ++ ++ spin_lock_init(&layout_hashtbl_lock); ++ INIT_LIST_HEAD(&layout_hash); ++ for (i = 0; i < BL_LAYOUT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&layout_hashtbl[i]); ++ bl_init_proc(); ++} ++ ++/* ++ * pnfs_block_enabled -- check to see if this file system should be export as ++ * block pnfs ++ */ ++int ++pnfs_block_enabled(struct inode *inode, int ex_flags) ++{ ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ static int bl_comm_once = 0; ++ ++ dprintk("--> %s\n", __func__); ++ /* ++ * FIXME: Figure out method to determine if this file system should ++ * be exported. The following areas need to be checked. ++ * (1) Validate that this file system was exported as a pNFS ++ * block-layout ++ * (2) Has there been successful communication with the ++ * volume daemon? ++ */ ++ /* Check #1 */ ++#ifdef notyet ++ if (!(ex_flags & NFSEXP_PNFS_BLOCK)) { ++ dprintk("%s: pnfs_block not set in export\n", __func__); ++ return 0; ++ } ++#endif ++ ++ /* Check #1 */ ++ if (!bl_comm_once) { ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ if (msg.u.msg_vers != res->u.vers) { ++ dprintk("%s: vers mismatch, kernel != daemon\n", ++ __func__); ++ kfree(res); ++ return 0; ++ } ++ } ++ bl_comm_once = 1; ++ ++ kfree(res); ++ ++ dprintk("<-- %s okay\n", __func__); ++ return 1; ++} ++ ++int ++bl_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_BLOCK_VOLUME; ++} ++ ++int ++bl_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_devid = sb->s_dev; ++ res->gd_verf = 1; ++ res->gd_cookie = 1; ++ return 0; ++} ++ ++static int ++bl_getdeviceinfo_slice(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld_slice_p, ++ *bld_simple_p, ++ *bld; ++ int status = -EIO, ++ location = 0; ++ struct list_head volumes; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ bld_simple_p = bld_simple(&volumes, devid->devid, ++ location++); ++ if (!bld_simple_p) ++ goto out; ++ bld_slice_p = bld_slice(&volumes, devid->devid, location++, ++ bld_simple_p->bld_index_loc); ++ ++ if (!bld_slice_p) ++ goto out; ++ ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ if (bld->bld_type == PNFS_BLOCK_VOLUME_SIMPLE) ++ kfree(bld->u.simple.bld_sig); ++ bld_free(bld); ++ } ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++static int ++bl_getdeviceinfo_dm(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ int status = -EIO, // default to error ++ i, ++ location = 0; ++ struct list_head volumes; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMGET; ++ msg.u.msg_dev = devid->devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: upcall for DMGET failed\n", __func__); ++ goto out; ++ } ++ ++ /* ++ * Don't use bld_alloc() here. If used this will be the first volume ++ * type added to the list whereas the protocol requires it to be the ++ * last. ++ */ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ goto out; ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = PNFS_BLOCK_VOLUME_STRIPE; ++ bld->u.stripe.bld_stripes = res->u.stripe.num_stripes; ++ bld->u.stripe.bld_chunk_size = res->u.stripe.stripe_size * 512LL; ++ dprintk("%s: stripes %d, chunk_size %Lu\n", __func__, ++ bld->u.stripe.bld_stripes, bld->u.stripe.bld_chunk_size / 512LL); ++ ++ bld->u.stripe.bld_stripe_indexs = kmalloc(bld->u.stripe.bld_stripes * ++ sizeof (int), GFP_KERNEL); ++ if (!bld->u.stripe.bld_stripe_indexs) ++ goto out; ++ ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) { ++ dev_t dev; ++ pnfs_blocklayout_devinfo_t *bldp; ++ ++ dev = MKDEV(res->u.stripe.devs[i].major, ++ res->u.stripe.devs[i].minor); ++ if (dev == 0) ++ goto out; ++ ++ bldp = bld_simple(&volumes, dev, location++); ++ if (!bldp) { ++ dprintk("%s: bld_simple failed\n", __func__); ++ goto out; ++ } ++ bldp = bld_slice(&volumes, dev, location++, bldp->bld_index_loc); ++ ++ if (!bldp) { ++ dprintk("%s: bld_slice failed\n", __func__); ++ goto out; ++ } ++ bld->u.stripe.bld_stripe_indexs[i] = bldp->bld_index_loc; ++ ++ } ++ list_add_tail(&bld->bld_list, &volumes); ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SLICE: ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ // No memory to release for these ++ break; ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ kfree(bld->u.simple.bld_sig); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ kfree(bld->u.stripe.bld_stripe_indexs); ++ break; ++ } ++ bld_free(bld); ++ } ++ kfree(res); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++/* ++ * bl_getdeviceinfo -- determine device tree for requested devid ++ */ ++int ++bl_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ if (device_slice(devid->devid) == True) ++ return bl_getdeviceinfo_slice(sb, xdr, devid); ++ else if (device_dm(devid->devid) == True) ++ return bl_getdeviceinfo_dm(sb, xdr, devid); ++ return -EINVAL; ++} ++ ++enum nfsstat4 ++bl_layoutget(struct inode *i, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ pnfs_blocklayout_layout_t *b; ++ bl_layout_rec_t *r; ++ struct list_head bl_possible, ++ *bl_candidates = NULL; ++ boolean_t del_on_error = False; ++ int adj; ++ enum nfsstat4 nfserr = NFS4_OK; ++ ++ dprintk("--> %s (inode=[0x%x:%lu], offset=%Lu, len=%Lu, iomode=%d)\n", ++ __func__, i->i_sb->s_dev, i->i_ino, _2SECTS(res->lg_seg.offset), ++ _2SECTS(res->lg_seg.length), res->lg_seg.iomode); ++ ++ if (res->lg_seg.length == 0) { ++ printk("%s: request length of 0, error condition\n", __func__); ++ return NFS4ERR_BADLAYOUT; ++ } ++ ++ /* ++ * Adjust the length as required per spec. ++ * - First case is were the length is set to (u64)-1. Cheap means to ++ * define the end of the file. ++ * - Second case is were the I/O mode is read-only, but the request is ++ * past the end of the file so the request needs to be trimed. ++ */ ++ if ((res->lg_seg.length == NFS4_MAX_UINT64) || ++ (((res->lg_seg.offset + res->lg_seg.length) > i->i_size) && ++ (res->lg_seg.iomode == IOMODE_READ))) ++ res->lg_seg.length = i->i_size - res->lg_seg.offset; ++ ++ adj = (res->lg_seg.offset & 511) ? res->lg_seg.offset & 511 : 0; ++ res->lg_seg.offset -= adj; ++ res->lg_seg.length = (res->lg_seg.length + adj + 511) & ~511; ++ ++ if (res->lg_seg.iomode != IOMODE_READ) ++ if (i->i_op->fallocate(i, FALLOC_FL_KEEP_SIZE, ++ res->lg_seg.offset, res->lg_seg.length)) ++ return NFS4ERR_IO; ++ ++ INIT_LIST_HEAD(&bl_possible); ++ ++ if ((r = layout_inode_find(i)) == NULL) { ++ if (layout_inode_add(i, &r) == False) { ++ printk("%s: layout_inode_add failed\n", __func__); ++ return NFS4ERR_IO; ++ } ++ del_on_error = True; ++ } ++ BUG_ON(!r); ++ ++ spin_lock(&r->blr_lock); ++ ++ if (layout_cache_fill_from(r, &bl_possible, &res->lg_seg)) { ++ /* ++ * This will send LAYOUTTRYAGAIN error to the client. ++ */ ++ dprintk("%s: layout_cache_fill_from() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res->lg_return_on_close = 1; ++ res->lg_seg.length = 0; ++ ++ bl_candidates = layout_cache_iter(r, &bl_possible, &res->lg_seg); ++ if (!bl_candidates) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ layout_cache_merge(r, bl_candidates); ++ if (layout_cache_update(r, bl_candidates)) { ++ /* ---- Failed to allocate memory. ---- */ ++ dprintk("%s: layout_cache_update() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ nfserr = blocklayout_encode_layout(xdr, bl_candidates); ++ if (nfserr) ++ dprintk("%s: layoutget xdr routine failed\n", __func__); ++ ++layoutget_cleanup: ++ if (bl_candidates) { ++ while (!list_empty(bl_candidates)) { ++ b = list_entry(bl_candidates->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ } ++ ++ spin_unlock(&r->blr_lock); ++ if (unlikely(nfserr)) { ++ if (del_on_error == True) ++ layout_inode_del(i); ++ res->lg_seg.length = 0; ++ res->lg_seg.offset = 0; ++ } ++ ++ dprintk("<-- %s (rval %u)\n", __func__, nfserr); ++ return nfserr; ++} ++ ++/* ++ * bl_layoutcommit -- commit changes, especially size, to file systemj ++ * ++ * Currently this routine isn't called and everything is handled within ++ * nfsd4_layoutcommit(). By not calling this routine the server doesn't ++ * handle a partial return, a set of extents, of the layout. The extents ++ * are decoded here, but nothing is done with them. If this routine is ++ * be called the interface must change to pass the 'dentry' pointer such ++ * that notify_change() can be called. ++ */ ++int ++bl_layoutcommit(struct inode *i, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ bl_layout_rec_t *r; ++ int status = 0; ++ u64 lw_plus; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ r = layout_inode_find(i); ++ if (r) { ++ lw_plus = args->lc_last_wr + 1; ++ if (args->lc_newoffset) { ++ dprintk(" lc_last_wr %Lu\n", lw_plus); ++ if (r->blr_orig_size < lw_plus) { ++ r->blr_orig_size = lw_plus; ++ res->lc_size_chg = 1; ++ res->lc_newsize = lw_plus; ++ } ++ } ++ ++ if (args->lc_up_len) { ++ int extents, ++ i; ++ struct pnfs_blocklayout_layout *b; ++ __be32 *p = args->lc_up_layout; ++ ++ /* ++ * Client is returning a set of extents which ++ * should/could be used to update the file system. ++ * See section 2.3.2 in draft-ietf-nfsv4-pnfs-block-08 ++ */ ++ READ32(extents); ++ dprintk(" Client returning %d extents: data size %d\n", ++ extents, args->lc_up_len); ++ b = kmalloc(sizeof (struct pnfs_blocklayout_layout) * ++ extents, GFP_KERNEL); ++ if (b) { ++ for (i = 0; i < extents; i++) { ++ READ64(b[i].bll_vol_id.sbid); ++ READ64(b[i].bll_vol_id.devid); ++ READ64(b[i].bll_foff); ++ READ64(b[i].bll_len); ++ READ64(b[i].bll_soff); ++ READ32(b[i].bll_es); ++ dprintk(" %d: foff %Lu, len %Lu, soff %Lu " ++ "state %s\n", ++ i, _2SECTS(b[i].bll_foff), ++ _2SECTS(b[i].bll_len), ++ _2SECTS(b[i].bll_soff), ++ map_state2name(b[i].bll_es)); ++ } ++ kfree(b); ++ } else { ++ status = -ENOMEM; ++ } ++ } ++ } else ++ dprintk("%s: Unexpected commit to inode %p\n", __func__, i); ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutreturn(struct inode *i, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ int status = 0; ++ bl_layout_rec_t *r; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ ++ r = layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ layout_cache_del(r, &args->lr_seg); ++ spin_unlock(&r->blr_lock); ++ dprintk(" ext_size %Lu, i_size %Lu, orig_size %Lu\n", ++ r->blr_ext_size, i->i_size, r->blr_orig_size); ++ } ++ ++ layout_inode_del(i); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ bl_layout_rec_t *r; ++ pnfs_blocklayout_layout_t *b; ++ u64 adj; ++ ++ dprintk("--> %s\n", __func__); ++ BUG_ON(!len); ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk(" recalling layout [0x%x:%lu], %Lu:%Lu\n", ++ inode->i_sb->s_dev, inode->i_ino, ++ _2SECTS(offset), _2SECTS(len)); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* ++ * XXX figure out how to get a sb since there's no ++ * inode ptr ++ */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++restart: ++ r = layout_inode_find(inode); ++ if (r && len && !r->blr_recalled) { ++ spin_lock(&r->blr_lock); ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (!r->blr_recalled && !b->bll_recalled && ++ (offset >= b->bll_foff) && (offset < BLL_F_END(b))) { ++ b->bll_recalled = 1; ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_BLOCK_VOLUME; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = 0; ++ lr.cbl_seg.length = NFS4_MAX_UINT64; ++ r->blr_recalled = 1; ++ dprintk(" FULL LAYOUTRECALL\n"); ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ ++ /* ++ * Currently there are only two cases where the ++ * layout is being returned. ++ * (1) Someone is issuing a NFS_WRITE operation ++ * to this layout. ++ * (2) The file has been truncated which means ++ * the layout is immediately made invalid. ++ * In both cases the client must write any ++ * uncommitted modifications to the server via ++ * NFS_WRITE. ++ */ ++ lr.cbl_layoutchanged = 1; ++ ++ /* ++ * Need to drop the lock because we'll get a ++ * layoutreturn which will block waiting for ++ * the lock. The request will come in on the ++ * same thread which will cause a deadlock. ++ */ ++ spin_unlock(&r->blr_lock); ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ adj = MIN(b->bll_len - (offset - b->bll_foff), ++ len); ++ offset += adj; ++ len -= adj; ++ if (!len) { ++ spin_lock(&r->blr_lock); ++ break; ++ } ++ /* ++ * Since layoutreturn will have been called we ++ * can't assume blr_layouts is still valid, ++ * so restart. ++ */ ++ goto restart; ++ } ++ } ++ spin_unlock(&r->blr_lock); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++/* ++ * []------------------------------------------------------------------[] ++ * | Support functions from here on down. | ++ * []------------------------------------------------------------------[] ++ */ ++ ++/* ++ * bld_simple -- given a dev_t build a simple volume structure ++ * ++ * Simple volume contains the device signature and offset to that data in ++ * the storage volume. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_simple(struct list_head *volumes, dev_t devid, int local_index) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSIG; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to get signature information\n", __func__); ++ goto error; ++ } ++ ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SIMPLE); ++ if (!bld) ++ return NULL; ++ ++ bld->u.simple.bld_offset = (res->u.sig.sector * 512LL) + res->u.sig.offset; ++ bld->u.simple.bld_sig_len = res->u.sig.len; ++ bld->u.simple.bld_sig = kmalloc(res->u.sig.len, GFP_KERNEL); ++ if (!bld->u.simple.bld_sig) ++ goto error; ++ ++ memcpy(bld->u.simple.bld_sig, res->u.sig.sig, res->u.sig.len); ++ kfree(res); ++ return bld; ++ ++error: ++ if (bld) ++ bld_free(bld); ++ if (res) ++ kfree(res); ++ dprintk("%s: error in bld_simple\n", __func__); ++ return NULL; ++} ++ ++/* ++ * bld_slice -- given a dev_t build a slice volume structure ++ * ++ * A slice volume contains the length of the slice/partition and its offset ++ * from the beginning of the storage volume. There's also a reference to ++ * the "simple" volume which contains this slice. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_slice(struct list_head *volumes, dev_t devid, int my_loc, int simple_loc) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SLICE); ++ if (!bld) ++ return NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSLICE; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Upcall to get slice info failed\n"); ++ bld_free(bld); ++ return NULL; ++ } ++ ++ bld->bld_devid.devid = devid; ++ bld->bld_index_loc = my_loc; ++ bld->u.slice.bld_start = res->u.slice.start * 512LL; ++ bld->u.slice.bld_len = res->u.slice.length * 512LL; ++ bld->u.slice.bld_index = simple_loc; ++ ++ dprintk("%s: start %Lu, len %Lu\n", __func__, ++ bld->u.slice.bld_start / 512LL, bld->u.slice.bld_len / 512LL); ++ ++ kfree(res); ++ dprintk("<-- %s (rval %p)\n", __func__, bld); ++ return bld; ++} ++ ++static int ++layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!list_empty(&r->blr_layouts)) ++ if (layout_cache_fill_from_list(r, h, seg) == False) ++ return -EIO; ++ ++ /* ++ * This deals with two conditions. ++ * (1) When blr_layouts is empty we need to create the first entry ++ * (2) When the range requested falls past the end of any current ++ * layout the residual must be taken care of. ++ */ ++ if (seg->length) { ++ n = bll_alloc(seg->offset, seg->length, BLOCK_LAYOUT_NEW, h); ++ if (!n) ++ return -ENOMEM; ++ dprintk(" remaining at %Lu, len %Lu\n", _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len)); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++struct list_head * ++layout_cache_iter(bl_layout_rec_t *r, struct list_head *bl_possible, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n = NULL; ++ struct list_head *bl_candidates = NULL; ++ struct fiemap_extent_info fei; ++ struct inode *i; ++ dev_t dev; ++ ++ dev = r->blr_rdev; ++ i = r->blr_inode; ++ ++ dprintk("--> %s\n", __func__); ++ bl_candidates = kmalloc(sizeof (*bl_candidates), GFP_KERNEL); ++ if (!bl_candidates) ++ return NULL; ++ INIT_LIST_HEAD(bl_candidates); ++ extents_setup(&fei); ++ ++ list_for_each_entry(b, bl_possible, bll_list) { ++ if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ extents_count(&fei, i, b->bll_foff, b->bll_len); ++ if (fei.fi_extents_mapped) { ++ ++ /* ++ * Common case here. Got a range which has ++ * extents. Now get those extents and process ++ * them into pNFS extents. ++ */ ++ if (extents_get(&fei, i, b->bll_foff, ++ b->bll_len) == False) ++ goto cleanup; ++ if (extents_process(&fei, bl_candidates, ++ seg, dev, b) == False) ++ goto cleanup; ++ extents_cleanup(&fei); ++ ++ } else if (seg->iomode == IOMODE_READ) { ++ ++ /* ++ * Found a hole in a file while reading. No ++ * problem, just create a pNFS extent for the ++ * range and let the client know there's no ++ * backing store. ++ */ ++ n = bll_alloc(b->bll_foff, b->bll_len, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += b->bll_len; ++ } else { ++ ++ /* ++ * There's a problem here. Since the iomode ++ * is read/write fallocate should have allocated ++ * any necessary storage for the given range. ++ */ ++ dprintk(" Extent count for RW is 0\n"); ++ goto cleanup; ++ } ++ ++ } else { ++ n = bll_alloc_dup(b, b->bll_cache_state, bl_candidates); ++ seg->length += n->bll_len; ++ } ++ ++ if (r->blr_ext_size < (b->bll_foff + b->bll_len)) ++ r->blr_ext_size = b->bll_foff + b->bll_len; ++ } ++ ++ while (!list_empty(bl_possible)) { ++ b = list_entry(bl_possible->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ ++ b = list_first_entry(bl_candidates, struct pnfs_blocklayout_layout, ++ bll_list); ++ seg->offset = b->bll_foff; ++ dprintk("<-- %s okay\n", __func__); ++ return bl_candidates; ++ ++cleanup: ++ extents_cleanup(&fei); ++ if (bl_candidates) ++ kfree(bl_candidates); ++ dprintk("<-- %s, error occurred\n", __func__); ++ return NULL; ++} ++ ++/* ++ * layout_cache_merge -- collapse layouts which make up a contiguous range. ++ */ ++static void ++layout_cache_merge(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *p; ++ ++ dprintk("--> %s\n", __func__); ++restart: ++ p = NULL; ++ list_for_each_entry(b, h, bll_list) { ++ if (p && (BLL_S_END(p) == b->bll_soff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ /* ++ * We've got a condidate. ++ */ ++#ifdef too_verbose ++ dprintk(" merge %Lu(f):%Lu(l):%Lu(s) into %Lu(f):%Lu(l):%Lu(s)\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), ++ _2SECTS(p->bll_foff), _2SECTS(p->bll_len), ++ _2SECTS(b->bll_soff)); ++#endif ++ ++ if (p->bll_cache_state == BLOCK_LAYOUT_CACHE) ++ p->bll_cache_state = BLOCK_LAYOUT_UPDATE; ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else if (p && (BLL_F_END(p) == b->bll_foff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es == PNFS_BLOCK_NONE_DATA)) { ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else ++ p = b; ++ } ++ dprintk("<-- %s\n", __func__); ++} ++ ++static int ++layout_cache_update(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *c, ++ *n; ++ boolean_t status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ if (list_empty(&r->blr_layouts)) { ++ /* ---- Just add entries and return ---- */ ++ dprintk(" cache empty for inode 0x%x:%ld\n", r->blr_rdev, ++ r->blr_inode->i_ino); ++ list_for_each_entry(b, h, bll_list) { ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, ++ &r->blr_layouts); ++ if (!c) { ++ status = -ENOMEM; ++ break; ++ } ++ dprintk(" adding %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), c->bll_es); ++ } ++ return status; ++ } ++ ++ list_for_each_entry(b, h, bll_list) { ++ BUG_ON(!b->bll_vol_id.devid); ++ if (b->bll_cache_state == BLOCK_LAYOUT_UPDATE) { ++ boolean_t found = False; ++ list_for_each_entry(c, &r->blr_layouts, bll_list) { ++ if ((b->bll_soff >= c->bll_soff) && ++ (b->bll_soff < BLL_S_END(c)) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ u64 u; ++ ++ if ((b->bll_foff < c->bll_foff) || ++ (b->bll_foff > BLL_F_END(c))) ++ BUG(); ++ ++ u = BLL_S_END(b) - BLL_S_END(c); ++ /* ++ * The updated cache entry has to be ++ * different than the current. ++ * Otherwise the cache state for 'b' ++ * should be BLOCK_LAYOUT_CACHE. ++ */ ++ BUG_ON(BLL_S_END(b) < BLL_S_END(c)); ++ ++ dprintk(" " ++ "updating %Lu(f):%Lu(l):%Lu(s) to len %Lu\n", ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), ++ _2SECTS(c->bll_len + u)); ++ c->bll_len += u; ++ bll_collapse(r, c); ++ found = True; ++ break; ++ } ++ } ++ ++ if (found == False) { ++ dprintk(" ERROR Expected to find" ++ " %Lu(f):%Lu(l):%Lu(s), but didn't\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_for_each_entry(c, &r->blr_layouts, bll_list) ++ print_bll(c, "Cached"); ++ BUG(); ++ } ++ } else if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ c = list_first_entry(&r->blr_layouts, ++ struct pnfs_blocklayout_layout, bll_list); ++ if (b->bll_foff < c->bll_foff) { ++ /* ++ * Special case where new entry is before ++ * first cached entry. ++ */ ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, NULL); ++ list_add(&c->bll_list, &r->blr_layouts); ++ dprintk(" new entry at head of list at %Lu, " ++ "len %Lu\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len)); ++ } else { ++ list_for_each_entry(c, &r->blr_layouts, ++ bll_list) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, ++ bll_list); ++ /* ++ * This is ugly, but can't think of ++ * another way to examine this case. ++ * Consider the following. Need to ++ * add an entry which starts at 40 ++ * and the cache has the following ++ * entries: ++ * Start Length ++ * 10 5 ++ * 30 5 ++ * 50 5 ++ * So, need to look and see if the new ++ * entry starts after the current ++ * cache, but before the next one. ++ * There's a catch in that the next ++ * entry might not be valid as it's ++ * really just a pointer to the list ++ * head. ++ */ ++ if (((b->bll_foff >= ++ BLL_F_END(c)) && ++ (c->bll_list.next == &r->blr_layouts)) || ++ ((b->bll_foff >= ++ BLL_F_END(c)) && ++ (b->bll_foff < n->bll_foff))) { ++ ++ n = bll_alloc_dup(b, ++ BLOCK_LAYOUT_CACHE, NULL); ++ dprintk(" adding new %Lu:%Lu" ++ " after %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len), ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len)); ++ list_add(&n->bll_list, ++ &c->bll_list); ++ break; ++ } ++ } ++ } ++ } ++ } ++ dprintk("<-- %s\n", __func__); ++ return status; ++} ++ ++static void ++layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg_in) ++{ ++ struct pnfs_blocklayout_layout *b, ++ *n; ++ u64 len; ++ struct nfsd4_layout_seg seg = *seg_in; ++ ++ dprintk("--> %s\n", __func__); ++ if (seg.length == NFS4_MAX_UINT64) { ++ r->blr_recalled = 0; ++ dprintk(" Fast return of all layouts\n"); ++ while (!list_empty(&r->blr_layouts)) { ++ b = list_entry(r->blr_layouts.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ dprintk(" foff %Lu, len %Lu, soff %Lu\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ dprintk("<-- %s\n", __func__); ++ return; ++ } ++ ++restart: ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg.offset == b->bll_foff) { ++ /* ++ * This handle the following three cases: ++ * (1) return layout matches entire cache layout ++ * (2) return layout matches beginning portion of cache ++ * (3) return layout matches entire cache layout and ++ * into next entry. Varies from #1 in end case. ++ */ ++ dprintk(" match on offsets, %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length)); ++ len = MIN(seg.length, b->bll_len); ++ b->bll_foff += len; ++ b->bll_soff += len; ++ b->bll_len -= len; ++ seg.length -= len; ++ seg.offset += len; ++ if (!b->bll_len) { ++ list_del(&b->bll_list); ++ kfree(b); ++ dprintk(" removing cache line\n"); ++ if (!seg.length) { ++ dprintk(" also finished\n"); ++ goto complete; ++ } ++ /* ++ * Since 'b' was freed we can't continue at the ++ * next entry which is referenced as ++ * b->bll_list.next by the list_for_each_entry ++ * macro. Need to restart the loop. ++ * TODO: Think about creating a dummy 'b' which ++ * would keep list_for_each_entry() happy. ++ */ ++ goto restart; ++ } ++ if (!seg.length) { ++ dprintk(" finished, but cache line not" ++ "empty\n"); ++ goto complete; ++ } ++ } else if ((seg.offset >= b->bll_foff) && ++ (seg.offset < BLL_F_END(b))) { ++ /* ++ * layout being returned is within this cache line. ++ */ ++ dprintk(" layout %Lu:%Lu within cache line %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ BUG_ON(!seg.length); ++ if ((seg.offset + seg.length) >= BLL_F_END(b)) { ++ /* ++ * Layout returned starts in the middle of ++ * cache entry and just need to trim back ++ * cache to shorter length. ++ */ ++ dprintk(" trim back cache line\n"); ++ len = seg.offset - b->bll_foff; ++ seg.offset += b->bll_len - len; ++ seg.length -= b->bll_len - len; ++ b->bll_len = len; ++ if (!seg.length) ++ return; ++ } else { ++ /* ++ * Need to split current cache layout because ++ * chunk is being removed from the middle. ++ */ ++ dprintk(" split cache line\n"); ++ len = seg.offset + seg.length; ++ n = bll_alloc(len, ++ (b->bll_foff + b->bll_len) - len, ++ BLOCK_LAYOUT_CACHE, NULL); ++ n->bll_soff = b->bll_soff + len; ++ list_add(&n->bll_list, &b->bll_list); ++ b->bll_len = seg.offset - b->bll_foff; ++ return; ++ } ++ } ++ } ++complete: ++ if (list_empty(&r->blr_layouts)) ++ r->blr_recalled = 0; ++ dprintk("<-- %s\n", __func__); ++} ++ ++/* ++ * layout_cache_fill_from_list -- fills from cache list ++ * ++ * NOTE: This routine was only seperated out from layout_cache_file_from() ++ * to reduce the indentation level which makes the code easier to read. ++ */ ++static inline boolean_t ++layout_cache_fill_from_list(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n; ++ enum pnfs_block_extent_state4 s; ++ ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg->offset < b->bll_foff) { ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, b->bll_foff - seg->offset), ++ BLOCK_LAYOUT_NEW, NULL); ++ if (!n) ++ return False; ++ ++ list_add(&n->bll_list, h->prev); ++ dprintk(" new: %Lu:%Lu, added before %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ ++ if ((seg->offset >= b->bll_foff) && ++ (seg->offset < BLL_F_END(b))) { ++ if (layout_conflict(b, seg->iomode, &s) == False) { ++ dprintk(" CONFLICT FOUND: " ++ "%Lu(f):%Lu(l):%Lu(s) state %d, iomode %d\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es, ++ seg->iomode); ++ return False; ++ } ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, BLL_F_END(b) - seg->offset), ++ BLOCK_LAYOUT_CACHE, h); ++ dprintk(" CACHE hit: Found %Lu(f):%Lu(l): " ++ "in %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es); ++ if (!n) ++ return False; ++ ++ n->bll_soff = b->bll_soff + seg->offset - b->bll_foff; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ n->bll_es = s; ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ } ++ return True; ++} ++ ++static u64 ++bll_alloc_holey(struct list_head *bl_candidates, u64 offset, u64 length, ++ dev_t dev) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ n = bll_alloc(offset, length, BLOCK_LAYOUT_NEW, bl_candidates); ++ if (!n) ++ return 0; ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ ++ return n->bll_len; ++} ++ ++static void ++extents_setup(struct fiemap_extent_info *fei) ++{ ++ fei->fi_extents_start = NULL; ++} ++ ++/* ++ * extents_count -- Determine the number of extents for a given range. ++ * ++ * No need to call set_fs() here because the function ++ * doesn't use copy_to_user() if it's only counting ++ * the number of extents needed. ++ */ ++static void ++extents_count(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ dprintk(" Need fiemap of %Ld:%Ld\n", _2SECTS(foff), _2SECTS(len)); ++ fei->fi_flags = FIEMAP_FLAG_SYNC; ++ fei->fi_extents_max = 0; ++ fei->fi_extents_start = NULL; ++ fei->fi_extents_mapped = 0; ++ i->i_op->fiemap(i, fei, foff, len + (1 << i->i_sb->s_blocksize_bits) - 1); ++} ++ ++/* ++ * extents_get -- Get list of extents for range ++ * ++ * extents_count() must have been called before this routine such that ++ * fi_extents_mapped is known. ++ */ ++static boolean_t ++extents_get(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ int m_space, ++ rval; ++ struct fiemap_extent *fe; ++ mm_segment_t old_fs = get_fs(); ++ ++ /* ++ * Now malloc the correct amount of space ++ * needed. It's possible for the file to have changed ++ * between calls which would require more space for ++ * the extents. If that occurs the last extent will ++ * not have FIEMAP_EXTENT_LAST set and the error will ++ * be caught in extents_process(). ++ */ ++ m_space = fei->fi_extents_mapped * sizeof (struct fiemap_extent); ++ fe = kmalloc(m_space, GFP_KERNEL); ++ if (!fe) ++ return False; ++ memset(fe, 0, m_space); ++ ++ fei->fi_extents_max = fei->fi_extents_mapped; ++ fei->fi_extents_mapped = 0; ++ fei->fi_extents_start = fe; ++ ++ set_fs(KERNEL_DS); ++ rval = i->i_op->fiemap(i, fei, foff, len + ++ (1 << i->i_sb->s_blocksize_bits) - 1); ++ set_fs(old_fs); ++ ++ if (rval || !fei->fi_extents_mapped) { ++ dprintk(" No extents. Wanted %d, got %d\n", ++ fei->fi_extents_max, fei->fi_extents_mapped); ++ kfree(fe); ++ fei->fi_extents_start = NULL; ++ return False; ++ } else ++ return True; ++} ++ ++/* ++ * extents_process -- runs through the extent returned from the file system and ++ * creates block layout entries. ++ */ ++static boolean_t ++extents_process(struct fiemap_extent_info *fei, struct list_head *bl_candidates, ++ struct nfsd4_layout_seg *seg, dev_t dev, pnfs_blocklayout_layout_t *b) ++{ ++ struct fiemap_extent *fep, ++ *fep_last = NULL; ++ int i; ++ pnfs_blocklayout_layout_t *n; ++ u64 last_end, ++ rval; ++ ++ dprintk("--> %s\n", __func__); ++ for (fep = fei->fi_extents_start, i = 0; i < fei->fi_extents_mapped; ++ i++, fep++) { ++ ++ BUG_ON(!fep->fe_physical); ++ /* ++ * Deal with corner cases of hoel-y files. ++ */ ++ if (fep_last && ((fep_last->fe_logical + fep_last->fe_length) != ++ fep->fe_logical)) { ++ ++ /* ++ * If the last extent doesn't end logically ++ * at the beginning of the current we've got ++ * hole and need to create a pNFS extent. ++ */ ++ dprintk(" Got a hole at %Ld:%Ld \n", ++ _2SECTS(fep_last->fe_logical), ++ _2SECTS(fep_last->fe_length)); ++ last_end = fep_last->fe_logical + fep_last->fe_length; ++ rval = bll_alloc_holey(bl_candidates, last_end, ++ fep->fe_logical - last_end, dev); ++ if (!rval) ++ return False; ++ seg->length += rval; ++ } ++ ++ n = bll_alloc(fep->fe_logical, fep->fe_length, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ if (unlikely(n == NULL)) { ++ dprintk("%s: bll_alloc failed\n", __func__); ++ return False; ++ } ++ ++ n->bll_soff = fep->fe_physical; ++ n->bll_es = seg->iomode == IOMODE_READ ? ++ PNFS_BLOCK_READ_DATA : PNFS_BLOCK_READWRITE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += fep->fe_length; ++ print_bll(n, "New extent"); ++ fep_last = fep; ++ } ++ dprintk("<-- %s (i=%d)\n", __func__, i); ++ ++ return True; ++} ++ ++static void ++extents_cleanup(struct fiemap_extent_info *fei) ++{ ++ if (fei->fi_extents_start) { ++ kfree(fei->fi_extents_start); ++ fei->fi_extents_start = NULL; ++ } ++} ++ ++/* ++ * device_slice -- check to see if device is a slice or DM ++ */ ++static boolean_t ++device_slice(dev_t devid) ++{ ++ struct block_device *bd = open_by_devnum(devid, FMODE_READ); ++ boolean_t rval = False; ++ ++ if (bd) { ++ if (bd->bd_disk->minors > 1) ++ rval = True; ++ blkdev_put(bd, FMODE_READ); ++ } ++ return rval; ++} ++ ++/* ++ * device_dm -- check to see if device is a Device Mapper volume. ++ * ++ * Returns 1 for DM or 0 if not ++ */ ++static boolean_t ++device_dm(dev_t devid) ++{ ++ boolean_t rval = False; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMCHK; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Failed upcall to check on DM status\n"); ++ } else if (res->u.dm_vol) { ++ rval = True; ++ dprintk("Device is DM volume\n"); ++ } else ++ dprintk("Device is not DM volume\n"); ++ kfree(res); ++ ++ return rval; ++} ++ ++static boolean_t ++layout_inode_add(struct inode *i, bl_layout_rec_t **p) ++{ ++ bl_layout_rec_t *r = NULL; ++ ++ if (!i->i_op->fiemap || !i->i_op->fallocate) { ++ printk("pNFS: file system doesn't support required fiemap or" ++ "fallocate methods\n"); ++ return False; ++ } ++ ++ r = kmalloc(sizeof (*r), GFP_KERNEL); ++ if (!r) ++ goto error; ++ ++ r->blr_rdev = i->i_sb->s_dev; ++ r->blr_inode = i; ++ r->blr_orig_size = i->i_size; ++ r->blr_ext_size = 0; ++ r->blr_recalled = 0; ++ INIT_LIST_HEAD(&r->blr_layouts); ++ spin_lock_init(&r->blr_lock); ++ spin_lock(&layout_hashtbl_lock); ++ list_add_tail(&r->blr_hash, &layout_hash); ++ spin_unlock(&layout_hashtbl_lock); ++ *p = r; ++ return True; ++ ++error: ++ if (r) ++ kfree(r); ++ return False; ++} ++ ++static bl_layout_rec_t * ++__layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ if (!list_empty(&layout_hash)) { ++ list_for_each_entry(r, &layout_hash, blr_hash) { ++ if ((r->blr_inode->i_ino == i->i_ino) && ++ (r->blr_rdev == i->i_sb->s_dev)) { ++ return r; ++ } ++ } ++ } ++ return NULL; ++} ++ ++static bl_layout_rec_t * ++layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ spin_unlock(&layout_hashtbl_lock); ++ ++ return r; ++} ++ ++static void ++layout_inode_del(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ if (list_empty(&r->blr_layouts)) { ++ list_del(&r->blr_hash); ++ spin_unlock(&r->blr_lock); ++ kfree(r); ++ } else { ++ spin_unlock(&r->blr_lock); ++ } ++ } else { ++ dprintk("%s: failed to find inode [0x%x:%lu] in table for delete\n", ++ __func__, i->i_sb->s_dev, i->i_ino); ++ } ++ spin_unlock(&layout_hashtbl_lock); ++} ++ ++/* ++ * map_state2name -- converts state in ascii string. ++ * ++ * Used for debug messages only. ++ */ ++static char * ++map_state2name(enum pnfs_block_extent_state4 s) ++{ ++ switch (s) { ++ case PNFS_BLOCK_READWRITE_DATA: return " RW"; ++ case PNFS_BLOCK_READ_DATA: return " RO"; ++ case PNFS_BLOCK_INVALID_DATA: return "INVALID"; ++ case PNFS_BLOCK_NONE_DATA: return " NONE"; ++ default: ++ BUG(); ++ } ++} ++ ++static pnfs_blocklayout_devinfo_t * ++bld_alloc(struct list_head *volumes, int type) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ return NULL; ++ ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = type; ++ list_add_tail(&bld->bld_list, volumes); ++ ++ return bld; ++} ++ ++static void ++bld_free(pnfs_blocklayout_devinfo_t *bld) ++{ ++ list_del(&bld->bld_list); ++ kfree(bld); ++} ++ ++static void ++print_bll(pnfs_blocklayout_layout_t *b, char *text) ++{ ++ dprintk(" BLL: %s\n", text); ++ dprintk(" foff %Lu, soff %Lu, len %Lu, state %s\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_soff), _2SECTS(b->bll_len), ++ map_state2name(b->bll_es)); ++} ++ ++static inline void ++bll_collapse(bl_layout_rec_t *r, pnfs_blocklayout_layout_t *c) ++{ ++ pnfs_blocklayout_layout_t *n; ++ int dbg_count = 0; ++ u64 endpoint; ++ ++ BUG_ON(c->bll_es == PNFS_BLOCK_NONE_DATA); ++ while (c->bll_list.next != &r->blr_layouts) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ endpoint = BLL_S_END(c); ++ if ((n->bll_soff >= c->bll_soff) && ++ (n->bll_soff < endpoint)) { ++ if (endpoint < BLL_S_END(n)) { ++ /* ++ * The following is possible. ++ * ++ * ++ * Existing: +---+ +---+ ++ * New: +-----------------------+ ++ * The client request merge entries together ++ * but didn't require picking up all of the ++ * last entry. So, we still need to delete ++ * the last entry and add the remaining space ++ * to the new entry. ++ */ ++ c->bll_len += BLL_S_END(n) - endpoint; ++ } ++ dbg_count++; ++ list_del(&n->bll_list); ++ kfree(n); ++ } else { ++ break; ++ } ++ } ++ /* ---- Debug only, remove before integration ---- */ ++ if (dbg_count) ++ dprintk(" Collapsed %d cache entries between %Lu(s) and %Lu(s)\n", ++ dbg_count, _2SECTS(c->bll_soff), _2SECTS(BLL_S_END(c))); ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc(u64 offset, u64 len, enum bl_cache_state state, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = kmalloc(sizeof (*n), GFP_KERNEL); ++ if (n) { ++ memset(n, 0, sizeof (*n)); ++ n->bll_foff = offset; ++ n->bll_len = len; ++ n->bll_cache_state = state; ++ if (h) ++ list_add_tail(&n->bll_list, h); ++ } ++ return n; ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc_dup(pnfs_blocklayout_layout_t *b, enum bl_cache_state c, ++ struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = bll_alloc(b->bll_foff, b->bll_len, c, h); ++ if (n) { ++ n->bll_es = b->bll_es; ++ n->bll_soff = b->bll_soff; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ } ++ return n; ++} ++ ++static inline boolean_t ++layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s) ++{ ++ /* ---- Normal case ---- */ ++ *s = b->bll_es; ++ ++ switch (b->bll_es) { ++ case PNFS_BLOCK_READWRITE_DATA: ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_READ_DATA; ++ /* ---- Any use is permitted. ---- */ ++ break; ++ case PNFS_BLOCK_READ_DATA: ++ /* ---- Committed as read only data. ---- */ ++ if (iomode == IOMODE_RW) ++ return False; ++ break; ++ case PNFS_BLOCK_INVALID_DATA: ++ /* ---- Blocks have been allocated, but not initialized ---- */ ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_NONE_DATA; ++ break; ++ case PNFS_BLOCK_NONE_DATA: ++ /* ---- Hole-y file. No backing store avail. ---- */ ++ if (iomode != IOMODE_READ) ++ return False; ++ break; ++ default: ++ BUG(); ++ } ++ return True; ++} ++ ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs/delegation.c +--- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-08-23 12:08:29.037481540 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-23 12:09:03.300491952 -0400 +@@ -104,7 +104,8 @@ again: + continue; + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + continue; +- if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) ++ if (memcmp(state->stateid.u.data, stateid->u.data, ++ sizeof(state->stateid.u.data)) != 0) + continue; + get_nfs_open_context(ctx); + spin_unlock(&inode->i_lock); +@@ -133,8 +134,8 @@ void nfs_inode_reclaim_delegation(struct + if (delegation != NULL) { + spin_lock(&delegation->lock); + if (delegation->inode != NULL) { +- memcpy(delegation->stateid.data, res->delegation.data, +- sizeof(delegation->stateid.data)); ++ memcpy(delegation->stateid.u.data, res->delegation.u.data, ++ sizeof(delegation->stateid.u.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + oldcred = delegation->cred; +@@ -187,8 +188,9 @@ static struct nfs_delegation *nfs_detach + if (delegation == NULL) + goto nomatch; + spin_lock(&delegation->lock); +- if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data, +- sizeof(delegation->stateid.data)) != 0) ++ if (stateid != NULL && memcmp(delegation->stateid.u.data, ++ stateid->u.data, ++ sizeof(delegation->stateid.u.data)) != 0) + goto nomatch_unlock; + list_del_rcu(&delegation->super_list); + delegation->inode = NULL; +@@ -216,8 +218,8 @@ int nfs_inode_set_delegation(struct inod + delegation = kmalloc(sizeof(*delegation), GFP_NOFS); + if (delegation == NULL) + return -ENOMEM; +- memcpy(delegation->stateid.data, res->delegation.data, +- sizeof(delegation->stateid.data)); ++ memcpy(delegation->stateid.u.data, res->delegation.u.data, ++ sizeof(delegation->stateid.u.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + delegation->change_attr = nfsi->change_attr; +@@ -471,9 +473,7 @@ void nfs_expire_unreferenced_delegations + /* + * Asynchronous delegation recall! + */ +-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, +- int (*validate_stateid)(struct nfs_delegation *delegation, +- const nfs4_stateid *stateid)) ++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) + { + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_delegation *delegation; +@@ -481,7 +481,7 @@ int nfs_async_inode_return_delegation(st + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + +- if (!validate_stateid(delegation, stateid)) { ++ if (!clp->cl_mvops->validate_stateid(delegation, stateid)) { + rcu_read_unlock(); + return -ENOENT; + } +@@ -562,7 +562,8 @@ int nfs4_copy_delegation_stateid(nfs4_st + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation != NULL) { +- memcpy(dst->data, delegation->stateid.data, sizeof(dst->data)); ++ memcpy(dst->u.data, delegation->stateid.u.data, ++ sizeof(dst->u.data)); + ret = 1; + } + rcu_read_unlock(); +diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs/delegation.h +--- linux-2.6.34.noarch/fs/nfs/delegation.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-08-23 12:09:03.301431797 -0400 +@@ -34,9 +34,7 @@ enum { + int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); + void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); + int nfs_inode_return_delegation(struct inode *inode); +-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, +- int (*validate_stateid)(struct nfs_delegation *delegation, +- const nfs4_stateid *stateid)); ++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); + void nfs_inode_return_delegation_noreclaim(struct inode *inode); + + struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); +diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c +--- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-08-23 12:08:29.089481525 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 12:09:03.302511603 -0400 +@@ -17,11 +17,19 @@ + #include + #include + ++#include ++#if defined(CONFIG_SPNFS) ++#include ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif ++#endif + #include + #include + + #include "nfsd.h" + #include "nfsfh.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_EXPORT + +@@ -352,6 +360,40 @@ static int svc_export_upcall(struct cach + return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); + } + ++#if defined(CONFIG_PNFSD) ++static struct pnfsd_cb_operations pnfsd_cb_op = { ++ .cb_layout_recall = nfsd_layout_recall_cb, ++ .cb_device_notify = nfsd_device_notify_cb, ++ ++ .cb_get_state = nfs4_pnfs_cb_get_state, ++ .cb_change_state = nfs4_pnfs_cb_change_state, ++}; ++ ++#if defined(CONFIG_SPNFS) ++static struct pnfs_export_operations spnfs_export_ops = { ++ .layout_type = spnfs_layout_type, ++ .get_device_info = spnfs_getdeviceinfo, ++ .get_device_iter = spnfs_getdeviceiter, ++ .layout_get = spnfs_layoutget, ++ .layout_return = spnfs_layoutreturn, ++}; ++ ++static struct pnfs_export_operations spnfs_ds_export_ops = { ++ .get_state = spnfs_get_state, ++}; ++ ++#if defined(CONFIG_SPNFS_BLOCK) ++static struct pnfs_export_operations bl_export_ops = { ++ .layout_type = bl_layout_type, ++ .get_device_info = bl_getdeviceinfo, ++ .get_device_iter = bl_getdeviceiter, ++ .layout_get = bl_layoutget, ++ .layout_return = bl_layoutreturn, ++}; ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_SPNFS */ ++#endif /* CONFIG_PNFSD */ ++ + static struct svc_export *svc_export_update(struct svc_export *new, + struct svc_export *old); + static struct svc_export *svc_export_lookup(struct svc_export *); +@@ -395,6 +437,47 @@ static int check_export(struct inode *in + return -EINVAL; + } + ++#if !defined(CONFIG_SPNFS) ++ if (inode->i_sb->s_pnfs_op && ++ (!inode->i_sb->s_pnfs_op->layout_type || ++ !inode->i_sb->s_pnfs_op->get_device_info || ++ !inode->i_sb->s_pnfs_op->layout_get)) { ++ dprintk("exp_export: export of invalid fs pnfs export ops.\n"); ++ return -EINVAL; ++ } ++#endif /* CONFIG_SPNFS */ ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ if (!inode->i_sb->s_pnfs_op) ++ pnfsd_lexp_init(inode); ++ return 0; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, *flags)) { ++ dprintk("set pnfs block export structure... \n"); ++ inode->i_sb->s_pnfs_op = &bl_export_ops; ++ } else ++#endif /* CONFIG_SPNFS_BLOCK */ ++ /* ++ * spnfs_enabled() indicates we're an MDS. ++ * XXX Better to check an export time option as well. ++ */ ++ if (spnfs_enabled()) { ++ dprintk("set spnfs export structure...\n"); ++ inode->i_sb->s_pnfs_op = &spnfs_export_ops; ++ } else { ++ dprintk("%s spnfs not in use\n", __func__); ++ ++ /* ++ * get_state is needed if we're a DS using spnfs. ++ * XXX Better to check an export time option instead. ++ */ ++ inode->i_sb->s_pnfs_op = &spnfs_ds_export_ops; ++ } ++#endif /* CONFIG_SPNFS */ ++ + return 0; + + } +@@ -586,6 +669,8 @@ static int svc_export_parse(struct cache + if (exp.ex_uuid == NULL) + err = -ENOMEM; + } ++ } else if (strcmp(buf, "pnfs") == 0) { ++ exp.ex_pnfs = 1; + } else if (strcmp(buf, "secinfo") == 0) + err = secinfo_parse(&mesg, buf, &exp); + else +@@ -660,6 +745,8 @@ static int svc_export_show(struct seq_fi + seq_printf(m, "%02x", exp->ex_uuid[i]); + } + } ++ if (exp->ex_pnfs) ++ seq_puts(m, ",pnfs"); + show_secinfo(m, exp); + } + seq_puts(m, ")\n"); +@@ -687,6 +774,7 @@ static void svc_export_init(struct cache + new->ex_fslocs.locations = NULL; + new->ex_fslocs.locations_count = 0; + new->ex_fslocs.migrated = 0; ++ new->ex_pnfs = 0; + } + + static void export_update(struct cache_head *cnew, struct cache_head *citem) +@@ -699,6 +787,7 @@ static void export_update(struct cache_h + new->ex_anon_uid = item->ex_anon_uid; + new->ex_anon_gid = item->ex_anon_gid; + new->ex_fsid = item->ex_fsid; ++ new->ex_pnfs = item->ex_pnfs; + new->ex_uuid = item->ex_uuid; + item->ex_uuid = NULL; + new->ex_pathname = item->ex_pathname; +@@ -1635,8 +1724,17 @@ nfsd_export_init(void) + if (rv) + return rv; + rv = cache_register(&svc_expkey_cache); +- if (rv) ++ if (rv) { + cache_unregister(&svc_export_cache); ++ goto out; ++ } ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = THIS_MODULE; ++ pnfsd_cb_ctl.cb_op = &pnfsd_cb_op; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ ++out: + return rv; + + } +@@ -1664,6 +1762,12 @@ nfsd_export_shutdown(void) + + exp_writelock(); + ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = NULL; ++ pnfsd_cb_ctl.cb_op = NULL; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ + cache_unregister(&svc_expkey_cache); + cache_unregister(&svc_export_cache); + svcauth_unix_purge(); +diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/direct.c +--- linux-2.6.34.noarch/fs/nfs/direct.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-08-23 12:09:03.303491500 -0400 +@@ -267,6 +267,38 @@ static const struct rpc_call_ops nfs_rea + .rpc_release = nfs_direct_read_release, + }; + ++static long nfs_direct_read_execute(struct nfs_read_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ nfs_fattr_init(&data->fattr); ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ NFS_PROTO(inode)->read_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct read call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each rsize'd chunk of the user's buffer, dispatch an NFS READ + * operation. If nfs_readdata_alloc() or get_user_pages() fails, +@@ -283,7 +315,6 @@ static ssize_t nfs_direct_read_schedule_ + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; + size_t rsize = NFS_SERVER(inode)->rsize; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -343,26 +374,9 @@ static ssize_t nfs_direct_read_schedule_ + data->res.fattr = &data->fattr; + data->res.eof = 0; + data->res.count = bytes; +- nfs_fattr_init(&data->fattr); +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct read call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_read_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +@@ -448,12 +462,15 @@ static void nfs_direct_free_writedata(st + } + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg); ++ + static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) + { + struct inode *inode = dreq->inode; + struct list_head *p; + struct nfs_write_data *data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = dreq->ctx->cred, + }; +@@ -487,25 +504,7 @@ static void nfs_direct_write_reschedule( + * Reuse data->task; data->args should not have changed + * since the original request was sent. + */ +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- /* +- * We're called via an RPC callback, so BKL is already held. +- */ +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); +- +- dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- data->args.count, +- (unsigned long long)data->args.offset); ++ nfs_direct_write_execute(data, &task_setup_data, &msg); + } + + if (put_dreq(dreq)) +@@ -548,10 +547,31 @@ static const struct rpc_call_ops nfs_com + .rpc_release = nfs_direct_commit_release, + }; + ++static long nfs_direct_commit_execute(struct nfs_direct_req *dreq, ++ struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct rpc_task *task; ++ ++ NFS_PROTO(data->inode)->commit_setup(data, msg); ++ ++ /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ ++ dreq->commit_data = NULL; ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ return 0; ++} ++ + static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) + { + struct nfs_write_data *data = dreq->commit_data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +@@ -579,16 +599,7 @@ static void nfs_direct_commit_schedule(s + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- NFS_PROTO(data->inode)->commit_setup(data, &msg); +- +- /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ +- dreq->commit_data = NULL; +- +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); +- +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); ++ nfs_direct_commit_execute(dreq, data, &task_setup_data, &msg); + } + + static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +@@ -690,6 +701,36 @@ static const struct rpc_call_ops nfs_wri + .rpc_release = nfs_direct_write_release, + }; + ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ NFS_PROTO(inode)->write_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE + * operation. If nfs_writedata_alloc() or get_user_pages() fails, +@@ -705,7 +746,6 @@ static ssize_t nfs_direct_write_schedule + struct inode *inode = ctx->path.dentry->d_inode; + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -771,24 +811,8 @@ static ssize_t nfs_direct_write_schedule + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct write call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_write_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kconfig +--- linux-2.6.34.noarch/fs/nfsd/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-08-23 12:09:03.304505472 -0400 +@@ -79,3 +79,52 @@ config NFSD_V4 + available from http://linux-nfs.org/. + + If unsure, say N. ++ ++config PNFSD ++ bool "NFSv4.1 server support for Parallel NFS (pNFS) (DEVELOPER ONLY)" ++ depends on NFSD_V4 && EXPERIMENTAL ++ select EXPORTFS_FILE_LAYOUT ++ help ++ This option enables support for the parallel NFS features of the ++ minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1) ++ in the kernel's NFS server. ++ ++ Unless you're an NFS developer, say N. ++ ++config PNFSD_LOCAL_EXPORT ++ bool "Enable pNFS support for exporting local filesystems for debugging purposes" ++ depends on PNFSD ++ help ++ Say Y here if you want your pNFS server to export local file systems ++ over the files layout type. With this option the MDS (metadata ++ server) functions also as a single DS (data server). This is mostly ++ useful for development and debugging purposes. ++ ++ If unsure, say N. ++ ++config SPNFS ++ bool "Provide spNFS server support (EXPERIMENTAL)" ++ depends on PNFSD ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS server support. ++ ++ If unsure, say N. ++ ++config SPNFS_LAYOUTSEGMENTS ++ bool "Allow spNFS to return partial file layouts (EXPERIMENTAL)" ++ depends on SPNFS ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS to be able to return layout segments. ++ ++ If unsure, say N. ++ ++config SPNFS_BLOCK ++ bool "Provide Block Layout server support (EXPERIMENTAL)" ++ depends on SPNFS ++ select EXPORTFS_BLOCK_LAYOUT ++ help ++ Say Y here if you want spNFS block layout support ++ ++ If unsure, say N. +diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/Makefile +--- linux-2.6.34.noarch/fs/nfsd/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-08-23 12:09:03.304505472 -0400 +@@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs + nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o + nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ + nfs4acl.o nfs4callback.o nfs4recover.o ++nfsd-$(CONFIG_PNFSD) += nfs4pnfsd.o nfs4pnfsdlm.o nfs4pnfsds.o ++nfsd-$(CONFIG_PNFSD_LOCAL_EXPORT) += pnfsd_lexp.o ++nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o ++nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-23 12:08:29.090501507 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 12:09:03.306491345 -0400 +@@ -40,7 +40,6 @@ + + #define NFSPROC4_CB_NULL 0 + #define NFSPROC4_CB_COMPOUND 1 +-#define NFS4_STATEID_SIZE 16 + + /* Index of predefined Linux callback client operations */ + +@@ -48,11 +47,17 @@ enum { + NFSPROC4_CLNT_CB_NULL = 0, + NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_SEQUENCE, ++#if defined(CONFIG_PNFSD) ++ NFSPROC4_CLNT_CB_LAYOUT, ++ NFSPROC4_CLNT_CB_DEVICE, ++#endif + }; + + enum nfs_cb_opnum4 { + OP_CB_RECALL = 4, ++ OP_CB_LAYOUT = 5, + OP_CB_SEQUENCE = 11, ++ OP_CB_DEVICE = 14, + }; + + #define NFS4_MAXTAGLEN 20 +@@ -78,6 +83,19 @@ enum nfs_cb_opnum4 { + #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) ++#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 3 + \ ++ enc_nfs4_fh_sz + 4) ++#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) ++#define NFS4_enc_cb_device_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 6) ++#define NFS4_dec_cb_device_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) + + /* + * Generic encode routines from fs/nfs/nfs4xdr.c +@@ -94,6 +112,10 @@ xdr_writemem(__be32 *p, const void *ptr, + } + + #define WRITE32(n) *p++ = htonl(n) ++#define WRITE64(n) do { \ ++ *p++ = htonl((u32)((n) >> 32)); \ ++ *p++ = htonl((u32)(n)); \ ++} while (0) + #define WRITEMEM(ptr,nbytes) do { \ + p = xdr_writemem(p, ptr, nbytes); \ + } while (0) +@@ -204,6 +226,16 @@ nfs_cb_stat_to_errno(int stat) + */ + + static void ++encode_stateid(struct xdr_stream *xdr, stateid_t *sid) ++{ ++ __be32 *p; ++ ++ RESERVE_SPACE(sizeof(stateid_t)); ++ WRITE32(sid->si_generation); ++ WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); ++} ++ ++static void + encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) + { + __be32 * p; +@@ -228,10 +260,10 @@ encode_cb_recall(struct xdr_stream *xdr, + __be32 *p; + int len = dp->dl_fh.fh_size; + +- RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len); ++ RESERVE_SPACE(4); + WRITE32(OP_CB_RECALL); +- WRITE32(dp->dl_stateid.si_generation); +- WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t)); ++ encode_stateid(xdr, &dp->dl_stateid); ++ RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2)); + WRITE32(0); /* truncate optimization not implemented */ + WRITE32(len); + WRITEMEM(&dp->dl_fh.fh_base, len); +@@ -259,6 +291,111 @@ encode_cb_sequence(struct xdr_stream *xd + hdr->nops++; + } + ++#if defined(CONFIG_PNFSD) ++ ++#include "pnfsd.h" ++ ++static void ++encode_cb_layout(struct xdr_stream *xdr, struct nfs4_layoutrecall *clr, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ RESERVE_SPACE(20); ++ WRITE32(OP_CB_LAYOUT); ++ WRITE32(clr->cb.cbl_seg.layout_type); ++ WRITE32(clr->cb.cbl_seg.iomode); ++ WRITE32(clr->cb.cbl_layoutchanged); ++ WRITE32(clr->cb.cbl_recall_type); ++ if (unlikely(clr->cb.cbl_recall_type == RETURN_FSID)) { ++ struct nfs4_fsid fsid = clr->cb.cbl_fsid; ++ ++ RESERVE_SPACE(16); ++ WRITE64(fsid.major); ++ WRITE64(fsid.minor); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "fsid 0x%llx-0x%llx\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, fsid.major, fsid.minor); ++ } else if (clr->cb.cbl_recall_type == RETURN_FILE) { ++ int len = clr->clr_file->fi_fhlen; ++ stateid_t *cbl_sid = (stateid_t *)&clr->cb.cbl_sid; ++ ++ RESERVE_SPACE(20 + len); ++ WRITE32(len); ++ WRITEMEM(clr->clr_file->fi_fhval, len); ++ WRITE64(clr->cb.cbl_seg.offset); ++ WRITE64(clr->cb.cbl_seg.length); ++ encode_stateid(xdr, cbl_sid); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "offset %lld length %lld stateid " STATEID_FMT "\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, ++ clr->cb.cbl_seg.offset, clr->cb.cbl_seg.length, ++ STATEID_VAL(cbl_sid)); ++ } else { ++ dprintk("%s: type %x iomode %d changed %d recall_type %d\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type); ++ } ++ hdr->nops++; ++} ++ ++static void ++encode_cb_device(struct xdr_stream *xdr, struct nfs4_notify_device *nd, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ int i; ++ int len = nd->nd_list->cbd_len; ++ struct nfsd4_pnfs_cb_dev_item *cbd = nd->nd_list->cbd_list; ++ ++ dprintk("NFSD %s: --> num %d\n", __func__, len); ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ RESERVE_SPACE(8); ++ WRITE32(OP_CB_DEVICE); ++ ++ /* notify4 cnda_changes<>; */ ++ WRITE32(len); ++ for (i = 0; i < len; i++) { ++ dprintk("%s: nt %d lt %d devid x%llx-x%llx im %d i %d\n", ++ __func__, cbd[i].cbd_notify_type, ++ cbd[i].cbd_layout_type, ++ cbd[i].cbd_devid.sbid, ++ cbd[i].cbd_devid.devid, ++ cbd[i].cbd_immediate, i); ++ ++ BUG_ON(cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_DELETE); ++ RESERVE_SPACE(32); ++ /* bitmap4 notify_mask; */ ++ WRITE32(1); ++ WRITE32(cbd[i].cbd_notify_type); ++ /* opaque notify_vals<>; */ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) ++ WRITE32(24); ++ else ++ WRITE32(20); ++ WRITE32(cbd[i].cbd_layout_type); ++ WRITE64(cbd[i].cbd_devid.sbid); ++ WRITE64(cbd[i].cbd_devid.devid); ++ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) { ++ RESERVE_SPACE(4); ++ WRITE32(cbd[i].cbd_immediate); ++ } ++ } ++ hdr->nops++; ++} ++#endif /* CONFIG_PNFSD */ ++ + static int + nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) + { +@@ -288,6 +425,45 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst * + return 0; + } + ++#if defined(CONFIG_PNFSD) ++static int ++nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, u32 *p, ++ struct nfs4_rpc_args *rpc_args) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_layoutrecall *args = rpc_args->args_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = rpc_args->args_seq.cbs_minorversion, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_cb_compound_hdr(&xdr, &hdr); ++ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); ++ encode_cb_layout(&xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++ return 0; ++} ++ ++static int ++nfs4_xdr_enc_cb_device(struct rpc_rqst *req, u32 *p, ++ struct nfs4_rpc_args *rpc_args) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_notify_device *args = rpc_args->args_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = rpc_args->args_seq.cbs_minorversion, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_cb_compound_hdr(&xdr, &hdr); ++ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); ++ encode_cb_device(&xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++ return 0; ++} ++#endif /* CONFIG_PNFSD */ + + static int + decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ +@@ -403,6 +579,48 @@ out: + return status; + } + ++#if defined(CONFIG_PNFSD) ++static int ++nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, u32 *p, ++ struct nfsd4_cb_sequence *seq) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_cb_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_cb_sequence(&xdr, seq, rqstp); ++ if (status) ++ goto out; ++ status = decode_cb_op_hdr(&xdr, OP_CB_LAYOUT); ++out: ++ return status; ++} ++ ++static int ++nfs4_xdr_dec_cb_device(struct rpc_rqst *rqstp, u32 *p, ++ struct nfsd4_cb_sequence *seq) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_cb_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_cb_sequence(&xdr, seq, rqstp); ++ if (status) ++ goto out; ++ status = decode_cb_op_hdr(&xdr, OP_CB_DEVICE); ++out: ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * RPC procedure tables + */ +@@ -420,6 +638,10 @@ out: + static struct rpc_procinfo nfs4_cb_procedures[] = { + PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null), + PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall), ++#if defined(CONFIG_PNFSD) ++ PROC(CB_LAYOUT, COMPOUND, enc_cb_layout, dec_cb_layout), ++ PROC(CB_DEVICE, COMPOUND, enc_cb_device, dec_cb_device), ++#endif + }; + + static struct rpc_version nfs_cb_version4 = { +@@ -606,10 +828,9 @@ out: + * TODO: cb_sequence should support referring call lists, cachethis, multiple + * slots, and mark callback channel down on communication errors. + */ +-static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_prepare_sequence(struct rpc_task *task, ++ struct nfs4_client *clp) + { +- struct nfs4_delegation *dp = calldata; +- struct nfs4_client *clp = dp->dl_client; + struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; + u32 minorversion = clp->cl_cb_conn.cb_minorversion; + int status = 0; +@@ -629,11 +850,15 @@ static void nfsd4_cb_prepare(struct rpc_ + rpc_call_start(task); + } + +-static void nfsd4_cb_done(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_recall_prepare(struct rpc_task *task, void *calldata) + { + struct nfs4_delegation *dp = calldata; +- struct nfs4_client *clp = dp->dl_client; ++ nfsd4_cb_prepare_sequence(task, dp->dl_client); ++} + ++static void nfsd4_cb_done_sequence(struct rpc_task *task, ++ struct nfs4_client *clp) ++{ + dprintk("%s: minorversion=%d\n", __func__, + clp->cl_cb_conn.cb_minorversion); + +@@ -657,7 +882,7 @@ static void nfsd4_cb_recall_done(struct + struct nfs4_client *clp = dp->dl_client; + struct rpc_clnt *current_rpc_client = clp->cl_cb_client; + +- nfsd4_cb_done(task, calldata); ++ nfsd4_cb_done_sequence(task, clp); + + if (current_rpc_client == NULL) { + /* We're shutting down; give up. */ +@@ -688,7 +913,7 @@ static void nfsd4_cb_recall_done(struct + if (dp->dl_retries--) { + rpc_delay(task, 2*HZ); + task->tk_status = 0; +- rpc_restart_call(task); ++ rpc_restart_call_prepare(task); + return; + } else { + atomic_set(&clp->cl_cb_set, 0); +@@ -704,7 +929,7 @@ static void nfsd4_cb_recall_release(void + } + + static const struct rpc_call_ops nfsd4_cb_recall_ops = { +- .rpc_call_prepare = nfsd4_cb_prepare, ++ .rpc_call_prepare = nfsd4_cb_recall_prepare, + .rpc_call_done = nfsd4_cb_recall_done, + .rpc_release = nfsd4_cb_recall_release, + }; +@@ -781,3 +1006,173 @@ void nfsd4_cb_recall(struct nfs4_delegat + { + queue_work(callback_wq, &dp->dl_recall.cb_work); + } ++ ++#if defined(CONFIG_PNFSD) ++static void nfsd4_cb_layout_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ nfsd4_cb_prepare_sequence(task, clr->clr_client); ++} ++ ++static void nfsd4_cb_layout_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ struct nfs4_client *clp = clr->clr_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ if (!task->tk_status) ++ return; ++ ++ printk("%s: clp %p cb_client %p fp %p failed with status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ clr->clr_file, ++ task->tk_status); ++ ++ switch (task->tk_status) { ++ case -EIO: ++ /* Network partition? */ ++ atomic_set(&clp->cl_cb_set, 0); ++ warn_no_callback_path(clp, task->tk_status); ++ /* FIXME: ++ * The pnfs standard states that we need to only expire ++ * the client after at-least "lease time" .eg lease-time * 2 ++ * when failing to communicate a recall ++ */ ++ break; ++ case -NFS4ERR_DELAY: ++ /* Pole the client until it's done with the layout */ ++ rpc_delay(task, HZ/100); /* 10 mili-seconds */ ++ task->tk_status = 0; ++ rpc_restart_call_prepare(task); ++ break; ++ case -NFS4ERR_NOMATCHING_LAYOUT: ++ task->tk_status = 0; ++ nomatching_layout(clr); ++ } ++} ++ ++static void nfsd4_cb_layout_release(void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ kfree(clr->clr_args); ++ clr->clr_args = NULL; ++ put_layoutrecall(clr); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_layout_ops = { ++ .rpc_call_prepare = nfsd4_cb_layout_prepare, ++ .rpc_call_done = nfsd4_cb_layout_done, ++ .rpc_release = nfsd4_cb_layout_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++int ++nfsd4_cb_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfs4_client *clp = clr->clr_client; ++ struct rpc_clnt *clnt = clp->cl_cb_client; ++ struct nfs4_rpc_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_LAYOUT], ++ .rpc_cred = callback_cred ++ }; ++ int status; ++ ++ args = kzalloc(sizeof(*args), GFP_KERNEL); ++ if (!args) { ++ status = -ENOMEM; ++ goto out; ++ } ++ clr->clr_args = args; ++ args->args_op = clr; ++ msg.rpc_argp = args; ++ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, ++ &nfsd4_cb_layout_ops, clr); ++out: ++ if (status) { ++ kfree(args); ++ put_layoutrecall(clr); ++ } ++ dprintk("NFSD: nfsd4_cb_layout: status %d\n", status); ++ return status; ++} ++ ++static void nfsd4_cb_device_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ nfsd4_cb_prepare_sequence(task, cbnd->nd_client); ++} ++ ++static void nfsd4_cb_device_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ struct nfs4_client *clp = cbnd->nd_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ dprintk("%s: clp %p cb_client %p: status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ task->tk_status); ++ ++ if (task->tk_status == -EIO) { ++ /* Network partition? */ ++ atomic_set(&clp->cl_cb_set, 0); ++ warn_no_callback_path(clp, task->tk_status); ++ } ++} ++ ++static void nfsd4_cb_device_release(void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ kfree(cbnd->nd_args); ++ cbnd->nd_args = NULL; ++ kfree(cbnd); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_device_ops = { ++ .rpc_call_prepare = nfsd4_cb_device_prepare, ++ .rpc_call_done = nfsd4_cb_device_done, ++ .rpc_release = nfsd4_cb_device_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++int ++nfsd4_cb_notify_device(struct nfs4_notify_device *cbnd) ++{ ++ struct nfs4_client *clp = cbnd->nd_client; ++ struct rpc_clnt *clnt = clp->cl_cb_client; ++ struct nfs4_rpc_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_DEVICE], ++ .rpc_cred = callback_cred ++ }; ++ int status = -EIO; ++ ++ dprintk("%s: clp %p\n", __func__, clp); ++ ++ args = kzalloc(sizeof(*args), GFP_KERNEL); ++ if (!args) { ++ status = -ENOMEM; ++ goto out; ++ } ++ args->args_op = cbnd; ++ msg.rpc_argp = args; ++ ++ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, ++ &nfsd4_cb_device_ops, cbnd); ++out: ++ if (status) ++ kfree(args); ++ dprintk("%s: status %d\n", __func__, status); ++ return status; ++} ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-23 12:09:03.307491492 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-23 12:09:03.308491262 -0400 +@@ -0,0 +1,1679 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ *****************************************************************************/ ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Globals */ ++static u32 current_layoutid = 1; ++ ++/* ++ * Currently used for manipulating the layout state. ++ */ ++static DEFINE_SPINLOCK(layout_lock); ++ ++#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_SMP) ++# define BUG_ON_UNLOCKED_LAYOUT() BUG_ON(!spin_is_locked(&layout_lock)) ++#else ++# define BUG_ON_UNLOCKED_LAYOUT() ++#endif ++ ++/* ++ * Layout state - NFSv4.1 pNFS ++ */ ++static struct kmem_cache *pnfs_layout_slab; ++static struct kmem_cache *pnfs_layoutrecall_slab; ++ ++/* hash table for nfsd4_pnfs_deviceid.sbid */ ++#define SBID_HASH_BITS 8 ++#define SBID_HASH_SIZE (1 << SBID_HASH_BITS) ++#define SBID_HASH_MASK (SBID_HASH_SIZE - 1) ++ ++struct sbid_tracker { ++ u64 id; ++ struct super_block *sb; ++ struct list_head hash; ++}; ++ ++static u64 current_sbid; ++static struct list_head sbid_hashtbl[SBID_HASH_SIZE]; ++ ++static inline unsigned long ++sbid_hashval(struct super_block *sb) ++{ ++ return hash_ptr(sb, SBID_HASH_BITS); ++} ++ ++static inline struct sbid_tracker * ++alloc_sbid(void) ++{ ++ return kmalloc(sizeof(struct sbid_tracker), GFP_KERNEL); ++} ++ ++static void ++destroy_sbid(struct sbid_tracker *sbid) ++{ ++ spin_lock(&layout_lock); ++ list_del(&sbid->hash); ++ spin_unlock(&layout_lock); ++ kfree(sbid); ++} ++ ++void ++nfsd4_free_pnfs_slabs(void) ++{ ++ int i; ++ struct sbid_tracker *sbid; ++ ++ nfsd4_free_slab(&pnfs_layout_slab); ++ nfsd4_free_slab(&pnfs_layoutrecall_slab); ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ while (!list_empty(&sbid_hashtbl[i])) { ++ sbid = list_first_entry(&sbid_hashtbl[i], ++ struct sbid_tracker, ++ hash); ++ destroy_sbid(sbid); ++ } ++ } ++} ++ ++int ++nfsd4_init_pnfs_slabs(void) ++{ ++ int i; ++ ++ pnfs_layout_slab = kmem_cache_create("pnfs_layouts", ++ sizeof(struct nfs4_layout), 0, 0, NULL); ++ if (pnfs_layout_slab == NULL) ++ return -ENOMEM; ++ pnfs_layoutrecall_slab = kmem_cache_create("pnfs_layoutrecalls", ++ sizeof(struct nfs4_layoutrecall), 0, 0, NULL); ++ if (pnfs_layoutrecall_slab == NULL) ++ return -ENOMEM; ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ INIT_LIST_HEAD(&sbid_hashtbl[i]); ++ } ++ ++ return 0; ++} ++ ++/* XXX: Need to implement the notify types and track which ++ * clients have which devices. */ ++void pnfs_set_device_notify(clientid_t *clid, unsigned int types) ++{ ++ struct nfs4_client *clp; ++ dprintk("%s: -->\n", __func__); ++ ++ nfs4_lock_state(); ++ /* Indicate that client has a device so we can only notify ++ * the correct clients */ ++ clp = find_confirmed_client(clid); ++ if (clp) { ++ atomic_inc(&clp->cl_deviceref); ++ dprintk("%s: Incr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++ } ++ nfs4_unlock_state(); ++} ++ ++/* Clear notifications for this client ++ * XXX: Do we need to loop through a clean up all ++ * krefs when nfsd cleans up the client? */ ++void pnfs_clear_device_notify(struct nfs4_client *clp) ++{ ++ atomic_dec(&clp->cl_deviceref); ++ dprintk("%s: Decr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++} ++ ++static struct nfs4_layout_state * ++alloc_init_layout_state(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid) ++{ ++ struct nfs4_layout_state *new; ++ ++ /* FIXME: use a kmem_cache */ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return new; ++ get_nfs4_file(fp); ++ INIT_LIST_HEAD(&new->ls_perfile); ++ INIT_LIST_HEAD(&new->ls_layouts); ++ kref_init(&new->ls_ref); ++ new->ls_client = clp; ++ new->ls_file = fp; ++ new->ls_stateid.si_boot = stateid->si_boot; ++ new->ls_stateid.si_stateownerid = 0; /* identifies layout stateid */ ++ new->ls_stateid.si_generation = 1; ++ spin_lock(&layout_lock); ++ new->ls_stateid.si_fileid = current_layoutid++; ++ list_add(&new->ls_perfile, &fp->fi_layout_states); ++ spin_unlock(&layout_lock); ++ return new; ++} ++ ++static inline void ++get_layout_state(struct nfs4_layout_state *ls) ++{ ++ kref_get(&ls->ls_ref); ++} ++ ++static void ++destroy_layout_state_common(struct nfs4_layout_state *ls) ++{ ++ struct nfs4_file *fp = ls->ls_file; ++ ++ dprintk("pNFS %s: ls %p fp %p clp %p\n", __func__, ls, fp, ++ ls->ls_client); ++ BUG_ON(!list_empty(&ls->ls_layouts)); ++ kfree(ls); ++ put_nfs4_file(fp); ++} ++ ++static void ++destroy_layout_state(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ spin_lock(&layout_lock); ++ list_del(&ls->ls_perfile); ++ spin_unlock(&layout_lock); ++ destroy_layout_state_common(ls); ++} ++ ++static void ++destroy_layout_state_locked(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ list_del(&ls->ls_perfile); ++ destroy_layout_state_common(ls); ++} ++ ++static inline void ++put_layout_state(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state); ++} ++ ++static inline void ++put_layout_state_locked(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state_locked); ++} ++ ++/* ++ * Search the fp->fi_layout_state list for a layout state with the clientid. ++ * If not found, then this is a 'first open/delegation/lock stateid' from ++ * the client for this file. ++ * Called under the layout_lock. ++ */ ++static struct nfs4_layout_state * ++find_get_layout_state(struct nfs4_client *clp, struct nfs4_file *fp) ++{ ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_for_each_entry(ls, &fp->fi_layout_states, ls_perfile) { ++ if (ls->ls_client == clp) { ++ dprintk("pNFS %s: before GET ls %p ls_ref %d\n", ++ __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ get_layout_state(ls); ++ return ls; ++ } ++ } ++ return NULL; ++} ++ ++static __be32 ++verify_stateid(struct nfs4_file *fp, stateid_t *stateid) ++{ ++ struct nfs4_stateid *local = NULL; ++ struct nfs4_delegation *temp = NULL; ++ ++ /* check if open or lock stateid */ ++ local = find_stateid(stateid, RD_STATE); ++ if (local) ++ return 0; ++ temp = find_delegation_stateid(fp->fi_inode, stateid); ++ if (temp) ++ return 0; ++ return nfserr_bad_stateid; ++} ++ ++/* ++ * nfs4_preocess_layout_stateid () ++ * ++ * We have looked up the nfs4_file corresponding to the current_fh, and ++ * confirmed the clientid. Pull the few tests from nfs4_preprocess_stateid_op() ++ * that make sense with a layout stateid. ++ * ++ * Called with the state_lock held ++ * Returns zero and stateid is updated, or error. ++ * ++ * Note: the struct nfs4_layout_state pointer is only set by layoutget. ++ */ ++static __be32 ++nfs4_process_layout_stateid(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid, struct nfs4_layout_state **lsp) ++{ ++ struct nfs4_layout_state *ls = NULL; ++ __be32 status = 0; ++ ++ dprintk("--> %s clp %p fp %p \n", __func__, clp, fp); ++ ++ dprintk("%s: operation stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ goto out; ++ ++ /* Is this the first use of this layout ? */ ++ spin_lock(&layout_lock); ++ ls = find_get_layout_state(clp, fp); ++ spin_unlock(&layout_lock); ++ if (!ls) { ++ /* Only alloc layout state on layoutget (which sets lsp). */ ++ if (!lsp) { ++ dprintk("%s ERROR: Not layoutget & no layout stateid\n", ++ __func__); ++ status = nfserr_bad_stateid; ++ goto out; ++ } ++ dprintk("%s Initial stateid for layout: file %p client %p\n", ++ __func__, fp, clp); ++ ++ /* verify input stateid */ ++ status = verify_stateid(fp, stateid); ++ if (status) { ++ dprintk("%s ERROR: invalid open/deleg/lock stateid\n", ++ __func__); ++ goto out; ++ } ++ ls = alloc_init_layout_state(clp, fp, stateid); ++ if (!ls) { ++ dprintk("%s pNFS ERROR: no memory for layout state\n", ++ __func__); ++ status = nfserr_resource; ++ goto out; ++ } ++ } else { ++ dprintk("%s Not initial stateid. Layout state %p file %p\n", ++ __func__, ls, fp); ++ ++ /* BAD STATEID */ ++ status = nfserr_bad_stateid; ++ if (memcmp(&ls->ls_stateid.si_opaque, &stateid->si_opaque, ++ sizeof(stateid_opaque_t)) != 0) { ++ ++ /* if a LAYOUTGET operation and stateid is a valid ++ * open/deleg/lock stateid, accept it as a parallel ++ * initial layout stateid ++ */ ++ if (lsp && ((verify_stateid(fp, stateid)) == 0)) { ++ dprintk("%s parallel initial layout state\n", ++ __func__); ++ goto update; ++ } ++ ++ dprintk("%s ERROR bad opaque in stateid 1\n", __func__); ++ goto out_put; ++ } ++ ++ /* stateid is a valid layout stateid for this file. */ ++ if (stateid->si_generation > ls->ls_stateid.si_generation) { ++ dprintk("%s bad stateid 1\n", __func__); ++ goto out_put; ++ } ++update: ++ update_stateid(&ls->ls_stateid); ++ dprintk("%s Updated ls_stateid to %d on layoutstate %p\n", ++ __func__, ls->ls_stateid.si_generation, ls); ++ } ++ status = 0; ++ /* Set the stateid to be encoded */ ++ memcpy(stateid, &ls->ls_stateid, sizeof(stateid_t)); ++ ++ /* Return the layout state if requested */ ++ if (lsp) { ++ get_layout_state(ls); ++ *lsp = ls; ++ } ++ dprintk("%s: layout stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(&ls->ls_stateid)); ++out_put: ++ dprintk("%s PUT LO STATE:\n", __func__); ++ put_layout_state(ls); ++out: ++ dprintk("<-- %s status %d\n", __func__, htonl(status)); ++ ++ return status; ++} ++ ++static inline struct nfs4_layout * ++alloc_layout(void) ++{ ++ return kmem_cache_alloc(pnfs_layout_slab, GFP_KERNEL); ++} ++ ++static inline void ++free_layout(struct nfs4_layout *lp) ++{ ++ kmem_cache_free(pnfs_layout_slab, lp); ++} ++ ++static void ++init_layout(struct nfs4_layout_state *ls, ++ struct nfs4_layout *lp, ++ struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg) ++{ ++ dprintk("pNFS %s: ls %p lp %p clp %p fp %p ino %p\n", __func__, ++ ls, lp, clp, fp, fp->fi_inode); ++ ++ get_nfs4_file(fp); ++ lp->lo_client = clp; ++ lp->lo_file = fp; ++ get_layout_state(ls); ++ lp->lo_state = ls; ++ memcpy(&lp->lo_seg, seg, sizeof(lp->lo_seg)); ++ spin_lock(&layout_lock); ++ list_add_tail(&lp->lo_perstate, &ls->ls_layouts); ++ list_add_tail(&lp->lo_perclnt, &clp->cl_layouts); ++ list_add_tail(&lp->lo_perfile, &fp->fi_layouts); ++ spin_unlock(&layout_lock); ++ dprintk("pNFS %s end\n", __func__); ++} ++ ++static void ++dequeue_layout(struct nfs4_layout *lp) ++{ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del(&lp->lo_perclnt); ++ list_del(&lp->lo_perfile); ++ list_del(&lp->lo_perstate); ++} ++ ++static void ++destroy_layout(struct nfs4_layout *lp) ++{ ++ struct nfs4_client *clp; ++ struct nfs4_file *fp; ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ clp = lp->lo_client; ++ fp = lp->lo_file; ++ ls = lp->lo_state; ++ dprintk("pNFS %s: lp %p clp %p fp %p ino %p ls_layouts empty %d\n", ++ __func__, lp, clp, fp, fp->fi_inode, ++ list_empty(&ls->ls_layouts)); ++ ++ kmem_cache_free(pnfs_layout_slab, lp); ++ /* release references taken by init_layout */ ++ put_layout_state_locked(ls); ++ put_nfs4_file(fp); ++} ++ ++void fs_layout_return(struct super_block *sb, struct inode *ino, ++ struct nfsd4_pnfs_layoutreturn *lrp, int flags, ++ void *recall_cookie) ++{ ++ int ret; ++ ++ if (unlikely(!sb->s_pnfs_op->layout_return)) ++ return; ++ ++ lrp->lr_flags = flags; ++ lrp->args.lr_cookie = recall_cookie; ++ ++ if (!ino) /* FSID or ALL */ ++ ino = sb->s_root->d_inode; ++ ++ ret = sb->s_pnfs_op->layout_return(ino, &lrp->args); ++ dprintk("%s: inode %lu iomode=%d offset=0x%llx length=0x%llx " ++ "cookie = %p flags 0x%x status=%d\n", ++ __func__, ino->i_ino, lrp->args.lr_seg.iomode, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, ++ recall_cookie, flags, ret); ++} ++ ++static u64 ++alloc_init_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ struct sbid_tracker *new = alloc_sbid(); ++ unsigned long hash_idx = sbid_hashval(sb); ++ u64 id = 0; ++ ++ if (likely(new)) { ++ spin_lock(&layout_lock); ++ id = ++current_sbid; ++ new->id = (id << SBID_HASH_BITS) | (hash_idx & SBID_HASH_MASK); ++ id = new->id; ++ BUG_ON(id == 0); ++ new->sb = sb; ++ ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) ++ if (sbid->sb == sb) { ++ kfree(new); ++ id = sbid->id; ++ spin_unlock(&layout_lock); ++ return id; ++ } ++ list_add(&new->hash, &sbid_hashtbl[hash_idx]); ++ spin_unlock(&layout_lock); ++ } ++ return id; ++} ++ ++struct super_block * ++find_sbid_id(u64 id) ++{ ++ struct sbid_tracker *sbid; ++ struct super_block *sb = NULL; ++ unsigned long hash_idx = id & SBID_HASH_MASK; ++ int pos = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->id != id) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ sb = sbid->sb; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return sb; ++} ++ ++u64 ++find_create_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ unsigned long hash_idx = sbid_hashval(sb); ++ int pos = 0; ++ u64 id = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->sb != sb) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ id = sbid->id; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ if (!id) ++ id = alloc_init_sbid(sb); ++ ++ return id; ++} ++ ++/* ++ * Create a layoutrecall structure ++ * An optional layoutrecall can be cloned (except for the layoutrecall lists) ++ */ ++static struct nfs4_layoutrecall * ++alloc_init_layoutrecall(struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_client *clp, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ dprintk("NFSD %s\n", __func__); ++ clr = kmem_cache_alloc(pnfs_layoutrecall_slab, GFP_KERNEL); ++ if (clr == NULL) ++ return clr; ++ ++ dprintk("NFSD %s -->\n", __func__); ++ ++ memset(clr, 0, sizeof(*clr)); ++ if (lrfile) ++ get_nfs4_file(lrfile); ++ clr->clr_client = clp; ++ clr->clr_file = lrfile; ++ clr->cb = *cbl; ++ ++ kref_init(&clr->clr_ref); ++ INIT_LIST_HEAD(&clr->clr_perclnt); ++ ++ dprintk("NFSD %s return %p\n", __func__, clr); ++ return clr; ++} ++ ++static void ++get_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ kref_get(&clr->clr_ref); ++} ++ ++static void ++destroy_layoutrecall(struct kref *kref) ++{ ++ struct nfs4_layoutrecall *clr = ++ container_of(kref, struct nfs4_layoutrecall, clr_ref); ++ dprintk("pNFS %s: clr %p fp %p clp %p\n", __func__, clr, ++ clr->clr_file, clr->clr_client); ++ BUG_ON(!list_empty(&clr->clr_perclnt)); ++ if (clr->clr_file) ++ put_nfs4_file(clr->clr_file); ++ kmem_cache_free(pnfs_layoutrecall_slab, clr); ++} ++ ++int ++put_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ return kref_put(&clr->clr_ref, destroy_layoutrecall); ++} ++ ++void * ++layoutrecall_done(struct nfs4_layoutrecall *clr) ++{ ++ void *recall_cookie = clr->cb.cbl_cookie; ++ struct nfs4_layoutrecall *parent = clr->parent; ++ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del_init(&clr->clr_perclnt); ++ put_layoutrecall(clr); ++ ++ if (parent && !put_layoutrecall(parent)) ++ recall_cookie = NULL; ++ ++ return recall_cookie; ++} ++ ++/* ++ * get_state() and cb_get_state() are ++ */ ++void ++release_pnfs_ds_dev_list(struct nfs4_stateid *stp) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ while (!list_empty(&stp->st_pnfs_ds_id)) { ++ ddp = list_entry(stp->st_pnfs_ds_id.next, ++ struct pnfs_ds_dev_entry, dd_dev_entry); ++ list_del(&ddp->dd_dev_entry); ++ kfree(ddp); ++ } ++} ++ ++static int ++nfs4_add_pnfs_ds_dev(struct nfs4_stateid *stp, u32 dsid) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ ddp = kmalloc(sizeof(*ddp), GFP_KERNEL); ++ if (!ddp) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&ddp->dd_dev_entry); ++ list_add(&ddp->dd_dev_entry, &stp->st_pnfs_ds_id); ++ ddp->dd_dsid = dsid; ++ return 0; ++} ++ ++/* ++ * are two octet ranges overlapping? ++ * start1 last1 ++ * |-----------------| ++ * start2 last2 ++ * |----------------| ++ */ ++static inline int ++lo_seg_overlapping(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 last1 = last_byte_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 last2 = last_byte_offset(start2, l2->length); ++ int ret; ++ ++ /* if last1 == start2 there's a single byte overlap */ ++ ret = (last2 >= start1) && (last1 >= start2); ++ dprintk("%s: l1 %llu:%lld l2 %llu:%lld ret=%d\n", __func__, ++ l1->offset, l1->length, l2->offset, l2->length, ret); ++ return ret; ++} ++ ++static inline int ++same_fsid_major(struct nfs4_fsid *fsid, u64 major) ++{ ++ return fsid->major == major; ++} ++ ++static inline int ++same_fsid(struct nfs4_fsid *fsid, struct svc_fh *current_fh) ++{ ++ return same_fsid_major(fsid, current_fh->fh_export->ex_fsid); ++} ++ ++/* ++ * find a layout recall conflicting with the specified layoutget ++ */ ++static int ++is_layout_recalled(struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (clr, &clp->cl_layoutrecalls, clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != seg->layout_type) ++ continue; ++ if (clr->cb.cbl_recall_type == RETURN_ALL) ++ goto found; ++ if (clr->cb.cbl_recall_type == RETURN_FSID) { ++ if (same_fsid(&clr->cb.cbl_fsid, current_fh)) ++ goto found; ++ else ++ continue; ++ } ++ BUG_ON(clr->cb.cbl_recall_type != RETURN_FILE); ++ if (clr->cb.cbl_seg.clientid == seg->clientid && ++ lo_seg_overlapping(&clr->cb.cbl_seg, seg)) ++ goto found; ++ } ++ spin_unlock(&layout_lock); ++ return 0; ++found: ++ spin_unlock(&layout_lock); ++ return 1; ++} ++ ++/* ++ * are two octet ranges overlapping or adjacent? ++ */ ++static inline int ++lo_seg_mergeable(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ /* is end1 == start2 ranges are adjacent */ ++ return (end2 >= start1) && (end1 >= start2); ++} ++ ++static void ++extend_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lg) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lg_start = lg->offset; ++ u64 lg_end = end_offset(lg_start, lg->length); ++ ++ /* lo already covers lg? */ ++ if (lo_start <= lg_start && lg_end <= lo_end) ++ return; ++ ++ /* extend start offset */ ++ if (lo_start > lg_start) ++ lo_start = lg_start; ++ ++ /* extend end offset */ ++ if (lo_end < lg_end) ++ lo_end = lg_end; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? ++ lo_end : lo_end - lo_start; ++} ++ ++static struct nfs4_layout * ++merge_layout(struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layout *lp = NULL; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (lp, &fp->fi_layouts, lo_perfile) ++ if (lp->lo_seg.layout_type == seg->layout_type && ++ lp->lo_seg.clientid == seg->clientid && ++ lp->lo_seg.iomode == seg->iomode && ++ lo_seg_mergeable(&lp->lo_seg, seg)) { ++ extend_layout(&lp->lo_seg, seg); ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return lp; ++} ++ ++__be32 ++nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *lgp, ++ struct exp_xdr_stream *xdr) ++{ ++ u32 status; ++ __be32 nfserr; ++ struct inode *ino = lgp->lg_fhp->fh_dentry->d_inode; ++ struct super_block *sb = ino->i_sb; ++ int can_merge; ++ struct nfs4_file *fp; ++ struct nfs4_client *clp; ++ struct nfs4_layout *lp = NULL; ++ struct nfs4_layout_state *ls = NULL; ++ struct nfsd4_pnfs_layoutget_arg args = { ++ .lg_minlength = lgp->lg_minlength, ++ .lg_fh = &lgp->lg_fhp->fh_handle, ++ }; ++ struct nfsd4_pnfs_layoutget_res res = { ++ .lg_seg = lgp->lg_seg, ++ }; ++ ++ dprintk("NFSD: %s Begin\n", __func__); ++ ++ args.lg_sbid = find_create_sbid(sb); ++ if (!args.lg_sbid) { ++ nfserr = nfserr_layouttrylater; ++ goto out; ++ } ++ ++ can_merge = sb->s_pnfs_op->can_merge_layouts != NULL && ++ sb->s_pnfs_op->can_merge_layouts(lgp->lg_seg.layout_type); ++ ++ nfs4_lock_state(); ++ fp = find_alloc_file(ino, lgp->lg_fhp); ++ clp = find_confirmed_client((clientid_t *)&lgp->lg_seg.clientid); ++ dprintk("pNFS %s: fp %p clp %p \n", __func__, fp, clp); ++ if (!fp || !clp) { ++ nfserr = nfserr_inval; ++ goto out_unlock; ++ } ++ ++ /* Check decoded layout stateid */ ++ nfserr = nfs4_process_layout_stateid(clp, fp, &lgp->lg_sid, &ls); ++ if (nfserr) ++ goto out_unlock; ++ ++ if (is_layout_recalled(clp, lgp->lg_fhp, &lgp->lg_seg)) { ++ nfserr = nfserr_recallconflict; ++ goto out; ++ } ++ ++ /* pre-alloc layout in case we can't merge after we call ++ * the file system ++ */ ++ lp = alloc_layout(); ++ if (!lp) { ++ nfserr = nfserr_layouttrylater; ++ goto out_unlock; ++ } ++ ++ dprintk("pNFS %s: pre-export type 0x%x maxcount %Zd " ++ "iomode %u offset %llu length %llu\n", ++ __func__, lgp->lg_seg.layout_type, ++ exp_xdr_qbytes(xdr->end - xdr->p), ++ lgp->lg_seg.iomode, lgp->lg_seg.offset, lgp->lg_seg.length); ++ ++ /* FIXME: need to eliminate the use of the state lock */ ++ nfs4_unlock_state(); ++ status = sb->s_pnfs_op->layout_get(ino, xdr, &args, &res); ++ nfs4_lock_state(); ++ ++ dprintk("pNFS %s: post-export status %u " ++ "iomode %u offset %llu length %llu\n", ++ __func__, status, res.lg_seg.iomode, ++ res.lg_seg.offset, res.lg_seg.length); ++ ++ /* ++ * The allowable error codes for the layout_get pNFS export ++ * operations vector function (from the file system) can be ++ * expanded as needed to include other errors defined for ++ * the RFC 5561 LAYOUTGET operation. ++ */ ++ switch (status) { ++ case 0: ++ nfserr = NFS4_OK; ++ break; ++ case NFS4ERR_ACCESS: ++ case NFS4ERR_BADIOMODE: ++ /* No support for LAYOUTIOMODE4_RW layouts */ ++ case NFS4ERR_BADLAYOUT: ++ /* No layout matching loga_minlength rules */ ++ case NFS4ERR_INVAL: ++ case NFS4ERR_IO: ++ case NFS4ERR_LAYOUTTRYLATER: ++ case NFS4ERR_LAYOUTUNAVAILABLE: ++ case NFS4ERR_LOCKED: ++ case NFS4ERR_NOSPC: ++ case NFS4ERR_RECALLCONFLICT: ++ case NFS4ERR_SERVERFAULT: ++ case NFS4ERR_TOOSMALL: ++ /* Requested layout too big for loga_maxcount */ ++ case NFS4ERR_WRONG_TYPE: ++ /* Not a regular file */ ++ nfserr = cpu_to_be32(status); ++ goto out_freelayout; ++ default: ++ BUG(); ++ nfserr = nfserr_serverfault; ++ } ++ ++ lgp->lg_seg = res.lg_seg; ++ lgp->lg_roc = res.lg_return_on_close; ++ ++ /* SUCCESS! ++ * Can the new layout be merged into an existing one? ++ * If so, free unused layout struct ++ */ ++ if (can_merge && merge_layout(fp, clp, &res.lg_seg)) ++ goto out_freelayout; ++ ++ /* Can't merge, so let's initialize this new layout */ ++ init_layout(ls, lp, fp, clp, lgp->lg_fhp, &res.lg_seg); ++out_unlock: ++ if (ls) ++ put_layout_state(ls); ++ if (fp) ++ put_nfs4_file(fp); ++ nfs4_unlock_state(); ++out: ++ dprintk("pNFS %s: lp %p exit nfserr %u\n", __func__, lp, ++ be32_to_cpu(nfserr)); ++ return nfserr; ++out_freelayout: ++ free_layout(lp); ++ goto out_unlock; ++} ++ ++static void ++trim_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lr) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lr_start = lr->offset; ++ u64 lr_end = end_offset(lr_start, lr->length); ++ ++ dprintk("%s:Begin lo %llu:%lld lr %llu:%lld\n", __func__, ++ lo->offset, lo->length, lr->offset, lr->length); ++ ++ /* lr fully covers lo? */ ++ if (lr_start <= lo_start && lo_end <= lr_end) { ++ lo->length = 0; ++ goto out; ++ } ++ ++ /* ++ * split not supported yet. retain layout segment. ++ * remains must be returned by the client ++ * on the final layout return. ++ */ ++ if (lo_start < lr_start && lr_end < lo_end) { ++ dprintk("%s: split not supported\n", __func__); ++ goto out; ++ } ++ ++ if (lo_start < lr_start) ++ lo_end = lr_start - 1; ++ else /* lr_end < lo_end */ ++ lo_start = lr_end + 1; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? lo_end : lo_end - lo_start; ++out: ++ dprintk("%s:End lo %llu:%lld\n", __func__, lo->offset, lo->length); ++} ++ ++static int ++pnfs_return_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ dprintk("%s: clp %p fp %p\n", __func__, clp, fp); ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &fp->fi_layouts, lo_perfile) { ++ dprintk("%s: lp %p client %p,%p lo_type %x,%x iomode %d,%d\n", ++ __func__, lp, ++ lp->lo_client, clp, ++ lp->lo_seg.layout_type, lrp->args.lr_seg.layout_type, ++ lp->lo_seg.iomode, lrp->args.lr_seg.iomode); ++ if (lp->lo_client != clp || ++ lp->lo_seg.layout_type != lrp->args.lr_seg.layout_type || ++ (lp->lo_seg.iomode != lrp->args.lr_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) || ++ !lo_seg_overlapping(&lp->lo_seg, &lrp->args.lr_seg)) ++ continue; ++ layouts_found++; ++ trim_layout(&lp->lo_seg, &lrp->args.lr_seg); ++ if (!lp->lo_seg.length) { ++ lrp->lrs_present = 0; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ } ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++pnfs_return_client_layouts(struct nfs4_client *clp, ++ struct nfsd4_pnfs_layoutreturn *lrp, u64 ex_fsid) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &clp->cl_layouts, lo_perclnt) { ++ if (lrp->args.lr_seg.layout_type != lp->lo_seg.layout_type || ++ (lrp->args.lr_seg.iomode != lp->lo_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY)) ++ continue; ++ ++ if (lrp->args.lr_return_type == RETURN_FSID && ++ !same_fsid_major(&lp->lo_file->fi_fsid, ex_fsid)) ++ continue; ++ ++ layouts_found++; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++recall_return_perfect_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode || ++ clr->cb.cbl_recall_type != lrp->args.lr_return_type) ++ return 0; ++ ++ return (clr->cb.cbl_recall_type == RETURN_FILE && ++ clr->clr_file == fp && ++ clr->cb.cbl_seg.offset == lrp->args.lr_seg.offset && ++ clr->cb.cbl_seg.length == lrp->args.lr_seg.length) || ++ ++ (clr->cb.cbl_recall_type == RETURN_FSID && ++ same_fsid(&clr->cb.cbl_fsid, current_fh)) || ++ ++ clr->cb.cbl_recall_type == RETURN_ALL; ++} ++ ++static int ++recall_return_partial_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ /* iomode matching? */ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode && ++ clr->cb.cbl_seg.iomode != IOMODE_ANY && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) ++ return 0; ++ ++ if (clr->cb.cbl_recall_type == RETURN_ALL || ++ lrp->args.lr_return_type == RETURN_ALL) ++ return 1; ++ ++ /* fsid matches? */ ++ if (clr->cb.cbl_recall_type == RETURN_FSID || ++ lrp->args.lr_return_type == RETURN_FSID) ++ return same_fsid(&clr->cb.cbl_fsid, current_fh); ++ ++ /* file matches, range overlapping? */ ++ return clr->clr_file == fp && ++ lo_seg_overlapping(&clr->cb.cbl_seg, &lrp->args.lr_seg); ++} ++ ++int nfs4_pnfs_return_layout(struct super_block *sb, struct svc_fh *current_fh, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status = 0; ++ int layouts_found = 0; ++ struct inode *ino = current_fh->fh_dentry->d_inode; ++ struct nfs4_file *fp = NULL; ++ struct nfs4_client *clp; ++ struct nfs4_layoutrecall *clr, *nextclr; ++ u64 ex_fsid = current_fh->fh_export->ex_fsid; ++ void *recall_cookie = NULL; ++ ++ dprintk("NFSD: %s\n", __func__); ++ ++ nfs4_lock_state(); ++ clp = find_confirmed_client((clientid_t *)&lrp->args.lr_seg.clientid); ++ if (!clp) ++ goto out; ++ ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ fp = find_file(ino); ++ if (!fp) { ++ printk(KERN_ERR "%s: RETURN_FILE: no nfs4_file for " ++ "ino %p:%lu\n", ++ __func__, ino, ino ? ino->i_ino : 0L); ++ goto out; ++ } ++ ++ /* Check the stateid */ ++ dprintk("%s PROCESS LO_STATEID inode %p\n", __func__, ino); ++ status = nfs4_process_layout_stateid(clp, fp, &lrp->lr_sid, ++ NULL); ++ if (status) ++ goto out_put_file; ++ ++ /* update layouts */ ++ layouts_found = pnfs_return_file_layouts(clp, fp, lrp); ++ /* optimize for the all-empty case */ ++ if (list_empty(&fp->fi_layouts)) ++ recall_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ } else { ++ layouts_found = pnfs_return_client_layouts(clp, lrp, ex_fsid); ++ } ++ ++ dprintk("pNFS %s: clp %p fp %p layout_type 0x%x iomode %d " ++ "return_type %d fsid 0x%llx offset %llu length %llu: " ++ "layouts_found %d\n", ++ __func__, clp, fp, lrp->args.lr_seg.layout_type, ++ lrp->args.lr_seg.iomode, lrp->args.lr_return_type, ++ ex_fsid, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, layouts_found); ++ ++ /* update layoutrecalls ++ * note: for RETURN_{FSID,ALL}, fp may be NULL ++ */ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (clr, nextclr, &clp->cl_layoutrecalls, ++ clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != lrp->args.lr_seg.layout_type) ++ continue; ++ ++ if (recall_return_perfect_match(clr, lrp, fp, current_fh)) ++ recall_cookie = layoutrecall_done(clr); ++ else if (layouts_found && ++ recall_return_partial_match(clr, lrp, fp, current_fh)) ++ clr->clr_time = CURRENT_TIME; ++ } ++ spin_unlock(&layout_lock); ++ ++out_put_file: ++ if (fp) ++ put_nfs4_file(fp); ++out: ++ nfs4_unlock_state(); ++ ++ /* call exported filesystem layout_return (ignore return-code) */ ++ fs_layout_return(sb, ino, lrp, 0, recall_cookie); ++ ++ dprintk("pNFS %s: exit status %d \n", __func__, status); ++ return status; ++} ++ ++/* ++ * PNFS Metadata server export operations callback for get_state ++ * ++ * called by the cluster fs when it receives a get_state() from a data ++ * server. ++ * returns status, or pnfs_get_state* with pnfs_get_state->status set. ++ * ++ */ ++int ++nfs4_pnfs_cb_get_state(struct super_block *sb, struct pnfs_get_state *arg) ++{ ++ struct nfs4_stateid *stp; ++ int flags = LOCK_STATE | OPEN_STATE; /* search both hash tables */ ++ int status = -EINVAL; ++ struct inode *ino; ++ struct nfs4_delegation *dl; ++ stateid_t *stid = (stateid_t *)&arg->stid; ++ ++ dprintk("NFSD: %s sid=" STATEID_FMT " ino %llu\n", __func__, ++ STATEID_VAL(stid), arg->ino); ++ ++ nfs4_lock_state(); ++ stp = find_stateid(stid, flags); ++ if (!stp) { ++ ino = iget_locked(sb, arg->ino); ++ if (!ino) ++ goto out; ++ ++ if (ino->i_state & I_NEW) { ++ iget_failed(ino); ++ goto out; ++ } ++ ++ dl = find_delegation_stateid(ino, stid); ++ if (dl) ++ status = 0; ++ ++ iput(ino); ++ } else { ++ /* XXX ANDROS: marc removed nfs4_check_fh - how come? */ ++ ++ /* arg->devid is the Data server id, set by the cluster fs */ ++ status = nfs4_add_pnfs_ds_dev(stp, arg->dsid); ++ if (status) ++ goto out; ++ ++ arg->access = stp->st_access_bmap; ++ *(clientid_t *)&arg->clid = ++ stp->st_stateowner->so_client->cl_clientid; ++ } ++out: ++ nfs4_unlock_state(); ++ return status; ++} ++ ++static int ++cl_has_file_layout(struct nfs4_client *clp, struct nfs4_file *lrfile, ++ stateid_t *lsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ struct nfs4_layout_state *ls; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) { ++ if (lp->lo_file != lrfile) ++ continue; ++ ++ ls = find_get_layout_state(clp, lrfile); ++ if (!ls) { ++ /* This shouldn't happen as the file should have a ++ * layout stateid if it has a layout. ++ */ ++ printk(KERN_ERR "%s: file %p has no layout stateid\n", ++ __func__, lrfile); ++ WARN_ON(1); ++ break; ++ } ++ update_stateid(&ls->ls_stateid); ++ memcpy(lsid, &ls->ls_stateid, sizeof(stateid_t)); ++ put_layout_state_locked(ls); ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return found; ++} ++ ++static int ++cl_has_fsid_layout(struct nfs4_client *clp, struct nfs4_fsid *fsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ ++ /* note: minor version unused */ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) ++ if (lp->lo_file->fi_fsid.major == fsid->major) { ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return found; ++} ++ ++static int ++cl_has_any_layout(struct nfs4_client *clp) ++{ ++ return !list_empty(&clp->cl_layouts); ++} ++ ++static int ++cl_has_layout(struct nfs4_client *clp, struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile, stateid_t *lsid) ++{ ++ switch (cbl->cbl_recall_type) { ++ case RETURN_FILE: ++ return cl_has_file_layout(clp, lrfile, lsid); ++ case RETURN_FSID: ++ return cl_has_fsid_layout(clp, &cbl->cbl_fsid); ++ default: ++ return cl_has_any_layout(clp); ++ } ++} ++ ++/* ++ * Called without the layout_lock. ++ */ ++void ++nomatching_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfsd4_pnfs_layoutreturn lr = { ++ .args.lr_return_type = clr->cb.cbl_recall_type, ++ .args.lr_seg = clr->cb.cbl_seg, ++ }; ++ struct inode *inode; ++ void *recall_cookie; ++ ++ if (clr->clr_file) { ++ inode = igrab(clr->clr_file->fi_inode); ++ if (WARN_ON(!inode)) ++ return; ++ } else { ++ inode = NULL; ++ } ++ ++ dprintk("%s: clp %p fp %p: simulating layout_return\n", __func__, ++ clr->clr_client, clr->clr_file); ++ ++ if (clr->cb.cbl_recall_type == RETURN_FILE) ++ pnfs_return_file_layouts(clr->clr_client, clr->clr_file, &lr); ++ else ++ pnfs_return_client_layouts(clr->clr_client, &lr, ++ clr->cb.cbl_fsid.major); ++ ++ spin_lock(&layout_lock); ++ recall_cookie = layoutrecall_done(clr); ++ spin_unlock(&layout_lock); ++ ++ fs_layout_return(clr->clr_sb, inode, &lr, LR_FLAG_INTERN, ++ recall_cookie); ++ iput(inode); ++} ++ ++void pnfs_expire_client(struct nfs4_client *clp) ++{ ++ for (;;) { ++ struct nfs4_layoutrecall *lrp = NULL; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layoutrecalls)) { ++ lrp = list_entry(clp->cl_layoutrecalls.next, ++ struct nfs4_layoutrecall, clr_perclnt); ++ get_layoutrecall(lrp); ++ } ++ spin_unlock(&layout_lock); ++ if (!lrp) ++ break; ++ ++ dprintk("%s: lrp %p, fp %p\n", __func__, lrp, lrp->clr_file); ++ BUG_ON(lrp->clr_client != clp); ++ nomatching_layout(lrp); ++ put_layoutrecall(lrp); ++ } ++ ++ for (;;) { ++ struct nfs4_layout *lp = NULL; ++ struct inode *inode = NULL; ++ struct nfsd4_pnfs_layoutreturn lr; ++ bool empty = false; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layouts)) { ++ lp = list_entry(clp->cl_layouts.next, ++ struct nfs4_layout, lo_perclnt); ++ inode = igrab(lp->lo_file->fi_inode); ++ memset(&lr, 0, sizeof(lr)); ++ lr.args.lr_return_type = RETURN_FILE; ++ lr.args.lr_seg = lp->lo_seg; ++ empty = list_empty(&lp->lo_file->fi_layouts); ++ BUG_ON(lp->lo_client != clp); ++ dequeue_layout(lp); ++ destroy_layout(lp); /* do not access lp after this */ ++ } ++ spin_unlock(&layout_lock); ++ if (!lp) ++ break; ++ ++ if (WARN_ON(!inode)) ++ break; ++ ++ dprintk("%s: inode %lu lp %p clp %p\n", __func__, inode->i_ino, ++ lp, clp); ++ ++ fs_layout_return(inode->i_sb, inode, &lr, LR_FLAG_EXPIRE, ++ empty ? PNFS_LAST_LAYOUT_NO_RECALLS : NULL); ++ iput(inode); ++ } ++} ++ ++struct create_recall_list_arg { ++ struct nfsd4_pnfs_cb_layout *cbl; ++ struct nfs4_file *lrfile; ++ struct list_head *todolist; ++ unsigned todo_count; ++}; ++ ++/* ++ * look for matching layout for the given client ++ * and add a pending layout recall to the todo list ++ * if found any. ++ * returns: ++ * 0 if layouts found or negative error. ++ */ ++static int ++lo_recall_per_client(struct nfs4_client *clp, void *p) ++{ ++ stateid_t lsid; ++ struct nfs4_layoutrecall *pending; ++ struct create_recall_list_arg *arg = p; ++ ++ memset(&lsid, 0, sizeof(lsid)); ++ if (!cl_has_layout(clp, arg->cbl, arg->lrfile, &lsid)) ++ return 0; ++ ++ /* Matching put done by layoutreturn */ ++ pending = alloc_init_layoutrecall(arg->cbl, clp, arg->lrfile); ++ /* out of memory, drain todo queue */ ++ if (!pending) ++ return -ENOMEM; ++ ++ *(stateid_t *)&pending->cb.cbl_sid = lsid; ++ list_add(&pending->clr_perclnt, arg->todolist); ++ arg->todo_count++; ++ return 0; ++} ++ ++/* Create a layoutrecall structure for each client based on the ++ * original structure. */ ++int ++create_layout_recall_list(struct list_head *todolist, unsigned *todo_len, ++ struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_client *clp; ++ struct create_recall_list_arg arg = { ++ .cbl = cbl, ++ .lrfile = lrfile, ++ .todolist = todolist, ++ }; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ /* If client given by fs, just do single client */ ++ if (cbl->cbl_seg.clientid) { ++ clp = find_confirmed_client( ++ (clientid_t *)&cbl->cbl_seg.clientid); ++ if (!clp) { ++ status = -ENOENT; ++ dprintk("%s: clientid %llx not found\n", __func__, ++ (unsigned long long)cbl->cbl_seg.clientid); ++ goto out; ++ } ++ ++ status = lo_recall_per_client(clp, &arg); ++ } else { ++ /* Check all clients for layout matches */ ++ status = filter_confirmed_clients(lo_recall_per_client, &arg); ++ } ++ ++out: ++ *todo_len = arg.todo_count; ++ dprintk("%s: <-- list len %u status %d\n", __func__, *todo_len, status); ++ return status; ++} ++ ++/* ++ * Recall layouts asynchronously ++ * Called with state lock. ++ */ ++static int ++spawn_layout_recall(struct super_block *sb, struct list_head *todolist, ++ unsigned todo_len) ++{ ++ struct nfs4_layoutrecall *pending; ++ struct nfs4_layoutrecall *parent = NULL; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ if (todo_len > 1) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ ++ parent = alloc_init_layoutrecall(&pending->cb, NULL, ++ pending->clr_file); ++ if (unlikely(!parent)) { ++ /* We want forward progress. If parent cannot be ++ * allocated take the first one as parent but don't ++ * execute it. Caller must check for -EAGAIN, if so ++ * When the partial recalls return, ++ * nfsd_layout_recall_cb should be called again. ++ */ ++ list_del_init(&pending->clr_perclnt); ++ if (todo_len > 2) { ++ parent = pending; ++ } else { ++ parent = NULL; ++ put_layoutrecall(pending); ++ } ++ --todo_len; ++ status = -ENOMEM; ++ } ++ } ++ ++ while (!list_empty(todolist)) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ list_del_init(&pending->clr_perclnt); ++ dprintk("%s: clp %p cb_client %p fp %p\n", __func__, ++ pending->clr_client, ++ pending->clr_client->cl_cb_client, ++ pending->clr_file); ++ if (unlikely(!pending->clr_client->cl_cb_client)) { ++ printk(KERN_INFO ++ "%s: clientid %08x/%08x has no callback path\n", ++ __func__, ++ pending->clr_client->cl_clientid.cl_boot, ++ pending->clr_client->cl_clientid.cl_id); ++ put_layoutrecall(pending); ++ continue; ++ } ++ ++ pending->clr_time = CURRENT_TIME; ++ pending->clr_sb = sb; ++ if (parent) { ++ /* If we created a parent its initial ref count is 1. ++ * We will need to de-ref it eventually. So we just ++ * don't increment on behalf of the last one. ++ */ ++ if (todo_len != 1) ++ get_layoutrecall(parent); ++ } ++ pending->parent = parent; ++ get_layoutrecall(pending); ++ /* Add to list so corresponding layoutreturn can find req */ ++ list_add(&pending->clr_perclnt, ++ &pending->clr_client->cl_layoutrecalls); ++ ++ nfsd4_cb_layout(pending); ++ --todo_len; ++ } ++ ++ return status; ++} ++ ++/* ++ * Spawn a thread to perform a recall layout ++ * ++ */ ++int nfsd_layout_recall_cb(struct super_block *sb, struct inode *inode, ++ struct nfsd4_pnfs_cb_layout *cbl) ++{ ++ int status; ++ struct nfs4_file *lrfile = NULL; ++ struct list_head todolist; ++ unsigned todo_len = 0; ++ ++ dprintk("NFSD nfsd_layout_recall_cb: inode %p cbl %p\n", inode, cbl); ++ BUG_ON(!cbl); ++ BUG_ON(cbl->cbl_recall_type != RETURN_FILE && ++ cbl->cbl_recall_type != RETURN_FSID && ++ cbl->cbl_recall_type != RETURN_ALL); ++ BUG_ON(cbl->cbl_recall_type == RETURN_FILE && !inode); ++ BUG_ON(cbl->cbl_seg.iomode != IOMODE_READ && ++ cbl->cbl_seg.iomode != IOMODE_RW && ++ cbl->cbl_seg.iomode != IOMODE_ANY); ++ ++ if (nfsd_serv == NULL) { ++ dprintk("NFSD nfsd_layout_recall_cb: nfsd_serv == NULL\n"); ++ return -ENOENT; ++ } ++ ++ nfs4_lock_state(); ++ status = -ENOENT; ++ if (inode) { ++ lrfile = find_file(inode); ++ if (!lrfile) { ++ dprintk("NFSD nfsd_layout_recall_cb: " ++ "nfs4_file not found\n"); ++ goto err; ++ } ++ if (cbl->cbl_recall_type == RETURN_FSID) ++ cbl->cbl_fsid = lrfile->fi_fsid; ++ } ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ /* If no cookie provided by FS, return a default one */ ++ if (!cbl->cbl_cookie) ++ cbl->cbl_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ ++ status = create_layout_recall_list(&todolist, &todo_len, cbl, lrfile); ++ if (list_empty(&todolist)) { ++ status = -ENOENT; ++ } else { ++ /* process todolist even if create_layout_recall_list ++ * returned an error */ ++ int status2 = spawn_layout_recall(sb, &todolist, todo_len); ++ if (status2) ++ status = status2; ++ } ++ ++err: ++ nfs4_unlock_state(); ++ if (lrfile) ++ put_nfs4_file(lrfile); ++ return (todo_len && status) ? -EAGAIN : status; ++} ++ ++struct create_device_notify_list_arg { ++ struct list_head *todolist; ++ struct nfsd4_pnfs_cb_dev_list *ndl; ++}; ++ ++static int ++create_device_notify_per_cl(struct nfs4_client *clp, void *p) ++{ ++ struct nfs4_notify_device *cbnd; ++ struct create_device_notify_list_arg *arg = p; ++ ++ if (atomic_read(&clp->cl_deviceref) <= 0) ++ return 0; ++ ++ cbnd = kmalloc(sizeof(*cbnd), GFP_KERNEL); ++ if (!cbnd) ++ return -ENOMEM; ++ ++ cbnd->nd_list = arg->ndl; ++ cbnd->nd_client = clp; ++ list_add(&cbnd->nd_perclnt, arg->todolist); ++ return 0; ++} ++ ++/* Create a list of clients to send device notifications. */ ++int ++create_device_notify_list(struct list_head *todolist, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ int status; ++ struct create_device_notify_list_arg arg = { ++ .todolist = todolist, ++ .ndl = ndl, ++ }; ++ ++ nfs4_lock_state(); ++ status = filter_confirmed_clients(create_device_notify_per_cl, &arg); ++ nfs4_unlock_state(); ++ ++ return status; ++} ++ ++/* ++ * For each client that a device, send a device notification. ++ * XXX: Need to track which clients have which devices. ++ */ ++int nfsd_device_notify_cb(struct super_block *sb, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ struct nfs4_notify_device *cbnd; ++ unsigned int notify_num = 0; ++ int status2, status = 0; ++ struct list_head todolist; ++ ++ BUG_ON(!ndl || ndl->cbd_len == 0 || !ndl->cbd_list); ++ ++ dprintk("NFSD %s: cbl %p len %u\n", __func__, ndl, ndl->cbd_len); ++ ++ if (nfsd_serv == NULL) ++ return -ENOENT; ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ status = create_device_notify_list(&todolist, ndl); ++ ++ while (!list_empty(&todolist)) { ++ cbnd = list_entry(todolist.next, struct nfs4_notify_device, ++ nd_perclnt); ++ list_del_init(&cbnd->nd_perclnt); ++ status2 = nfsd4_cb_notify_device(cbnd); ++ pnfs_clear_device_notify(cbnd->nd_client); ++ if (status2) { ++ kfree(cbnd); ++ status = status2; ++ } ++ notify_num++; ++ } ++ ++ dprintk("NFSD %s: status %d clients %u\n", ++ __func__, status, notify_num); ++ return status; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-23 12:09:03.309501439 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-23 12:09:03.309501439 -0400 +@@ -0,0 +1,461 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsfh.h" ++#include "nfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Just use a linked list. Do not expect more than 32 dlm_device_entries ++ * the first implementation will just use one device per cluster file system ++ */ ++ ++static LIST_HEAD(dlm_device_list); ++static DEFINE_SPINLOCK(dlm_device_list_lock); ++ ++struct dlm_device_entry { ++ struct list_head dlm_dev_list; ++ char disk_name[DISK_NAME_LEN]; ++ int num_ds; ++ char ds_list[NFSD_DLM_DS_LIST_MAX]; ++}; ++ ++static struct dlm_device_entry * ++_nfsd4_find_pnfs_dlm_device(char *disk_name) ++{ ++ struct dlm_device_entry *dlm_pdev; ++ ++ dprintk("--> %s disk name %s\n", __func__, disk_name); ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) { ++ dprintk("%s Look for dlm_pdev %s\n", __func__, ++ dlm_pdev->disk_name); ++ if (!memcmp(dlm_pdev->disk_name, disk_name, strlen(disk_name))) { ++ spin_unlock(&dlm_device_list_lock); ++ return dlm_pdev; ++ } ++ } ++ spin_unlock(&dlm_device_list_lock); ++ return NULL; ++} ++ ++static struct dlm_device_entry * ++nfsd4_find_pnfs_dlm_device(struct super_block *sb) { ++ char dname[BDEVNAME_SIZE]; ++ ++ bdevname(sb->s_bdev, dname); ++ return _nfsd4_find_pnfs_dlm_device(dname); ++} ++ ++ssize_t ++nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen) ++{ ++ char *pos = buf; ++ ssize_t size = 0; ++ struct dlm_device_entry *dlm_pdev; ++ int ret = -EINVAL; ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) ++ { ++ int advanced; ++ advanced = snprintf(pos, buflen - size, "%s:%s\n", dlm_pdev->disk_name, dlm_pdev->ds_list); ++ if (advanced >= buflen - size) ++ goto out; ++ size += advanced; ++ pos += advanced; ++ } ++ ret = size; ++ ++out: ++ spin_unlock(&dlm_device_list_lock); ++ return ret; ++} ++ ++bool nfsd4_validate_pnfs_dlm_device(char *ds_list, int *num_ds) ++{ ++ char *start = ds_list; ++ ++ *num_ds = 0; ++ ++ while (*start) { ++ struct sockaddr_storage tempAddr; ++ int ipLen = strcspn(start, ","); ++ ++ if (!rpc_pton(start, ipLen, (struct sockaddr *)&tempAddr, sizeof(tempAddr))) ++ return false; ++ (*num_ds)++; ++ start += ipLen + 1; ++ } ++ return true; ++} ++ ++/* ++ * pnfs_dlm_device string format: ++ * block-device-path:, ++ * ++ * Examples ++ * /dev/sda:192.168.1.96,192.168.1.97' creates a data server list with ++ * two data servers for the dlm cluster file system mounted on /dev/sda. ++ * ++ * /dev/sda:192.168.1.96,192.168.1.100' ++ * replaces the data server list for /dev/sda ++ * ++ * Only the deviceid == 1 is supported. Can add device id to ++ * pnfs_dlm_device string when needed. ++ * ++ * Only the round robin each data server once stripe index is supported. ++ */ ++int ++nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len) ++ ++{ ++ struct dlm_device_entry *new, *found; ++ char *bufp = pnfs_dlm_device; ++ char *endp = bufp + strlen(bufp); ++ int err = -ENOMEM; ++ ++ dprintk("--> %s len %d\n", __func__, len); ++ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return err; ++ ++ err = -EINVAL; ++ /* disk_name */ ++ /* FIXME: need to check for valid disk_name. search superblocks? ++ * check for slash dev slash ? ++ */ ++ len = strcspn(bufp, ":"); ++ if (len > DISK_NAME_LEN) ++ goto out_free; ++ memcpy(new->disk_name, bufp, len); ++ ++ err = -EINVAL; ++ bufp += len + 1; ++ if (bufp >= endp) ++ goto out_free; ++ ++ /* data server list */ ++ /* FIXME: need to check for comma separated valid ip format */ ++ len = strcspn(bufp, ":"); ++ if (len > NFSD_DLM_DS_LIST_MAX) ++ goto out_free; ++ memcpy(new->ds_list, bufp, len); ++ ++ ++ /* validate the ips */ ++ if (!nfsd4_validate_pnfs_dlm_device(new->ds_list, &(new->num_ds))) ++ goto out_free; ++ ++ dprintk("%s disk_name %s num_ds %d ds_list %s\n", __func__, ++ new->disk_name, new->num_ds, new->ds_list); ++ ++ found = _nfsd4_find_pnfs_dlm_device(new->disk_name); ++ if (found) { ++ /* FIXME: should compare found->ds_list with new->ds_list ++ * and if it is different, kick off a CB_NOTIFY change ++ * deviceid. ++ */ ++ dprintk("%s pnfs_dlm_device %s:%s already in cache " ++ " replace ds_list with new ds_list %s\n", __func__, ++ found->disk_name, found->ds_list, new->ds_list); ++ memset(found->ds_list, 0, DISK_NAME_LEN); ++ memcpy(found->ds_list, new->ds_list, strlen(new->ds_list)); ++ found->num_ds = new->num_ds; ++ kfree(new); ++ } else { ++ dprintk("%s Adding pnfs_dlm_device %s:%s\n", __func__, ++ new->disk_name, new->ds_list); ++ spin_lock(&dlm_device_list_lock); ++ list_add(&new->dlm_dev_list, &dlm_device_list); ++ spin_unlock(&dlm_device_list_lock); ++ } ++ dprintk("<-- %s Success\n", __func__); ++ return 0; ++ ++out_free: ++ kfree(new); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ struct dlm_device_entry *dlm_pdev, *next; ++ ++ dprintk("--> %s\n", __func__); ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry_safe (dlm_pdev, next, &dlm_device_list, ++ dlm_dev_list) { ++ list_del(&dlm_pdev->dlm_dev_list); ++ kfree(dlm_pdev); ++ } ++ spin_unlock(&dlm_device_list_lock); ++} ++ ++static int nfsd4_pnfs_dlm_getdeviter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ printk(KERN_ERR "%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return -ENOTSUPP; ++ } ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ return 0; ++} ++ ++static int nfsd4_pnfs_dlm_getdevinfo(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err, len, i = 0; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_devaddr *daddr; ++ struct dlm_device_entry *dlm_pdev; ++ char *bufp; ++ ++ err = -ENOTSUPP; ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ dprintk("%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return err; ++ } ++ ++ /* We only hand out a deviceid of 1 in LAYOUTGET, so a GETDEVICEINFO ++ * with a gdia_device_id != 1 is invalid. ++ */ ++ err = -EINVAL; ++ if (devid->devid != 1) { ++ dprintk("%s: WARNING: didn't receive a deviceid of " ++ "1 (got: 0x%llx)\n", __func__, devid->devid); ++ return err; ++ } ++ ++ /* ++ * If the DS list has not been established, return -EINVAL ++ */ ++ dlm_pdev = nfsd4_find_pnfs_dlm_device(sb); ++ if (!dlm_pdev) { ++ dprintk("%s: DEBUG: disk %s Not Found\n", __func__, ++ sb->s_bdev->bd_disk->disk_name); ++ return err; ++ } ++ ++ dprintk("%s: Found disk %s with DS list |%s|\n", ++ __func__, dlm_pdev->disk_name, dlm_pdev->ds_list); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ fdev.fl_device_length = dlm_pdev->num_ds; ++ ++ err = -ENOMEM; ++ len = sizeof(*fdev.fl_device_list) * fdev.fl_device_length; ++ fdev.fl_device_list = kzalloc(len, GFP_KERNEL); ++ if (!fdev.fl_device_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device list " ++ "buffer for %d DSes.\n", __func__, i); ++ fdev.fl_device_length = 0; ++ goto out; ++ } ++ ++ /* Set a simple stripe indicie */ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = kzalloc(sizeof(u32) * ++ fdev.fl_stripeindices_length, GFP_KERNEL); ++ ++ if (!fdev.fl_stripeindices_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a stripeindices " ++ "list buffer for %d DSes.\n", __func__, i); ++ goto out; ++ } ++ for (i = 0; i < fdev.fl_stripeindices_length; i++) ++ fdev.fl_stripeindices_list[i] = i; ++ ++ /* Transfer the data server list with a single multipath entry */ ++ bufp = dlm_pdev->ds_list; ++ for (i = 0; i < fdev.fl_device_length; i++) { ++ daddr = kmalloc(sizeof(*daddr), GFP_KERNEL); ++ if (!daddr) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device " ++ "addr buffer.\n", __func__); ++ goto out; ++ } ++ ++ daddr->r_netid.data = "tcp"; ++ daddr->r_netid.len = 3; ++ ++ len = strcspn(bufp, ","); ++ daddr->r_addr.data = kmalloc(len + 4, GFP_KERNEL); ++ memcpy(daddr->r_addr.data, bufp, len); ++ /* ++ * append the port number. interpreted as two more bytes ++ * beyond the quad: ".8.1" -> 0x08.0x01 -> 0x0801 = port 2049. ++ */ ++ memcpy(daddr->r_addr.data + len, ".8.1", 4); ++ daddr->r_addr.len = len + 4; ++ ++ fdev.fl_device_list[i].fl_multipath_length = 1; ++ fdev.fl_device_list[i].fl_multipath_list = daddr; ++ ++ dprintk("%s: encoding DS |%s|\n", __func__, bufp); ++ ++ bufp += len + 1; ++ } ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ for (i = 0; i < fdev.fl_device_length; i++) ++ kfree(fdev.fl_device_list[i].fl_multipath_list); ++ kfree(fdev.fl_device_list); ++ kfree(fdev.fl_stripeindices_list); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize >= NFSSVC_MAXBLKSIZE) ++ return blocksize; ++ return NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++} ++ ++/* ++ * Look up inode block device in pnfs_dlm_device list. ++ * Hash on the inode->i_ino and number of data servers. ++ */ ++static int dlm_ino_hash(struct inode *ino) ++{ ++ struct dlm_device_entry *de; ++ u32 hash_mask = 0; ++ ++ /* If can't find the inode block device in the pnfs_dlm_deivce list ++ * then don't hand out a layout ++ */ ++ de = nfsd4_find_pnfs_dlm_device(ino->i_sb); ++ if (!de) ++ return -1; ++ hash_mask = de->num_ds - 1; ++ return ino->i_ino & hash_mask; ++} ++ ++static enum nfsstat4 nfsd4_pnfs_dlm_layoutget(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ int index; ++ enum nfsstat4 rc = NFS4_OK; ++ ++ dprintk("%s: LAYOUT_GET\n", __func__); ++ ++ /* DLM exported file systems only support layouts for READ */ ++ if (res->lg_seg.iomode == IOMODE_RW) ++ return NFS4ERR_BADIOMODE; ++ ++ index = dlm_ino_hash(inode); ++ dprintk("%s first stripe index %d i_ino %lu\n", __func__, index, ++ inode->i_ino); ++ if (index < 0) ++ return NFS4ERR_LAYOUTUNAVAILABLE; ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ /* Always give out whole file layouts */ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ /* Always give out READ ONLY layouts */ ++ res->lg_seg.iomode = IOMODE_READ; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = false; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = args->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = index; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ memcpy(fhp, args->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++nfsd4_pnfs_dlm_layouttype(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++const struct pnfs_export_operations pnfs_dlm_export_ops = { ++ .layout_type = nfsd4_pnfs_dlm_layouttype, ++ .get_device_info = nfsd4_pnfs_dlm_getdevinfo, ++ .get_device_iter = nfsd4_pnfs_dlm_getdeviter, ++ .layout_get = nfsd4_pnfs_dlm_layoutget, ++}; ++EXPORT_SYMBOL(pnfs_dlm_export_ops); +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-23 12:09:03.310501390 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-23 12:09:03.310501390 -0400 +@@ -0,0 +1,620 @@ ++/* ++* linux/fs/nfsd/nfs4pnfsds.c ++* ++* Copyright (c) 2005 The Regents of the University of Michigan. ++* All rights reserved. ++* ++* Andy Adamson ++* ++* Redistribution and use in source and binary forms, with or without ++* modification, are permitted provided that the following conditions ++* are met: ++* ++* 1. Redistributions of source code must retain the above copyright ++* notice, this list of conditions and the following disclaimer. ++* 2. Redistributions in binary form must reproduce the above copyright ++* notice, this list of conditions and the following disclaimer in the ++* documentation and/or other materials provided with the distribution. ++* 3. Neither the name of the University nor the names of its ++* contributors may be used to endorse or promote products derived ++* from this software without specific prior written permission. ++* ++* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++* ++*/ ++#if defined(CONFIG_PNFSD) ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsd.h" ++#include "pnfsd.h" ++#include "state.h" ++ ++/* ++ ******************* ++ * PNFS ++ ******************* ++ */ ++/* ++ * Hash tables for pNFS Data Server state ++ * ++ * mds_nodeid: list of struct pnfs_mds_id one per Metadata server (MDS) using ++ * this data server (DS). ++ * ++ * mds_clid_hashtbl[]: uses clientid_hashval(), hash of all clientids obtained ++ * from any MDS. ++ * ++ * ds_stid_hashtbl[]: uses stateid_hashval(), hash of all stateids obtained ++ * from any MDS. ++ * ++ */ ++/* Hash tables for clientid state */ ++#define CLIENT_HASH_BITS 4 ++#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) ++#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) ++ ++#define clientid_hashval(id) \ ++ ((id) & CLIENT_HASH_MASK) ++ ++/* hash table for pnfs_ds_stateid */ ++#define STATEID_HASH_BITS 10 ++#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) ++#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) ++ ++#define stateid_hashval(owner_id, file_id) \ ++ (((owner_id) + (file_id)) & STATEID_HASH_MASK) ++ ++static struct list_head mds_id_tbl; ++static struct list_head mds_clid_hashtbl[CLIENT_HASH_SIZE]; ++static struct list_head ds_stid_hashtbl[STATEID_HASH_SIZE]; ++ ++static inline void put_ds_clientid(struct pnfs_ds_clientid *dcp); ++static inline void put_ds_mdsid(struct pnfs_mds_id *mdp); ++ ++/* Mutex for data server state. Needs to be separate from ++ * mds state mutex since a node can be both mds and ds */ ++static DEFINE_MUTEX(ds_mutex); ++static struct thread_info *ds_mutex_owner; ++ ++static void ++ds_lock_state(void) ++{ ++ mutex_lock(&ds_mutex); ++ ds_mutex_owner = current_thread_info(); ++} ++ ++static void ++ds_unlock_state(void) ++{ ++ BUG_ON(ds_mutex_owner != current_thread_info()); ++ ds_mutex_owner = NULL; ++ mutex_unlock(&ds_mutex); ++} ++ ++static int ++cmp_clid(const clientid_t *cl1, const clientid_t *cl2) ++{ ++ return (cl1->cl_boot == cl2->cl_boot) && ++ (cl1->cl_id == cl2->cl_id); ++} ++ ++void ++nfs4_pnfs_state_init(void) ++{ ++ int i; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&mds_clid_hashtbl[i]); ++ ++ for (i = 0; i < STATEID_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&ds_stid_hashtbl[i]); ++ ++ INIT_LIST_HEAD(&mds_id_tbl); ++} ++ ++static struct pnfs_mds_id * ++find_pnfs_mds_id(u32 mdsid) ++{ ++ struct pnfs_mds_id *local = NULL; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ list_for_each_entry(local, &mds_id_tbl, di_hash) { ++ if (local->di_mdsid == mdsid) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_clientid * ++find_pnfs_ds_clientid(const clientid_t *clid) ++{ ++ struct pnfs_ds_clientid *local = NULL; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = clientid_hashval(clid->cl_id); ++ list_for_each_entry(local, &mds_clid_hashtbl[hashval], dc_hash) { ++ if (cmp_clid(&local->dc_mdsclid, clid)) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_stateid * ++find_pnfs_ds_stateid(stateid_t *stid) ++{ ++ struct pnfs_ds_stateid *local = NULL; ++ u32 st_id = stid->si_stateownerid; ++ u32 f_id = stid->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_for_each_entry(local, &ds_stid_hashtbl[hashval], ds_hash) ++ if ((local->ds_stid.si_stateownerid == st_id) && ++ (local->ds_stid.si_fileid == f_id) && ++ (local->ds_stid.si_boot == stid->si_boot)) { ++ stateid_t *sid = &local->ds_stid; ++ dprintk("NFSD: %s <-- %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, local, local->ds_flags, ++ STATEID_VAL(sid)); ++ return local; ++ } ++ return NULL; ++} ++ ++static void ++release_ds_mdsid(struct kref *kref) ++{ ++ struct pnfs_mds_id *mdp = ++ container_of(kref, struct pnfs_mds_id, di_ref); ++ dprintk("pNFSD: %s\n", __func__); ++ ++ list_del(&mdp->di_hash); ++ list_del(&mdp->di_mdsclid); ++ kfree(mdp); ++} ++ ++static void ++release_ds_clientid(struct kref *kref) ++{ ++ struct pnfs_ds_clientid *dcp = ++ container_of(kref, struct pnfs_ds_clientid, dc_ref); ++ struct pnfs_mds_id *mdp; ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(dcp->dc_mdsid); ++ if (mdp) ++ put_ds_mdsid(mdp); ++ ++ list_del(&dcp->dc_hash); ++ list_del(&dcp->dc_stateid); ++ list_del(&dcp->dc_permdsid); ++ kfree(dcp); ++} ++ ++static void ++release_ds_stateid(struct kref *kref) ++{ ++ struct pnfs_ds_stateid *dsp = ++ container_of(kref, struct pnfs_ds_stateid, ds_ref); ++ struct pnfs_ds_clientid *dcp; ++ dprintk("pNFS %s: dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid(&dsp->ds_mdsclid); ++ if (dcp) ++ put_ds_clientid(dcp); ++ ++ list_del(&dsp->ds_hash); ++ list_del(&dsp->ds_perclid); ++ kfree(dsp); ++} ++ ++static inline void ++put_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_put(&dcp->dc_ref, release_ds_clientid); ++} ++ ++static inline void ++get_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_get(&dcp->dc_ref); ++} ++ ++static inline void ++put_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_put(&mdp->di_ref, release_ds_mdsid); ++} ++ ++static inline void ++get_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_get(&mdp->di_ref); ++} ++ ++static inline void ++put_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_put(&dsp->ds_ref, release_ds_stateid); ++} ++ ++static inline void ++get_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_get(&dsp->ds_ref); ++} ++ ++void ++nfs4_pnfs_state_shutdown(void) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int i; ++ ++ dprintk("pNFSD %s: -->\n", __func__); ++ ++ ds_lock_state(); ++ for (i = 0; i < STATEID_HASH_SIZE; i++) { ++ while (!list_empty(&ds_stid_hashtbl[i])) { ++ dsp = list_entry(ds_stid_hashtbl[i].next, ++ struct pnfs_ds_stateid, ds_hash); ++ put_ds_stateid(dsp); ++ } ++ } ++ ds_unlock_state(); ++} ++ ++static struct pnfs_mds_id * ++alloc_init_mds_id(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = kmalloc(sizeof(*mdp), GFP_KERNEL); ++ if (!mdp) ++ return NULL; ++ INIT_LIST_HEAD(&mdp->di_hash); ++ INIT_LIST_HEAD(&mdp->di_mdsclid); ++ list_add(&mdp->di_hash, &mds_id_tbl); ++ mdp->di_mdsid = gsp->dsid; ++ mdp->di_mdsboot = 0; ++ kref_init(&mdp->di_ref); ++ return mdp; ++} ++ ++static struct pnfs_ds_clientid * ++alloc_init_ds_clientid(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ struct pnfs_ds_clientid *dcp; ++ clientid_t *clid = (clientid_t *)&gsp->clid; ++ unsigned int hashval = clientid_hashval(clid->cl_id); ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(gsp->dsid); ++ if (!mdp) { ++ mdp = alloc_init_mds_id(gsp); ++ if (!mdp) ++ return NULL; ++ } else { ++ get_ds_mdsid(mdp); ++ } ++ ++ dcp = kmalloc(sizeof(*dcp), GFP_KERNEL); ++ if (!dcp) ++ return NULL; ++ ++ INIT_LIST_HEAD(&dcp->dc_hash); ++ INIT_LIST_HEAD(&dcp->dc_stateid); ++ INIT_LIST_HEAD(&dcp->dc_permdsid); ++ list_add(&dcp->dc_hash, &mds_clid_hashtbl[hashval]); ++ list_add(&dcp->dc_permdsid, &mdp->di_mdsclid); ++ dcp->dc_mdsclid = *clid; ++ kref_init(&dcp->dc_ref); ++ dcp->dc_mdsid = gsp->dsid; ++ return dcp; ++} ++ ++static struct pnfs_ds_stateid * ++alloc_init_ds_stateid(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct pnfs_ds_stateid *dsp; ++ u32 st_id = stidp->si_stateownerid; ++ u32 f_id = stidp->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ dsp = kmalloc(sizeof(*dsp), GFP_KERNEL); ++ if (!dsp) ++ return dsp; ++ ++ INIT_LIST_HEAD(&dsp->ds_hash); ++ INIT_LIST_HEAD(&dsp->ds_perclid); ++ memcpy(&dsp->ds_stid, stidp, sizeof(stateid_t)); ++ fh_copy_shallow(&dsp->ds_fh, &cfh->fh_handle); ++ dsp->ds_access = 0; ++ dsp->ds_status = 0; ++ dsp->ds_flags = 0L; ++ kref_init(&dsp->ds_ref); ++ set_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ clear_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ init_waitqueue_head(&dsp->ds_waitq); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_add(&dsp->ds_hash, &ds_stid_hashtbl[hashval]); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++static int ++update_ds_stateid(struct pnfs_ds_stateid *dsp, struct svc_fh *cfh, ++ struct pnfs_get_state *gsp) ++{ ++ struct pnfs_ds_clientid *dcp; ++ int new = 0; ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid((clientid_t *)&gsp->clid); ++ if (!dcp) { ++ dcp = alloc_init_ds_clientid(gsp); ++ if (!dcp) ++ return 1; ++ new = 1; ++ } ++ if (test_bit(DS_STATEID_NEW, &dsp->ds_flags)) { ++ list_add(&dsp->ds_perclid, &dcp->dc_stateid); ++ if (!new) ++ get_ds_clientid(dcp); ++ } ++ ++ memcpy(&dsp->ds_stid, &gsp->stid, sizeof(stateid_t)); ++ dsp->ds_access = gsp->access; ++ dsp->ds_status = 0; ++ dsp->ds_verifier[0] = gsp->verifier[0]; ++ dsp->ds_verifier[1] = gsp->verifier[1]; ++ memcpy(&dsp->ds_mdsclid, &gsp->clid, sizeof(clientid_t)); ++ set_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ clear_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ return 0; ++} ++ ++int ++nfs4_pnfs_cb_change_state(struct pnfs_get_state *gs) ++{ ++ stateid_t *stid = (stateid_t *)&gs->stid; ++ struct pnfs_ds_stateid *dsp; ++ ++ dprintk("pNFSD: %s stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stid)); ++ ++ ds_lock_state(); ++ dsp = find_pnfs_ds_stateid(stid); ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ if (dsp) ++ return 0; ++ return -ENOENT; ++} ++ ++/* Retrieves and validates stateid. ++ * If stateid exists and its fields match, return it. ++ * If stateid exists but either the generation or ++ * ownerids don't match, check with mds to see if it is valid. ++ * If the stateid doesn't exist, the first thread creates a ++ * invalid *marker* stateid, then checks to see if the ++ * stateid exists on the mds. If so, it validates the *marker* ++ * stateid and updates its fields. Subsequent threads that ++ * find the *marker* stateid wait until it is valid or an error ++ * occurs. ++ * Called with ds_state_lock. ++ */ ++static struct pnfs_ds_stateid * ++nfsv4_ds_get_state(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct inode *ino = cfh->fh_dentry->d_inode; ++ struct super_block *sb; ++ struct pnfs_ds_stateid *dsp = NULL; ++ struct pnfs_get_state gs = { ++ .access = 0, ++ }; ++ int status = 0, waiter = 0; ++ ++ dprintk("pNFSD: %s -->\n", __func__); ++ ++ dsp = find_pnfs_ds_stateid(stidp); ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags) && ++ (stidp->si_generation == dsp->ds_stid.si_generation)) ++ goto out_noput; ++ ++ sb = ino->i_sb; ++ if (!sb || !sb->s_pnfs_op->get_state) ++ goto out_noput; ++ ++ /* Uninitialize current state if it exists yet it doesn't match. ++ * If it is already invalid, another thread is checking state */ ++ if (dsp) { ++ if (!test_and_clear_bit(DS_STATEID_VALID, &dsp->ds_flags)) ++ waiter = 1; ++ } else { ++ dsp = alloc_init_ds_stateid(cfh, stidp); ++ if (!dsp) ++ goto out_noput; ++ } ++ ++ dprintk("pNFSD: %s Starting loop\n", __func__); ++ get_ds_stateid(dsp); ++ while (!test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ ds_unlock_state(); ++ ++ /* Another thread is checking the state */ ++ if (waiter) { ++ dprintk("pNFSD: %s waiting\n", __func__); ++ wait_event_interruptible_timeout(dsp->ds_waitq, ++ (test_bit(DS_STATEID_VALID, &dsp->ds_flags) || ++ test_bit(DS_STATEID_ERROR, &dsp->ds_flags)), ++ msecs_to_jiffies(1024)); ++ dprintk("pNFSD: %s awake\n", __func__); ++ ds_lock_state(); ++ if (test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ goto out; ++ ++ continue; ++ } ++ ++ /* Validate stateid on mds */ ++ dprintk("pNFSD: %s Checking state on MDS\n", __func__); ++ memcpy(&gs.stid, stidp, sizeof(stateid_t)); ++ status = sb->s_pnfs_op->get_state(ino, &cfh->fh_handle, &gs); ++ dprintk("pNFSD: %s from MDS status %d\n", __func__, status); ++ ds_lock_state(); ++ /* if !status and stateid is valid, update id and mark valid */ ++ if (status || update_ds_stateid(dsp, cfh, &gs)) { ++ set_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ /* remove invalid stateid from list */ ++ put_ds_stateid(dsp); ++ wake_up(&dsp->ds_waitq); ++ goto out; ++ } ++ ++ wake_up(&dsp->ds_waitq); ++ } ++out: ++ if (dsp) ++ put_ds_stateid(dsp); ++out_noput: ++ if (dsp) ++ dprintk("pNFSD: %s <-- dsp %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, dsp, dsp->ds_flags, STATEID_VAL(&dsp->ds_stid)); ++ /* If error, return null */ ++ if (dsp && test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ dsp = NULL; ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++int ++nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *cfh, stateid_t *stateid) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int status = 0; ++ ++ dprintk("pNFSD: %s --> " STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ /* Must release state lock while verifying stateid on mds */ ++ nfs4_unlock_state(); ++ ds_lock_state(); ++ dsp = nfsv4_ds_get_state(cfh, stateid); ++ if (dsp) { ++ get_ds_stateid(dsp); ++ dprintk("pNFSD: %s Found " STATEID_FMT "\n", __func__, ++ STATEID_VAL(&dsp->ds_stid)); ++ ++ dprintk("NFSD: %s: dsp %p fh_size %u:%u " ++ "fh [%08x:%08x:%08x:%08x]:[%08x:%08x:%08x:%08x] " ++ "gen %x:%x\n", ++ __func__, dsp, ++ cfh->fh_handle.fh_size, dsp->ds_fh.fh_size, ++ ((unsigned *)&cfh->fh_handle.fh_base)[0], ++ ((unsigned *)&cfh->fh_handle.fh_base)[1], ++ ((unsigned *)&cfh->fh_handle.fh_base)[2], ++ ((unsigned *)&cfh->fh_handle.fh_base)[3], ++ ((unsigned *)&dsp->ds_fh.fh_base)[0], ++ ((unsigned *)&dsp->ds_fh.fh_base)[1], ++ ((unsigned *)&dsp->ds_fh.fh_base)[2], ++ ((unsigned *)&dsp->ds_fh.fh_base)[3], ++ stateid->si_generation, dsp->ds_stid.si_generation); ++ } ++ ++ if (!dsp || ++ (cfh->fh_handle.fh_size != dsp->ds_fh.fh_size) || ++ (memcmp(&cfh->fh_handle.fh_base, &dsp->ds_fh.fh_base, ++ dsp->ds_fh.fh_size) != 0) || ++ (stateid->si_generation > dsp->ds_stid.si_generation)) ++ status = nfserr_bad_stateid; ++ else if (stateid->si_generation < dsp->ds_stid.si_generation) ++ status = nfserr_old_stateid; ++ ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ nfs4_lock_state(); ++ dprintk("pNFSD: %s <-- status %d\n", __func__, be32_to_cpu(status)); ++ return status; ++} ++ ++void ++nfs4_ds_get_verifier(stateid_t *stateid, struct super_block *sb, u32 *p) ++{ ++ struct pnfs_ds_stateid *dsp = NULL; ++ ++ dprintk("pNFSD: %s --> stid %p\n", __func__, stateid); ++ ++ ds_lock_state(); ++ if (stateid != NULL) { ++ dsp = find_pnfs_ds_stateid(stateid); ++ if (dsp) ++ get_ds_stateid(dsp); ++ } ++ ++ /* XXX: Should we fetch the stateid or wait if some other ++ * thread is currently retrieving the stateid ? */ ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ *p++ = dsp->ds_verifier[0]; ++ *p++ = dsp->ds_verifier[1]; ++ put_ds_stateid(dsp); ++ } else { ++ /* must be on MDS */ ++ ds_unlock_state(); ++ sb->s_pnfs_op->get_verifier(sb, p); ++ ds_lock_state(); ++ p += 2; ++ } ++ ds_unlock_state(); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-23 12:08:29.091491685 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 12:09:03.311501496 -0400 +@@ -34,10 +34,14 @@ + */ + #include + #include ++#include ++#include ++#include + + #include "cache.h" + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_PROC + +@@ -372,6 +376,24 @@ nfsd4_open(struct svc_rqst *rqstp, struc + * set, (2) sets open->op_stateid, (3) sets open->op_delegation. + */ + status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); ++#if defined(CONFIG_SPNFS) ++ if (!status && spnfs_enabled()) { ++ struct inode *inode = cstate->current_fh.fh_dentry->d_inode; ++ ++ status = spnfs_open(inode, open); ++ if (status) { ++ dprintk( ++ "nfsd: pNFS could not be enabled for inode: %lu\n", ++ inode->i_ino); ++ /* ++ * XXX When there's a failure then need to indicate to ++ * future ops that no pNFS is available. Should I save ++ * the status in the inode? It's kind of a big hammer. ++ * But there may be no stripes available? ++ */ ++ } ++ } ++#endif /* CONFIG_SPNFS */ + out: + if (open->op_stateowner) { + nfs4_get_stateowner(open->op_stateowner); +@@ -454,16 +476,30 @@ nfsd4_access(struct svc_rqst *rqstp, str + &access->ac_supported); + } + ++static void ++nfsd4_get_verifier(struct super_block *sb, nfs4_verifier *verf) ++{ ++ u32 *p = (u32 *)verf->data; ++ ++#if defined(CONFIG_PNFSD) ++ if (sb->s_pnfs_op && sb->s_pnfs_op->get_verifier) { ++ nfs4_ds_get_verifier(NULL, sb, p); ++ return; ++ } ++#endif /* CONFIG_PNFSD */ ++ ++ *p++ = nfssvc_boot.tv_sec; ++ *p++ = nfssvc_boot.tv_usec; ++} ++ + static __be32 + nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_commit *commit) + { + __be32 status; + +- u32 *p = (u32 *)commit->co_verf.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; +- ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &commit->co_verf); + status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + commit->co_count); + if (status == nfserr_symlink) +@@ -816,7 +852,6 @@ nfsd4_write(struct svc_rqst *rqstp, stru + { + stateid_t *stateid = &write->wr_stateid; + struct file *filp = NULL; +- u32 *p; + __be32 status = nfs_ok; + unsigned long cnt; + +@@ -838,13 +873,49 @@ nfsd4_write(struct svc_rqst *rqstp, stru + + cnt = write->wr_buflen; + write->wr_how_written = write->wr_stable_how; +- p = (u32 *)write->wr_verifier.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; + ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &write->wr_verifier); ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(cstate->current_fh.fh_dentry->d_inode, 0)) { ++ status = bl_layoutrecall(cstate->current_fh.fh_dentry->d_inode, ++ RETURN_FILE, write->wr_offset, write->wr_buflen); ++ if (!status) { ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++ } ++ } else ++#endif ++ ++ if (spnfs_enabled()) { ++ status = spnfs_write(cstate->current_fh.fh_dentry->d_inode, ++ write->wr_offset, write->wr_buflen, write->wr_vlen, ++ rqstp); ++ if (status == nfs_ok) { ++ /* DMXXX: HACK to get filesize set */ ++ /* write one byte at offset+length-1 */ ++ struct kvec k[1]; ++ char zero = 0; ++ unsigned long cnt = 1; ++ ++ k[0].iov_base = (void *)&zero; ++ k[0].iov_len = 1; ++ nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset+write->wr_buflen-1, k, 1, ++ &cnt, &write->wr_how_written); ++ } ++ } else /* we're not an MDS */ ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++#else + status = nfsd_write(rqstp, &cstate->current_fh, filp, + write->wr_offset, rqstp->rq_vec, write->wr_vlen, + &cnt, &write->wr_how_written); ++#endif /* CONFIG_SPNFS */ ++ + if (filp) + fput(filp); + +@@ -935,6 +1006,306 @@ nfsd4_verify(struct svc_rqst *rqstp, str + return status == nfserr_same ? nfs_ok : status; + } + ++#if defined(CONFIG_PNFSD) ++ ++static __be32 ++nfsd4_layout_verify(struct super_block *sb, struct svc_export *exp, ++ unsigned int layout_type) ++{ ++ int status, type; ++ ++ /* check to see if pNFS is supported. */ ++ status = nfserr_layoutunavailable; ++ if (exp && exp->ex_pnfs == 0) { ++ dprintk("%s: Underlying file system " ++ "is not exported over pNFS\n", __func__); ++ goto out; ++ } ++ if (!sb->s_pnfs_op || !sb->s_pnfs_op->layout_type) { ++ dprintk("%s: Underlying file system " ++ "does not support pNFS\n", __func__); ++ goto out; ++ } ++ ++ type = sb->s_pnfs_op->layout_type(sb); ++ ++ /* check to see if requested layout type is supported. */ ++ status = nfserr_unknown_layouttype; ++ if (!type) ++ dprintk("BUG: %s: layout_type 0 is reserved and must not be " ++ "used by filesystem\n", __func__); ++ else if (type != layout_type) ++ dprintk("%s: requested layout type %d " ++ "does not match supported type %d\n", ++ __func__, layout_type, type); ++ else ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevlist(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevlist *gdlp) ++{ ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ int status; ++ ++ dprintk("%s: type %u maxdevices %u cookie %llu verf %llu\n", ++ __func__, gdlp->gd_layout_type, gdlp->gd_maxdevices, ++ gdlp->gd_cookie, gdlp->gd_verf); ++ ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* We must be able to encode at list one device */ ++ if (!gdlp->gd_maxdevices) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ gdlp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Do nothing if underlying file system does not support ++ * getdevicelist */ ++ if (!sb->s_pnfs_op->get_device_iter) { ++ status = nfserr_notsupp; ++ goto out; ++ } ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdlp->gd_fhp = &cstate->current_fh; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevinfo(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevinfo *gdp) ++{ ++ struct super_block *sb; ++ int status; ++ clientid_t clid; ++ ++ dprintk("%s: layout_type %u dev_id %llx:%llx maxcnt %u\n", ++ __func__, gdp->gd_layout_type, gdp->gd_devid.sbid, ++ gdp->gd_devid.devid, gdp->gd_maxcount); ++ ++ status = nfserr_inval; ++ sb = find_sbid_id(gdp->gd_devid.sbid); ++ dprintk("%s: sb %p\n", __func__, sb); ++ if (!sb) { ++ status = nfserr_noent; ++ goto out; ++ } ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, NULL, gdp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdp->gd_sb = sb; ++ ++ /* Update notifications */ ++ copy_clientid(&clid, cstate->session); ++ pnfs_set_device_notify(&clid, gdp->gd_notify_types); ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutget(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lgp->lg_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_badiomode; ++ if (lgp->lg_seg.iomode != IOMODE_READ && ++ lgp->lg_seg.iomode != IOMODE_RW) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lgp->lg_seg.iomode); ++ goto out; ++ } ++ ++ /* Set up arguments so layout can be retrieved at encode time */ ++ lgp->lg_fhp = current_fh; ++ copy_clientid((clientid_t *)&lgp->lg_seg.clientid, cstate->session); ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutcommit(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ int status; ++ struct inode *ino = NULL; ++ struct iattr ia; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ dprintk("NFSD: nfsd4_layoutcommit \n"); ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ ino = current_fh->fh_dentry->d_inode; ++ if (!ino) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = ino->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lcp->args.lc_seg.layout_type); ++ if (status) ++ goto out; ++ ++ /* This will only extend the file length. Do a quick ++ * check to see if there is any point in waiting for the update ++ * locks. ++ * TODO: Is this correct for all back ends? ++ */ ++ dprintk("%s:new offset: %d new size: %llu old size: %lld\n", ++ __func__, lcp->args.lc_newoffset, lcp->args.lc_last_wr + 1, ++ ino->i_size); ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lcp->args.lc_seg.clientid, cstate->session); ++ lcp->res.lc_size_chg = 0; ++ if (sb->s_pnfs_op->layout_commit) { ++ status = sb->s_pnfs_op->layout_commit(ino, &lcp->args, &lcp->res); ++ dprintk("%s:layout_commit result %d\n", __func__, status); ++ } else { ++ fh_lock(current_fh); ++ if ((lcp->args.lc_newoffset == 0) || ++ ((lcp->args.lc_last_wr + 1) <= ino->i_size)) { ++ status = 0; ++ lcp->res.lc_size_chg = 0; ++ fh_unlock(current_fh); ++ goto out; ++ } ++ ++ /* Try our best to update the file size */ ++ dprintk("%s: Modifying file size\n", __func__); ++ ia.ia_valid = ATTR_SIZE; ++ ia.ia_size = lcp->args.lc_last_wr + 1; ++ status = notify_change(current_fh->fh_dentry, &ia); ++ fh_unlock(current_fh); ++ dprintk("%s:notify_change result %d\n", __func__, status); ++ } ++ ++ if (!status && lcp->res.lc_size_chg && ++ EX_ISSYNC(current_fh->fh_export)) { ++ dprintk("%s: Synchronously writing inode size %llu\n", ++ __func__, ino->i_size); ++ write_inode_now(ino, 1); ++ lcp->res.lc_newsize = i_size_read(ino); ++ } ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutreturn(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lrp->args.lr_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_return_type != RETURN_FILE && ++ lrp->args.lr_return_type != RETURN_FSID && ++ lrp->args.lr_return_type != RETURN_ALL) { ++ dprintk("pNFS %s: invalid return_type %d\n", __func__, ++ lrp->args.lr_return_type); ++ goto out; ++ } ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_seg.iomode != IOMODE_READ && ++ lrp->args.lr_seg.iomode != IOMODE_RW && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lrp->args.lr_seg.iomode); ++ goto out; ++ } ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lrp->args.lr_seg.clientid, cstate->session); ++ lrp->lrs_present = (lrp->args.lr_return_type == RETURN_FILE); ++ status = nfs4_pnfs_return_layout(sb, current_fh, lrp); ++out: ++ dprintk("pNFS %s: status %d return_type 0x%x lrs_present %d\n", ++ __func__, status, lrp->args.lr_return_type, lrp->lrs_present); ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * NULL call. + */ +@@ -1317,6 +1688,29 @@ static struct nfsd4_operation nfsd4_ops[ + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_RECLAIM_COMPLETE", + }, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICELIST] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevlist, ++ .op_name = "OP_GETDEVICELIST", ++ }, ++ [OP_GETDEVICEINFO] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevinfo, ++ .op_flags = ALLOWED_WITHOUT_FH, ++ .op_name = "OP_GETDEVICEINFO", ++ }, ++ [OP_LAYOUTGET] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutget, ++ .op_name = "OP_LAYOUTGET", ++ }, ++ [OP_LAYOUTCOMMIT] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutcommit, ++ .op_name = "OP_LAYOUTCOMMIT", ++ }, ++ [OP_LAYOUTRETURN] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutreturn, ++ .op_name = "OP_LAYOUTRETURN", ++ }, ++#endif /* CONFIG_PNFSD */ + }; + + static const char *nfsd4_op_name(unsigned opnum) +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-08-23 12:08:29.093491375 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 12:09:03.313491310 -0400 +@@ -42,6 +42,8 @@ + #include "xdr4.h" + #include "vfs.h" + ++#include "pnfsd.h" ++ + #define NFSDDBG_FACILITY NFSDDBG_PROC + + /* Globals */ +@@ -60,8 +62,6 @@ static u64 current_sessionid = 1; + #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) + + /* forward declarations */ +-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); +-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); + static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; + static void nfs4_set_recdir(char *recdir); + +@@ -69,6 +69,7 @@ static void nfs4_set_recdir(char *recdir + + /* Currently used for almost all code touching nfsv4 state: */ + static DEFINE_MUTEX(client_mutex); ++struct task_struct *client_mutex_owner; + + /* + * Currently used for the del_recall_lru and file hash table. In an +@@ -86,11 +87,21 @@ void + nfs4_lock_state(void) + { + mutex_lock(&client_mutex); ++ client_mutex_owner = current; ++} ++ ++#define BUG_ON_UNLOCKED_STATE() BUG_ON(client_mutex_owner != current) ++ ++void ++nfs4_bug_on_unlocked_state(void) ++{ ++ BUG_ON(client_mutex_owner != current); + } + + void + nfs4_unlock_state(void) + { ++ client_mutex_owner = NULL; + mutex_unlock(&client_mutex); + } + +@@ -109,7 +120,7 @@ opaque_hashval(const void *ptr, int nbyt + + static struct list_head del_recall_lru; + +-static inline void ++inline void + put_nfs4_file(struct nfs4_file *fi) + { + if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { +@@ -120,7 +131,7 @@ put_nfs4_file(struct nfs4_file *fi) + } + } + +-static inline void ++inline void + get_nfs4_file(struct nfs4_file *fi) + { + atomic_inc(&fi->fi_ref); +@@ -230,7 +241,10 @@ nfs4_close_delegation(struct nfs4_delega + * but we want to remove the lease in any case. */ + if (dp->dl_flock) + vfs_setlease(filp, F_UNLCK, &dp->dl_flock); ++ BUG_ON_UNLOCKED_STATE(); ++ nfs4_unlock_state(); /* allow nested layout recall/return */ + nfsd_close(filp); ++ nfs4_lock_state(); + } + + /* Called under the state lock. */ +@@ -266,8 +280,8 @@ static DEFINE_SPINLOCK(client_lock); + * reclaim_str_hashtbl[] holds known client info from previous reset/reboot + * used in reboot/reset lease grace period processing + * +- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed +- * setclientid_confirmed info. ++ * conf_id_hashtbl[], and conf_str_hashtbl[] hold ++ * confirmed setclientid_confirmed info. + * + * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed + * setclientid info. +@@ -292,6 +306,7 @@ static void unhash_generic_stateid(struc + list_del(&stp->st_hash); + list_del(&stp->st_perfile); + list_del(&stp->st_perstateowner); ++ release_pnfs_ds_dev_list(stp); + } + + static void free_generic_stateid(struct nfs4_stateid *stp) +@@ -345,7 +360,10 @@ static void release_open_stateid(struct + { + unhash_generic_stateid(stp); + release_stateid_lockowners(stp); ++ BUG_ON_UNLOCKED_STATE(); ++ nfs4_unlock_state(); /* allow nested layout recall/return */ + nfsd_close(stp->st_vfs_file); ++ nfs4_lock_state(); + free_generic_stateid(stp); + } + +@@ -739,6 +757,8 @@ expire_client(struct nfs4_client *clp) + struct nfs4_delegation *dp; + struct list_head reaplist; + ++ BUG_ON_UNLOCKED_STATE(); ++ + INIT_LIST_HEAD(&reaplist); + spin_lock(&recall_lock); + while (!list_empty(&clp->cl_delegations)) { +@@ -758,6 +778,7 @@ expire_client(struct nfs4_client *clp) + sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); + release_openowner(sop); + } ++ pnfs_expire_client(clp); + nfsd4_set_callback_client(clp, NULL); + if (clp->cl_cb_conn.cb_xprt) + svc_xprt_put(clp->cl_cb_conn.cb_xprt); +@@ -770,6 +791,13 @@ expire_client(struct nfs4_client *clp) + spin_unlock(&client_lock); + } + ++void expire_client_lock(struct nfs4_client *clp) ++{ ++ nfs4_lock_state(); ++ expire_client(clp); ++ nfs4_unlock_state(); ++} ++ + static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) + { + memcpy(target->cl_verifier.data, source->data, +@@ -859,6 +887,11 @@ static struct nfs4_client *create_client + INIT_LIST_HEAD(&clp->cl_strhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&clp->cl_layouts); ++ INIT_LIST_HEAD(&clp->cl_layoutrecalls); ++ atomic_set(&clp->cl_deviceref, 0); ++#endif /* CONFIG_PNFSD */ + INIT_LIST_HEAD(&clp->cl_sessions); + INIT_LIST_HEAD(&clp->cl_lru); + clp->cl_time = get_seconds(); +@@ -908,7 +941,7 @@ move_to_confirmed(struct nfs4_client *cl + renew_client(clp); + } + +-static struct nfs4_client * ++struct nfs4_client * + find_confirmed_client(clientid_t *clid) + { + struct nfs4_client *clp; +@@ -978,6 +1011,24 @@ find_unconfirmed_client_by_str(const cha + return NULL; + } + ++int ++filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), ++ void *arg) ++{ ++ struct nfs4_client *clp, *next; ++ int i, status = 0; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ list_for_each_entry_safe (clp, next, &conf_str_hashtbl[i], ++ cl_strhash) { ++ status = func(clp, arg); ++ if (status) ++ break; ++ } ++ ++ return status; ++} ++ + static void + gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) + { +@@ -1110,8 +1161,12 @@ nfsd4_replay_cache_entry(struct nfsd4_co + static void + nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) + { +- /* pNFS is not supported */ ++#if defined(CONFIG_PNFSD) ++ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS | ++ EXCHGID4_FLAG_USE_PNFS_DS; ++#else /* CONFIG_PNFSD */ + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; ++#endif /* CONFIG_PNFSD */ + + /* Referrals are supported, Migration is not. */ + new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; +@@ -1301,6 +1356,13 @@ nfsd4_create_session(struct svc_rqst *rq + struct nfsd4_clid_slot *cs_slot = NULL; + int status = 0; + ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ /* XXX hack to get local ip address */ ++ memcpy(&pnfsd_lexp_addr, &rqstp->rq_xprt->xpt_local, ++ sizeof(pnfsd_lexp_addr)); ++ pnfs_lexp_addr_len = rqstp->rq_xprt->xpt_locallen; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ + nfs4_lock_state(); + unconf = find_unconfirmed_client(&cr_ses->clientid); + conf = find_confirmed_client(&cr_ses->clientid); +@@ -1340,25 +1402,26 @@ nfsd4_create_session(struct svc_rqst *rq + cs_slot->sl_seqid++; /* from 0 to 1 */ + move_to_confirmed(unconf); + +- if (cr_ses->flags & SESSION4_BACK_CHAN) { +- unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; +- svc_xprt_get(rqstp->rq_xprt); +- rpc_copy_addr( +- (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, +- sa); +- unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); +- unconf->cl_cb_conn.cb_minorversion = +- cstate->minorversion; +- unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; +- unconf->cl_cb_seq_nr = 1; +- nfsd4_probe_callback(unconf, &unconf->cl_cb_conn); +- } ++ if (is_ds_only_session(unconf->cl_exchange_flags)) ++ cr_ses->flags &= ~SESSION4_BACK_CHAN; ++ + conf = unconf; + } else { + status = nfserr_stale_clientid; + goto out; + } + ++ if (cr_ses->flags & SESSION4_BACK_CHAN) { ++ conf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; ++ svc_xprt_get(rqstp->rq_xprt); ++ rpc_copy_addr((struct sockaddr *)&conf->cl_cb_conn.cb_addr, sa); ++ conf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); ++ conf->cl_cb_conn.cb_minorversion = cstate->minorversion; ++ conf->cl_cb_conn.cb_prog = cr_ses->callback_prog; ++ conf->cl_cb_seq_nr = 1; ++ nfsd4_probe_callback(conf, &conf->cl_cb_conn); ++ } ++ + /* + * We do not support RDMA or persistent sessions + */ +@@ -1746,7 +1809,7 @@ out: + + /* OPEN Share state helper functions */ + static inline struct nfs4_file * +-alloc_init_file(struct inode *ino) ++alloc_init_file(struct inode *ino, struct svc_fh *current_fh) + { + struct nfs4_file *fp; + unsigned int hashval = file_hashval(ino); +@@ -1760,6 +1823,16 @@ alloc_init_file(struct inode *ino) + fp->fi_inode = igrab(ino); + fp->fi_id = current_fileid++; + fp->fi_had_conflict = false; ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&fp->fi_layouts); ++ INIT_LIST_HEAD(&fp->fi_layout_states); ++ fp->fi_fsid.major = current_fh->fh_export->ex_fsid; ++ fp->fi_fsid.minor = 0; ++ fp->fi_fhlen = current_fh->fh_handle.fh_size; ++ BUG_ON(fp->fi_fhlen > sizeof(fp->fi_fhval)); ++ memcpy(fp->fi_fhval, ¤t_fh->fh_handle.fh_base, ++ fp->fi_fhlen); ++#endif /* CONFIG_PNFSD */ + spin_lock(&recall_lock); + list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); +@@ -1768,7 +1841,7 @@ alloc_init_file(struct inode *ino) + return NULL; + } + +-static void ++void + nfsd4_free_slab(struct kmem_cache **slab) + { + if (*slab == NULL) +@@ -1784,6 +1857,7 @@ nfsd4_free_slabs(void) + nfsd4_free_slab(&file_slab); + nfsd4_free_slab(&stateid_slab); + nfsd4_free_slab(&deleg_slab); ++ nfsd4_free_pnfs_slabs(); + } + + static int +@@ -1805,6 +1879,8 @@ nfsd4_init_slabs(void) + sizeof(struct nfs4_delegation), 0, 0, NULL); + if (deleg_slab == NULL) + goto out_nomem; ++ if (nfsd4_init_pnfs_slabs()) ++ goto out_nomem; + return 0; + out_nomem: + nfsd4_free_slabs(); +@@ -1878,6 +1954,9 @@ init_stateid(struct nfs4_stateid *stp, s + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); + INIT_LIST_HEAD(&stp->st_perfile); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &stateid_hashtbl[hashval]); + list_add(&stp->st_perstateowner, &sop->so_stateids); + list_add(&stp->st_perfile, &fp->fi_stateids); +@@ -1919,6 +1998,7 @@ find_openstateowner_str(unsigned int has + { + struct nfs4_stateowner *so = NULL; + ++ BUG_ON_UNLOCKED_STATE(); + list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { + if (same_owner_str(so, &open->op_owner, &open->op_clientid)) + return so; +@@ -1927,7 +2007,7 @@ find_openstateowner_str(unsigned int has + } + + /* search file_hashtbl[] for file */ +-static struct nfs4_file * ++struct nfs4_file * + find_file(struct inode *ino) + { + unsigned int hashval = file_hashval(ino); +@@ -1945,6 +2025,18 @@ find_file(struct inode *ino) + return NULL; + } + ++struct nfs4_file * ++find_alloc_file(struct inode *ino, struct svc_fh *current_fh) ++{ ++ struct nfs4_file *fp; ++ ++ fp = find_file(ino); ++ if (fp) ++ return fp; ++ ++ return alloc_init_file(ino, current_fh); ++} ++ + static inline int access_valid(u32 x, u32 minorversion) + { + if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) +@@ -2503,7 +2595,7 @@ nfsd4_process_open2(struct svc_rqst *rqs + if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) + goto out; + status = nfserr_resource; +- fp = alloc_init_file(ino); ++ fp = alloc_init_file(ino, current_fh); + if (fp == NULL) + goto out; + } +@@ -2730,7 +2822,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct + return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_path.dentry->d_inode; + } + +-static int ++int + STALE_STATEID(stateid_t *stateid) + { + if (stateid->si_boot == boot_time) +@@ -2740,6 +2832,16 @@ STALE_STATEID(stateid_t *stateid) + return 1; + } + ++__be32 ++nfs4_check_stateid(stateid_t *stateid) ++{ ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ return nfserr_bad_stateid; ++ if (STALE_STATEID(stateid)) ++ return nfserr_stale_stateid; ++ return 0; ++} ++ + static inline int + access_permit_read(unsigned long access_bmap) + { +@@ -2848,6 +2950,24 @@ nfs4_preprocess_stateid_op(struct nfsd4_ + if (grace_disallows_io(ino)) + return nfserr_grace; + ++#if defined(CONFIG_PNFSD) ++ if (pnfs_fh_is_ds(¤t_fh->fh_handle)) { ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ status = nfserr_bad_stateid; ++ else ++#ifdef CONFIG_GFS2_FS_LOCKING_DLM ++ { ++ dprintk("%s Don't check DS stateid\n", __func__); ++ return 0; ++ } ++#else /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ status = nfs4_preprocess_pnfs_ds_stateid(current_fh, ++ stateid); ++#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ goto out; ++ } ++#endif /* CONFIG_PNFSD */ ++ + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + +@@ -2924,13 +3044,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_co + *stpp = NULL; + *sopp = NULL; + +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { +- dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); +- return nfserr_bad_stateid; +- } +- +- if (STALE_STATEID(stateid)) +- return nfserr_stale_stateid; ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ return status; + + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; +@@ -3205,11 +3321,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + nfs4_lock_state(); +- status = nfserr_bad_stateid; +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) +- goto out; +- status = nfserr_stale_stateid; +- if (STALE_STATEID(stateid)) ++ status = nfs4_check_stateid(stateid); ++ if (status) + goto out; + status = nfserr_bad_stateid; + if (!is_delegation_stateid(stateid)) +@@ -3238,26 +3351,6 @@ out: + #define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) + #define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) + +-static inline u64 +-end_offset(u64 start, u64 len) +-{ +- u64 end; +- +- end = start + len; +- return end >= start ? end: NFS4_MAX_UINT64; +-} +- +-/* last octet in a range */ +-static inline u64 +-last_byte_offset(u64 start, u64 len) +-{ +- u64 end; +- +- BUG_ON(!len); +- end = start + len; +- return end > start ? end - 1: NFS4_MAX_UINT64; +-} +- + #define lockownerid_hashval(id) \ + ((id) & LOCK_HASH_MASK) + +@@ -3274,7 +3367,7 @@ static struct list_head lock_ownerid_has + static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; + static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; + +-static struct nfs4_stateid * ++struct nfs4_stateid * + find_stateid(stateid_t *stid, int flags) + { + struct nfs4_stateid *local; +@@ -3303,7 +3396,7 @@ find_stateid(stateid_t *stid, int flags) + return NULL; + } + +-static struct nfs4_delegation * ++struct nfs4_delegation * + find_delegation_stateid(struct inode *ino, stateid_t *stid) + { + struct nfs4_file *fp; +@@ -3436,6 +3529,9 @@ alloc_init_lock_stateid(struct nfs4_stat + INIT_LIST_HEAD(&stp->st_perfile); + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]); + list_add(&stp->st_perfile, &fp->fi_stateids); + list_add(&stp->st_perstateowner, &sop->so_stateids); +@@ -3998,6 +4094,9 @@ nfs4_state_init(void) + INIT_LIST_HEAD(&client_lru); + INIT_LIST_HEAD(&del_recall_lru); + reclaim_str_hashtbl_size = 0; ++#if defined(CONFIG_PNFSD) ++ nfs4_pnfs_state_init(); ++#endif /* CONFIG_PNFSD */ + return 0; + } + +@@ -4110,6 +4209,7 @@ __nfs4_state_shutdown(void) + } + + nfsd4_shutdown_recdir(); ++ nfs4_pnfs_state_shutdown(); + nfs4_init = 0; + } + +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-23 12:09:03.315491356 -0400 +@@ -47,9 +47,14 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_XDR + +@@ -1234,6 +1239,138 @@ nfsd4_decode_sequence(struct nfsd4_compo + DECODE_TAIL; + } + ++#if defined(CONFIG_PNFSD) ++static __be32 ++nfsd4_decode_getdevlist(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16 + sizeof(nfs4_verifier)); ++ READ32(gdevl->gd_layout_type); ++ READ32(gdevl->gd_maxdevices); ++ READ64(gdevl->gd_cookie); ++ COPYMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_getdevinfo(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ u32 num; ++ DECODE_HEAD; ++ ++ READ_BUF(12 + sizeof(struct nfsd4_pnfs_deviceid)); ++ READ64(gdev->gd_devid.sbid); ++ READ64(gdev->gd_devid.devid); ++ READ32(gdev->gd_layout_type); ++ READ32(gdev->gd_maxcount); ++ READ32(num); ++ if (num) { ++ READ_BUF(4); ++ READ32(gdev->gd_notify_types); ++ } else { ++ gdev->gd_notify_types = 0; ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(36); ++ READ32(lgp->lg_signal); ++ READ32(lgp->lg_seg.layout_type); ++ READ32(lgp->lg_seg.iomode); ++ READ64(lgp->lg_seg.offset); ++ READ64(lgp->lg_seg.length); ++ READ64(lgp->lg_minlength); ++ nfsd4_decode_stateid(argp, &lgp->lg_sid); ++ READ_BUF(4); ++ READ32(lgp->lg_maxcount); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ DECODE_HEAD; ++ u32 timechange; ++ ++ READ_BUF(20); ++ READ64(lcp->args.lc_seg.offset); ++ READ64(lcp->args.lc_seg.length); ++ READ32(lcp->args.lc_reclaim); ++ nfsd4_decode_stateid(argp, &lcp->lc_sid); ++ READ_BUF(4); ++ READ32(lcp->args.lc_newoffset); ++ if (lcp->args.lc_newoffset) { ++ READ_BUF(8); ++ READ64(lcp->args.lc_last_wr); ++ } else ++ lcp->args.lc_last_wr = 0; ++ READ_BUF(4); ++ READ32(timechange); ++ if (timechange) { ++ READ_BUF(12); ++ READ64(lcp->args.lc_mtime.seconds); ++ READ32(lcp->args.lc_mtime.nseconds); ++ } else { ++ lcp->args.lc_mtime.seconds = 0; ++ lcp->args.lc_mtime.nseconds = 0; ++ } ++ READ_BUF(8); ++ READ32(lcp->args.lc_seg.layout_type); ++ /* XXX: saving XDR'ed layout update. Since we don't have the ++ * current_fh yet, and therefore no export_ops, we can't call ++ * the layout specific decode routines. File and pVFS2 ++ * do not use the layout update.... ++ */ ++ READ32(lcp->args.lc_up_len); ++ if (lcp->args.lc_up_len > 0) { ++ READ_BUF(lcp->args.lc_up_len); ++ READMEM(lcp->args.lc_up_layout, lcp->args.lc_up_len); ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16); ++ READ32(lrp->args.lr_reclaim); ++ READ32(lrp->args.lr_seg.layout_type); ++ READ32(lrp->args.lr_seg.iomode); ++ READ32(lrp->args.lr_return_type); ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ READ_BUF(16); ++ READ64(lrp->args.lr_seg.offset); ++ READ64(lrp->args.lr_seg.length); ++ nfsd4_decode_stateid(argp, &lrp->lr_sid); ++ READ_BUF(4); ++ READ32(lrp->args.lrf_body_len); ++ if (lrp->args.lrf_body_len > 0) { ++ READ_BUF(lrp->args.lrf_body_len); ++ READMEM(lrp->args.lrf_body, lrp->args.lrf_body_len); ++ } ++ } ++ ++ DECODE_TAIL; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) + { +@@ -1335,11 +1472,19 @@ static nfsd4_dec nfsd41_dec_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, + [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, +@@ -2136,6 +2281,36 @@ out_acl: + } + WRITE64(stat.ino); + } ++#if defined(CONFIG_PNFSD) ++ if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { ++ struct super_block *sb = dentry->d_inode->i_sb; ++ int type = 0; ++ ++ /* Query the filesystem for supported pNFS layout types. ++ * Currently, we only support one layout type per file system. ++ * The export_ops->layout_type() returns the pnfs_layouttype4. ++ */ ++ buflen -= 4; ++ if (buflen < 0) /* length */ ++ goto out_resource; ++ ++ if (sb && sb->s_pnfs_op && sb->s_pnfs_op->layout_type) ++ type = sb->s_pnfs_op->layout_type(sb); ++ if (type) { ++ if ((buflen -= 4) < 0) /* type */ ++ goto out_resource; ++ WRITE32(1); /* length */ ++ WRITE32(type); /* type */ ++ } else ++ WRITE32(0); /* length */ ++ } ++ ++ if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ if ((buflen -= 4) < 0) ++ goto out_resource; ++ WRITE32(stat.blksize); ++ } ++#endif /* CONFIG_PNFSD */ + if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + WRITE32(3); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); +@@ -2366,6 +2541,10 @@ nfsd4_encode_commit(struct nfsd4_compoun + if (!nfserr) { + RESERVE_SPACE(8); + WRITEMEM(commit->co_verf.data, 8); ++ dprintk("NFSD: nfsd4_encode_commit: verifier %x:%x\n", ++ ((u32 *)(&commit->co_verf.data))[0], ++ ((u32 *)(&commit->co_verf.data))[1]); ++ + ADJUST_ARGS(); + } + return nfserr; +@@ -2620,9 +2799,20 @@ nfsd4_encode_read(struct nfsd4_compoundr + } + read->rd_vlen = v; + ++#if defined(CONFIG_SPNFS) ++ if (spnfs_enabled()) ++ nfserr = spnfs_read(read->rd_fhp->fh_dentry->d_inode, ++ read->rd_offset, &maxcount, read->rd_vlen, ++ resp->rqstp); ++ else /* we're not an MDS */ ++ nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, ++ read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, ++ &maxcount); ++#else + nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, + read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, + &maxcount); ++#endif /* CONFIG_SPNFS */ + + if (nfserr == nfserr_symlink) + nfserr = nfserr_inval; +@@ -2926,6 +3116,9 @@ nfsd4_encode_write(struct nfsd4_compound + WRITE32(write->wr_bytes_written); + WRITE32(write->wr_how_written); + WRITEMEM(write->wr_verifier.data, 8); ++ dprintk("NFSD: nfsd4_encode_write: verifier %x:%x\n", ++ ((u32 *)(&write->wr_verifier.data))[0], ++ ((u32 *)(&write->wr_verifier.data))[1]); + ADJUST_ARGS(); + } + return nfserr; +@@ -3069,6 +3262,343 @@ nfsd4_encode_sequence(struct nfsd4_compo + return 0; + } + ++#if defined(CONFIG_PNFSD) ++ ++/* Uses the export interface to iterate through the available devices ++ * and encodes them on the response stream. ++ */ ++static __be32 ++nfsd4_encode_devlist_iterator(struct nfsd4_compoundres *resp, ++ struct nfsd4_pnfs_getdevlist *gdevl, ++ unsigned int *dev_count) ++{ ++ struct super_block *sb = gdevl->gd_fhp->fh_dentry->d_inode->i_sb; ++ __be32 nfserr; ++ int status; ++ __be32 *p; ++ struct nfsd4_pnfs_dev_iter_res res = { ++ .gd_cookie = gdevl->gd_cookie, ++ .gd_verf = gdevl->gd_verf, ++ .gd_eof = 0 ++ }; ++ u64 sbid; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ sbid = find_create_sbid(sb); ++ *dev_count = 0; ++ do { ++ status = sb->s_pnfs_op->get_device_iter(sb, ++ gdevl->gd_layout_type, ++ &res); ++ if (status) { ++ if (status == -ENOENT) { ++ res.gd_eof = 1; ++ /* return success */ ++ break; ++ } ++ nfserr = nfserrno(status); ++ goto out_err; ++ } ++ ++ /* Encode device id and layout type */ ++ RESERVE_SPACE(sizeof(struct nfsd4_pnfs_deviceid)); ++ WRITE64((__be64)sbid); ++ WRITE64(res.gd_devid); /* devid minor */ ++ ADJUST_ARGS(); ++ (*dev_count)++; ++ } while (*dev_count < gdevl->gd_maxdevices && !res.gd_eof); ++ gdevl->gd_cookie = res.gd_cookie; ++ gdevl->gd_verf = res.gd_verf; ++ gdevl->gd_eof = res.gd_eof; ++ nfserr = nfs_ok; ++out_err: ++ dprintk("%s: Encoded %u devices\n", __func__, *dev_count); ++ return nfserr; ++} ++ ++/* Encodes the response of get device list. ++*/ ++static __be32 ++nfsd4_encode_getdevlist(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ unsigned int dev_count = 0, lead_count; ++ u32 *p_in = resp->p; ++ __be32 *p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ /* Ensure we have room for cookie, verifier, and devlist len, ++ * which we will backfill in after we encode as many devices as possible ++ */ ++ lead_count = 8 + sizeof(nfs4_verifier) + 4; ++ RESERVE_SPACE(lead_count); ++ /* skip past these values */ ++ p += XDR_QUADLEN(lead_count); ++ ADJUST_ARGS(); ++ ++ /* Iterate over as many device ids as possible on the xdr stream */ ++ nfserr = nfsd4_encode_devlist_iterator(resp, gdevl, &dev_count); ++ if (nfserr) ++ goto out_err; ++ ++ /* Backfill in cookie, verf and number of devices encoded */ ++ p = p_in; ++ WRITE64(gdevl->gd_cookie); ++ WRITEMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ WRITE32(dev_count); ++ ++ /* Skip over devices */ ++ p += XDR_QUADLEN(dev_count * sizeof(struct nfsd4_pnfs_deviceid)); ++ ADJUST_ARGS(); ++ ++ /* are we at the end of devices? */ ++ RESERVE_SPACE(4); ++ WRITE32(gdevl->gd_eof); ++ ADJUST_ARGS(); ++ ++ dprintk("%s: done.\n", __func__); ++ ++ nfserr = nfs_ok; ++out: ++ return nfserr; ++out_err: ++ p = p_in; ++ ADJUST_ARGS(); ++ goto out; ++} ++ ++/* For a given device id, have the file system retrieve and encode the ++ * associated device. For file layout, the encoding function is ++ * passed down to the file system. The file system then has the option ++ * of using this encoding function or one of its own. ++ * ++ * Note: the file system must return the XDR size of struct device_addr4 ++ * da_addr_body in pnfs_xdr_info.bytes_written on NFS4ERR_TOOSMALL for the ++ * gdir_mincount calculation. ++ */ ++static __be32 ++nfsd4_encode_getdevinfo(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ struct super_block *sb; ++ int maxcount = 0, type_notify_len = 12; ++ __be32 *p, *p_save = NULL, *p_in = resp->p; ++ struct exp_xdr_stream xdr; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = gdev->gd_sb; ++ ++ if (gdev->gd_maxcount != 0) { ++ /* FIXME: this will be bound by the session max response */ ++ maxcount = svc_max_payload(resp->rqstp); ++ if (maxcount > gdev->gd_maxcount) ++ maxcount = gdev->gd_maxcount; ++ ++ /* Ensure have room for type and notify field */ ++ maxcount -= type_notify_len; ++ if (maxcount < 0) { ++ nfserr = -ETOOSMALL; ++ goto toosmall; ++ } ++ } ++ ++ RESERVE_SPACE(4); ++ WRITE32(gdev->gd_layout_type); ++ ADJUST_ARGS(); ++ ++ /* If maxcount is 0 then just update notifications */ ++ if (gdev->gd_maxcount == 0) ++ goto handle_notifications; ++ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ nfserr = sb->s_pnfs_op->get_device_info(sb, &xdr, gdev->gd_layout_type, ++ &gdev->gd_devid); ++ if (nfserr) ++ goto err; ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ BUG_ON(xdr.p > xdr.end); ++ ++ /* Update the xdr stream with the number of bytes encoded ++ * by the file system. ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++handle_notifications: ++ /* Encode supported device notifications */ ++ RESERVE_SPACE(4); ++ if (sb->s_pnfs_op->set_device_notify) { ++ struct pnfs_devnotify_arg dn_args; ++ ++ dn_args.dn_layout_type = gdev->gd_layout_type; ++ dn_args.dn_devid = gdev->gd_devid; ++ dn_args.dn_notify_types = gdev->gd_notify_types; ++ nfserr = sb->s_pnfs_op->set_device_notify(sb, &dn_args); ++ if (nfserr) ++ goto err; ++ WRITE32(dn_args.dn_notify_types); ++ } else { ++ WRITE32(0); ++ } ++ ADJUST_ARGS(); ++ ++out: ++ return nfserrno(nfserr); ++toosmall: ++ dprintk("%s: maxcount too small\n", __func__); ++ RESERVE_SPACE(4); ++ WRITE32((p_save ? (xdr.p - p_save) * 4 : 0) + type_notify_len); ++ ADJUST_ARGS(); ++ goto out; ++err: ++ /* Rewind to the beginning */ ++ p = p_in; ++ ADJUST_ARGS(); ++ if (nfserr == -ETOOSMALL) ++ goto toosmall; ++ printk(KERN_ERR "%s: export ERROR %d\n", __func__, nfserr); ++ goto out; ++} ++ ++static __be32 ++nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, ++ __be32 nfserr, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int maxcount, leadcount; ++ struct super_block *sb; ++ struct exp_xdr_stream xdr; ++ __be32 *p, *p_save, *p_start = resp->p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = lgp->lg_fhp->fh_dentry->d_inode->i_sb; ++ maxcount = PAGE_SIZE; ++ if (maxcount > lgp->lg_maxcount) ++ maxcount = lgp->lg_maxcount; ++ ++ /* Check for space on xdr stream */ ++ leadcount = 36 + sizeof(stateid_opaque_t); ++ RESERVE_SPACE(leadcount); ++ /* encode layout metadata after file system encodes layout */ ++ p += XDR_QUADLEN(leadcount); ++ ADJUST_ARGS(); ++ ++ /* Ensure have room for ret_on_close, off, len, iomode, type */ ++ maxcount -= leadcount; ++ if (maxcount < 0) { ++ printk(KERN_ERR "%s: buffer too small\n", __func__); ++ nfserr = nfserr_toosmall; ++ goto err; ++ } ++ ++ /* Set xdr info so file system can encode layout */ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ /* Retrieve, encode, and merge layout; process stateid */ ++ nfserr = nfs4_pnfs_get_layout(lgp, &xdr); ++ if (nfserr) ++ goto err; ++ ++ /* Ensure file system returned enough bytes for the client ++ * to access. ++ */ ++ if (lgp->lg_seg.length < lgp->lg_minlength) { ++ nfserr = nfserr_badlayout; ++ goto err; ++ } ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ ++ /* Rewind to beginning and encode attrs */ ++ resp->p = p_start; ++ RESERVE_SPACE(4); ++ WRITE32(lgp->lg_roc); /* return on close */ ++ ADJUST_ARGS(); ++ nfsd4_encode_stateid(resp, &lgp->lg_sid); ++ RESERVE_SPACE(28); ++ /* Note: response logr_layout array count, always one for now */ ++ WRITE32(1); ++ WRITE64(lgp->lg_seg.offset); ++ WRITE64(lgp->lg_seg.length); ++ WRITE32(lgp->lg_seg.iomode); ++ WRITE32(lgp->lg_seg.layout_type); ++ ++ /* Update the xdr stream with the number of bytes written ++ * by the file system ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++ return nfs_ok; ++err: ++ resp->p = p_start; ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lcp->res.lc_size_chg); ++ ADJUST_ARGS(); ++ if (lcp->res.lc_size_chg) { ++ RESERVE_SPACE(8); ++ WRITE64(lcp->res.lc_newsize); ++ ADJUST_ARGS(); ++ } ++out: ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lrp->lrs_present != 0); /* got stateid? */ ++ ADJUST_ARGS(); ++ if (lrp->lrs_present) ++ nfsd4_encode_stateid(resp, &lrp->lr_sid); ++out: ++ return nfserr; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) + { +@@ -3129,11 +3659,19 @@ static nfsd4_enc nfsd4_enc_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, + [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, +diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c +--- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-08-23 12:08:29.094491943 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 12:09:03.317501495 -0400 +@@ -13,10 +13,15 @@ + #include + #include + #include ++#include + + #include "nfsd.h" + #include "cache.h" + ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++#include ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + /* + * We have a single directory with 9 nodes in it. + */ +@@ -49,6 +54,9 @@ enum { + NFSD_Gracetime, + NFSD_RecoveryDir, + #endif ++#ifdef CONFIG_PNFSD ++ NFSD_pnfs_dlm_device, ++#endif + }; + + /* +@@ -74,6 +82,9 @@ static ssize_t write_leasetime(struct fi + static ssize_t write_gracetime(struct file *file, char *buf, size_t size); + static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); + #endif ++#ifdef CONFIG_PNFSD ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size); ++#endif + + static ssize_t (*write_op[])(struct file *, char *, size_t) = { + [NFSD_Svc] = write_svc, +@@ -96,6 +107,9 @@ static ssize_t (*write_op[])(struct file + [NFSD_Gracetime] = write_gracetime, + [NFSD_RecoveryDir] = write_recoverydir, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = write_pnfs_dlm_device, ++#endif + }; + + static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) +@@ -1349,6 +1363,68 @@ static ssize_t write_recoverydir(struct + + #endif + ++#ifdef CONFIG_PNFSD ++ ++static ssize_t __write_pnfs_dlm_device(struct file *file, char *buf, ++ size_t size) ++{ ++ char *mesg = buf; ++ char *pnfs_dlm_device; ++ int max_size = NFSD_PNFS_DLM_DEVICE_MAX; ++ int len, ret = 0; ++ ++ if (size > 0) { ++ ret = -EINVAL; ++ if (size > max_size || buf[size-1] != '\n') ++ return ret; ++ buf[size-1] = 0; ++ ++ pnfs_dlm_device = mesg; ++ len = qword_get(&mesg, pnfs_dlm_device, size); ++ if (len <= 0) ++ return ret; ++ ++ ret = nfsd4_set_pnfs_dlm_device(pnfs_dlm_device, len); ++ } else ++ return nfsd4_get_pnfs_dlm_device_list(buf, SIMPLE_TRANSACTION_LIMIT); ++ ++ return ret <= 0 ? ret : strlen(buf); ++} ++ ++/** ++ * write_pnfs_dlm_device - Set or report the current pNFS data server list ++ * ++ * Input: ++ * buf: ignored ++ * size: zero ++ * ++ * OR ++ * ++ * Input: ++ * buf: C string containing a block device name, ++ * a colon, and then a comma separated ++ * list of pNFS data server IPv4 addresses ++ * size: non-zero length of C string in @buf ++ * Output: ++ * On success: passed-in buffer filled with '\n'-terminated C ++ * string containing a block device name, a colon, and ++ * then a comma separated list of pNFS ++ * data server IPv4 addresses. ++ * return code is the size in bytes of the string ++ * On error: return code is a negative errno value ++ */ ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size) ++{ ++ ssize_t rv; ++ ++ mutex_lock(&nfsd_mutex); ++ rv = __write_pnfs_dlm_device(file, buf, size); ++ mutex_unlock(&nfsd_mutex); ++ return rv; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ + /*----------------------------------------------------------------------------*/ + /* + * populating the filesystem. +@@ -1383,6 +1459,10 @@ static int nfsd_fill_super(struct super_ + [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = {"pnfs_dlm_device", &transaction_ops, ++ S_IWUSR|S_IRUSR}, ++#endif + /* last one */ {""} + }; + return simple_fill_super(sb, 0x6e667364, nfsd_files); +@@ -1421,6 +1501,9 @@ static int create_proc_exports_entry(voi + } + #endif + ++#if defined(CONFIG_SPNFS_BLOCK) ++int nfsd_bl_init(void); ++#endif + static int __init init_nfsd(void) + { + int retval; +@@ -1443,6 +1526,15 @@ static int __init init_nfsd(void) + retval = create_proc_exports_entry(); + if (retval) + goto out_free_idmap; ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ retval = spnfs_init_proc(); ++ if (retval != 0) ++ goto out_free_idmap; ++#if defined(CONFIG_SPNFS_BLOCK) ++ nfsd_bl_init(); ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + retval = register_filesystem(&nfsd_fs_type); + if (retval) + goto out_free_all; +@@ -1465,7 +1557,22 @@ out_free_stat: + + static void __exit exit_nfsd(void) + { ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ remove_proc_entry("fs/nfs/spnfs/recall", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/getfh", NULL); ++ remove_proc_entry("fs/nfs/spnfs/config", NULL); ++ remove_proc_entry("fs/nfs/spnfs/ctl", NULL); ++ remove_proc_entry("fs/nfs/spnfs", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutsegsize", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ + nfsd_export_shutdown(); ++ nfsd4_pnfs_dlm_shutdown(); + nfsd_reply_cache_shutdown(); + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); +diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h +--- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-08-23 12:08:29.095491390 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 12:09:03.318355741 -0400 +@@ -285,11 +285,17 @@ extern time_t nfsd4_grace; + #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ + NFSD4_SUPPORTED_ATTRS_WORD0 + ++#if defined(CONFIG_PNFSD) ++#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ ++ (NFSD4_SUPPORTED_ATTRS_WORD1 | FATTR4_WORD1_FS_LAYOUT_TYPES) ++#else /* CONFIG_PNFSD */ + #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ + NFSD4_SUPPORTED_ATTRS_WORD1 ++#endif /* CONFIG_PNFSD */ + + #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ +- (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) ++ (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT | \ ++ FATTR4_WORD2_LAYOUT_BLKSIZE) + + static inline u32 nfsd_suppattrs0(u32 minorversion) + { +diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.c +--- linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-08-23 12:09:03.319511586 -0400 +@@ -10,6 +10,7 @@ + #include + + #include ++#include + #include "nfsd.h" + #include "vfs.h" + #include "auth.h" +@@ -139,6 +140,7 @@ static inline __be32 check_pseudo_root(s + static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) + { + struct knfsd_fh *fh = &fhp->fh_handle; ++ int fsid_type; + struct fid *fid = NULL, sfid; + struct svc_export *exp; + struct dentry *dentry; +@@ -159,7 +161,8 @@ static __be32 nfsd_set_fh_dentry(struct + return error; + if (fh->fh_auth_type != 0) + return error; +- len = key_len(fh->fh_fsid_type) / 4; ++ fsid_type = pnfs_fh_fsid_type(fh); ++ len = key_len(fsid_type) / 4; + if (len == 0) + return error; + if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { +@@ -172,7 +175,7 @@ static __be32 nfsd_set_fh_dentry(struct + data_left -= len; + if (data_left < 0) + return error; +- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth); ++ exp = rqst_exp_find(rqstp, fsid_type, fh->fh_auth); + fid = (struct fid *)(fh->fh_auth + len); + } else { + __u32 tfh[2]; +diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.h +--- linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-08-23 12:09:03.319511586 -0400 +@@ -14,6 +14,7 @@ enum nfsd_fsid { + FSID_UUID8, + FSID_UUID16, + FSID_UUID16_INUM, ++ FSID_MAX + }; + + enum fsid_source { +@@ -205,4 +206,42 @@ fh_unlock(struct svc_fh *fhp) + } + } + ++#if defined(CONFIG_PNFSD) ++ ++/* ++ * fh_fsid_type is overloaded to indicate whether a filehandle was one supplied ++ * to a DS by LAYOUTGET. nfs4_preprocess_stateid_op() uses this to decide how ++ * to handle a given stateid. ++ */ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return fh->fh_fsid_type >= FSID_MAX; ++} ++ ++static inline void pnfs_fh_mark_ds(struct knfsd_fh *fh) ++{ ++ BUG_ON(fh->fh_version != 1); ++ BUG_ON(pnfs_fh_is_ds(fh)); ++ fh->fh_fsid_type += FSID_MAX; ++} ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ ++/* allows fh_verify() to check the real fsid_type (i.e., not overloaded). */ ++static inline int pnfs_fh_fsid_type(struct knfsd_fh *fh) ++{ ++ int fsid_type = fh->fh_fsid_type; ++ ++ if (pnfs_fh_is_ds(fh)) ++ return fsid_type - FSID_MAX; ++ return fsid_type; ++} ++ + #endif /* _LINUX_NFSD_FH_INT_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/nfssvc.c +--- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-08-23 12:08:27.631563969 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-08-23 12:09:03.320416974 -0400 +@@ -115,7 +115,7 @@ struct svc_program nfsd_program = { + + }; + +-u32 nfsd_supported_minorversion; ++u32 nfsd_supported_minorversion = NFSD_SUPPORTED_MINOR_VERSION; + + int nfsd_vers(int vers, enum vers_op change) + { +diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pnfsd.h +--- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-08-23 12:09:03.321376171 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-08-23 12:09:03.321376171 -0400 +@@ -0,0 +1,143 @@ ++/* ++ * Copyright (c) 2005 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef LINUX_NFSD_PNFSD_H ++#define LINUX_NFSD_PNFSD_H ++ ++#include ++#include ++ ++#include "state.h" ++#include "xdr4.h" ++ ++/* outstanding layout stateid */ ++struct nfs4_layout_state { ++ struct list_head ls_perfile; ++ struct list_head ls_layouts; /* list of nfs4_layouts */ ++ struct kref ls_ref; ++ struct nfs4_client *ls_client; ++ struct nfs4_file *ls_file; ++ stateid_t ls_stateid; ++}; ++ ++/* outstanding layout */ ++struct nfs4_layout { ++ struct list_head lo_perfile; /* hash by f_id */ ++ struct list_head lo_perclnt; /* hash by clientid */ ++ struct list_head lo_perstate; ++ struct nfs4_file *lo_file; /* backpointer */ ++ struct nfs4_client *lo_client; ++ struct nfs4_layout_state *lo_state; ++ struct nfsd4_layout_seg lo_seg; ++}; ++ ++struct pnfs_inval_state { ++ struct knfsd_fh mdsfh; /* needed only by invalidate all */ ++ stateid_t stid; ++ clientid_t clid; ++ u32 status; ++}; ++ ++/* pNFS Data Server state */ ++#define DS_STATEID_VALID 0 ++#define DS_STATEID_ERROR 1 ++#define DS_STATEID_NEW 2 ++ ++struct pnfs_ds_stateid { ++ struct list_head ds_hash; /* ds_stateid hash entry */ ++ struct list_head ds_perclid; /* per client hash entry */ ++ stateid_t ds_stid; ++ struct knfsd_fh ds_fh; ++ unsigned long ds_access; ++ u32 ds_status; /* from MDS */ ++ u32 ds_verifier[2]; /* from MDS */ ++ wait_queue_head_t ds_waitq; ++ unsigned long ds_flags; ++ struct kref ds_ref; ++ clientid_t ds_mdsclid; ++}; ++ ++struct pnfs_ds_clientid { ++ struct list_head dc_hash; /* mds_clid_hashtbl entry */ ++ struct list_head dc_stateid; /* ds_stateid head */ ++ struct list_head dc_permdsid; /* per mdsid hash entry */ ++ clientid_t dc_mdsclid; ++ struct kref dc_ref; ++ uint32_t dc_mdsid; ++}; ++ ++struct pnfs_mds_id { ++ struct list_head di_hash; /* mds_nodeid list entry */ ++ struct list_head di_mdsclid; /* mds_clientid head */ ++ uint32_t di_mdsid; ++ time_t di_mdsboot; /* mds boot time */ ++ struct kref di_ref; ++}; ++ ++/* notify device request (from exported filesystem) */ ++struct nfs4_notify_device { ++ struct nfsd4_pnfs_cb_dev_list *nd_list; ++ struct nfs4_client *nd_client; ++ struct list_head nd_perclnt; ++ ++ void *nd_args; /* nfsd internal */ ++}; ++ ++u64 find_create_sbid(struct super_block *); ++struct super_block *find_sbid_id(u64); ++__be32 nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *, struct exp_xdr_stream *); ++int nfs4_pnfs_return_layout(struct super_block *, struct svc_fh *, ++ struct nfsd4_pnfs_layoutreturn *); ++int nfs4_pnfs_cb_get_state(struct super_block *, struct pnfs_get_state *); ++int nfs4_pnfs_cb_change_state(struct pnfs_get_state *); ++void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++int put_layoutrecall(struct nfs4_layoutrecall *); ++void nomatching_layout(struct nfs4_layoutrecall *); ++void *layoutrecall_done(struct nfs4_layoutrecall *); ++int nfsd4_cb_layout(struct nfs4_layoutrecall *); ++int nfsd_layout_recall_cb(struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++int nfsd_device_notify_cb(struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++int nfsd4_cb_notify_device(struct nfs4_notify_device *); ++void pnfs_set_device_notify(clientid_t *, unsigned int types); ++void pnfs_clear_device_notify(struct nfs4_client *); ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++extern struct sockaddr pnfsd_lexp_addr; ++extern size_t pnfs_lexp_addr_len; ++ ++extern void pnfsd_lexp_init(struct inode *); ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#endif /* LINUX_NFSD_PNFSD_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c +--- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-23 12:09:03.321376171 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-23 12:09:03.322501672 -0400 +@@ -0,0 +1,225 @@ ++/* ++ * linux/fs/nfsd/pnfs_lexp.c ++ * ++ * pNFS export of local filesystems. ++ * ++ * Export local file systems over the files layout type. ++ * The MDS (metadata server) functions also as a single DS (data server). ++ * This is mostly useful for development and debugging purposes. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * Copyright (C) 2008 Benny Halevy, ++ * ++ * Initial implementation was based on the pnfs-gfs2 patches done ++ * by David M. Richter ++ */ ++ ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++struct sockaddr pnfsd_lexp_addr; ++size_t pnfs_lexp_addr_len; ++ ++static int ++pnfsd_lexp_layout_type(struct super_block *sb) ++{ ++ int ret = LAYOUT_NFSV4_1_FILES; ++ dprintk("<-- %s: return %d\n", __func__, ret); ++ return ret; ++} ++ ++static int ++pnfsd_lexp_get_device_iter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ ++ dprintk("<-- %s: return 0\n", __func__); ++ return 0; ++} ++ ++static int ++pnfsd_lexp_get_device_info(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_multipath fl_devices[1]; ++ u32 fl_stripe_indices[1] = { 0 }; ++ struct pnfs_filelayout_devaddr daddr; ++ /* %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x.%03u.%03u */ ++ char daddr_buf[8*4 + 2*3 + 10]; ++ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ ++ if (devid->devid != 1) { ++ printk(KERN_ERR "%s: WARNING: didn't receive a deviceid of 1 " ++ "(got: 0x%llx)\n", __func__, devid->devid); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ /* count the number of comma-delimited DS IPs */ ++ fdev.fl_device_length = 1; ++ fdev.fl_device_list = fl_devices; ++ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = fl_stripe_indices; ++ ++ daddr.r_addr.data = daddr_buf; ++ daddr.r_addr.len = sizeof(daddr_buf); ++ err = __svc_print_netaddr(&pnfsd_lexp_addr, &daddr.r_addr); ++ if (err < 0) ++ goto out; ++ daddr.r_addr.len = err; ++ switch (pnfsd_lexp_addr.sa_family) { ++ case AF_INET: ++ daddr.r_netid.data = "tcp"; ++ daddr.r_netid.len = 3; ++ break; ++ case AF_INET6: ++ daddr.r_netid.data = "tcp6"; ++ daddr.r_netid.len = 4; ++ break; ++ default: ++ BUG(); ++ } ++ fdev.fl_device_list[0].fl_multipath_length = 1; ++ fdev.fl_device_list[0].fl_multipath_list = &daddr; ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ dprintk("<-- %s: return %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize < NFSSVC_MAXBLKSIZE) ++ blocksize = NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++ dprintk("%s: return %d\n", __func__, blocksize); ++ return blocksize; ++} ++ ++static enum nfsstat4 ++pnfsd_lexp_layout_get(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ enum nfsstat4 rc = NFS4_OK; ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ ++ dprintk("--> %s: inode=%p\n", __func__, inode); ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = true; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = arg->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = 0; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ memcpy(fhp, arg->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ dprintk("<-- %s: return %d\n", __func__, rc); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++pnfsd_lexp_layout_commit(struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ dprintk("%s: (unimplemented)\n", __func__); ++ ++ return 0; ++} ++ ++static int ++pnfsd_lexp_layout_return(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ dprintk("%s: (unimplemented)\n", __func__); ++ ++ return 0; ++} ++ ++static int pnfsd_lexp_get_state(struct inode *inode, struct knfsd_fh *fh, ++ struct pnfs_get_state *p) ++{ ++ return 0; /* just use the current stateid */ ++} ++ ++static struct pnfs_export_operations pnfsd_lexp_ops = { ++ .layout_type = pnfsd_lexp_layout_type, ++ .get_device_info = pnfsd_lexp_get_device_info, ++ .get_device_iter = pnfsd_lexp_get_device_iter, ++ .layout_get = pnfsd_lexp_layout_get, ++ .layout_commit = pnfsd_lexp_layout_commit, ++ .layout_return = pnfsd_lexp_layout_return, ++ .get_state = pnfsd_lexp_get_state, ++}; ++ ++void ++pnfsd_lexp_init(struct inode *inode) ++{ ++ dprintk("%s: &pnfsd_lexp_ops=%p\n", __func__, &pnfsd_lexp_ops); ++ inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_com.c +--- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-23 12:09:03.322501672 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-08-23 12:09:03.323511608 -0400 +@@ -0,0 +1,535 @@ ++/* ++ * fs/nfsd/spnfs_com.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * Based heavily on idmap.c ++ * ++ */ ++ ++/* ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Marius Aamodt Eriksen ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++static ssize_t spnfs_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t spnfs_pipe_downcall(struct file *, const char __user *, ++ size_t); ++static void spnfs_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops spnfs_upcall_ops = { ++ .upcall = spnfs_pipe_upcall, ++ .downcall = spnfs_pipe_downcall, ++ .destroy_msg = spnfs_pipe_destroy_msg, ++}; ++ ++/* evil global variable */ ++struct spnfs *global_spnfs; ++struct spnfs_config *spnfs_config; ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++int spnfs_use_layoutsegments; ++uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++/* ++ * Used by spnfs_enabled() ++ * Tracks if the subsystem has been initialized at some point. It doesn't ++ * matter if it's not currently initialized. ++ */ ++static int spnfs_enabled_at_some_point; ++ ++/* call this to start the ball rolling */ ++/* code it like we're going to avoid the global variable in the future */ ++int ++nfsd_spnfs_new(void) ++{ ++ struct spnfs *spnfs = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ if (global_spnfs != NULL) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ spnfs = kzalloc(sizeof(*spnfs), GFP_KERNEL); ++ if (spnfs == NULL){ ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ spnfs->spnfs_dentry = rpc_mkpipe(nd.path.dentry, "spnfs", spnfs, ++ &spnfs_upcall_ops, 0); ++ if (IS_ERR(spnfs->spnfs_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ ++ mutex_init(&spnfs->spnfs_lock); ++ mutex_init(&spnfs->spnfs_plock); ++ init_waitqueue_head(&spnfs->spnfs_wq); ++ ++ global_spnfs = spnfs; ++ spnfs_enabled_at_some_point = 1; ++ ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(spnfs); ++ return rc; ++} ++ ++/* again, code it like we're going to remove the global variable */ ++void ++nfsd_spnfs_delete(void) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ ++ if (!spnfs) ++ return; ++ rpc_unlink(spnfs->spnfs_dentry); ++ rpc_put_mount(); ++ global_spnfs = NULL; ++ kfree(spnfs); ++} ++ ++/* RPC pipefs upcall/downcall routines */ ++/* looks like this code is invoked by the rpc_pipe code */ ++/* to handle upcalls on things we've queued elsewhere */ ++/* See nfs_idmap_id for an exmaple of enqueueing */ ++static ssize_t ++spnfs_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, ++ char __user *dst, size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied; ++ ssize_t left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ return mlen; ++} ++ ++static ssize_t ++spnfs_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ struct spnfs *spnfs = (struct spnfs *)rpci->private; ++ struct spnfs_msg *im_in = NULL, *im = &spnfs->spnfs_im; ++ int ret; ++ ++ if (mlen != sizeof(struct spnfs_msg)) ++ return -ENOSPC; ++ ++ im_in = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im_in == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(im_in, src, mlen) != 0) ++ return -EFAULT; ++ ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ ret = mlen; ++ im->im_status = im_in->im_status; ++ /* If we got an error, terminate now, and wake up pending upcalls */ ++ if (!(im_in->im_status & SPNFS_STATUS_SUCCESS)) { ++ wake_up(&spnfs->spnfs_wq); ++ goto out; ++ } ++ ++ ret = -EINVAL; ++ /* Did we match the current upcall? */ ++ /* DMXXX: do not understand the comment above, from original code */ ++ /* DMXXX: when do we _not_ match the current upcall? */ ++ /* DMXXX: anyway, let's to a simplistic check */ ++ if (im_in->im_type == im->im_type) { ++ /* copy the response into the spnfs struct */ ++ memcpy(&im->im_res, &im_in->im_res, sizeof(im->im_res)); ++ ret = mlen; ++ } else ++ dprintk("spnfs: downcall type != upcall type\n"); ++ ++ ++ wake_up(&spnfs->spnfs_wq); ++/* DMXXX handle rval processing */ ++out: ++ mutex_unlock(&spnfs->spnfs_plock); ++ kfree(im_in); ++ return ret; ++} ++ ++static void ++spnfs_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ struct spnfs_msg *im = msg->data; ++ struct spnfs *spnfs = container_of(im, struct spnfs, spnfs_im); ++ ++ if (msg->errno >= 0) ++ return; ++ mutex_lock(&spnfs->spnfs_plock); ++ im->im_status = SPNFS_STATUS_FAIL; /* DMXXX */ ++ wake_up(&spnfs->spnfs_wq); ++ mutex_unlock(&spnfs->spnfs_plock); ++} ++ ++/* generic upcall. called by functions in spnfs_ops.c */ ++int ++spnfs_upcall(struct spnfs *spnfs, struct spnfs_msg *upmsg, ++ union spnfs_msg_res *res) ++{ ++ struct rpc_pipe_msg msg; ++ struct spnfs_msg *im; ++ DECLARE_WAITQUEUE(wq, current); ++ int ret = -EIO; ++ int rval; ++ ++ im = &spnfs->spnfs_im; ++ ++ mutex_lock(&spnfs->spnfs_lock); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ memset(im, 0, sizeof(*im)); ++ memcpy(im, upmsg, sizeof(*upmsg)); ++ ++ memset(&msg, 0, sizeof(msg)); ++ msg.data = im; ++ msg.len = sizeof(*im); ++ ++ add_wait_queue(&spnfs->spnfs_wq, &wq); ++ rval = rpc_queue_upcall(spnfs->spnfs_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&spnfs->spnfs_plock); ++ schedule(); ++ current->state = TASK_RUNNING; ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ if (im->im_status & SPNFS_STATUS_SUCCESS) { ++ /* copy our result from the upcall */ ++ memcpy(res, &im->im_res, sizeof(*res)); ++ ret = 0; ++ } ++ ++out: ++ memset(im, 0, sizeof(*im)); ++ mutex_unlock(&spnfs->spnfs_plock); ++ mutex_unlock(&spnfs->spnfs_lock); ++ return(ret); ++} ++ ++/* ++ * This is used to determine if the spnfsd daemon has been started at ++ * least once since the system came up. This is used to by the export ++ * mechanism to decide if spnfs is in use. ++ * ++ * Returns non-zero if the spnfsd has initialized the communication pipe ++ * at least once. ++ */ ++int spnfs_enabled(void) ++{ ++ return spnfs_enabled_at_some_point; ++} ++ ++#ifdef CONFIG_PROC_FS ++ ++/* ++ * procfs virtual files for user/kernel space communication: ++ * ++ * ctl - currently just an on/off switch...can be expanded ++ * getfh - fd to fh conversion ++ * recall - recall a layout from the command line, for example: ++ * echo > /proc/fs/spnfs/recall ++ * config - configuration info, e.g., stripe size, num ds, etc. ++ */ ++ ++/*-------------- start ctl -------------------------*/ ++static ssize_t ctl_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int cmd, rc; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (cmd) { ++ rc = nfsd_spnfs_new(); ++ if (rc != 0) ++ return rc; ++ } else ++ nfsd_spnfs_delete(); ++ ++ return count; ++} ++ ++static const struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++/*-------------- end ctl ---------------------------*/ ++ ++/*-------------- start config -------------------------*/ ++static ssize_t config_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ static struct spnfs_config cfg; ++ ++ if (copy_from_user(&cfg, buf, count)) ++ return -EFAULT; ++ ++ spnfs_config = &cfg; ++ return 0; ++} ++ ++static const struct file_operations config_ops = { ++ .write = config_write, ++}; ++/*-------------- end config ---------------------------*/ ++ ++/*-------------- start getfh -----------------------*/ ++static int getfh_open(struct inode *inode, struct file *file) ++{ ++ file->private_data = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); ++ if (file->private_data == NULL) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static ssize_t getfh_read(struct file *file, char __user *buf, size_t count, ++ loff_t *offset) ++{ ++ if (copy_to_user(buf, file->private_data, sizeof(struct nfs_fh))) ++ return -EFAULT; ++ ++ return count; ++} ++ ++static ssize_t getfh_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int fd; ++ ++ if (copy_from_user((int *)&fd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (spnfs_getfh(fd, file->private_data) != 0) ++ return -EIO; ++ ++ return count; ++} ++ ++static int getfh_release(struct inode *inode, struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static const struct file_operations getfh_ops = { ++ .open = getfh_open, ++ .read = getfh_read, ++ .write = getfh_write, ++ .release = getfh_release, ++}; ++/*-------------- end getfh ------------------------*/ ++ ++ ++/*-------------- start recall layout --------------*/ ++static ssize_t recall_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char input[128]; ++ char *path, *str, *p; ++ int rc; ++ u64 off = 0, len = 0; ++ ++ if (count > 128) ++ return -EINVAL; ++ ++ if (copy_from_user(input, buf, count)) ++ return -EFAULT; ++ ++ /* assumes newline-terminated path */ ++ p = memchr(input, '\n', count); ++ if (p == NULL) ++ return -EINVAL; ++ *p = '\0'; ++ ++ /* ++ * Scan for path and, optionally, an offset and length ++ * of a layout segment to be recalled; if there are two ++ * fields, they're assumed to be path and offset. ++ */ ++ p = input; ++ path = strsep(&p, " "); ++ if (path == NULL) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &off); ++ if (rc != 0) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &len); ++ if (rc != 0) ++ return -EINVAL; ++ } ++ } ++ ++ rc = spnfs_test_layoutrecall(path, off, len); ++ if (rc != 0) ++ return rc; ++ ++ return count; ++} ++ ++static const struct file_operations recall_ops = { ++ .write = recall_write, ++}; ++/*-------------- end recall layout --------------*/ ++ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++/*-------------- start layoutseg -------------------------*/ ++static ssize_t layoutseg_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[3]; ++ ++ if (copy_from_user(cmd, buf, 1)) ++ return -EFAULT; ++ if (cmd[0] == '0') ++ spnfs_use_layoutsegments = 0; ++ else ++ spnfs_use_layoutsegments = 1; ++ ++ return count; ++} ++ ++static const struct file_operations layoutseg_ops = { ++ .write = layoutseg_write, ++}; ++/*-------------- end layoutseg ---------------------------*/ ++ ++/*-------------- start layoutsegsize -------------------------*/ ++static ssize_t layoutsegsize_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[50]; ++ ++ if (copy_from_user(cmd, buf, 49)) ++ return -EFAULT; ++ layoutsegment_size = simple_strtoull(cmd, NULL, 10); ++ ++ return count; ++} ++ ++static const struct file_operations layoutsegsize_ops = { ++ .write = layoutsegsize_write, ++}; ++/*-------------- end layoutsegsize ---------------------------*/ ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++int ++spnfs_init_proc(void) ++{ ++ struct proc_dir_entry *entry; ++ ++ entry = proc_mkdir("fs/spnfs", NULL); ++ if (!entry) ++ return -ENOMEM; ++ ++ entry = create_proc_entry("fs/spnfs/ctl", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &ctl_ops; ++ ++ entry = create_proc_entry("fs/spnfs/config", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &config_ops; ++ ++ entry = create_proc_entry("fs/spnfs/getfh", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &getfh_ops; ++ ++ entry = create_proc_entry("fs/spnfs/recall", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &recall_ops; ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++ entry = create_proc_entry("fs/spnfs/layoutseg", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutseg_ops; ++ ++ entry = create_proc_entry("fs/spnfs/layoutsegsize", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutsegsize_ops; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ return 0; ++} ++#endif /* CONFIG_PROC_FS */ +diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c +--- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-23 12:09:03.324501390 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-08-23 12:09:03.324501390 -0400 +@@ -0,0 +1,878 @@ ++/* ++ * fs/nfsd/spnfs_ops.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * ++ */ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++/* comment out CONFIG_SPNFS_TEST for non-test behaviour */ ++/* #define CONFIG_SPNFS_TEST 1 */ ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++/* ++ * The functions that are called from elsewhere in the kernel ++ * to perform tasks in userspace ++ * ++ */ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++extern int spnfs_use_layoutsegments; ++extern uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++extern struct spnfs *global_spnfs; ++ ++int ++spnfs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++enum nfsstat4 ++spnfs_layoutget(struct inode *inode, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *lg_arg, ++ struct nfsd4_pnfs_layoutget_res *lg_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct pnfs_filelayout_layout *flp = NULL; ++ int status, i; ++ enum nfsstat4 nfserr; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ im->im_type = SPNFS_TYPE_LAYOUTGET; ++ im->im_args.layoutget_args.inode = inode->i_ino; ++ im->im_args.layoutget_args.generation = inode->i_generation; ++ ++ /* call function to queue the msg for upcall */ ++ if (spnfs_upcall(spnfs, im, res) != 0) { ++ dprintk("failed spnfs upcall: layoutget\n"); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto layoutget_cleanup; ++ } ++ status = res->layoutget_res.status; ++ if (status != 0) { ++ /* FIXME? until user mode is fixed, translate system error */ ++ switch (status) { ++ case -E2BIG: ++ case -ETOOSMALL: ++ nfserr = NFS4ERR_TOOSMALL; ++ break; ++ case -ENOMEM: ++ case -EAGAIN: ++ case -EINTR: ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ break; ++ case -ENOENT: ++ nfserr = NFS4ERR_BADLAYOUT; ++ break; ++ default: ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ } ++ dprintk("spnfs layout_get upcall: status=%d nfserr=%u\n", ++ status, nfserr); ++ goto layoutget_cleanup; ++ } ++ ++ lg_res->lg_return_on_close = 0; ++#if defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ /* if spnfs_use_layoutsegments & layoutsegment_size == 0, use */ ++ /* the amount requested by the client. */ ++ if (spnfs_use_layoutsegments) { ++ if (layoutsegment_size != 0) ++ lg_res->lg_seg.length = layoutsegment_size; ++ } else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ flp = kmalloc(sizeof(struct pnfs_filelayout_layout), GFP_KERNEL); ++ if (flp == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ flp->device_id.sbid = lg_arg->lg_sbid; ++ flp->device_id.devid = res->layoutget_res.devid; ++ flp->lg_layout_type = 1; /* XXX */ ++ flp->lg_stripe_type = res->layoutget_res.stripe_type; ++ flp->lg_commit_through_mds = 0; ++ flp->lg_stripe_unit = res->layoutget_res.stripe_size; ++ flp->lg_first_stripe_index = 0; ++ flp->lg_pattern_offset = 0; ++ flp->lg_fh_length = res->layoutget_res.stripe_count; ++ ++ flp->lg_fh_list = kmalloc(flp->lg_fh_length * sizeof(struct knfsd_fh), ++ GFP_KERNEL); ++ if (flp->lg_fh_list == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ /* ++ * FIX: Doing an extra copy here. Should group res.flist's fh_len ++ * and fh_val into a knfsd_fh structure. ++ */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ flp->lg_fh_list[i].fh_size = res->layoutget_res.flist[i].fh_len; ++ memcpy(&flp->lg_fh_list[i].fh_base, ++ res->layoutget_res.flist[i].fh_val, ++ res->layoutget_res.flist[i].fh_len); ++ } ++ ++ /* encode the layoutget body */ ++ nfserr = filelayout_encode_layout(xdr, flp); ++ ++layoutget_cleanup: ++ if (flp) { ++ if (flp->lg_fh_list) ++ kfree(flp->lg_fh_list); ++ kfree(flp); ++ } ++ kfree(im); ++ kfree(res); ++ ++ return nfserr; ++} ++ ++int ++spnfs_layoutcommit(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutreturn(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for ino = %lu\n", ++ __func__, inode->i_ino); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* XXX figure out how to get a sb since there's no inode ptr */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = offset; ++ lr.cbl_seg.length = len; ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ lr.cbl_layoutchanged = 0; ++ ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ ++ return 0; ++} ++ ++ ++int ++spnfs_test_layoutrecall(char *path, u64 offset, u64 len) ++{ ++ struct nameidata nd; ++ struct inode *inode; ++ int type, rc; ++ ++ dprintk("%s: path=%s, offset=%llu, len=%llu\n", ++ __func__, path, offset, len); ++ ++ if (strcmp(path, "all") == 0) { ++ inode = NULL; ++ type = RETURN_ALL; ++ } else { ++ rc = path_lookup(path, 0, &nd); ++ if (rc != 0) ++ return -ENOENT; ++ ++ /* ++ * XXX todo: add a RETURN_FSID scenario here...maybe if ++ * inode is a dir... ++ */ ++ ++ inode = nd.path.dentry->d_inode; ++ type = RETURN_FILE; ++ } ++ ++ if (len == 0) ++ len = NFS4_MAX_UINT64; ++ ++ rc = spnfs_layoutrecall(inode, type, offset, len); ++ ++ if (type != RETURN_ALL) ++ path_put(&nd.path); ++ return rc; ++} ++ ++int ++spnfs_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *gd_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* XXX keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEITER; ++ im->im_args.getdeviceiter_args.cookie = gd_res->gd_cookie; ++ im->im_args.getdeviceiter_args.verf = gd_res->gd_verf; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceiter_out; ++ } ++ status = res->getdeviceiter_res.status; ++ ++ if (res->getdeviceiter_res.eof) ++ gd_res->gd_eof = 1; ++ else { ++ gd_res->gd_devid = res->getdeviceiter_res.devid; ++ gd_res->gd_cookie = res->getdeviceiter_res.cookie; ++ gd_res->gd_verf = res->getdeviceiter_res.verf; ++ gd_res->gd_eof = 0; ++ } ++ ++getdeviceiter_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++#ifdef CONFIG_SPNFS_TEST ++/* ++ * Setup the rq_res xdr_buf. The svc_rqst rq_respages[1] page contains the ++ * 1024 encoded stripe indices. ++ * ++ * Skip the devaddr4 length and encode the indicies count (1024) in the ++ * rq_res.head and set the rq_res.head length. ++ * ++ * Set the rq_res page_len to 4096 (for the 1024 stripe indices). ++ * Set the rq_res xdr_buf tail base to rq_respages[0] just after the ++ * rq_res head to hold the rest of the getdeviceinfo return. ++ * ++ * So rq_respages[rq_resused - 1] contains the rq_res.head and rq_res.tail and ++ * rq_respages[rq_resused] contains the rq_res.pages. ++ */ ++static int spnfs_test_indices_xdr(struct pnfs_xdr_info *info, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ struct nfsd4_compoundres *resp = info->resp; ++ struct svc_rqst *rqstp = resp->rqstp; ++ struct xdr_buf *xb = &resp->rqstp->rq_res; ++ __be32 *p; ++ ++ p = nfsd4_xdr_reserve_space(resp, 8); ++ p++; /* Fill in length later */ ++ *p++ = cpu_to_be32(fdev->fl_stripeindices_length); /* 1024 */ ++ resp->p = p; ++ ++ xb->head[0].iov_len = (char *)resp->p - (char *)xb->head[0].iov_base; ++ xb->pages = &rqstp->rq_respages[rqstp->rq_resused]; ++ xb->page_base = 0; ++ xb->page_len = PAGE_SIZE; /* page of 1024 encoded indices */ ++ xb->tail[0].iov_base = resp->p; ++ resp->end = xb->head[0].iov_base + PAGE_SIZE; ++ xb->tail[0].iov_len = (char *)resp->end - (char *)resp->p; ++ return 0; ++} ++/* ++ * Return a stripeindices of length 1024 to test ++ * the pNFS client multipage getdeviceinfo implementation. ++ * ++ * Encode a page of stripe indices. ++ */ ++static void spnfs_set_test_indices(struct pnfs_filelayout_device *fldev, ++ struct spnfs_device *dev, ++ struct pnfs_devinfo_arg *info) ++{ ++ struct svc_rqst *rqstp = info->xdr.resp->rqstp; ++ __be32 *p; ++ int i, j = 0; ++ ++ p = (__be32 *)page_address(rqstp->rq_respages[rqstp->rq_resused]); ++ fldev->fl_stripeindices_length = 1024; ++ /* round-robin the data servers device index into the stripe indicie */ ++ for (i = 0; i < 1024; i++) { ++ *p++ = cpu_to_be32(j); ++ if (j < dev->dscount - 1) ++ j++; ++ else ++ j = 0; ++ } ++ fldev->fl_stripeindices_list = NULL; ++} ++#endif /* CONFIG_SPNFS_TEST */ ++ ++int ++spnfs_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct spnfs_device *dev; ++ struct pnfs_filelayout_device *fldev = NULL; ++ struct pnfs_filelayout_multipath *mp = NULL; ++ struct pnfs_filelayout_devaddr *fldap = NULL; ++ int status = 0, i, len; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEINFO; ++ /* XXX FIX: figure out what to do about fsid */ ++ im->im_args.getdeviceinfo_args.devid = devid->devid; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceinfo_out; ++ } ++ status = res->getdeviceinfo_res.status; ++ if (status != 0) ++ goto getdeviceinfo_out; ++ ++ dev = &res->getdeviceinfo_res.devinfo; ++ ++ /* Fill in the device data, i.e., nfs4_1_file_layout_ds_addr4 */ ++ fldev = kzalloc(sizeof(struct pnfs_filelayout_device), GFP_KERNEL); ++ if (fldev == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ /* ++ * Stripe count is the same as data server count for our purposes ++ */ ++ fldev->fl_stripeindices_length = dev->dscount; ++ fldev->fl_device_length = dev->dscount; ++ ++ /* Set stripe indices */ ++#ifdef CONFIG_SPNFS_TEST ++ spnfs_set_test_indices(fldev, dev, info); ++ fldev->fl_enc_stripe_indices = spnfs_test_indices_xdr; ++#else /* CONFIG_SPNFS_TEST */ ++ fldev->fl_stripeindices_list = ++ kmalloc(fldev->fl_stripeindices_length * sizeof(u32), ++ GFP_KERNEL); ++ if (fldev->fl_stripeindices_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_stripeindices_length; i++) ++ fldev->fl_stripeindices_list[i] = i; ++#endif /* CONFIG_SPNFS_TEST */ ++ ++ /* ++ * Set the device's data server addresses No multipath for spnfs, ++ * so mp length is always 1. ++ * ++ */ ++ fldev->fl_device_list = ++ kmalloc(fldev->fl_device_length * ++ sizeof(struct pnfs_filelayout_multipath), ++ GFP_KERNEL); ++ if (fldev->fl_device_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ mp = &fldev->fl_device_list[i]; ++ mp->fl_multipath_length = 1; ++ mp->fl_multipath_list = ++ kmalloc(sizeof(struct pnfs_filelayout_devaddr), ++ GFP_KERNEL); ++ if (mp->fl_multipath_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ fldap = mp->fl_multipath_list; ++ ++ /* ++ * Copy the netid into the device address, for example: "tcp" ++ */ ++ len = strlen(dev->dslist[i].netid); ++ fldap->r_netid.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_netid.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_netid.data, dev->dslist[i].netid, len); ++ fldap->r_netid.len = len; ++ ++ /* ++ * Copy the network address into the device address, ++ * for example: "10.35.9.16.08.01" ++ */ ++ len = strlen(dev->dslist[i].addr); ++ fldap->r_addr.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_addr.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_addr.data, dev->dslist[i].addr, len); ++ fldap->r_addr.len = len; ++ } ++ ++ /* encode the device data */ ++ status = filelayout_encode_devinfo(xdr, fldev); ++ ++getdeviceinfo_out: ++ if (fldev) { ++ kfree(fldev->fl_stripeindices_list); ++ if (fldev->fl_device_list) { ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ fldap = ++ fldev->fl_device_list[i].fl_multipath_list; ++ kfree(fldap->r_netid.data); ++ kfree(fldap->r_addr.data); ++ kfree(fldap); ++ } ++ kfree(fldev->fl_device_list); ++ } ++ kfree(fldev); ++ } ++ ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_setattr(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_open(struct inode *inode, struct nfsd4_open *open) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_OPEN; ++ im->im_args.open_args.inode = inode->i_ino; ++ im->im_args.open_args.generation = inode->i_generation; ++ im->im_args.open_args.create = open->op_create; ++ im->im_args.open_args.createmode = open->op_createmode; ++ im->im_args.open_args.truncate = open->op_truncate; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto open_out; ++ } ++ status = res->open_res.status; ++ ++open_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_create(void) ++{ ++ return 0; ++} ++ ++/* ++ * Invokes the spnfsd with the inode number of the object to remove. ++ * The file has already been removed on the MDS, so all the spnsfd ++ * daemon does is remove the stripes. ++ * Returns 0 on success otherwise error code ++ */ ++int ++spnfs_remove(unsigned long ino, unsigned long generation) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_REMOVE; ++ im->im_args.remove_args.inode = ino; ++ im->im_args.remove_args.generation = generation; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto remove_out; ++ } ++ status = res->remove_res.status; ++ ++remove_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++static int ++read_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_read(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ if (err == 0) ++ break; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDONLY | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0 ; vnum < vlen ; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = read_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err < 0) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ if (err < iolen) { ++ bytecount += err; ++ goto read_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++read_out: ++ *lenp = bytecount; ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ return status; ++} ++ ++__be32 ++spnfs_read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return read(inode, offset, lenp, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++static int ++write_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_write(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDWR | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto write_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0; vnum < vlen; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = write_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err != iolen) { ++ dprintk("spnfs_write: err=%d expected %Zd\n", err, len); ++ status = nfserr_io; ++ goto write_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++write_out: ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ ++ return status; ++} ++ ++__be32 ++spnfs_write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return write(inode, offset, len, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++int ++spnfs_commit(void) ++{ ++ return 0; ++} ++ ++/* ++ * Return the state for this object. ++ * At this time simply return 0 to indicate success and use the existing state ++ */ ++int ++spnfs_get_state(struct inode *inode, struct knfsd_fh *fh, struct pnfs_get_state *arg) ++{ ++ return 0; ++} ++ ++/* ++ * Return the filehandle for the specified file descriptor ++ */ ++int ++spnfs_getfh(int fd, struct nfs_fh *fh) ++{ ++ struct file *file; ++ ++ file = fget(fd); ++ if (file == NULL) ++ return -EIO; ++ ++ memcpy(fh, NFS_FH(file->f_dentry->d_inode), sizeof(struct nfs_fh)); ++ fput(file); ++ return 0; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h +--- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-08-23 12:08:29.096512142 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 12:09:03.325501424 -0400 +@@ -242,6 +242,12 @@ struct nfs4_client { + u32 cl_cb_seq_nr; + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ ++#if defined(CONFIG_PNFSD) ++ struct list_head cl_layouts; /* outstanding layouts */ ++ struct list_head cl_layoutrecalls; /* outstanding layoutrecall ++ callbacks */ ++ atomic_t cl_deviceref; /* Num outstanding devs */ ++#endif /* CONFIG_PNFSD */ + }; + + static inline void +@@ -342,12 +348,31 @@ struct nfs4_file { + struct list_head fi_hash; /* hash by "struct inode *" */ + struct list_head fi_stateids; + struct list_head fi_delegations; ++#if defined(CONFIG_PNFSD) ++ struct list_head fi_layouts; ++ struct list_head fi_layout_states; ++#endif /* CONFIG_PNFSD */ + struct inode *fi_inode; + u32 fi_id; /* used with stateowner->so_id + * for stateid_hashtbl hash */ + bool fi_had_conflict; ++#if defined(CONFIG_PNFSD) ++ /* used by layoutget / layoutrecall */ ++ struct nfs4_fsid fi_fsid; ++ u32 fi_fhlen; ++ u8 fi_fhval[NFS4_FHSIZE]; ++#endif /* CONFIG_PNFSD */ + }; + ++#if defined(CONFIG_PNFSD) ++/* pNFS Metadata server state */ ++ ++struct pnfs_ds_dev_entry { ++ struct list_head dd_dev_entry; /* st_pnfs_ds_id entry */ ++ u32 dd_dsid; ++}; ++#endif /* CONFIG_PNFSD */ ++ + /* + * nfs4_stateid can either be an open stateid or (eventually) a lock stateid + * +@@ -370,6 +395,9 @@ struct nfs4_stateid { + struct list_head st_perfile; + struct list_head st_perstateowner; + struct list_head st_lockowners; ++#if defined(CONFIG_PNFSD) ++ struct list_head st_pnfs_ds_id; ++#endif /* CONFIG_PNFSD */ + struct nfs4_stateowner * st_stateowner; + struct nfs4_file * st_file; + stateid_t st_stateid; +@@ -421,6 +449,34 @@ extern void nfsd4_recdir_purge_old(void) + extern int nfsd4_create_clid_dir(struct nfs4_client *clp); + extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); + extern void release_session_client(struct nfsd4_session *); ++extern void nfsd4_free_slab(struct kmem_cache **); ++extern struct nfs4_file *find_file(struct inode *); ++extern struct nfs4_file *find_alloc_file(struct inode *, struct svc_fh *); ++extern void put_nfs4_file(struct nfs4_file *); ++extern void get_nfs4_file(struct nfs4_file *); ++extern struct nfs4_client *find_confirmed_client(clientid_t *); ++extern struct nfs4_stateid *find_stateid(stateid_t *, int flags); ++extern struct nfs4_delegation *find_delegation_stateid(struct inode *, stateid_t *); ++extern __be32 nfs4_check_stateid(stateid_t *); ++extern void expire_client_lock(struct nfs4_client *); ++extern int filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), void *); ++ ++#if defined(CONFIG_PNFSD) ++extern int nfsd4_init_pnfs_slabs(void); ++extern void nfsd4_free_pnfs_slabs(void); ++extern void pnfs_expire_client(struct nfs4_client *); ++extern void release_pnfs_ds_dev_list(struct nfs4_stateid *); ++extern void nfs4_pnfs_state_init(void); ++extern void nfs4_pnfs_state_shutdown(void); ++extern void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++extern int nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *, stateid_t *); ++#else /* CONFIG_PNFSD */ ++static inline void nfsd4_free_pnfs_slabs(void) {} ++static inline int nfsd4_init_pnfs_slabs(void) { return 0; } ++static inline void pnfs_expire_client(struct nfs4_client *clp) {} ++static inline void release_pnfs_ds_dev_list(struct nfs4_stateid *stp) {} ++static inline void nfs4_pnfs_state_shutdown(void) {} ++#endif /* CONFIG_PNFSD */ + + static inline void + nfs4_put_stateowner(struct nfs4_stateowner *so) +@@ -434,4 +490,24 @@ nfs4_get_stateowner(struct nfs4_stateown + kref_get(&so->so_ref); + } + ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ + #endif /* NFSD4_STATE_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs.c +--- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-08-23 12:08:27.632564132 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-08-23 12:09:03.326501490 -0400 +@@ -37,7 +37,12 @@ + #ifdef CONFIG_NFSD_V4 + #include + #include ++#include ++#include + #endif /* CONFIG_NFSD_V4 */ ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif + + #include "nfsd.h" + #include "vfs.h" +@@ -383,6 +388,12 @@ nfsd_setattr(struct svc_rqst *rqstp, str + NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); + if (err) + goto out; ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, 0)) { ++ err = bl_layoutrecall(inode, RETURN_FILE, ++ iap->ia_size, inode->i_size - iap->ia_size); ++ } ++#endif /* CONFIG_SPNFS_BLOCK */ + } + + /* +@@ -1703,6 +1714,11 @@ nfsd_rename(struct svc_rqst *rqstp, stru + struct inode *fdir, *tdir; + __be32 err; + int host_err; ++#ifdef CONFIG_SPNFS ++ unsigned long ino = 0; ++ unsigned long generation = 0; ++ unsigned int nlink = 0; ++#endif /* CONFIG_SPNFS */ + + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); + if (err) +@@ -1766,7 +1782,26 @@ nfsd_rename(struct svc_rqst *rqstp, stru + if (host_err) + goto out_dput_new; + ++#ifdef CONFIG_SPNFS ++ /* ++ * if the target is a preexisting regular file, remember the ++ * inode number and generation so we can delete the stripes; ++ * save the link count as well so that the stripes only get ++ * get deleted when the last link is deleted ++ */ ++ if (ndentry && ndentry->d_inode && S_ISREG(ndentry->d_inode->i_mode)) { ++ ino = ndentry->d_inode->i_ino; ++ generation = ndentry->d_inode->i_generation; ++ nlink = ndentry->d_inode->i_nlink; ++ } ++#endif /* CONFIG_SPNFS */ ++ + host_err = vfs_rename(fdir, odentry, tdir, ndentry); ++#ifdef CONFIG_SPNFS ++ if (spnfs_enabled() && (!host_err && ino && nlink == 1)) ++ spnfs_remove(ino, generation); ++#endif /* CONFIG_SPNFS */ ++ + if (!host_err) { + host_err = commit_metadata(tfhp); + if (!host_err) +@@ -1807,6 +1842,11 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + struct inode *dirp; + __be32 err; + int host_err; ++#if defined(CONFIG_SPNFS) ++ unsigned long ino; ++ unsigned long generation; ++ unsigned int nlink; ++#endif /* defined(CONFIG_SPNFS) */ + + err = nfserr_acces; + if (!flen || isdotent(fname, flen)) +@@ -1830,6 +1870,17 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + goto out; + } + ++#if defined(CONFIG_SPNFS) ++ /* ++ * Remember the inode number to communicate to the spnfsd ++ * for removal of stripes; save the link count as well so that ++ * the stripes only get get deleted when the last link is deleted ++ */ ++ ino = rdentry->d_inode->i_ino; ++ generation = rdentry->d_inode->i_generation; ++ nlink = rdentry->d_inode->i_nlink; ++#endif /* defined(CONFIG_SPNFS) */ ++ + if (!type) + type = rdentry->d_inode->i_mode & S_IFMT; + +@@ -1854,6 +1905,29 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + if (!host_err) + host_err = commit_metadata(fhp); + ++#if defined(CONFIG_SPNFS) ++ /* ++ * spnfs: notify spnfsd of removal to destroy stripes ++ */ ++/* ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (sb->s_export_op->spnfs_remove) { ++*/ ++ dprintk("%s check if spnfs_enabled\n", __FUNCTION__); ++ if (spnfs_enabled() && nlink == 1) { ++ BUG_ON(ino == 0); ++ dprintk("%s calling spnfs_remove inumber=%ld\n", ++ __FUNCTION__, ino); ++ if (spnfs_remove(ino, generation) == 0) { ++ dprintk("%s spnfs_remove success\n", __FUNCTION__); ++ } else { ++ /* XXX How do we make this atomic? */ ++ printk(KERN_WARNING "nfsd: pNFS could not " ++ "remove stripes for inode: %ld\n", ino); ++ } ++ } ++#endif /* defined(CONFIG_SPNFS) */ ++ + mnt_drop_write(fhp->fh_export->ex_path.mnt); + out_nfserr: + err = nfserrno(host_err); +diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h +--- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-08-23 12:08:29.097425997 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 12:09:03.327451643 -0400 +@@ -37,6 +37,8 @@ + #ifndef _LINUX_NFSD_XDR4_H + #define _LINUX_NFSD_XDR4_H + ++#include ++ + #include "state.h" + #include "nfsd.h" + +@@ -385,6 +387,51 @@ struct nfsd4_reclaim_complete { + u32 rca_one_fs; + }; + ++struct nfsd4_pnfs_getdevinfo { ++ struct nfsd4_pnfs_deviceid gd_devid; /* request */ ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxcount; /* request */ ++ u32 gd_notify_types;/* request */ ++ struct super_block *gd_sb; ++}; ++ ++struct nfsd4_pnfs_getdevlist { ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxdevices; /* request */ ++ u64 gd_cookie; /* request - response */ ++ u64 gd_verf; /* request - response */ ++ struct svc_fh *gd_fhp; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutget { ++ u64 lg_minlength; /* request */ ++ u32 lg_signal; /* request */ ++ u32 lg_maxcount; /* request */ ++ struct svc_fh *lg_fhp; /* request */ ++ stateid_t lg_sid; /* request/response */ ++ struct nfsd4_layout_seg lg_seg; /* request/response */ ++ u32 lg_roc; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit { ++ struct nfsd4_pnfs_layoutcommit_arg args; ++ stateid_t lc_sid; /* request */ ++ struct nfsd4_pnfs_layoutcommit_res res; ++}; ++ ++enum layoutreturn_flags { ++ LR_FLAG_INTERN = 1 << 0, /* internal return */ ++ LR_FLAG_EXPIRE = 1 << 1, /* return on client expiration */ ++}; ++ ++struct nfsd4_pnfs_layoutreturn { ++ struct nfsd4_pnfs_layoutreturn_arg args; ++ u32 lr_flags; ++ stateid_t lr_sid; /* request/resopnse */ ++ u32 lrs_present; /* response */ ++}; ++ + struct nfsd4_op { + int opnum; + __be32 status; +@@ -426,6 +473,13 @@ struct nfsd4_op { + struct nfsd4_destroy_session destroy_session; + struct nfsd4_sequence sequence; + struct nfsd4_reclaim_complete reclaim_complete; ++#if defined(CONFIG_PNFSD) ++ struct nfsd4_pnfs_getdevlist pnfs_getdevlist; ++ struct nfsd4_pnfs_getdevinfo pnfs_getdevinfo; ++ struct nfsd4_pnfs_layoutget pnfs_layoutget; ++ struct nfsd4_pnfs_layoutcommit pnfs_layoutcommit; ++ struct nfsd4_pnfs_layoutreturn pnfs_layoutreturn; ++#endif /* CONFIG_PNFSD */ + } u; + struct nfs4_replay * replay; + }; +diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file.c +--- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-23 12:08:29.039491912 -0400 ++++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-23 12:09:03.328501680 -0400 +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -36,6 +37,7 @@ + #include "internal.h" + #include "iostat.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_FILE + +@@ -388,12 +390,17 @@ static int nfs_write_begin(struct file * + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int once_thru = 0; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + mapping->host->i_ino, len, (long long) pos); + ++ pnfs_update_layout(mapping->host, ++ nfs_file_open_context(file), ++ 0, NFS4_MAX_UINT64, IOMODE_RW, ++ &lseg); + start: + /* + * Prevent starvation issues if someone is doing a consistency +@@ -402,17 +409,22 @@ start: + ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) +- return ret; ++ goto out; + + page = grab_cache_page_write_begin(mapping, index, flags); +- if (!page) +- return -ENOMEM; ++ if (!page) { ++ ret = -ENOMEM; ++ goto out; ++ } + *pagep = page; + +- ret = nfs_flush_incompatible(file, page); ++ ret = nfs_flush_incompatible(file, page, lseg); + if (ret) { + unlock_page(page); + page_cache_release(page); ++ *pagep = NULL; ++ *fsdata = NULL; ++ goto out; + } else if (!once_thru && + nfs_want_read_modify_write(file, page, pos, len)) { + once_thru = 1; +@@ -421,6 +433,12 @@ start: + if (!ret) + goto start; + } ++ ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata); ++ out: ++ if (ret) { ++ put_lseg(lseg); ++ *fsdata = NULL; ++ } + return ret; + } + +@@ -430,6 +448,7 @@ static int nfs_write_end(struct file *fi + { + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + int status; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, +@@ -456,10 +475,17 @@ static int nfs_write_end(struct file *fi + zero_user_segment(page, pglen, PAGE_CACHE_SIZE); + } + +- status = nfs_updatepage(file, page, offset, copied); ++ lseg = nfs4_pull_lseg_from_fsdata(file, fsdata); ++ status = pnfs_write_end(file, page, pos, len, copied, lseg); ++ if (status) ++ goto out; ++ status = nfs_updatepage(file, page, offset, copied, lseg, fsdata); + ++ out: + unlock_page(page); + page_cache_release(page); ++ pnfs_write_end_cleanup(file, fsdata); ++ put_lseg(lseg); + + if (status < 0) + return status; +@@ -570,6 +596,8 @@ static int nfs_vm_page_mkwrite(struct vm + /* make sure the cache has finished storing the page */ + nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); + ++ /* XXX Do we want to call pnfs_update_layout here? */ ++ + lock_page(page); + mapping = page->mapping; + if (mapping != dentry->d_inode->i_mapping) +@@ -580,11 +608,11 @@ static int nfs_vm_page_mkwrite(struct vm + if (pagelen == 0) + goto out_unlock; + +- ret = nfs_flush_incompatible(filp, page); ++ ret = nfs_flush_incompatible(filp, page, NULL); + if (ret != 0) + goto out_unlock; + +- ret = nfs_updatepage(filp, page, 0, pagelen); ++ ret = nfs_updatepage(filp, page, 0, pagelen, NULL, NULL); + out_unlock: + if (!ret) + return VM_FAULT_LOCKED; +diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inode.c +--- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-08-23 12:08:29.042511552 -0400 ++++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-23 12:09:03.329501644 -0400 +@@ -48,6 +48,7 @@ + #include "internal.h" + #include "fscache.h" + #include "dns_resolve.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_VFS + +@@ -278,7 +279,7 @@ nfs_fhget(struct super_block *sb, struct + */ + inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; + if (S_ISREG(inode->i_mode)) { +- inode->i_fop = &nfs_file_operations; ++ inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; + inode->i_data.a_ops = &nfs_file_aops; + inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; + } else if (S_ISDIR(inode->i_mode)) { +@@ -530,6 +531,68 @@ out: + return err; + } + ++static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) ++{ ++ atomic_set(&l_ctx->count, 1); ++ l_ctx->lockowner = current->files; ++ l_ctx->pid = current->tgid; ++ INIT_LIST_HEAD(&l_ctx->list); ++} ++ ++static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) ++{ ++ struct nfs_lock_context *pos; ++ ++ list_for_each_entry(pos, &ctx->lock_context.list, list) { ++ if (pos->lockowner != current->files) ++ continue; ++ if (pos->pid != current->tgid) ++ continue; ++ atomic_inc(&pos->count); ++ return pos; ++ } ++ return NULL; ++} ++ ++struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) ++{ ++ struct nfs_lock_context *res, *new = NULL; ++ struct inode *inode = ctx->path.dentry->d_inode; ++ ++ spin_lock(&inode->i_lock); ++ res = __nfs_find_lock_context(ctx); ++ if (res == NULL) { ++ spin_unlock(&inode->i_lock); ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (new == NULL) ++ return NULL; ++ nfs_init_lock_context(new); ++ spin_lock(&inode->i_lock); ++ res = __nfs_find_lock_context(ctx); ++ if (res == NULL) { ++ list_add_tail(&new->list, &ctx->lock_context.list); ++ new->open_context = ctx; ++ res = new; ++ new = NULL; ++ } ++ } ++ spin_unlock(&inode->i_lock); ++ kfree(new); ++ return res; ++} ++ ++void nfs_put_lock_context(struct nfs_lock_context *l_ctx) ++{ ++ struct nfs_open_context *ctx = l_ctx->open_context; ++ struct inode *inode = ctx->path.dentry->d_inode; ++ ++ if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) ++ return; ++ list_del(&l_ctx->list); ++ spin_unlock(&inode->i_lock); ++ kfree(l_ctx); ++} ++ + /** + * nfs_close_context - Common close_context() routine NFSv2/v3 + * @ctx: pointer to context +@@ -566,11 +629,11 @@ static struct nfs_open_context *alloc_nf + path_get(&ctx->path); + ctx->cred = get_rpccred(cred); + ctx->state = NULL; +- ctx->lockowner = current->files; + ctx->flags = 0; + ctx->error = 0; + ctx->dir_cookie = 0; +- atomic_set(&ctx->count, 1); ++ nfs_init_lock_context(&ctx->lock_context); ++ ctx->lock_context.open_context = ctx; + } + return ctx; + } +@@ -578,15 +641,16 @@ static struct nfs_open_context *alloc_nf + struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) + { + if (ctx != NULL) +- atomic_inc(&ctx->count); ++ atomic_inc(&ctx->lock_context.count); + return ctx; + } ++EXPORT_SYMBOL(get_nfs_open_context); + + static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) + { + struct inode *inode = ctx->path.dentry->d_inode; + +- if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) ++ if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) + return; + list_del(&ctx->list); + spin_unlock(&inode->i_lock); +@@ -933,6 +997,7 @@ void nfs_fattr_init(struct nfs_fattr *fa + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); + } ++EXPORT_SYMBOL(nfs_fattr_init); + + struct nfs_fattr *nfs_alloc_fattr(void) + { +@@ -1142,6 +1207,14 @@ static int nfs_update_inode(struct inode + server->fsid = fattr->fsid; + + /* ++ * file needs layout commit, server attributes may be stale ++ */ ++ if (layoutcommit_needed(nfsi) && nfsi->change_attr >= fattr->change_attr) { ++ dprintk("NFS: %s: layoutcommit is needed for file %s/%ld\n", ++ __func__, inode->i_sb->s_id, inode->i_ino); ++ return 0; ++ } ++ /* + * Update the read time so we don't revalidate too often. + */ + nfsi->read_cache_jiffies = fattr->time_start; +@@ -1340,9 +1413,10 @@ static int nfs_update_inode(struct inode + */ + void nfs4_clear_inode(struct inode *inode) + { ++ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); ++ + /* If we are holding a delegation, return it! */ + nfs_inode_return_delegation_noreclaim(inode); +- /* First call standard NFS clear_inode() code */ + nfs_clear_inode(inode); + } + #endif +@@ -1367,7 +1441,10 @@ struct inode *nfs_alloc_inode(struct sup + + void nfs_destroy_inode(struct inode *inode) + { +- kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); ++ struct nfs_inode *nfsi = NFS_I(inode); ++ ++ pnfs_destroy_layout(nfsi); ++ kmem_cache_free(nfs_inode_cachep, nfsi); + } + + static inline void nfs4_init_once(struct nfs_inode *nfsi) +@@ -1377,6 +1454,11 @@ static inline void nfs4_init_once(struct + nfsi->delegation = NULL; + nfsi->delegation_state = 0; + init_rwsem(&nfsi->rwsem); ++#ifdef CONFIG_NFS_V4_1 ++ init_waitqueue_head(&nfsi->lo_waitq); ++ nfsi->pnfs_layout_suspend = 0; ++ nfsi->layout = NULL; ++#endif /* CONFIG_NFS_V4_1 */ + #endif + } + +@@ -1488,6 +1570,12 @@ static int __init init_nfs_fs(void) + if (err) + goto out0; + ++#ifdef CONFIG_NFS_V4_1 ++ err = pnfs_initialize(); ++ if (err) ++ goto out00; ++#endif /* CONFIG_NFS_V4_1 */ ++ + #ifdef CONFIG_PROC_FS + rpc_proc_register(&nfs_rpcstat); + #endif +@@ -1498,6 +1586,10 @@ out: + #ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); + #endif ++#ifdef CONFIG_NFS_V4_1 ++out00: ++ pnfs_uninitialize(); ++#endif /* CONFIG_NFS_V4_1 */ + nfs_destroy_directcache(); + out0: + nfs_destroy_writepagecache(); +@@ -1531,6 +1623,9 @@ static void __exit exit_nfs_fs(void) + #ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); + #endif ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_uninitialize(); ++#endif + unregister_nfs_fs(); + nfs_fs_proc_exit(); + nfsiod_stop(); +diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/internal.h +--- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-08-23 12:08:29.042511552 -0400 ++++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-23 12:09:03.330502148 -0400 +@@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_serv + struct nfs_fattr *); + extern void nfs_mark_client_ready(struct nfs_client *clp, int state); + extern int nfs4_check_client_ready(struct nfs_client *clp); ++extern int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++ const struct sockaddr *sa2); ++extern int nfs4_set_client(struct nfs_server *server, ++ const char *hostname, ++ const struct sockaddr *addr, ++ const size_t addrlen, ++ const char *ip_addr, ++ rpc_authflavor_t authflavour, ++ int proto, const struct rpc_timeout *timeparms, ++ u32 minorversion); + #ifdef CONFIG_PROC_FS + extern int __init nfs_fs_proc_init(void); + extern void nfs_fs_proc_exit(void); +@@ -201,6 +211,8 @@ extern const u32 nfs41_maxwrite_overhead + extern struct rpc_procinfo nfs4_procedures[]; + #endif + ++extern int nfs4_recover_expired_lease(struct nfs_client *clp); ++ + /* proc.c */ + void nfs_close_context(struct nfs_open_context *ctx, int is_sync); + +@@ -248,10 +260,31 @@ extern int nfs4_get_rootfh(struct nfs_se + #endif + + /* read.c */ ++extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); ++extern int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); + extern void nfs_read_prepare(struct rpc_task *task, void *calldata); + + /* write.c */ ++extern int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs); + extern void nfs_write_prepare(struct rpc_task *task, void *calldata); ++extern void nfs_mark_list_commit(struct list_head *head); + #ifdef CONFIG_MIGRATION + extern int nfs_migrate_page(struct address_space *, + struct page *, struct page *); +diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kconfig +--- linux-2.6.34.noarch/fs/nfs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-08-23 12:09:03.331395814 -0400 +@@ -79,10 +79,48 @@ config NFS_V4_1 + depends on NFS_V4 && EXPERIMENTAL + help + This option enables support for minor version 1 of the NFSv4 protocol +- (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. ++ (RFC5661) including support for the parallel NFS (pNFS) features ++ in the kernel's NFS client. + + Unless you're an NFS developer, say N. + ++config PNFS_FILE_LAYOUT ++ tristate "NFS client support for the pNFS nfs-files layout (DEVELOPER ONLY)" ++ depends on NFS_FS && NFS_V4_1 ++ default y ++ help ++ This option enables support for the pNFS nfs-files layout. ++ ++ Unless you're an NFS developer, say N. ++ ++config PNFS_OBJLAYOUT ++ tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD ++ help ++ Say M here if you want your pNFS client to support the Objects Layout Driver. ++ Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and ++ upper level driver (SCSI_OSD_ULD). ++ ++ If unsure, say N. ++ ++config PNFS_PANLAYOUT ++ tristate "Provide support for the Panasas OSD Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on PNFS_OBJLAYOUT ++ help ++ Say M or y here if you want your pNFS client to support the Panasas OSD Layout Driver. ++ ++ If unsure, say N. ++ ++config PNFS_BLOCK ++ tristate "Provide a pNFS block client (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 ++ select MD ++ select BLK_DEV_DM ++ help ++ Say M or y here if you want your pNfs client to support the block protocol ++ ++ If unsure, say N. ++ + config ROOT_NFS + bool "Root file system on NFS" + depends on NFS_FS=y && IP_PNP +diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Makefile +--- linux-2.6.34.noarch/fs/nfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-08-23 12:09:03.331395814 -0400 +@@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4x + delegation.o idmap.o \ + callback.o callback_xdr.o callback_proc.o \ + nfs4namespace.o ++nfs-$(CONFIG_NFS_V4_1) += pnfs.o + nfs-$(CONFIG_SYSCTL) += sysctl.o + nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o ++ ++obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o ++nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o ++ ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ ++obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ +diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs3proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-08-23 12:08:29.045525837 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-23 12:09:03.332511640 -0400 +@@ -833,6 +833,7 @@ const struct nfs_rpc_ops nfs_v3_clientop + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs3_dir_inode_operations, + .file_inode_ops = &nfs3_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs3_proc_get_root, + .getattr = nfs3_proc_getattr, + .setattr = nfs3_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-23 12:09:03.333512111 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-08-23 12:09:03.334491472 -0400 +@@ -0,0 +1,765 @@ ++/* ++ * linux/fs/nfs/nfs4filelayout.c ++ * ++ * Module for the pnfs nfs4 file layout driver. ++ * Defines all I/O and Policy interface operations, plus code ++ * to register itself with the pNFS client. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfs4filelayout.h" ++#include "nfs4_fs.h" ++#include "internal.h" ++#include "pnfs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Dean Hildebrand "); ++MODULE_DESCRIPTION("The NFSv4 file layout driver"); ++ ++/* Callback operations to the pNFS client */ ++struct pnfs_client_operations *pnfs_callback_ops; ++ ++/* Forward declaration */ ++struct layoutdriver_io_operations filelayout_io_operations; ++ ++int ++filelayout_initialize_mountpoint(struct nfs_server *nfss, ++ const struct nfs_fh *mntfh) ++{ ++ int status = nfs4_alloc_init_deviceid_cache(nfss->nfs_client, ++ nfs4_fl_free_deviceid_callback); ++ if (status) { ++ printk(KERN_WARNING "%s: deviceid cache could not be " ++ "initialized\n", __func__); ++ return status; ++ } ++ dprintk("%s: deviceid cache has been initialized successfully\n", ++ __func__); ++ return 0; ++} ++ ++/* Uninitialize a mountpoint by destroying its device list */ ++int ++filelayout_uninitialize_mountpoint(struct nfs_server *nfss) ++{ ++ dprintk("--> %s\n", __func__); ++ ++ if (nfss->pnfs_curr_ld && nfss->nfs_client->cl_devid_cache) ++ nfs4_put_deviceid_cache(nfss->nfs_client); ++ return 0; ++} ++ ++/* This function is used by the layout driver to calculate the ++ * offset of the file on the dserver based on whether the ++ * layout type is STRIPE_DENSE or STRIPE_SPARSE ++ */ ++static loff_t ++filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ ++ switch (flseg->stripe_type) { ++ case STRIPE_SPARSE: ++ return offset; ++ ++ case STRIPE_DENSE: ++ { ++ u32 stripe_width; ++ u64 tmp, off; ++ u32 unit = flseg->stripe_unit; ++ ++ stripe_width = unit * FILE_DSADDR(lseg)->stripe_count; ++ tmp = off = offset - flseg->pattern_offset; ++ do_div(tmp, stripe_width); ++ return tmp * unit + do_div(off, unit); ++ } ++ default: ++ BUG(); ++ } ++ ++ /* We should never get here... just to stop the gcc warning */ ++ return 0; ++} ++ ++/* ++ * Call ops for the async read/write cases ++ * In the case of dense layouts, the offset needs to be reset to its ++ * original value. ++ */ ++static void filelayout_read_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ if (rdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ rdata->args.offset, rdata->fldata.orig_offset); ++ rdata->args.offset = rdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ rdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_read_release(void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ put_lseg(rdata->pdata.lseg); ++ rdata->pdata.lseg = NULL; ++ rdata->pdata.call_ops->rpc_release(data); ++} ++ ++static void filelayout_write_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ if (wdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ wdata->args.offset, wdata->fldata.orig_offset); ++ wdata->args.offset = wdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_write_release(void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ put_lseg(wdata->pdata.lseg); ++ wdata->pdata.lseg = NULL; ++ wdata->pdata.call_ops->rpc_release(data); ++} ++ ++struct rpc_call_ops filelayout_read_call_ops = { ++ .rpc_call_prepare = nfs_read_prepare, ++ .rpc_call_done = filelayout_read_call_done, ++ .rpc_release = filelayout_read_release, ++}; ++ ++struct rpc_call_ops filelayout_write_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_write_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* Perform sync or async reads. ++ * ++ * An optimization for the NFS file layout driver ++ * allows the original read/write data structs to be passed in the ++ * last argument. ++ * ++ * TODO: join with write_pagelist? ++ */ ++static enum pnfs_try_status ++filelayout_read_pagelist(struct nfs_read_data *data, unsigned nr_pages) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ dprintk("--> %s ino %lu nr_pages %d pgbase %u req %Zu@%llu\n", ++ __func__, data->inode->i_ino, nr_pages, ++ data->args.pgbase, (size_t)data->args.count, offset); ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s USE DS:ip %x %s\n", __func__, ++ htonl(ds->ds_ip_addr), ds->r_addr); ++ ++ /* just try the first data server for the index..*/ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ ++ /* ++ * Now get the file offset on the dserver ++ * Set the read offset to this offset, and ++ * save the original offset in orig_offset ++ * In the case of aync reads, the offset will be reset in the ++ * call_ops->rpc_call_done() routine. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* Perform an asynchronous read */ ++ nfs_initiate_read(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_read_call_ops); ++ ++ data->pdata.pnfs_error = 0; ++ ++ return PNFS_ATTEMPTED; ++} ++ ++/* Perform async writes. */ ++static enum pnfs_try_status ++filelayout_write_pagelist(struct nfs_write_data *data, unsigned nr_pages, int sync) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu %s\n", __func__, ++ data->inode->i_ino, sync, (size_t) data->args.count, offset, ++ htonl(ds->ds_ip_addr), ntohs(ds->ds_port), ds->r_addr); ++ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ /* ++ * Get the file offset on the dserver. Set the write offset to ++ * this offset and save the original offset. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* ++ * Perform an asynchronous write The offset will be reset in the ++ * call_ops->rpc_call_done() routine ++ */ ++ nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_write_call_ops, sync); ++ ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++} ++ ++/* ++ * Create a filelayout layout structure and return it. The pNFS client ++ * will use the pnfs_layout_type type to refer to the layout for this ++ * inode from now on. ++ */ ++static struct pnfs_layout_type * ++filelayout_alloc_layout(struct inode *inode) ++{ ++ struct nfs4_filelayout *flp; ++ ++ dprintk("NFS_FILELAYOUT: allocating layout\n"); ++ flp = kzalloc(sizeof(struct nfs4_filelayout), GFP_KERNEL); ++ return flp ? &flp->fl_layout : NULL; ++} ++ ++/* Free a filelayout layout structure */ ++static void ++filelayout_free_layout(struct pnfs_layout_type *lo) ++{ ++ dprintk("NFS_FILELAYOUT: freeing layout\n"); ++ kfree(FILE_LO(lo)); ++} ++ ++/* ++ * filelayout_check_layout() ++ * ++ * Make sure layout segment parameters are sane WRT the device. ++ * ++ * Notes: ++ * 1) current code insists that # stripe index = # data servers in ds_list ++ * which is wrong. ++ * 2) pattern_offset is ignored and must == 0 which is wrong; ++ * 3) the pattern_offset needs to be a mutliple of the stripe unit. ++ * 4) stripe unit is multiple of page size ++ */ ++ ++static int ++filelayout_check_layout(struct pnfs_layout_type *lo, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct nfs4_filelayout_segment *fl = LSEG_LD_DATA(lseg); ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ int status = -EINVAL; ++ struct nfs_server *nfss = NFS_SERVER(PNFS_INODE(lo)); ++ ++ dprintk("--> %s\n", __func__); ++ dsaddr = nfs4_pnfs_device_item_find(nfss->nfs_client, &fl->dev_id); ++ if (dsaddr == NULL) { ++ dsaddr = get_device_info(PNFS_INODE(lo), &fl->dev_id); ++ if (dsaddr == NULL) { ++ dprintk("%s NO device for dev_id %s\n", ++ __func__, deviceid_fmt(&fl->dev_id)); ++ goto out; ++ } ++ } ++ if (fl->first_stripe_index < 0 || ++ fl->first_stripe_index > dsaddr->stripe_count) { ++ dprintk("%s Bad first_stripe_index %d\n", ++ __func__, fl->first_stripe_index); ++ goto out; ++ } ++ ++ if (fl->pattern_offset != 0) { ++ dprintk("%s Unsupported no-zero pattern_offset %Ld\n", ++ __func__, fl->pattern_offset); ++ goto out; ++ } ++ ++ if (fl->stripe_unit % PAGE_SIZE) { ++ dprintk("%s Stripe unit (%u) not page aligned\n", ++ __func__, fl->stripe_unit); ++ goto out; ++ } ++ ++ /* XXX only support SPARSE packing. Don't support use MDS open fh */ ++ if (!(fl->num_fh == 1 || fl->num_fh == dsaddr->ds_num)) { ++ dprintk("%s num_fh %u not equal to 1 or ds_num %u\n", ++ __func__, fl->num_fh, dsaddr->ds_num); ++ goto out; ++ } ++ ++ if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { ++ dprintk("%s Stripe unit (%u) not aligned with rsize %u " ++ "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, ++ nfss->wsize); ++ } ++ ++ /* reference the device */ ++ nfs4_set_layout_deviceid(lseg, &dsaddr->deviceid); ++ ++ status = 0; ++out: ++ dprintk("--> %s returns %d\n", __func__, status); ++ return status; ++} ++ ++static void _filelayout_free_lseg(struct pnfs_layout_segment *lseg); ++static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl); ++ ++/* Decode layout and store in layoutid. Overwrite any existing layout ++ * information for this file. ++ */ ++static int ++filelayout_set_layout(struct nfs4_filelayout *flo, ++ struct nfs4_filelayout_segment *fl, ++ struct nfs4_pnfs_layoutget_res *lgr) ++{ ++ uint32_t *p = (uint32_t *)lgr->layout.buf; ++ uint32_t nfl_util; ++ int i; ++ ++ dprintk("%s: set_layout_map Begin\n", __func__); ++ ++ memcpy(&fl->dev_id, p, NFS4_PNFS_DEVICEID4_SIZE); ++ p += XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE); ++ nfl_util = be32_to_cpup(p++); ++ if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) ++ fl->commit_through_mds = 1; ++ if (nfl_util & NFL4_UFLG_DENSE) ++ fl->stripe_type = STRIPE_DENSE; ++ else ++ fl->stripe_type = STRIPE_SPARSE; ++ fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; ++ ++ if (!flo->stripe_unit) ++ flo->stripe_unit = fl->stripe_unit; ++ else if (flo->stripe_unit != fl->stripe_unit) { ++ printk(KERN_NOTICE "%s: updating strip_unit from %u to %u\n", ++ __func__, flo->stripe_unit, fl->stripe_unit); ++ flo->stripe_unit = fl->stripe_unit; ++ } ++ ++ fl->first_stripe_index = be32_to_cpup(p++); ++ p = xdr_decode_hyper(p, &fl->pattern_offset); ++ fl->num_fh = be32_to_cpup(p++); ++ ++ dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu dev_id %s\n", ++ __func__, nfl_util, fl->num_fh, fl->first_stripe_index, ++ fl->pattern_offset, deviceid_fmt(&fl->dev_id)); ++ ++ if (fl->num_fh * sizeof(struct nfs_fh) > 2*PAGE_SIZE) { ++ fl->fh_array = vmalloc(fl->num_fh * sizeof(struct nfs_fh)); ++ if (fl->fh_array) ++ memset(fl->fh_array, 0, ++ fl->num_fh * sizeof(struct nfs_fh)); ++ } else { ++ fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh), ++ GFP_KERNEL); ++ } ++ if (!fl->fh_array) ++ return -ENOMEM; ++ ++ for (i = 0; i < fl->num_fh; i++) { ++ /* fh */ ++ fl->fh_array[i].size = be32_to_cpup(p++); ++ if (sizeof(struct nfs_fh) < fl->fh_array[i].size) { ++ printk(KERN_ERR "Too big fh %d received %d\n", ++ i, fl->fh_array[i].size); ++ /* Layout is now invalid, pretend it doesn't exist */ ++ filelayout_free_fh_array(fl); ++ fl->num_fh = 0; ++ break; ++ } ++ memcpy(fl->fh_array[i].data, p, fl->fh_array[i].size); ++ p += XDR_QUADLEN(fl->fh_array[i].size); ++ dprintk("DEBUG: %s: fh len %d\n", __func__, ++ fl->fh_array[i].size); ++ } ++ ++ return 0; ++} ++ ++static struct pnfs_layout_segment * ++filelayout_alloc_lseg(struct pnfs_layout_type *layoutid, ++ struct nfs4_pnfs_layoutget_res *lgr) ++{ ++ struct nfs4_filelayout *flo = FILE_LO(layoutid); ++ struct pnfs_layout_segment *lseg; ++ int rc; ++ ++ dprintk("--> %s\n", __func__); ++ lseg = kzalloc(sizeof(struct pnfs_layout_segment) + ++ sizeof(struct nfs4_filelayout_segment), GFP_KERNEL); ++ if (!lseg) ++ return NULL; ++ ++ rc = filelayout_set_layout(flo, LSEG_LD_DATA(lseg), lgr); ++ ++ if (rc != 0 || filelayout_check_layout(layoutid, lseg)) { ++ _filelayout_free_lseg(lseg); ++ lseg = NULL; ++ } ++ return lseg; ++} ++ ++static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) ++{ ++ if (fl->num_fh * sizeof(struct nfs_fh) > 2*PAGE_SIZE) ++ vfree(fl->fh_array); ++ else ++ kfree(fl->fh_array); ++ ++ fl->fh_array = NULL; ++} ++ ++static void ++_filelayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ filelayout_free_fh_array(LSEG_LD_DATA(lseg)); ++ kfree(lseg); ++} ++ ++static void ++filelayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ dprintk("--> %s\n", __func__); ++ nfs4_unset_layout_deviceid(lseg, lseg->deviceid, ++ nfs4_fl_free_deviceid_callback); ++ _filelayout_free_lseg(lseg); ++} ++ ++/* Allocate a new nfs_write_data struct and initialize */ ++static struct nfs_write_data * ++filelayout_clone_write_data(struct nfs_write_data *old) ++{ ++ static struct nfs_write_data *new; ++ ++ new = nfs_commitdata_alloc(); ++ if (!new) ++ goto out; ++ kref_init(&new->refcount); ++ new->parent = old; ++ kref_get(&old->refcount); ++ new->inode = old->inode; ++ new->cred = old->cred; ++ new->args.offset = 0; ++ new->args.count = 0; ++ new->res.count = 0; ++ new->res.fattr = &new->fattr; ++ nfs_fattr_init(&new->fattr); ++ new->res.verf = &new->verf; ++ new->args.context = get_nfs_open_context(old->args.context); ++ new->pdata.lseg = NULL; ++ new->pdata.call_ops = old->pdata.call_ops; ++ new->pdata.how = old->pdata.how; ++out: ++ return new; ++} ++ ++static void filelayout_commit_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static struct rpc_call_ops filelayout_commit_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_commit_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* ++ * Execute a COMMIT op to the MDS or to each data server on which a page ++ * in 'pages' exists. ++ * Invoke the pnfs_commit_complete callback. ++ */ ++enum pnfs_try_status ++filelayout_commit(struct nfs_write_data *data, int sync) ++{ ++ LIST_HEAD(head); ++ struct nfs_page *req; ++ loff_t file_offset = 0; ++ u16 idx, i; ++ struct list_head **ds_page_list = NULL; ++ u16 *indices_used; ++ int num_indices_seen = 0; ++ const struct rpc_call_ops *call_ops; ++ struct rpc_clnt *clnt; ++ struct nfs_write_data **clone_list = NULL; ++ struct nfs_write_data *dsdata; ++ struct nfs4_pnfs_ds *ds; ++ ++ dprintk("%s data %p sync %d\n", __func__, data, sync); ++ ++ /* Alloc room for both in one go */ ++ ds_page_list = kzalloc((NFS4_PNFS_MAX_MULTI_CNT + 1) * ++ (sizeof(u16) + sizeof(struct list_head *)), ++ GFP_KERNEL); ++ if (!ds_page_list) ++ goto mem_error; ++ indices_used = (u16 *) (ds_page_list + NFS4_PNFS_MAX_MULTI_CNT + 1); ++ /* ++ * Sort pages based on which ds to send to. ++ * MDS is given index equal to NFS4_PNFS_MAX_MULTI_CNT. ++ * Note we are assuming there is only a single lseg in play. ++ * When that is not true, we could first sort on lseg, then ++ * sort within each as we do here. ++ */ ++ while (!list_empty(&data->pages)) { ++ req = nfs_list_entry(data->pages.next); ++ nfs_list_remove_request(req); ++ if (!req->wb_lseg || ++ ((struct nfs4_filelayout_segment *) ++ LSEG_LD_DATA(req->wb_lseg))->commit_through_mds) ++ idx = NFS4_PNFS_MAX_MULTI_CNT; ++ else { ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ idx = nfs4_fl_calc_ds_index(req->wb_lseg, file_offset); ++ } ++ if (ds_page_list[idx]) { ++ /* Already seen this idx */ ++ list_add(&req->wb_list, ds_page_list[idx]); ++ } else { ++ /* New idx not seen so far */ ++ list_add_tail(&req->wb_list, &head); ++ indices_used[num_indices_seen++] = idx; ++ } ++ ds_page_list[idx] = &req->wb_list; ++ } ++ /* Once created, clone must be released via call_op */ ++ clone_list = kzalloc(num_indices_seen * ++ sizeof(struct nfs_write_data *), GFP_KERNEL); ++ if (!clone_list) ++ goto mem_error; ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ clone_list[i] = filelayout_clone_write_data(data); ++ if (!clone_list[i]) ++ goto mem_error; ++ } ++ clone_list[i] = data; ++ /* ++ * Now send off the RPCs to each ds. Note that it is important ++ * that any RPC to the MDS be sent last (or at least after all ++ * clones have been made.) ++ */ ++ for (i = 0; i < num_indices_seen; i++) { ++ dsdata = clone_list[i]; ++ idx = indices_used[i]; ++ list_cut_position(&dsdata->pages, &head, ds_page_list[idx]); ++ if (idx == NFS4_PNFS_MAX_MULTI_CNT) { ++ call_ops = data->pdata.call_ops;; ++ clnt = NFS_CLIENT(dsdata->inode); ++ ds = NULL; ++ } else { ++ struct nfs_fh *fh; ++ ++ call_ops = &filelayout_commit_call_ops; ++ req = nfs_list_entry(dsdata->pages.next); ++ ds = nfs4_fl_prepare_ds(req->wb_lseg, idx); ++ if (!ds) { ++ /* Trigger retry of this chunk through MDS */ ++ dsdata->task.tk_status = -EIO; ++ data->pdata.call_ops->rpc_release(dsdata); ++ continue; ++ } ++ clnt = ds->ds_clp->cl_rpcclient; ++ dsdata->fldata.ds_nfs_client = ds->ds_clp; ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ fh = nfs4_fl_select_ds_fh(req->wb_lseg, file_offset); ++ if (fh) ++ dsdata->args.fh = fh; ++ } ++ dprintk("%s: Initiating commit: %llu USE DS:\n", ++ __func__, file_offset); ++ print_ds(ds); ++ ++ /* Send COMMIT to data server */ ++ nfs_initiate_commit(dsdata, clnt, call_ops, sync); ++ } ++ kfree(clone_list); ++ kfree(ds_page_list); ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++ ++ mem_error: ++ if (clone_list) { ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ if (!clone_list[i]) ++ break; ++ data->pdata.call_ops->rpc_release(clone_list[i]); ++ } ++ kfree(clone_list); ++ } ++ kfree(ds_page_list); ++ /* One of these will be empty, but doesn't hurt to do both */ ++ nfs_mark_list_commit(&head); ++ nfs_mark_list_commit(&data->pages); ++ data->pdata.call_ops->rpc_release(data); ++ return PNFS_ATTEMPTED; ++} ++ ++/* Return the stripesize for the specified file */ ++ssize_t ++filelayout_get_stripesize(struct pnfs_layout_type *layoutid) ++{ ++ struct nfs4_filelayout *flo = FILE_LO(layoutid); ++ ++ return flo->stripe_unit; ++} ++ ++/* ++ * filelayout_pg_test(). Called by nfs_can_coalesce_requests() ++ * ++ * return 1 : coalesce page ++ * return 0 : don't coalesce page ++ */ ++int ++filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ u64 p_stripe, r_stripe; ++ ++ if (pgio->pg_boundary == 0) ++ return 1; ++ p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; ++ r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; ++ ++ do_div(p_stripe, pgio->pg_boundary); ++ do_div(r_stripe, pgio->pg_boundary); ++ ++ return (p_stripe == r_stripe); ++} ++ ++struct layoutdriver_io_operations filelayout_io_operations = { ++ .commit = filelayout_commit, ++ .read_pagelist = filelayout_read_pagelist, ++ .write_pagelist = filelayout_write_pagelist, ++ .alloc_layout = filelayout_alloc_layout, ++ .free_layout = filelayout_free_layout, ++ .alloc_lseg = filelayout_alloc_lseg, ++ .free_lseg = filelayout_free_lseg, ++ .initialize_mountpoint = filelayout_initialize_mountpoint, ++ .uninitialize_mountpoint = filelayout_uninitialize_mountpoint, ++}; ++ ++struct layoutdriver_policy_operations filelayout_policy_operations = { ++ .flags = PNFS_USE_RPC_CODE, ++ .get_stripesize = filelayout_get_stripesize, ++ .pg_test = filelayout_pg_test, ++}; ++ ++struct pnfs_layoutdriver_type filelayout_type = { ++ .id = LAYOUT_NFSV4_1_FILES, ++ .name = "LAYOUT_NFSV4_1_FILES", ++ .ld_io_ops = &filelayout_io_operations, ++ .ld_policy_ops = &filelayout_policy_operations, ++}; ++ ++static int __init nfs4filelayout_init(void) ++{ ++ printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", ++ __func__); ++ ++ /* ++ * Need to register file_operations struct with global list to indicate ++ * that NFS4 file layout is a possible pNFS I/O module ++ */ ++ pnfs_callback_ops = pnfs_register_layoutdriver(&filelayout_type); ++ ++ return 0; ++} ++ ++static void __exit nfs4filelayout_exit(void) ++{ ++ printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", ++ __func__); ++ ++ /* Unregister NFS4 file layout driver with pNFS client*/ ++ pnfs_unregister_layoutdriver(&filelayout_type); ++} ++ ++module_init(nfs4filelayout_init); ++module_exit(nfs4filelayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-23 12:09:03.334491472 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-23 12:09:03.335501543 -0400 +@@ -0,0 +1,636 @@ ++/* ++ * linux/fs/nfs/nfs4filelayoutdev.c ++ * ++ * Device operations for the pnfs nfs4 file layout driver. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * Garth Goodson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include "nfs4filelayout.h" ++#include "internal.h" ++#include "nfs4_fs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++DEFINE_SPINLOCK(nfs4_ds_cache_lock); ++static LIST_HEAD(nfs4_data_server_cache); ++ ++void ++print_ds(struct nfs4_pnfs_ds *ds) ++{ ++ if (ds == NULL) { ++ dprintk("%s NULL device \n", __func__); ++ return; ++ } ++ dprintk(" ip_addr %x\n", ntohl(ds->ds_ip_addr)); ++ dprintk(" port %hu\n", ntohs(ds->ds_port)); ++ dprintk(" client %p\n", ds->ds_clp); ++ dprintk(" ref count %d\n", atomic_read(&ds->ds_count)); ++ if (ds->ds_clp) ++ dprintk(" cl_exchange_flags %x\n", ++ ds->ds_clp->cl_exchange_flags); ++ dprintk(" ip:port %s\n", ds->r_addr); ++} ++ ++void ++print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr) ++{ ++ int i; ++ ++ dprintk("%s dsaddr->ds_num %d\n", __func__, ++ dsaddr->ds_num); ++ for (i = 0; i < dsaddr->ds_num; i++) ++ print_ds(dsaddr->ds_list[i]); ++} ++ ++/* Debugging function assuming a 64bit major/minor split of the deviceid */ ++char * ++deviceid_fmt(const struct pnfs_deviceid *dev_id) ++{ ++ static char buf[17]; ++ uint32_t *p = (uint32_t *)dev_id->data; ++ uint64_t major, minor; ++ ++ p = xdr_decode_hyper(p, &major); ++ p = xdr_decode_hyper(p, &minor); ++ ++ sprintf(buf, "%08llu %08llu", major, minor); ++ return buf; ++} ++ ++/* nfs4_ds_cache_lock is held */ ++static inline struct nfs4_pnfs_ds * ++_data_server_lookup(u32 ip_addr, u32 port) ++{ ++ struct nfs4_pnfs_ds *ds; ++ ++ dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", ++ ntohl(ip_addr), ntohs(port)); ++ ++ list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { ++ if (ds->ds_ip_addr == ip_addr && ++ ds->ds_port == port) { ++ return ds; ++ } ++ } ++ return NULL; ++} ++ ++/* Create an rpc to the data server defined in 'dev_list' */ ++static int ++nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) ++{ ++ struct nfs_server *tmp; ++ struct sockaddr_in sin; ++ struct rpc_clnt *mds_clnt = mds_srv->client; ++ struct nfs_client *clp = mds_srv->nfs_client; ++ struct sockaddr *mds_addr; ++ int err = 0; ++ ++ dprintk("--> %s ip:port %s au_flavor %d\n", __func__, ++ ds->r_addr, mds_clnt->cl_auth->au_flavor); ++ ++ sin.sin_family = AF_INET; ++ sin.sin_addr.s_addr = ds->ds_ip_addr; ++ sin.sin_port = ds->ds_port; ++ ++ /* ++ * If this DS is also the MDS, use the MDS session only if the ++ * MDS exchangeid flags show the EXCHGID4_FLAG_USE_PNFS_DS pNFS role. ++ */ ++ mds_addr = (struct sockaddr *)&clp->cl_addr; ++ if (nfs_sockaddr_cmp((struct sockaddr *)&sin, mds_addr)) { ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO "ip:port %s is not a pNFS Data " ++ "Server\n", ds->r_addr); ++ err = -ENODEV; ++ } else { ++ atomic_inc(&clp->cl_count); ++ ds->ds_clp = clp; ++ dprintk("%s Using MDS Session for DS\n", __func__); ++ } ++ goto out; ++ } ++ ++ /* Temporay server for nfs4_set_client */ ++ tmp = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); ++ if (!tmp) ++ goto out; ++ ++ /* ++ * Set a retrans, timeout interval, and authflavor equual to the MDS ++ * values. Use the MDS nfs_client cl_ipaddr field so as to use the ++ * same co_ownerid as the MDS. ++ */ ++ err = nfs4_set_client(tmp, ++ mds_srv->nfs_client->cl_hostname, ++ (struct sockaddr *)&sin, ++ sizeof(struct sockaddr), ++ mds_srv->nfs_client->cl_ipaddr, ++ mds_clnt->cl_auth->au_flavor, ++ IPPROTO_TCP, ++ mds_clnt->cl_xprt->timeout, ++ 1 /* minorversion */); ++ if (err < 0) ++ goto out_free; ++ ++ clp = tmp->nfs_client; ++ ++ /* Ask for only the EXCHGID4_FLAG_USE_PNFS_DS pNFS role */ ++ dprintk("%s EXCHANGE_ID for clp %p\n", __func__, clp); ++ clp->cl_exchange_flags = EXCHGID4_FLAG_USE_PNFS_DS; ++ ++ err = nfs4_recover_expired_lease(clp); ++ if (!err) ++ err = nfs4_check_client_ready(clp); ++ if (err) ++ goto out_put; ++ ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO "ip:port %s is not a pNFS Data Server\n", ++ ds->r_addr); ++ err = -ENODEV; ++ goto out_put; ++ } ++ /* ++ * Mask the (possibly) returned EXCHGID4_FLAG_USE_PNFS_MDS pNFS role ++ * The is_ds_only_session depends on this. ++ */ ++ clp->cl_exchange_flags &= ~EXCHGID4_FLAG_USE_PNFS_MDS; ++ /* ++ * Set DS lease equal to the MDS lease, renewal is scheduled in ++ * create_session ++ */ ++ spin_lock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time; ++ spin_unlock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_last_renewal = jiffies; ++ ++ clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); ++ ds->ds_clp = clp; ++ ++ dprintk("%s: ip=%x, port=%hu, rpcclient %p\n", __func__, ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), ++ clp->cl_rpcclient); ++out_free: ++ kfree(tmp); ++out: ++ dprintk("%s Returns %d\n", __func__, err); ++ return err; ++out_put: ++ nfs_put_client(clp); ++ goto out_free; ++} ++ ++static void ++destroy_ds(struct nfs4_pnfs_ds *ds) ++{ ++ dprintk("--> %s\n", __func__); ++ print_ds(ds); ++ ++ if (ds->ds_clp) ++ nfs_put_client(ds->ds_clp); ++ kfree(ds); ++} ++ ++static void ++nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) ++{ ++ struct nfs4_pnfs_ds *ds; ++ int i; ++ ++ dprintk("%s: device id=%s\n", __func__, ++ deviceid_fmt(&dsaddr->deviceid.de_id)); ++ ++ for (i = 0; i < dsaddr->ds_num; i++) { ++ ds = dsaddr->ds_list[i]; ++ if (ds != NULL) { ++ if (atomic_dec_and_lock(&ds->ds_count, ++ &nfs4_ds_cache_lock)) { ++ list_del_init(&ds->ds_node); ++ spin_unlock(&nfs4_ds_cache_lock); ++ destroy_ds(ds); ++ } ++ } ++ } ++ kfree(dsaddr->stripe_indices); ++ kfree(dsaddr); ++} ++ ++void ++nfs4_fl_free_deviceid_callback(struct kref *kref) ++{ ++ struct nfs4_deviceid *device = ++ container_of(kref, struct nfs4_deviceid, de_kref); ++ struct nfs4_file_layout_dsaddr *dsaddr = ++ container_of(device, struct nfs4_file_layout_dsaddr, deviceid); ++ ++ nfs4_fl_free_deviceid(dsaddr); ++} ++ ++static void ++nfs4_pnfs_ds_add(struct inode *inode, struct nfs4_pnfs_ds **dsp, ++ u32 ip_addr, u32 port, char *r_addr, int len) ++{ ++ struct nfs4_pnfs_ds *tmp_ds, *ds; ++ ++ *dsp = NULL; ++ ++ ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL); ++ if (!ds) ++ return; ++ ++ spin_lock(&nfs4_ds_cache_lock); ++ tmp_ds = _data_server_lookup(ip_addr, port); ++ if (tmp_ds == NULL) { ++ ds->ds_ip_addr = ip_addr; ++ ds->ds_port = port; ++ strncpy(ds->r_addr, r_addr, len); ++ atomic_set(&ds->ds_count, 1); ++ INIT_LIST_HEAD(&ds->ds_node); ++ ds->ds_clp = NULL; ++ list_add(&ds->ds_node, &nfs4_data_server_cache); ++ *dsp = ds; ++ dprintk("%s add new data server ip 0x%x\n", __func__, ++ ds->ds_ip_addr); ++ spin_unlock(&nfs4_ds_cache_lock); ++ } else { ++ atomic_inc(&tmp_ds->ds_count); ++ *dsp = tmp_ds; ++ dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", ++ __func__, tmp_ds->ds_ip_addr, ++ atomic_read(&tmp_ds->ds_count)); ++ spin_unlock(&nfs4_ds_cache_lock); ++ kfree(ds); ++ } ++} ++ ++static struct nfs4_pnfs_ds * ++decode_and_add_ds(uint32_t **pp, struct inode *inode) ++{ ++ struct nfs4_pnfs_ds *ds = NULL; ++ char r_addr[29]; /* max size of ip/port string */ ++ int len; ++ u32 ip_addr, port; ++ int tmp[6]; ++ uint32_t *p = *pp; ++ ++ dprintk("%s enter\n", __func__); ++ /* check and skip r_netid */ ++ len = be32_to_cpup(p++); ++ /* "tcp" */ ++ if (len != 3) { ++ printk("%s: ERROR: non TCP r_netid len %d\n", ++ __func__, len); ++ goto out_err; ++ } ++ /* ++ * Read the bytes into a temporary buffer ++ * XXX: should probably sanity check them ++ */ ++ tmp[0] = be32_to_cpup(p++); ++ ++ len = be32_to_cpup(p++); ++ if (len >= sizeof(r_addr)) { ++ printk("%s: ERROR: Device ip/port too long (%d)\n", ++ __func__, len); ++ goto out_err; ++ } ++ memcpy(r_addr, p, len); ++ p += XDR_QUADLEN(len); ++ *pp = p; ++ r_addr[len] = '\0'; ++ sscanf(r_addr, "%d.%d.%d.%d.%d.%d", &tmp[0], &tmp[1], ++ &tmp[2], &tmp[3], &tmp[4], &tmp[5]); ++ ip_addr = htonl((tmp[0]<<24) | (tmp[1]<<16) | (tmp[2]<<8) | (tmp[3])); ++ port = htons((tmp[4] << 8) | (tmp[5])); ++ ++ nfs4_pnfs_ds_add(inode, &ds, ip_addr, port, r_addr, len); ++ ++ dprintk("%s: addr:port string = %s\n", __func__, r_addr); ++ return ds; ++out_err: ++ dprintk("%s returned NULL\n", __func__); ++ return NULL; ++} ++ ++/* Decode opaque device data and return the result */ ++static struct nfs4_file_layout_dsaddr* ++decode_device(struct inode *ino, struct pnfs_device *pdev) ++{ ++ int i, dummy; ++ u32 cnt, num; ++ u8 *indexp; ++ uint32_t *p = (u32 *)pdev->area, *indicesp; ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ ++ /* Get the stripe count (number of stripe index) */ ++ cnt = be32_to_cpup(p++); ++ dprintk("%s stripe count %d\n", __func__, cnt); ++ if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { ++ printk(KERN_WARNING "%s: stripe count %d greater than " ++ "supported maximum %d\n", __func__, ++ cnt, NFS4_PNFS_MAX_STRIPE_CNT); ++ goto out_err; ++ } ++ ++ /* Check the multipath list count */ ++ indicesp = p; ++ p += XDR_QUADLEN(cnt << 2); ++ num = be32_to_cpup(p++); ++ dprintk("%s ds_num %u\n", __func__, num); ++ if (num > NFS4_PNFS_MAX_MULTI_CNT) { ++ printk(KERN_WARNING "%s: multipath count %d greater than " ++ "supported maximum %d\n", __func__, ++ num, NFS4_PNFS_MAX_MULTI_CNT); ++ goto out_err; ++ } ++ dsaddr = kzalloc(sizeof(*dsaddr) + ++ (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), ++ GFP_KERNEL); ++ if (!dsaddr) ++ goto out_err; ++ ++ dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL); ++ if (!dsaddr->stripe_indices) ++ goto out_err_free; ++ ++ dsaddr->stripe_count = cnt; ++ dsaddr->ds_num = num; ++ ++ memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ ++ /* Go back an read stripe indices */ ++ p = indicesp; ++ indexp = &dsaddr->stripe_indices[0]; ++ for (i = 0; i < dsaddr->stripe_count; i++) { ++ dummy = be32_to_cpup(p++); ++ *indexp = dummy; /* bound by NFS4_PNFS_MAX_MULTI_CNT */ ++ indexp++; ++ } ++ /* Skip already read multipath list count */ ++ p++; ++ ++ for (i = 0; i < dsaddr->ds_num; i++) { ++ int j; ++ ++ dummy = be32_to_cpup(p++); /* multipath count */ ++ if (dummy > 1) { ++ printk(KERN_WARNING ++ "%s: Multipath count %d not supported, " ++ "skipping all greater than 1\n", __func__, ++ dummy); ++ } ++ for (j = 0; j < dummy; j++) { ++ if (j == 0) { ++ dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); ++ if (dsaddr->ds_list[i] == NULL) ++ goto out_err_free; ++ } else { ++ u32 len; ++ /* skip extra multipath */ ++ len = be32_to_cpup(p++); ++ p += XDR_QUADLEN(len); ++ len = be32_to_cpup(p++); ++ p += XDR_QUADLEN(len); ++ continue; ++ } ++ } ++ } ++ nfs4_init_deviceid_node(&dsaddr->deviceid); ++ ++ return dsaddr; ++ ++out_err_free: ++ nfs4_fl_free_deviceid(dsaddr); ++out_err: ++ dprintk("%s ERROR: returning NULL\n", __func__); ++ return NULL; ++} ++ ++/* ++ * Decode the opaque device specified in 'dev' ++ * and add it to the list of available devices. ++ * If the deviceid is already cached, nfs4_add_deviceid will return ++ * a pointer to the cached struct and throw away the new. ++ */ ++static struct nfs4_file_layout_dsaddr* ++decode_and_add_device(struct inode *inode, struct pnfs_device *dev) ++{ ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ struct nfs4_deviceid *d; ++ ++ dsaddr = decode_device(inode, dev); ++ if (!dsaddr) { ++ printk(KERN_WARNING "%s: Could not decode or add device\n", ++ __func__); ++ return NULL; ++ } ++ ++ d = nfs4_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, ++ &dsaddr->deviceid); ++ ++ return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); ++} ++ ++/* ++ * Retrieve the information for dev_id, add it to the list ++ * of available devices, and return it. ++ */ ++struct nfs4_file_layout_dsaddr * ++get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id) ++{ ++ struct pnfs_device *pdev = NULL; ++ u32 max_resp_sz; ++ int max_pages; ++ struct page **pages = NULL; ++ struct nfs4_file_layout_dsaddr *dsaddr = NULL; ++ int rc, i; ++ struct nfs_server *server = NFS_SERVER(inode); ++ ++ /* ++ * Use the session max response size as the basis for setting ++ * GETDEVICEINFO's maxcount ++ */ ++ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; ++ max_pages = max_resp_sz >> PAGE_SHIFT; ++ dprintk("%s inode %p max_resp_sz %u max_pages %d\n", ++ __func__, inode, max_resp_sz, max_pages); ++ ++ pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL); ++ if (pdev == NULL) ++ return NULL; ++ ++ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); ++ if (pages == NULL) { ++ kfree(pdev); ++ return NULL; ++ } ++ for (i = 0; i < max_pages; i++) { ++ pages[i] = alloc_page(GFP_KERNEL); ++ if (!pages[i]) ++ goto out_free; ++ } ++ ++ /* set pdev->area */ ++ pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); ++ if (!pdev->area) ++ goto out_free; ++ ++ memcpy(&pdev->dev_id, dev_id, NFS4_PNFS_DEVICEID4_SIZE); ++ pdev->layout_type = LAYOUT_NFSV4_1_FILES; ++ pdev->pages = pages; ++ pdev->pgbase = 0; ++ pdev->pglen = PAGE_SIZE * max_pages; ++ pdev->mincount = 0; ++ /* TODO: Update types when CB_NOTIFY_DEVICEID is available */ ++ pdev->dev_notify_types = 0; ++ ++ rc = pnfs_callback_ops->nfs_getdeviceinfo(server, pdev); ++ dprintk("%s getdevice info returns %d\n", __func__, rc); ++ if (rc) ++ goto out_free; ++ ++ /* ++ * Found new device, need to decode it and then add it to the ++ * list of known devices for this mountpoint. ++ */ ++ dsaddr = decode_and_add_device(inode, pdev); ++out_free: ++ if (pdev->area != NULL) ++ vunmap(pdev->area); ++ for (i = 0; i < max_pages; i++) ++ __free_page(pages[i]); ++ kfree(pages); ++ kfree(pdev); ++ dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); ++ return dsaddr; ++} ++ ++struct nfs4_file_layout_dsaddr * ++nfs4_pnfs_device_item_find(struct nfs_client *clp, struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ ++ d = nfs4_find_deviceid(clp->cl_devid_cache, id); ++ dprintk("%s device id (%s) nfs4_deviceid %p\n", __func__, ++ deviceid_fmt(id), d); ++ return (d == NULL) ? NULL : ++ container_of(d, struct nfs4_file_layout_dsaddr, deviceid); ++} ++ ++/* ++ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit ++ * Then: ((res + fsi) % dsaddr->stripe_count) ++ */ ++static inline u32 ++_nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ u64 tmp; ++ ++ tmp = offset - flseg->pattern_offset; ++ do_div(tmp, flseg->stripe_unit); ++ tmp += flseg->first_stripe_index; ++ return do_div(tmp, FILE_DSADDR(lseg)->stripe_count); ++} ++ ++u32 ++nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ u32 j; ++ ++ j = _nfs4_fl_calc_j_index(lseg, offset); ++ return FILE_DSADDR(lseg)->stripe_indices[j]; ++} ++ ++struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ u32 i; ++ ++ if (flseg->stripe_type == STRIPE_SPARSE) { ++ if (flseg->num_fh == 1) ++ i = 0; ++ else if (flseg->num_fh == 0) ++ return NULL; ++ else ++ i = nfs4_fl_calc_ds_index(lseg, offset); ++ } else ++ i = _nfs4_fl_calc_j_index(lseg, offset); ++ return &flseg->fh_array[i]; ++} ++ ++struct nfs4_pnfs_ds * ++nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ ++ dsaddr = FILE_DSADDR(lseg); ++ if (dsaddr->ds_list[ds_idx] == NULL) { ++ printk(KERN_ERR "%s: No data server for device id (%s)!!\n", ++ __func__, deviceid_fmt(&flseg->dev_id)); ++ return NULL; ++ } ++ ++ if (!dsaddr->ds_list[ds_idx]->ds_clp) { ++ int err; ++ ++ err = nfs4_pnfs_ds_create(PNFS_NFS_SERVER(lseg->layout), ++ dsaddr->ds_list[ds_idx]); ++ if (err) { ++ printk(KERN_ERR "%s nfs4_pnfs_ds_create error %d\n", ++ __func__, err); ++ return NULL; ++ } ++ } ++ dprintk("%s: dev_id=%s, ds_idx=%u\n", ++ __func__, deviceid_fmt(&flseg->dev_id), ds_idx); ++ ++ return dsaddr->ds_list[ds_idx]; ++} ++ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-23 12:09:03.335501543 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-08-23 12:09:03.335501543 -0400 +@@ -0,0 +1,97 @@ ++/* ++ * pnfs_nfs4filelayout.h ++ * ++ * NFSv4 file layout driver data structures. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef FS_NFS_NFS4FILELAYOUT_H ++#define FS_NFS_NFS4FILELAYOUT_H ++ ++#include ++#include ++#include ++ ++#define NFS4_PNFS_DEV_HASH_BITS 5 ++#define NFS4_PNFS_DEV_HASH_SIZE (1 << NFS4_PNFS_DEV_HASH_BITS) ++#define NFS4_PNFS_DEV_HASH_MASK (NFS4_PNFS_DEV_HASH_SIZE - 1) ++ ++#define NFS4_PNFS_MAX_STRIPE_CNT 4096 ++#define NFS4_PNFS_MAX_MULTI_CNT 64 /* 256 fit into a u8 stripe_index */ ++#define NFS4_PNFS_MAX_MULTI_DS 2 ++ ++#define FILE_DSADDR(lseg) (container_of(lseg->deviceid, \ ++ struct nfs4_file_layout_dsaddr, \ ++ deviceid)) ++ ++enum stripetype4 { ++ STRIPE_SPARSE = 1, ++ STRIPE_DENSE = 2 ++}; ++ ++/* Individual ip address */ ++struct nfs4_pnfs_ds { ++ struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ ++ u32 ds_ip_addr; ++ u32 ds_port; ++ struct nfs_client *ds_clp; ++ atomic_t ds_count; ++ char r_addr[29]; ++}; ++ ++struct nfs4_file_layout_dsaddr { ++ struct nfs4_deviceid deviceid; ++ u32 stripe_count; ++ u8 *stripe_indices; ++ u32 ds_num; ++ struct nfs4_pnfs_ds *ds_list[1]; ++}; ++ ++struct nfs4_pnfs_dev_hlist { ++ rwlock_t dev_lock; ++ struct hlist_head dev_list[NFS4_PNFS_DEV_HASH_SIZE]; ++}; ++ ++struct nfs4_filelayout_segment { ++ u32 stripe_type; ++ u32 commit_through_mds; ++ u32 stripe_unit; ++ u32 first_stripe_index; ++ u64 pattern_offset; ++ struct pnfs_deviceid dev_id; ++ unsigned int num_fh; ++ struct nfs_fh *fh_array; ++}; ++ ++struct nfs4_filelayout { ++ struct pnfs_layout_type fl_layout; ++ u32 stripe_unit; ++}; ++ ++extern struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset); ++ ++static inline struct nfs4_filelayout * ++FILE_LO(struct pnfs_layout_type *lo) ++{ ++ return container_of(lo, struct nfs4_filelayout, fl_layout); ++} ++ ++extern struct pnfs_client_operations *pnfs_callback_ops; ++ ++extern void nfs4_fl_free_deviceid_callback(struct kref *); ++extern void print_ds(struct nfs4_pnfs_ds *ds); ++char *deviceid_fmt(const struct pnfs_deviceid *dev_id); ++u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset); ++struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, ++ u32 ds_idx); ++extern struct nfs4_file_layout_dsaddr * ++nfs4_pnfs_device_item_find(struct nfs_client *, struct pnfs_deviceid *dev_id); ++struct nfs4_file_layout_dsaddr * ++get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id); ++ ++#endif /* FS_NFS_NFS4FILELAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nfs4_fs.h +--- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-23 12:08:29.047512264 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-23 12:09:03.336490079 -0400 +@@ -45,8 +45,28 @@ enum nfs4_client_state { + NFS4CLNT_RECLAIM_NOGRACE, + NFS4CLNT_DELEGRETURN, + NFS4CLNT_SESSION_RESET, +- NFS4CLNT_SESSION_DRAINING, + NFS4CLNT_RECALL_SLOT, ++ NFS4CLNT_LAYOUT_RECALL, ++}; ++ ++enum nfs4_session_state { ++ NFS4_SESSION_INITING, ++ NFS4_SESSION_DRAINING, ++}; ++ ++struct nfs4_minor_version_ops { ++ u32 minor_version; ++ ++ int (*call_sync)(struct nfs_server *server, ++ struct rpc_message *msg, ++ struct nfs4_sequence_args *args, ++ struct nfs4_sequence_res *res, ++ int cache_reply); ++ int (*validate_stateid)(struct nfs_delegation *, ++ const nfs4_stateid *); ++ const struct nfs4_state_recovery_ops *reboot_recovery_ops; ++ const struct nfs4_state_recovery_ops *nograce_recovery_ops; ++ const struct nfs4_state_maintenance_ops *state_renewal_ops; + }; + + /* +@@ -89,7 +109,6 @@ struct nfs_unique_id { + */ + struct nfs4_state_owner { + struct nfs_unique_id so_owner_id; +- struct nfs_client *so_client; + struct nfs_server *so_server; + struct rb_node so_client_node; + +@@ -99,7 +118,6 @@ struct nfs4_state_owner { + atomic_t so_count; + unsigned long so_flags; + struct list_head so_states; +- struct list_head so_delegations; + struct nfs_seqid_counter so_seqid; + struct rpc_sequence so_sequence; + }; +@@ -125,10 +143,20 @@ enum { + * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) + */ + ++struct nfs4_lock_owner { ++ unsigned int lo_type; ++#define NFS4_ANY_LOCK_TYPE (0U) ++#define NFS4_FLOCK_LOCK_TYPE (1U << 0) ++#define NFS4_POSIX_LOCK_TYPE (1U << 1) ++ union { ++ fl_owner_t posix_owner; ++ pid_t flock_owner; ++ } lo_u; ++}; ++ + struct nfs4_lock_state { + struct list_head ls_locks; /* Other lock stateids */ + struct nfs4_state * ls_state; /* Pointer to open state */ +- fl_owner_t ls_owner; /* POSIX lock owner */ + #define NFS_LOCK_INITIALIZED 1 + int ls_flags; + struct nfs_seqid_counter ls_seqid; +@@ -136,6 +164,7 @@ struct nfs4_lock_state { + struct nfs_unique_id ls_id; + nfs4_stateid ls_stateid; + atomic_t ls_count; ++ struct nfs4_lock_owner ls_owner; + }; + + /* bits for nfs4_state->flags */ +@@ -219,22 +248,34 @@ extern int nfs4_open_revalidate(struct i + extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); + extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, + struct nfs4_fs_locations *fs_locations, struct page *page); ++extern void nfs4_release_lockowner(const struct nfs4_lock_state *); + +-extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[]; +-extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[]; + #if defined(CONFIG_NFS_V4_1) +-extern int nfs4_setup_sequence(struct nfs_client *clp, ++static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) ++{ ++ return server->nfs_client->cl_session; ++} ++ ++extern int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task); + extern void nfs4_destroy_session(struct nfs4_session *session); + extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); ++extern int nfs4_proc_exchange_id(struct nfs_client *, struct rpc_cred *); + extern int nfs4_proc_create_session(struct nfs_client *); + extern int nfs4_proc_destroy_session(struct nfs4_session *); + extern int nfs4_init_session(struct nfs_server *server); + extern int nfs4_proc_get_lease_time(struct nfs_client *clp, + struct nfs_fsinfo *fsinfo); + #else /* CONFIG_NFS_v4_1 */ +-static inline int nfs4_setup_sequence(struct nfs_client *clp, ++static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) ++{ ++ return NULL; ++} ++ ++static inline int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task) + { +@@ -247,12 +288,12 @@ static inline int nfs4_init_session(stru + } + #endif /* CONFIG_NFS_V4_1 */ + +-extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[]; ++extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; + + extern const u32 nfs4_fattr_bitmap[2]; + extern const u32 nfs4_statfs_bitmap[2]; + extern const u32 nfs4_pathconf_bitmap[2]; +-extern const u32 nfs4_fsinfo_bitmap[2]; ++extern const u32 nfs4_fsinfo_bitmap[3]; + extern const u32 nfs4_fs_locations_bitmap[2]; + + /* nfs4renewd.c */ +@@ -284,7 +325,7 @@ extern void nfs41_handle_sequence_flag_e + extern void nfs41_handle_recall_slot(struct nfs_client *clp); + extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); + extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); +-extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); ++extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); + + extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); + extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); +@@ -293,6 +334,7 @@ extern void nfs_increment_lock_seqid(int + extern void nfs_release_seqid(struct nfs_seqid *seqid); + extern void nfs_free_seqid(struct nfs_seqid *seqid); + ++/* write.c */ + extern const nfs4_stateid zero_stateid; + + /* nfs4xdr.c */ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-08-23 12:08:29.050481368 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-23 12:09:03.339481253 -0400 +@@ -49,12 +49,15 @@ + #include + #include + #include ++#include ++#include + + #include "nfs4_fs.h" + #include "delegation.h" + #include "internal.h" + #include "iostat.h" + #include "callback.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PROC + +@@ -67,7 +70,7 @@ struct nfs4_opendata; + static int _nfs4_proc_open(struct nfs4_opendata *data); + static int _nfs4_recover_proc_open(struct nfs4_opendata *data); + static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); +-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); ++static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, struct nfs_client *); + static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, +@@ -125,11 +128,16 @@ const u32 nfs4_pathconf_bitmap[2] = { + 0 + }; + +-const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE ++const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE + | FATTR4_WORD0_MAXREAD + | FATTR4_WORD0_MAXWRITE + | FATTR4_WORD0_LEASE_TIME, ++#ifdef CONFIG_NFS_V4_1 ++ FATTR4_WORD1_FS_LAYOUT_TYPES, ++ FATTR4_WORD2_LAYOUT_BLKSIZE ++#else /* CONFIG_NFS_V4_1 */ + 0 ++#endif /* CONFIG_NFS_V4_1 */ + }; + + const u32 nfs4_fs_locations_bitmap[2] = { +@@ -356,7 +364,7 @@ static void nfs41_check_drain_session_co + { + struct rpc_task *task; + +- if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) { ++ if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); + if (task) + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); +@@ -370,12 +378,11 @@ static void nfs41_check_drain_session_co + complete(&ses->complete); + } + +-static void nfs41_sequence_free_slot(const struct nfs_client *clp, +- struct nfs4_sequence_res *res) ++static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) + { + struct nfs4_slot_table *tbl; + +- tbl = &clp->cl_session->fc_slot_table; ++ tbl = &res->sr_session->fc_slot_table; + if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { + /* just wake up the next guy waiting since + * we may have not consumed a slot after all */ +@@ -385,18 +392,17 @@ static void nfs41_sequence_free_slot(con + + spin_lock(&tbl->slot_tbl_lock); + nfs4_free_slot(tbl, res->sr_slotid); +- nfs41_check_drain_session_complete(clp->cl_session); ++ nfs41_check_drain_session_complete(res->sr_session); + spin_unlock(&tbl->slot_tbl_lock); + res->sr_slotid = NFS4_MAX_SLOT_TABLE; + } + +-static void nfs41_sequence_done(struct nfs_client *clp, +- struct nfs4_sequence_res *res, +- int rpc_status) ++static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) + { + unsigned long timestamp; + struct nfs4_slot_table *tbl; + struct nfs4_slot *slot; ++ struct nfs_client *clp; + + /* + * sr_status remains 1 if an RPC level error occurred. The server +@@ -411,13 +417,16 @@ static void nfs41_sequence_done(struct n + if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) + goto out; + ++ tbl = &res->sr_session->fc_slot_table; ++ slot = tbl->slots + res->sr_slotid; ++ + /* Check the SEQUENCE operation status */ +- if (res->sr_status == 0) { +- tbl = &clp->cl_session->fc_slot_table; +- slot = tbl->slots + res->sr_slotid; ++ switch (res->sr_status) { ++ case 0: + /* Update the slot's sequence and clientid lease timer */ + ++slot->seq_nr; + timestamp = res->sr_renewal_time; ++ clp = res->sr_session->clp; + spin_lock(&clp->cl_lock); + if (time_before(clp->cl_last_renewal, timestamp)) + clp->cl_last_renewal = timestamp; +@@ -425,11 +434,39 @@ static void nfs41_sequence_done(struct n + /* Check sequence flags */ + if (atomic_read(&clp->cl_count) > 1) + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); ++ break; ++ case -NFS4ERR_DELAY: ++ /* The server detected a resend of the RPC call and ++ * returned NFS4ERR_DELAY as per Section 2.10.6.2 ++ * of RFC5661. ++ */ ++ dprintk("%s: slot=%d seq=%d: Operation in progress\n", ++ __func__, res->sr_slotid, slot->seq_nr); ++ goto out_retry; ++ default: ++ /* Just update the slot sequence no. */ ++ ++slot->seq_nr; + } + out: + /* The session may be reset by one of the error handlers. */ + dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); +- nfs41_sequence_free_slot(clp, res); ++ nfs41_sequence_free_slot(res); ++ return 1; ++out_retry: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ rpc_restart_call(task); ++ /* FIXME: rpc_restart_call() should be made to return success/fail */ ++ if (RPC_ASSASSINATED(task)) ++ goto out; ++ return 0; ++} ++ ++static int nfs4_sequence_done(struct rpc_task *task, ++ struct nfs4_sequence_res *res) ++{ ++ if (res->sr_session == NULL) ++ return 1; ++ return nfs41_sequence_done(task, res); + } + + /* +@@ -480,12 +517,11 @@ static int nfs41_setup_sequence(struct n + if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) + return 0; + +- memset(res, 0, sizeof(*res)); + res->sr_slotid = NFS4_MAX_SLOT_TABLE; + tbl = &session->fc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); +- if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) && ++ if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && + !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { + /* + * The state manager will wait until the slot table is empty. +@@ -525,6 +561,7 @@ static int nfs41_setup_sequence(struct n + res->sr_session = session; + res->sr_slotid = slotid; + res->sr_renewal_time = jiffies; ++ res->sr_status_flags = 0; + /* + * sr_status is only set in decode_sequence, and so will remain + * set to 1 if an rpc level failure occurs. +@@ -533,33 +570,36 @@ static int nfs41_setup_sequence(struct n + return 0; + } + +-int nfs4_setup_sequence(struct nfs_client *clp, ++int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply, + struct rpc_task *task) + { ++ struct nfs4_session *session = nfs4_get_session(server); + int ret = 0; + ++ if (ds_session) ++ session = ds_session; ++ if (session == NULL) { ++ args->sa_session = NULL; ++ res->sr_session = NULL; ++ goto out; ++ } ++ + dprintk("--> %s clp %p session %p sr_slotid %d\n", +- __func__, clp, clp->cl_session, res->sr_slotid); ++ __func__, session->clp, session, res->sr_slotid); + +- if (!nfs4_has_session(clp)) +- goto out; +- ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply, ++ ret = nfs41_setup_sequence(session, args, res, cache_reply, + task); +- if (ret && ret != -EAGAIN) { +- /* terminate rpc task */ +- task->tk_status = ret; +- task->tk_action = NULL; +- } + out: + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; + } + + struct nfs41_call_sync_data { +- struct nfs_client *clp; ++ const struct nfs_server *seq_server; + struct nfs4_sequence_args *seq_args; + struct nfs4_sequence_res *seq_res; + int cache_reply; +@@ -569,9 +609,9 @@ static void nfs41_call_sync_prepare(stru + { + struct nfs41_call_sync_data *data = calldata; + +- dprintk("--> %s data->clp->cl_session %p\n", __func__, +- data->clp->cl_session); +- if (nfs4_setup_sequence(data->clp, data->seq_args, ++ dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); ++ ++ if (nfs4_setup_sequence(data->seq_server, NULL, data->seq_args, + data->seq_res, data->cache_reply, task)) + return; + rpc_call_start(task); +@@ -587,7 +627,7 @@ static void nfs41_call_sync_done(struct + { + struct nfs41_call_sync_data *data = calldata; + +- nfs41_sequence_done(data->clp, data->seq_res, task->tk_status); ++ nfs41_sequence_done(task, data->seq_res); + } + + struct rpc_call_ops nfs41_call_sync_ops = { +@@ -600,8 +640,7 @@ struct rpc_call_ops nfs41_call_priv_sync + .rpc_call_done = nfs41_call_sync_done, + }; + +-static int nfs4_call_sync_sequence(struct nfs_client *clp, +- struct rpc_clnt *clnt, ++static int nfs4_call_sync_sequence(struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, +@@ -611,13 +650,13 @@ static int nfs4_call_sync_sequence(struc + int ret; + struct rpc_task *task; + struct nfs41_call_sync_data data = { +- .clp = clp, ++ .seq_server = server, + .seq_args = args, + .seq_res = res, + .cache_reply = cache_reply, + }; + struct rpc_task_setup task_setup = { +- .rpc_client = clnt, ++ .rpc_client = server->client, + .rpc_message = msg, + .callback_ops = &nfs41_call_sync_ops, + .callback_data = &data +@@ -642,10 +681,15 @@ int _nfs4_call_sync_session(struct nfs_s + struct nfs4_sequence_res *res, + int cache_reply) + { +- return nfs4_call_sync_sequence(server->nfs_client, server->client, +- msg, args, res, cache_reply, 0); ++ return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0); + } + ++#else ++static int nfs4_sequence_done(struct rpc_task *task, ++ struct nfs4_sequence_res *res) ++{ ++ return 1; ++} + #endif /* CONFIG_NFS_V4_1 */ + + int _nfs4_call_sync(struct nfs_server *server, +@@ -659,18 +703,9 @@ int _nfs4_call_sync(struct nfs_server *s + } + + #define nfs4_call_sync(server, msg, args, res, cache_reply) \ +- (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \ ++ (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \ + &(res)->seq_res, (cache_reply)) + +-static void nfs4_sequence_done(const struct nfs_server *server, +- struct nfs4_sequence_res *res, int rpc_status) +-{ +-#ifdef CONFIG_NFS_V4_1 +- if (nfs4_has_session(server->nfs_client)) +- nfs41_sequence_done(server->nfs_client, res, rpc_status); +-#endif /* CONFIG_NFS_V4_1 */ +-} +- + static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) + { + struct nfs_inode *nfsi = NFS_I(dir); +@@ -745,19 +780,14 @@ static struct nfs4_opendata *nfs4_openda + p->o_arg.server = server; + p->o_arg.bitmask = server->attr_bitmask; + p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; +- if (flags & O_EXCL) { +- if (nfs4_has_persistent_session(server->nfs_client)) { +- /* GUARDED */ +- p->o_arg.u.attrs = &p->attrs; +- memcpy(&p->attrs, attrs, sizeof(p->attrs)); +- } else { /* EXCLUSIVE4_1 */ +- u32 *s = (u32 *) p->o_arg.u.verifier.data; +- s[0] = jiffies; +- s[1] = current->pid; +- } +- } else if (flags & O_CREAT) { ++ if (flags & O_CREAT) { ++ u32 *s; ++ + p->o_arg.u.attrs = &p->attrs; + memcpy(&p->attrs, attrs, sizeof(p->attrs)); ++ s = (u32 *) p->o_arg.u.verifier.data; ++ s[0] = jiffies; ++ s[1] = current->pid; + } + p->c_arg.fh = &p->o_res.fh; + p->c_arg.stateid = &p->o_res.stateid; +@@ -851,8 +881,10 @@ static void update_open_stateflags(struc + static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) + { + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) +- memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data)); +- memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data)); ++ memcpy(state->stateid.u.data, stateid->u.data, ++ sizeof(state->stateid.u.data)); ++ memcpy(state->open_stateid.u.data, stateid->u.data, ++ sizeof(state->open_stateid.u.data)); + switch (fmode) { + case FMODE_READ: + set_bit(NFS_O_RDONLY_STATE, &state->flags); +@@ -880,7 +912,8 @@ static void __update_open_stateid(struct + */ + write_seqlock(&state->seqlock); + if (deleg_stateid != NULL) { +- memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data)); ++ memcpy(state->stateid.u.data, deleg_stateid->u.data, ++ sizeof(state->stateid.u.data)); + set_bit(NFS_DELEGATED_STATE, &state->flags); + } + if (open_stateid != NULL) +@@ -911,7 +944,8 @@ static int update_open_stateid(struct nf + + if (delegation == NULL) + delegation = &deleg_cur->stateid; +- else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0) ++ else if (memcmp(deleg_cur->stateid.u.data, delegation->u.data, ++ NFS4_STATEID_SIZE) != 0) + goto no_delegation_unlock; + + nfs_mark_delegation_referenced(deleg_cur); +@@ -973,7 +1007,8 @@ static struct nfs4_state *nfs4_try_open_ + break; + } + /* Save the delegation */ +- memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); ++ memcpy(stateid.u.data, delegation->stateid.u.data, ++ sizeof(stateid.u.data)); + rcu_read_unlock(); + ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); + if (ret != 0) +@@ -1127,10 +1162,13 @@ static int nfs4_open_recover(struct nfs4 + * Check if we need to update the current stateid. + */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 && +- memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) { ++ memcmp(state->stateid.u.data, state->open_stateid.u.data, ++ sizeof(state->stateid.u.data)) != 0) { + write_seqlock(&state->seqlock); + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) +- memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)); ++ memcpy(state->stateid.u.data, ++ state->open_stateid.u.data, ++ sizeof(state->stateid.u.data)); + write_sequnlock(&state->seqlock); + } + return 0; +@@ -1199,8 +1237,8 @@ static int _nfs4_open_delegation_recall( + if (IS_ERR(opendata)) + return PTR_ERR(opendata); + opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; +- memcpy(opendata->o_arg.u.delegation.data, stateid->data, +- sizeof(opendata->o_arg.u.delegation.data)); ++ memcpy(opendata->o_arg.u.delegation.u.data, stateid->u.data, ++ sizeof(opendata->o_arg.u.delegation.u.data)); + ret = nfs4_open_recover(opendata, state); + nfs4_opendata_put(opendata); + return ret; +@@ -1258,8 +1296,8 @@ static void nfs4_open_confirm_done(struc + if (RPC_ASSASSINATED(task)) + return; + if (data->rpc_status == 0) { +- memcpy(data->o_res.stateid.data, data->c_res.stateid.data, +- sizeof(data->o_res.stateid.data)); ++ memcpy(data->o_res.stateid.u.data, data->c_res.stateid.u.data, ++ sizeof(data->o_res.stateid.u.data)); + nfs_confirm_seqid(&data->owner->so_seqid, 0); + renew_lease(data->o_res.server, data->timestamp); + data->rpc_done = 1; +@@ -1356,13 +1394,13 @@ static void nfs4_open_prepare(struct rpc + } + /* Update sequence id. */ + data->o_arg.id = sp->so_owner_id.id; +- data->o_arg.clientid = sp->so_client->cl_clientid; ++ data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; + if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; + nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); + } + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->o_arg.server->nfs_client, ++ if (nfs4_setup_sequence(data->o_arg.server, NULL, + &data->o_arg.seq_args, + &data->o_res.seq_res, 1, task)) + return; +@@ -1385,8 +1423,8 @@ static void nfs4_open_done(struct rpc_ta + + data->rpc_status = task->tk_status; + +- nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->o_res.seq_res)) ++ return; + + if (RPC_ASSASSINATED(task)) + return; +@@ -1539,9 +1577,8 @@ static int _nfs4_proc_open(struct nfs4_o + return 0; + } + +-static int nfs4_recover_expired_lease(struct nfs_server *server) ++int nfs4_recover_expired_lease(struct nfs_client *clp) + { +- struct nfs_client *clp = server->nfs_client; + unsigned int loop; + int ret; + +@@ -1557,6 +1594,7 @@ static int nfs4_recover_expired_lease(st + } + return ret; + } ++EXPORT_SYMBOL(nfs4_recover_expired_lease); + + /* + * OPEN_EXPIRED: +@@ -1646,7 +1684,7 @@ static int _nfs4_do_open(struct inode *d + dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); + goto out_err; + } +- status = nfs4_recover_expired_lease(server); ++ status = nfs4_recover_expired_lease(server->nfs_client); + if (status != 0) + goto err_put_state_owner; + if (path->dentry->d_inode != NULL) +@@ -1773,7 +1811,7 @@ static int _nfs4_do_setattr(struct inode + if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { + /* Use that stateid */ + } else if (state != NULL) { +- nfs4_copy_stateid(&arg.stateid, state, current->files); ++ nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid); + } else + memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); + +@@ -1838,7 +1876,8 @@ static void nfs4_close_done(struct rpc_t + struct nfs4_state *state = calldata->state; + struct nfs_server *server = NFS_SERVER(calldata->inode); + +- nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status); ++ if (!nfs4_sequence_done(task, &calldata->res.seq_res)) ++ return; + if (RPC_ASSASSINATED(task)) + return; + /* hmm. we are done with the inode, and in the process of freeing +@@ -1858,7 +1897,7 @@ static void nfs4_close_done(struct rpc_t + if (calldata->arg.fmode == 0) + break; + default: +- if (nfs4_async_handle_error(task, server, state) == -EAGAIN) ++ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } + nfs_release_seqid(calldata->arg.seqid); +@@ -1903,7 +1942,7 @@ static void nfs4_close_prepare(struct rp + + nfs_fattr_init(calldata->res.fattr); + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client, ++ if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), NULL, + &calldata->arg.seq_args, &calldata->res.seq_res, + 1, task)) + return; +@@ -2323,6 +2362,9 @@ nfs4_proc_setattr(struct dentry *dentry, + struct nfs4_state *state = NULL; + int status; + ++ if (pnfs_ld_layoutret_on_setattr(inode)) ++ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); ++ + nfs_fattr_init(fattr); + + /* Search for an existing open(O_WRITE) file */ +@@ -2648,8 +2690,9 @@ static int nfs4_proc_unlink_done(struct + { + struct nfs_removeres *res = task->tk_msg.rpc_resp; + +- nfs4_sequence_done(res->server, &res->seq_res, task->tk_status); +- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) ++ if (!nfs4_sequence_done(task, &res->seq_res)) ++ return 0; ++ if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN) + return 0; + update_changeattr(dir, &res->cinfo); + nfs_post_op_update_inode(dir, res->dir_attr); +@@ -3090,18 +3133,31 @@ static int nfs4_proc_pathconf(struct nfs + static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) + { + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; + + dprintk("--> %s\n", __func__); + +- nfs4_sequence_done(server, &data->res.seq_res, task->tk_status); ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; + +- if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, server->nfs_client); ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } + + nfs_invalidate_atime(data->inode); +- if (task->tk_status > 0) ++ if (task->tk_status > 0 && client == server->nfs_client) + renew_lease(server, data->timestamp); + return 0; + } +@@ -3112,20 +3168,56 @@ static void nfs4_proc_read_setup(struct + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + } + ++static void pnfs4_update_write_done(struct nfs_inode *nfsi, struct nfs_write_data *data) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ + static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- +- nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, +- task->tk_status); ++ struct nfs_server *server = NFS_SERVER(inode); ++ struct nfs_client *client = server->nfs_client; + +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; ++ ++#ifdef CONFIG_NFS_V4_1 ++ /* restore original count after retry? */ ++ if (data->pdata.orig_count) { ++ dprintk("%s: restoring original count %u\n", __func__, ++ data->pdata.orig_count); ++ data->args.count = data->pdata.orig_count; ++ } ++ ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } ++ ++ /* ++ * MDS write: renew lease ++ * DS write: update lastbyte written, mark for layout commit ++ */ + if (task->tk_status >= 0) { +- renew_lease(NFS_SERVER(inode), data->timestamp); +- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ if (client == server->nfs_client) { ++ renew_lease(server, data->timestamp); ++ nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ } else ++ pnfs4_update_write_done(NFS_I(inode), data); + } + return 0; + } +@@ -3138,20 +3230,42 @@ static void nfs4_proc_write_setup(struct + data->res.server = server; + data->timestamp = jiffies; + ++#ifdef CONFIG_NFS_V4_1 ++ /* writes to DS use pnfs vector */ ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_WRITE]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; + } + + static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- +- nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, +- task->tk_status); +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { ++ struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; ++ ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS commit\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; ++ ++ if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL, NULL) == -EAGAIN) { + nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + return -EAGAIN; + } +- nfs_refresh_inode(inode, data->res.fattr); ++ if (client == server->nfs_client) ++ nfs_refresh_inode(inode, data->res.fattr); + return 0; + } + +@@ -3161,6 +3275,12 @@ static void nfs4_proc_commit_setup(struc + + data->args.bitmask = server->cache_consistency_bitmask; + data->res.server = server; ++#if defined(CONFIG_NFS_V4_1) ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_COMMIT]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; + } + +@@ -3464,9 +3584,12 @@ static int nfs4_proc_set_acl(struct inod + } + + static int +-_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state) ++nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state, struct nfs_client *clp) + { +- if (!clp || task->tk_status >= 0) ++ if (!clp) ++ clp = server->nfs_client; ++ ++ if (task->tk_status >= 0) + return 0; + switch(task->tk_status) { + case -NFS4ERR_ADMIN_REVOKED: +@@ -3491,8 +3614,9 @@ _nfs4_async_handle_error(struct rpc_task + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: +- dprintk("%s ERROR %d, Reset session\n", __func__, +- task->tk_status); ++ dprintk("%s ERROR %d, Reset session. Exchangeid " ++ "flags 0x%x\n", __func__, task->tk_status, ++ clp->cl_exchange_flags); + nfs4_schedule_state_recovery(clp); + task->tk_status = 0; + return -EAGAIN; +@@ -3512,6 +3636,8 @@ _nfs4_async_handle_error(struct rpc_task + task->tk_status = nfs4_map_errors(task->tk_status); + return 0; + do_state_recovery: ++ if (is_ds_only_client(clp)) ++ return 0; + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); + nfs4_schedule_state_recovery(clp); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) +@@ -3520,12 +3646,6 @@ do_state_recovery: + return -EAGAIN; + } + +-static int +-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) +-{ +- return _nfs4_async_handle_error(task, server, server->nfs_client, state); +-} +- + int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, + unsigned short port, struct rpc_cred *cred, + struct nfs4_setclientid_res *res) +@@ -3641,8 +3761,8 @@ static void nfs4_delegreturn_done(struct + { + struct nfs4_delegreturndata *data = calldata; + +- nfs4_sequence_done(data->res.server, &data->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; + + switch (task->tk_status) { + case -NFS4ERR_STALE_STATEID: +@@ -3651,8 +3771,8 @@ static void nfs4_delegreturn_done(struct + renew_lease(data->res.server, data->timestamp); + break; + default: +- if (nfs4_async_handle_error(task, data->res.server, NULL) == +- -EAGAIN) { ++ if (nfs4_async_handle_error(task, data->res.server, NULL, NULL) ++ == -EAGAIN) { + nfs_restart_rpc(task, data->res.server->nfs_client); + return; + } +@@ -3672,7 +3792,7 @@ static void nfs4_delegreturn_prepare(str + + d_data = (struct nfs4_delegreturndata *)data; + +- if (nfs4_setup_sequence(d_data->res.server->nfs_client, ++ if (nfs4_setup_sequence(d_data->res.server, NULL, + &d_data->args.seq_args, + &d_data->res.seq_res, 1, task)) + return; +@@ -3892,15 +4012,16 @@ static void nfs4_locku_done(struct rpc_t + { + struct nfs4_unlockdata *calldata = data; + +- nfs4_sequence_done(calldata->server, &calldata->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &calldata->res.seq_res)) ++ return; + if (RPC_ASSASSINATED(task)) + return; + switch (task->tk_status) { + case 0: +- memcpy(calldata->lsp->ls_stateid.data, +- calldata->res.stateid.data, +- sizeof(calldata->lsp->ls_stateid.data)); ++ memcpy(calldata->lsp->ls_stateid.u.data, ++ calldata->res.stateid.u.data, ++ sizeof(calldata->lsp->ls_stateid.u. ++ data)); + renew_lease(calldata->server, calldata->timestamp); + break; + case -NFS4ERR_BAD_STATEID: +@@ -3909,7 +4030,7 @@ static void nfs4_locku_done(struct rpc_t + case -NFS4ERR_EXPIRED: + break; + default: +- if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) ++ if (nfs4_async_handle_error(task, calldata->server, NULL, NULL) == -EAGAIN) + nfs_restart_rpc(task, + calldata->server->nfs_client); + } +@@ -3927,7 +4048,7 @@ static void nfs4_locku_prepare(struct rp + return; + } + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence(calldata->server->nfs_client, ++ if (nfs4_setup_sequence(calldata->server, NULL, + &calldata->arg.seq_args, + &calldata->res.seq_res, 1, task)) + return; +@@ -4082,7 +4203,8 @@ static void nfs4_lock_prepare(struct rpc + } else + data->arg.new_lock_owner = 0; + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args, ++ if (nfs4_setup_sequence(data->server, NULL, ++ &data->arg.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +@@ -4101,8 +4223,8 @@ static void nfs4_lock_done(struct rpc_ta + + dprintk("%s: begin!\n", __func__); + +- nfs4_sequence_done(data->server, &data->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; + + data->rpc_status = task->tk_status; + if (RPC_ASSASSINATED(task)) +@@ -4114,8 +4236,8 @@ static void nfs4_lock_done(struct rpc_ta + goto out; + } + if (data->rpc_status == 0) { +- memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, +- sizeof(data->lsp->ls_stateid.data)); ++ memcpy(data->lsp->ls_stateid.u.data, data->res.stateid.u.data, ++ sizeof(data->lsp->ls_stateid.u.data)); + data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; + renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp); + } +@@ -4424,6 +4546,34 @@ out: + return err; + } + ++static void nfs4_release_lockowner_release(void *calldata) ++{ ++ kfree(calldata); ++} ++ ++const struct rpc_call_ops nfs4_release_lockowner_ops = { ++ .rpc_release = nfs4_release_lockowner_release, ++}; ++ ++void nfs4_release_lockowner(const struct nfs4_lock_state *lsp) ++{ ++ struct nfs_server *server = lsp->ls_state->owner->so_server; ++ struct nfs_release_lockowner_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], ++ }; ++ ++ if (server->nfs_client->cl_mvops->minor_version != 0) ++ return; ++ args = kmalloc(sizeof(*args), GFP_NOFS); ++ if (!args) ++ return; ++ args->lock_owner.clientid = server->nfs_client->cl_clientid; ++ args->lock_owner.id = lsp->ls_id.id; ++ msg.rpc_argp = args; ++ rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); ++} ++ + #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" + + int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, +@@ -4526,7 +4676,7 @@ int nfs4_proc_exchange_id(struct nfs_cli + nfs4_verifier verifier; + struct nfs41_exchange_id_args args = { + .client = clp, +- .flags = clp->cl_exchange_flags, ++ .flags = clp->cl_exchange_flags & ~EXCHGID4_FLAG_CONFIRMED_R, + }; + struct nfs41_exchange_id_res res = { + .client = clp, +@@ -4574,6 +4724,7 @@ int nfs4_proc_exchange_id(struct nfs_cli + dprintk("<-- %s status= %d\n", __func__, status); + return status; + } ++EXPORT_SYMBOL(nfs4_proc_exchange_id); + + struct nfs4_get_lease_time_data { + struct nfs4_get_lease_time_args *args; +@@ -4611,7 +4762,8 @@ static void nfs4_get_lease_time_done(str + (struct nfs4_get_lease_time_data *)calldata; + + dprintk("--> %s\n", __func__); +- nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status); ++ if (!nfs41_sequence_done(task, &data->res->lr_seq_res)) ++ return; + switch (task->tk_status) { + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: +@@ -4805,13 +4957,6 @@ struct nfs4_session *nfs4_alloc_session( + if (!session) + return NULL; + +- /* +- * The create session reply races with the server back +- * channel probe. Mark the client NFS_CS_SESSION_INITING +- * so that the client back channel can find the +- * nfs_client struct +- */ +- clp->cl_cons_state = NFS_CS_SESSION_INITING; + init_completion(&session->complete); + + tbl = &session->fc_slot_table; +@@ -4824,6 +4969,8 @@ struct nfs4_session *nfs4_alloc_session( + spin_lock_init(&tbl->slot_tbl_lock); + rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); + ++ session->session_state = 1<clp = clp; + return session; + } +@@ -5040,6 +5187,10 @@ int nfs4_init_session(struct nfs_server + if (!nfs4_has_session(clp)) + return 0; + ++ session = clp->cl_session; ++ if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) ++ return 0; ++ + rsize = server->rsize; + if (rsize == 0) + rsize = NFS_MAX_FILE_IO_SIZE; +@@ -5047,11 +5198,10 @@ int nfs4_init_session(struct nfs_server + if (wsize == 0) + wsize = NFS_MAX_FILE_IO_SIZE; + +- session = clp->cl_session; + session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; + session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; + +- ret = nfs4_recover_expired_lease(server); ++ ret = nfs4_recover_expired_lease(server->nfs_client); + if (!ret) + ret = nfs4_check_client_ready(clp); + return ret; +@@ -5060,69 +5210,70 @@ int nfs4_init_session(struct nfs_server + /* + * Renew the cl_session lease. + */ +-static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +-{ ++struct nfs4_sequence_data { ++ struct nfs_client *clp; + struct nfs4_sequence_args args; + struct nfs4_sequence_res res; +- +- struct rpc_message msg = { +- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], +- .rpc_argp = &args, +- .rpc_resp = &res, +- .rpc_cred = cred, +- }; +- +- args.sa_cache_this = 0; +- +- return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args, +- &res, args.sa_cache_this, 1); +-} ++}; + + static void nfs41_sequence_release(void *data) + { +- struct nfs_client *clp = (struct nfs_client *)data; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); ++ kfree(calldata); ++} ++ ++static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp) ++{ ++ switch(task->tk_status) { ++ case -NFS4ERR_DELAY: ++ case -EKEYEXPIRED: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ return -EAGAIN; ++ default: ++ nfs4_schedule_state_recovery(clp); ++ } ++ return 0; + } + + static void nfs41_sequence_call_done(struct rpc_task *task, void *data) + { +- struct nfs_client *clp = (struct nfs_client *)data; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + +- nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status); ++ if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp)) ++ return; + + if (task->tk_status < 0) { + dprintk("%s ERROR %d\n", __func__, task->tk_status); + if (atomic_read(&clp->cl_count) == 1) + goto out; + +- if (_nfs4_async_handle_error(task, NULL, clp, NULL) +- == -EAGAIN) { +- nfs_restart_rpc(task, clp); ++ if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) { ++ rpc_restart_call_prepare(task); + return; + } + } + dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); + out: +- kfree(task->tk_msg.rpc_argp); +- kfree(task->tk_msg.rpc_resp); +- + dprintk("<-- %s\n", __func__); + } + + static void nfs41_sequence_prepare(struct rpc_task *task, void *data) + { +- struct nfs_client *clp; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + struct nfs4_sequence_args *args; + struct nfs4_sequence_res *res; + +- clp = (struct nfs_client *)data; + args = task->tk_msg.rpc_argp; + res = task->tk_msg.rpc_resp; + +- if (nfs4_setup_sequence(clp, args, res, 0, task)) ++ if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task)) + return; + rpc_call_start(task); + } +@@ -5133,32 +5284,67 @@ static const struct rpc_call_ops nfs41_s + .rpc_release = nfs41_sequence_release, + }; + +-static int nfs41_proc_async_sequence(struct nfs_client *clp, +- struct rpc_cred *cred) ++static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) + { +- struct nfs4_sequence_args *args; +- struct nfs4_sequence_res *res; ++ struct nfs4_sequence_data *calldata; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], + .rpc_cred = cred, + }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = clp->cl_rpcclient, ++ .rpc_message = &msg, ++ .callback_ops = &nfs41_sequence_ops, ++ .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT, ++ }; + + if (!atomic_inc_not_zero(&clp->cl_count)) +- return -EIO; +- args = kzalloc(sizeof(*args), GFP_NOFS); +- res = kzalloc(sizeof(*res), GFP_NOFS); +- if (!args || !res) { +- kfree(args); +- kfree(res); ++ return ERR_PTR(-EIO); ++ calldata = kmalloc(sizeof(*calldata), GFP_NOFS); ++ if (calldata == NULL) { + nfs_put_client(clp); +- return -ENOMEM; ++ return ERR_PTR(-ENOMEM); + } +- res->sr_slotid = NFS4_MAX_SLOT_TABLE; +- msg.rpc_argp = args; +- msg.rpc_resp = res; ++ calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ msg.rpc_argp = &calldata->args; ++ msg.rpc_resp = &calldata->res; ++ calldata->clp = clp; ++ task_setup_data.callback_data = calldata; + +- return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, +- &nfs41_sequence_ops, (void *)clp); ++ return rpc_run_task(&task_setup_data); ++} ++ ++static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred) ++{ ++ struct rpc_task *task; ++ int ret = 0; ++ ++ task = _nfs41_proc_sequence(clp, cred); ++ if (IS_ERR(task)) ++ ret = PTR_ERR(task); ++ else ++ rpc_put_task(task); ++ dprintk("<-- %s status=%d\n", __func__, ret); ++ return ret; ++} ++ ++static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) ++{ ++ struct rpc_task *task; ++ int ret; ++ ++ task = _nfs41_proc_sequence(clp, cred); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ ret = rpc_wait_for_completion_task(task); ++ if (!ret) ++ ret = task->tk_status; ++ rpc_put_task(task); ++out: ++ dprintk("<-- %s status=%d\n", __func__, ret); ++ return ret; + } + + struct nfs4_reclaim_complete_data { +@@ -5172,13 +5358,31 @@ static void nfs4_reclaim_complete_prepar + struct nfs4_reclaim_complete_data *calldata = data; + + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); +- if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, ++ if (nfs41_setup_sequence(calldata->clp->cl_session, ++ &calldata->arg.seq_args, + &calldata->res.seq_res, 0, task)) + return; + + rpc_call_start(task); + } + ++static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) ++{ ++ switch(task->tk_status) { ++ case 0: ++ case -NFS4ERR_COMPLETE_ALREADY: ++ case -NFS4ERR_WRONG_CRED: /* What to do here? */ ++ break; ++ case -NFS4ERR_DELAY: ++ case -EKEYEXPIRED: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ return -EAGAIN; ++ default: ++ nfs4_schedule_state_recovery(clp); ++ } ++ return 0; ++} ++ + static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) + { + struct nfs4_reclaim_complete_data *calldata = data; +@@ -5186,32 +5390,13 @@ static void nfs4_reclaim_complete_done(s + struct nfs4_sequence_res *res = &calldata->res.seq_res; + + dprintk("--> %s\n", __func__); +- nfs41_sequence_done(clp, res, task->tk_status); +- switch (task->tk_status) { +- case 0: +- case -NFS4ERR_COMPLETE_ALREADY: +- break; +- case -NFS4ERR_BADSESSION: +- case -NFS4ERR_DEADSESSION: +- /* +- * Handle the session error, but do not retry the operation, as +- * we have no way of telling whether the clientid had to be +- * reset before we got our reply. If reset, a new wave of +- * reclaim operations will follow, containing their own reclaim +- * complete. We don't want our retry to get on the way of +- * recovery by incorrectly indicating to the server that we're +- * done reclaiming state since the process had to be restarted. +- */ +- _nfs4_async_handle_error(task, NULL, clp, NULL); +- break; +- default: +- if (_nfs4_async_handle_error( +- task, NULL, clp, NULL) == -EAGAIN) { +- rpc_restart_call_prepare(task); +- return; +- } +- } ++ if (!nfs41_sequence_done(task, res)) ++ return; + ++ if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) { ++ rpc_restart_call_prepare(task); ++ return; ++ } + dprintk("<-- %s\n", __func__); + } + +@@ -5268,6 +5453,404 @@ out: + dprintk("<-- %s status=%d\n", __func__, status); + return status; + } ++ ++static void ++nfs4_pnfs_layoutget_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_pnfs_layoutget *lgp = calldata; ++ struct inode *ino = lgp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args, ++ &lgp->res.seq_res, 0, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void nfs4_pnfs_layoutget_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_pnfs_layoutget *lgp = calldata; ++ struct inode *ino = lgp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!nfs4_sequence_done(task, &lgp->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ pnfs_get_layout_done(lgp, task->tk_status); ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ lgp->status = task->tk_status; ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void nfs4_pnfs_layoutget_release(void *calldata) ++{ ++ struct nfs4_pnfs_layoutget *lgp = calldata; ++ ++ dprintk("--> %s\n", __func__); ++ pnfs_layout_release(NFS_I(lgp->args.inode)->layout, NULL); ++ if (lgp->res.layout.buf != NULL) ++ free_page((unsigned long) lgp->res.layout.buf); ++ kfree(calldata); ++ dprintk("<-- %s\n", __func__); ++} ++ ++static const struct rpc_call_ops nfs4_pnfs_layoutget_call_ops = { ++ .rpc_call_prepare = nfs4_pnfs_layoutget_prepare, ++ .rpc_call_done = nfs4_pnfs_layoutget_done, ++ .rpc_release = nfs4_pnfs_layoutget_release, ++}; ++ ++/* FIXME: We need to call nfs4_handle_exception ++ * and deal with retries. ++ * Currently we can't since we release lgp and its contents. ++ */ ++static int _pnfs4_proc_layoutget(struct nfs4_pnfs_layoutget *lgp) ++{ ++ struct nfs_server *server = NFS_SERVER(lgp->args.inode); ++ struct rpc_task *task; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_LAYOUTGET], ++ .rpc_argp = &lgp->args, ++ .rpc_resp = &lgp->res, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = server->client, ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_pnfs_layoutget_call_ops, ++ .callback_data = lgp, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ int status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ ++ lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); ++ if (lgp->res.layout.buf == NULL) { ++ nfs4_pnfs_layoutget_release(lgp); ++ return -ENOMEM; ++ } ++ ++ lgp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = lgp->status; ++ if (status != 0) ++ goto out; ++ status = pnfs_layout_process(lgp); ++out: ++ rpc_put_task(task); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ return status; ++} ++ ++int pnfs4_proc_layoutget(struct nfs4_pnfs_layoutget *lgp) ++{ ++ struct nfs_server *server = NFS_SERVER(lgp->args.inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, _pnfs4_proc_layoutget(lgp), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static void pnfs_layoutcommit_prepare(struct rpc_task *task, void *data) ++{ ++ struct pnfs_layoutcommit_data *ldata = ++ (struct pnfs_layoutcommit_data *)data; ++ struct nfs_server *server = NFS_SERVER(ldata->args.inode); ++ ++ if (nfs4_setup_sequence(server, NULL, &ldata->args.seq_args, ++ &ldata->res.seq_res, 1, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void ++pnfs_layoutcommit_done(struct rpc_task *task, void *calldata) ++{ ++ struct pnfs_layoutcommit_data *data = ++ (struct pnfs_layoutcommit_data *)calldata; ++ struct nfs_server *server = NFS_SERVER(data->args.inode); ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ data->status = task->tk_status; ++} ++ ++static void pnfs_layoutcommit_release(void *lcdata) ++{ ++ struct pnfs_layoutcommit_data *data = ++ (struct pnfs_layoutcommit_data *)lcdata; ++ ++ put_rpccred(data->cred); ++ pnfs_cleanup_layoutcommit(lcdata); ++ pnfs_layoutcommit_free(lcdata); ++ /* Matched by get_layout in pnfs_layoutcommit_inode */ ++ put_layout(data->args.inode); ++} ++ ++static const struct rpc_call_ops pnfs_layoutcommit_ops = { ++ .rpc_call_prepare = pnfs_layoutcommit_prepare, ++ .rpc_call_done = pnfs_layoutcommit_done, ++ .rpc_release = pnfs_layoutcommit_release, ++}; ++ ++/* Execute a layoutcommit to the server */ ++static int ++_pnfs4_proc_layoutcommit(struct pnfs_layoutcommit_data *data, int issync) ++{ ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_LAYOUTCOMMIT], ++ .rpc_argp = &data->args, ++ .rpc_resp = &data->res, ++ .rpc_cred = data->cred, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .task = &data->task, ++ .rpc_client = NFS_CLIENT(data->args.inode), ++ .rpc_message = &msg, ++ .callback_ops = &pnfs_layoutcommit_ops, ++ .callback_data = data, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ struct rpc_task *task; ++ int status = 0; ++ ++ dprintk("NFS: %4d initiating layoutcommit call. %llu@%llu lbw: %llu " ++ "type: %d issync %d\n", ++ data->task.tk_pid, ++ data->args.lseg.length, ++ data->args.lseg.offset, ++ data->args.lastbytewritten, ++ data->args.layout_type, issync); ++ ++ data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = data->status; ++out: ++ dprintk("%s: status %d\n", __func__, status); ++ rpc_put_task(task); ++ return 0; ++} ++ ++int pnfs4_proc_layoutcommit(struct pnfs_layoutcommit_data *data, int issync) ++{ ++ struct nfs4_exception exception = { }; ++ struct nfs_server *server = NFS_SERVER(data->args.inode); ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _pnfs4_proc_layoutcommit(data, issync), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static void ++nfs4_pnfs_layoutreturn_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_pnfs_layoutreturn *lrp = calldata; ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ if (nfs4_setup_sequence(server, NULL, &lrp->args.seq_args, ++ &lrp->res.seq_res, 0, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void nfs4_pnfs_layoutreturn_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_pnfs_layoutreturn *lrp = calldata; ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!nfs4_sequence_done(task, &lrp->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void nfs4_pnfs_layoutreturn_release(void *calldata) ++{ ++ struct nfs4_pnfs_layoutreturn *lrp = calldata; ++ struct pnfs_layout_type *lo = NFS_I(lrp->args.inode)->layout; ++ ++ dprintk("--> %s return_type %d lo %p\n", __func__, ++ lrp->args.return_type, lo); ++ ++ if (lrp->args.return_type == RETURN_FILE) { ++ if (!lrp->res.lrs_present) ++ pnfs_set_layout_stateid(lo, &zero_stateid); ++ pnfs_layout_release(lo, &lrp->args.lseg); ++ } ++ kfree(calldata); ++ dprintk("<-- %s\n", __func__); ++} ++ ++static const struct rpc_call_ops nfs4_pnfs_layoutreturn_call_ops = { ++ .rpc_call_prepare = nfs4_pnfs_layoutreturn_prepare, ++ .rpc_call_done = nfs4_pnfs_layoutreturn_done, ++ .rpc_release = nfs4_pnfs_layoutreturn_release, ++}; ++ ++int _pnfs4_proc_layoutreturn(struct nfs4_pnfs_layoutreturn *lrp, bool issync) ++{ ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ struct rpc_task *task; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_LAYOUTRETURN], ++ .rpc_argp = &lrp->args, ++ .rpc_resp = &lrp->res, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = server->client, ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_pnfs_layoutreturn_call_ops, ++ .callback_data = lrp, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ int status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ lrp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = task->tk_status; ++out: ++ dprintk("<-- %s\n", __func__); ++ rpc_put_task(task); ++ return status; ++} ++ ++int pnfs4_proc_layoutreturn(struct nfs4_pnfs_layoutreturn *lrp, bool issync) ++{ ++ struct nfs_server *server = NFS_SERVER(lrp->args.inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, ++ _pnfs4_proc_layoutreturn(lrp, issync), ++ &exception); ++ } while (exception.retry); ++ ++ return err; ++} ++ ++/* ++ * Retrieve the list of Data Server devices from the MDS. ++ */ ++static int _nfs4_pnfs_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_pnfs_getdevicelist_arg arg = { ++ .fh = fh, ++ .layoutclass = server->pnfs_curr_ld->id, ++ }; ++ struct nfs4_pnfs_getdevicelist_res res = { ++ .devlist = devlist, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_GETDEVICELIST], ++ .rpc_argp = &arg, ++ .rpc_resp = &res, ++ }; ++ int status; ++ ++ dprintk("--> %s\n", __func__); ++ status = nfs4_call_sync(server, &msg, &arg, &res, 0); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ return status; ++} ++ ++int nfs4_pnfs_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_pnfs_getdevicelist(server, fh, devlist), ++ &exception); ++ } while (exception.retry); ++ ++ dprintk("nfs4_pnfs_getdevlist: err=%d, num_devs=%u\n", ++ err, devlist->num_devs); ++ ++ return err; ++} ++ ++int nfs4_pnfs_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) ++{ ++ struct nfs4_pnfs_getdeviceinfo_arg args = { ++ .pdev = pdev, ++ }; ++ struct nfs4_pnfs_getdeviceinfo_res res = { ++ .pdev = pdev, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_GETDEVICEINFO], ++ .rpc_argp = &args, ++ .rpc_resp = &res, ++ }; ++ int status; ++ ++ dprintk("--> %s\n", __func__); ++ status = nfs4_call_sync(server, &msg, &args, &res, 0); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ ++ return status; ++} ++ + #endif /* CONFIG_NFS_V4_1 */ + + struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { +@@ -5325,28 +5908,30 @@ struct nfs4_state_maintenance_ops nfs41_ + }; + #endif + +-/* +- * Per minor version reboot and network partition recovery ops +- */ +- +-struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = { +- &nfs40_reboot_recovery_ops, +-#if defined(CONFIG_NFS_V4_1) +- &nfs41_reboot_recovery_ops, +-#endif ++static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { ++ .minor_version = 0, ++ .call_sync = _nfs4_call_sync, ++ .validate_stateid = nfs4_validate_delegation_stateid, ++ .reboot_recovery_ops = &nfs40_reboot_recovery_ops, ++ .nograce_recovery_ops = &nfs40_nograce_recovery_ops, ++ .state_renewal_ops = &nfs40_state_renewal_ops, + }; + +-struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = { +- &nfs40_nograce_recovery_ops, + #if defined(CONFIG_NFS_V4_1) +- &nfs41_nograce_recovery_ops, +-#endif ++static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { ++ .minor_version = 1, ++ .call_sync = _nfs4_call_sync_session, ++ .validate_stateid = nfs41_validate_delegation_stateid, ++ .reboot_recovery_ops = &nfs41_reboot_recovery_ops, ++ .nograce_recovery_ops = &nfs41_nograce_recovery_ops, ++ .state_renewal_ops = &nfs41_state_renewal_ops, + }; ++#endif + +-struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = { +- &nfs40_state_renewal_ops, ++const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { ++ [0] = &nfs_v4_0_minor_ops, + #if defined(CONFIG_NFS_V4_1) +- &nfs41_state_renewal_ops, ++ [1] = &nfs_v4_1_minor_ops, + #endif + }; + +@@ -5364,6 +5949,7 @@ const struct nfs_rpc_ops nfs_v4_clientop + .dentry_ops = &nfs4_dentry_operations, + .dir_inode_ops = &nfs4_dir_inode_operations, + .file_inode_ops = &nfs4_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs4_proc_get_root, + .getattr = nfs4_proc_getattr, + .setattr = nfs4_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs/nfs4renewd.c +--- linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-08-23 12:09:03.341491726 -0400 +@@ -54,17 +54,17 @@ + void + nfs4_renew_state(struct work_struct *work) + { +- struct nfs4_state_maintenance_ops *ops; ++ const struct nfs4_state_maintenance_ops *ops; + struct nfs_client *clp = + container_of(work, struct nfs_client, cl_renewd.work); + struct rpc_cred *cred; + long lease; + unsigned long last, now; + +- ops = nfs4_state_renewal_ops[clp->cl_minorversion]; ++ ops = clp->cl_mvops->state_renewal_ops; + dprintk("%s: start\n", __func__); + /* Are there any active superblocks? */ +- if (list_empty(&clp->cl_superblocks)) ++ if (list_empty(&clp->cl_superblocks) && !is_ds_only_client(clp)) + goto out; + spin_lock(&clp->cl_lock); + lease = clp->cl_lease_time; +diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/nfs4state.c +--- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-08-23 12:08:29.052491341 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-23 12:09:03.342373443 -0400 +@@ -53,6 +53,9 @@ + #include "callback.h" + #include "delegation.h" + #include "internal.h" ++#include ++#include ++#include "pnfs.h" + + #define OPENOWNER_POOL_SIZE 8 + +@@ -126,6 +129,11 @@ static int nfs41_setup_state_renewal(str + int status; + struct nfs_fsinfo fsinfo; + ++ if (is_ds_only_client(clp)) { ++ nfs4_schedule_state_renewal(clp); ++ return 0; ++ } ++ + status = nfs4_proc_get_lease_time(clp, &fsinfo); + if (status == 0) { + /* Update lease time and schedule renewal */ +@@ -145,7 +153,9 @@ static void nfs4_end_drain_session(struc + struct nfs4_session *ses = clp->cl_session; + int max_slots; + +- if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) { ++ if (ses == NULL) ++ return; ++ if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + spin_lock(&ses->fc_slot_table.slot_tbl_lock); + max_slots = ses->fc_slot_table.max_slots; + while (max_slots--) { +@@ -167,7 +177,7 @@ static int nfs4_begin_drain_session(stru + struct nfs4_slot_table *tbl = &ses->fc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); +- set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state); ++ set_bit(NFS4_SESSION_DRAINING, &ses->session_state); + if (tbl->highest_used_slotid != -1) { + INIT_COMPLETION(ses->complete); + spin_unlock(&tbl->slot_tbl_lock); +@@ -371,7 +381,6 @@ nfs4_alloc_state_owner(void) + return NULL; + spin_lock_init(&sp->so_lock); + INIT_LIST_HEAD(&sp->so_states); +- INIT_LIST_HEAD(&sp->so_delegations); + rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); + sp->so_seqid.sequence = &sp->so_sequence; + spin_lock_init(&sp->so_sequence.lock); +@@ -384,7 +393,7 @@ static void + nfs4_drop_state_owner(struct nfs4_state_owner *sp) + { + if (!RB_EMPTY_NODE(&sp->so_client_node)) { +- struct nfs_client *clp = sp->so_client; ++ struct nfs_client *clp = sp->so_server->nfs_client; + + spin_lock(&clp->cl_lock); + rb_erase(&sp->so_client_node, &clp->cl_state_owners); +@@ -406,7 +415,6 @@ struct nfs4_state_owner *nfs4_get_state_ + new = nfs4_alloc_state_owner(); + if (new == NULL) + return NULL; +- new->so_client = clp; + new->so_server = server; + new->so_cred = cred; + spin_lock(&clp->cl_lock); +@@ -423,7 +431,7 @@ struct nfs4_state_owner *nfs4_get_state_ + + void nfs4_put_state_owner(struct nfs4_state_owner *sp) + { +- struct nfs_client *clp = sp->so_client; ++ struct nfs_client *clp = sp->so_server->nfs_client; + struct rpc_cred *cred = sp->so_cred; + + if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) +@@ -583,8 +591,24 @@ static void __nfs4_close(struct path *pa + if (!call_close) { + nfs4_put_open_state(state); + nfs4_put_state_owner(owner); +- } else ++ } else { ++ u32 roc_iomode; ++ struct nfs_inode *nfsi = NFS_I(state->inode); ++ ++ if (has_layout(nfsi) && ++ (roc_iomode = pnfs_layout_roc_iomode(nfsi)) != 0) { ++ struct nfs4_pnfs_layout_segment range = { ++ .iomode = roc_iomode, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ ++ pnfs_return_layout(state->inode, &range, NULL, ++ RETURN_FILE, wait); ++ } ++ + nfs4_do_close(path, state, gfp_mask, wait); ++ } + } + + void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) +@@ -602,12 +626,21 @@ void nfs4_close_sync(struct path *path, + * that is compatible with current->files + */ + static struct nfs4_lock_state * +-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) ++__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) + { + struct nfs4_lock_state *pos; + list_for_each_entry(pos, &state->lock_states, ls_locks) { +- if (pos->ls_owner != fl_owner) ++ if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type) + continue; ++ switch (pos->ls_owner.lo_type) { ++ case NFS4_POSIX_LOCK_TYPE: ++ if (pos->ls_owner.lo_u.posix_owner != fl_owner) ++ continue; ++ break; ++ case NFS4_FLOCK_LOCK_TYPE: ++ if (pos->ls_owner.lo_u.flock_owner != fl_pid) ++ continue; ++ } + atomic_inc(&pos->ls_count); + return pos; + } +@@ -619,10 +652,10 @@ __nfs4_find_lock_state(struct nfs4_state + * exists, return an uninitialized one. + * + */ +-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) ++static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) + { + struct nfs4_lock_state *lsp; +- struct nfs_client *clp = state->owner->so_client; ++ struct nfs_client *clp = state->owner->so_server->nfs_client; + + lsp = kzalloc(sizeof(*lsp), GFP_NOFS); + if (lsp == NULL) +@@ -633,7 +666,18 @@ static struct nfs4_lock_state *nfs4_allo + lsp->ls_seqid.sequence = &lsp->ls_sequence; + atomic_set(&lsp->ls_count, 1); + lsp->ls_state = state; +- lsp->ls_owner = fl_owner; ++ lsp->ls_owner.lo_type = type; ++ switch (lsp->ls_owner.lo_type) { ++ case NFS4_FLOCK_LOCK_TYPE: ++ lsp->ls_owner.lo_u.flock_owner = fl_pid; ++ break; ++ case NFS4_POSIX_LOCK_TYPE: ++ lsp->ls_owner.lo_u.posix_owner = fl_owner; ++ break; ++ default: ++ kfree(lsp); ++ return NULL; ++ } + spin_lock(&clp->cl_lock); + nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); + spin_unlock(&clp->cl_lock); +@@ -643,7 +687,7 @@ static struct nfs4_lock_state *nfs4_allo + + static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) + { +- struct nfs_client *clp = lsp->ls_state->owner->so_client; ++ struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client; + + spin_lock(&clp->cl_lock); + nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); +@@ -657,13 +701,13 @@ static void nfs4_free_lock_state(struct + * exists, return an uninitialized one. + * + */ +-static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) ++static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type) + { + struct nfs4_lock_state *lsp, *new = NULL; + + for(;;) { + spin_lock(&state->state_lock); +- lsp = __nfs4_find_lock_state(state, owner); ++ lsp = __nfs4_find_lock_state(state, owner, pid, type); + if (lsp != NULL) + break; + if (new != NULL) { +@@ -674,7 +718,7 @@ static struct nfs4_lock_state *nfs4_get_ + break; + } + spin_unlock(&state->state_lock); +- new = nfs4_alloc_lock_state(state, owner); ++ new = nfs4_alloc_lock_state(state, owner, pid, type); + if (new == NULL) + return NULL; + } +@@ -701,6 +745,8 @@ void nfs4_put_lock_state(struct nfs4_loc + if (list_empty(&state->lock_states)) + clear_bit(LK_STATE_IN_USE, &state->flags); + spin_unlock(&state->state_lock); ++ if (lsp->ls_flags & NFS_LOCK_INITIALIZED) ++ nfs4_release_lockowner(lsp); + nfs4_free_lock_state(lsp); + } + +@@ -728,7 +774,12 @@ int nfs4_set_lock_state(struct nfs4_stat + + if (fl->fl_ops != NULL) + return 0; +- lsp = nfs4_get_lock_state(state, fl->fl_owner); ++ if (fl->fl_flags & FL_POSIX) ++ lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE); ++ else if (fl->fl_flags & FL_FLOCK) ++ lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE); ++ else ++ return -EINVAL; + if (lsp == NULL) + return -ENOMEM; + fl->fl_u.nfs4_fl.owner = lsp; +@@ -740,7 +791,7 @@ int nfs4_set_lock_state(struct nfs4_stat + * Byte-range lock aware utility to initialize the stateid of read/write + * requests. + */ +-void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) ++void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid) + { + struct nfs4_lock_state *lsp; + int seq; +@@ -753,7 +804,7 @@ void nfs4_copy_stateid(nfs4_stateid *dst + return; + + spin_lock(&state->state_lock); +- lsp = __nfs4_find_lock_state(state, fl_owner); ++ lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); + if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) + memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); + spin_unlock(&state->state_lock); +@@ -1031,8 +1082,8 @@ restart: + * Open state on this file cannot be recovered + * All we can do is revert to using the zero stateid. + */ +- memset(state->stateid.data, 0, +- sizeof(state->stateid.data)); ++ memset(state->stateid.u.data, 0, ++ sizeof(state->stateid.u.data)); + /* Mark the file as being 'closed' */ + state->state = 0; + break; +@@ -1041,11 +1092,11 @@ restart: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: +- nfs4_state_mark_reclaim_nograce(sp->so_client, state); ++ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_NO_GRACE: +- nfs4_state_mark_reclaim_nograce(sp->so_client, state); ++ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: +@@ -1120,8 +1171,7 @@ static void nfs4_state_end_reclaim_reboo + if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + return; + +- nfs4_reclaim_complete(clp, +- nfs4_reboot_recovery_ops[clp->cl_minorversion]); ++ nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); + + for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); +@@ -1211,8 +1261,8 @@ restart: + static int nfs4_check_lease(struct nfs_client *clp) + { + struct rpc_cred *cred; +- struct nfs4_state_maintenance_ops *ops = +- nfs4_state_renewal_ops[clp->cl_minorversion]; ++ const struct nfs4_state_maintenance_ops *ops = ++ clp->cl_mvops->state_renewal_ops; + int status = -NFS4ERR_EXPIRED; + + /* Is the client already known to have an expired lease? */ +@@ -1235,8 +1285,8 @@ out: + static int nfs4_reclaim_lease(struct nfs_client *clp) + { + struct rpc_cred *cred; +- struct nfs4_state_recovery_ops *ops = +- nfs4_reboot_recovery_ops[clp->cl_minorversion]; ++ const struct nfs4_state_recovery_ops *ops = ++ clp->cl_mvops->reboot_recovery_ops; + int status = -ENOENT; + + cred = ops->get_clid_cred(clp); +@@ -1421,6 +1471,7 @@ static void nfs4_state_manager(struct nf + } + clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); ++ pnfs_destroy_all_layouts(clp); + } + + if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { +@@ -1444,7 +1495,7 @@ static void nfs4_state_manager(struct nf + /* First recover reboot state... */ + if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, +- nfs4_reboot_recovery_ops[clp->cl_minorversion]); ++ clp->cl_mvops->reboot_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + continue; +@@ -1458,7 +1509,7 @@ static void nfs4_state_manager(struct nf + /* Now recover expired state... */ + if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, +- nfs4_nograce_recovery_ops[clp->cl_minorversion]); ++ clp->cl_mvops->nograce_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || + test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) +diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-23 12:08:29.054481400 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-23 12:09:03.346481283 -0400 +@@ -50,8 +50,11 @@ + #include + #include + #include ++#include ++#include + #include "nfs4_fs.h" + #include "internal.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_XDR + +@@ -89,7 +92,7 @@ static int nfs4_stat_to_errno(int); + #define encode_getfh_maxsz (op_encode_hdr_maxsz) + #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ + ((3+NFS4_FHSIZE) >> 2)) +-#define nfs4_fattr_bitmap_maxsz 3 ++#define nfs4_fattr_bitmap_maxsz 4 + #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) + #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) + #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) +@@ -111,7 +114,11 @@ static int nfs4_stat_to_errno(int); + #define encode_restorefh_maxsz (op_encode_hdr_maxsz) + #define decode_restorefh_maxsz (op_decode_hdr_maxsz) + #define encode_fsinfo_maxsz (encode_getattr_maxsz) +-#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) ++/* The 5 accounts for the PNFS attributes, and assumes that at most three ++ * layout types will be returned. ++ */ ++#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ ++ nfs4_fattr_bitmap_maxsz + 8 + 5) + #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) + #define decode_renew_maxsz (op_decode_hdr_maxsz) + #define encode_setclientid_maxsz \ +@@ -202,14 +209,17 @@ static int nfs4_stat_to_errno(int); + #define encode_link_maxsz (op_encode_hdr_maxsz + \ + nfs4_name_maxsz) + #define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) ++#define encode_lockowner_maxsz (7) + #define encode_lock_maxsz (op_encode_hdr_maxsz + \ + 7 + \ +- 1 + encode_stateid_maxsz + 8) ++ 1 + encode_stateid_maxsz + 1 + \ ++ encode_lockowner_maxsz) + #define decode_lock_denied_maxsz \ + (8 + decode_lockowner_maxsz) + #define decode_lock_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) +-#define encode_lockt_maxsz (op_encode_hdr_maxsz + 12) ++#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \ ++ encode_lockowner_maxsz) + #define decode_lockt_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) + #define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \ +@@ -217,6 +227,11 @@ static int nfs4_stat_to_errno(int); + 4) + #define decode_locku_maxsz (op_decode_hdr_maxsz + \ + decode_stateid_maxsz) ++#define encode_release_lockowner_maxsz \ ++ (op_encode_hdr_maxsz + \ ++ encode_lockowner_maxsz) ++#define decode_release_lockowner_maxsz \ ++ (op_decode_hdr_maxsz) + #define encode_access_maxsz (op_encode_hdr_maxsz + 1) + #define decode_access_maxsz (op_decode_hdr_maxsz + 2) + #define encode_symlink_maxsz (op_encode_hdr_maxsz + \ +@@ -302,6 +317,35 @@ static int nfs4_stat_to_errno(int); + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) + #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) + #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) ++#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ ++ encode_verifier_maxsz) ++#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1 + \ ++ decode_verifier_maxsz + \ ++ XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ ++ NFS4_PNFS_DEVICEID4_SIZE)) ++#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ ++ XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE)) ++#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ ++ 4 /*layout type */ + \ ++ 4 /* opaque devaddr4 length */ +\ ++ 4 /* notification bitmap length */ + \ ++ 4 /* notification bitmap */) ++#define encode_layoutget_sz (op_encode_hdr_maxsz + 10 + \ ++ encode_stateid_maxsz) ++#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ ++ decode_stateid_maxsz + \ ++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) ++#define encode_layoutcommit_sz (18 + \ ++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \ ++ op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz) ++#define decode_layoutcommit_maxsz (3 + op_decode_hdr_maxsz) ++#define encode_layoutreturn_sz (8 + op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz + \ ++ 1 /* FIXME: opaque lrf_body always empty at ++ *the moment */) ++#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ ++ 1 + decode_stateid_maxsz) + #else /* CONFIG_NFS_V4_1 */ + #define encode_sequence_maxsz 0 + #define decode_sequence_maxsz 0 +@@ -471,6 +515,12 @@ static int nfs4_stat_to_errno(int); + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_locku_maxsz) ++#define NFS4_enc_release_lockowner_sz \ ++ (compound_encode_hdr_maxsz + \ ++ encode_lockowner_maxsz) ++#define NFS4_dec_release_lockowner_sz \ ++ (compound_decode_hdr_maxsz + \ ++ decode_lockowner_maxsz) + #define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ +@@ -685,6 +735,60 @@ static int nfs4_stat_to_errno(int); + #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_reclaim_complete_maxsz) ++#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_getdevicelist_maxsz) ++#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_getdevicelist_maxsz) ++#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_getdeviceinfo_maxsz) ++#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_getdeviceinfo_maxsz) ++#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_layoutget_sz) ++#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutget_maxsz) ++#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_layoutcommit_sz + \ ++ encode_getattr_maxsz) ++#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutcommit_maxsz + \ ++ decode_getattr_maxsz) ++#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_layoutreturn_sz) ++#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutreturn_maxsz) ++#define NFS4_enc_dswrite_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_write_maxsz) ++#define NFS4_dec_dswrite_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_write_maxsz) ++#define NFS4_enc_dscommit_sz (compound_encode_hdr_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_commit_maxsz) ++#define NFS4_dec_dscommit_sz (compound_decode_hdr_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_commit_maxsz) + + const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_encode_hdr_maxsz + +@@ -915,7 +1019,7 @@ static void encode_close(struct xdr_stre + p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_CLOSE); + *p++ = cpu_to_be32(arg->seqid->sequence->counter); +- xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_close_maxsz; + } +@@ -989,6 +1093,35 @@ static void encode_getattr_two(struct xd + hdr->replen += decode_getattr_maxsz; + } + ++static void ++encode_getattr_three(struct xdr_stream *xdr, ++ uint32_t bm0, uint32_t bm1, uint32_t bm2, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(OP_GETATTR); ++ if (bm2) { ++ p = reserve_space(xdr, 16); ++ *p++ = cpu_to_be32(3); ++ *p++ = cpu_to_be32(bm0); ++ *p++ = cpu_to_be32(bm1); ++ *p = cpu_to_be32(bm2); ++ } else if (bm1) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(2); ++ *p++ = cpu_to_be32(bm0); ++ *p = cpu_to_be32(bm1); ++ } else { ++ p = reserve_space(xdr, 8); ++ *p++ = cpu_to_be32(1); ++ *p = cpu_to_be32(bm0); ++ } ++ hdr->nops++; ++ hdr->replen += decode_getattr_maxsz; ++} ++ + static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { + encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], +@@ -997,8 +1130,11 @@ static void encode_getfattr(struct xdr_s + + static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { +- encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], +- bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); ++ encode_getattr_three(xdr, ++ bitmask[0] & nfs4_fsinfo_bitmap[0], ++ bitmask[1] & nfs4_fsinfo_bitmap[1], ++ bitmask[2] & nfs4_fsinfo_bitmap[2], ++ hdr); + } + + static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) +@@ -1042,6 +1178,17 @@ static inline uint64_t nfs4_lock_length( + return fl->fl_end - fl->fl_start + 1; + } + ++static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 28); ++ p = xdr_encode_hyper(p, lowner->clientid); ++ *p++ = cpu_to_be32(16); ++ p = xdr_encode_opaque_fixed(p, "lock id:", 8); ++ xdr_encode_hyper(p, lowner->id); ++} ++ + /* + * opcode,type,reclaim,offset,length,new_lock_owner = 32 + * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 +@@ -1058,18 +1205,16 @@ static void encode_lock(struct xdr_strea + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + *p = cpu_to_be32(args->new_lock_owner); + if (args->new_lock_owner){ +- p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); ++ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(args->open_seqid->sequence->counter); +- p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->open_stateid->u.data, ++ NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); +- p = xdr_encode_hyper(p, args->lock_owner.clientid); +- *p++ = cpu_to_be32(16); +- p = xdr_encode_opaque_fixed(p, "lock id:", 8); +- xdr_encode_hyper(p, args->lock_owner.id); ++ encode_lockowner(xdr, &args->lock_owner); + } + else { + p = reserve_space(xdr, NFS4_STATEID_SIZE+4); +- p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->lock_stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(args->lock_seqid->sequence->counter); + } + hdr->nops++; +@@ -1080,15 +1225,12 @@ static void encode_lockt(struct xdr_stre + { + __be32 *p; + +- p = reserve_space(xdr, 52); ++ p = reserve_space(xdr, 24); + *p++ = cpu_to_be32(OP_LOCKT); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); +- p = xdr_encode_hyper(p, args->lock_owner.clientid); +- *p++ = cpu_to_be32(16); +- p = xdr_encode_opaque_fixed(p, "lock id:", 8); +- xdr_encode_hyper(p, args->lock_owner.id); ++ encode_lockowner(xdr, &args->lock_owner); + hdr->nops++; + hdr->replen += decode_lockt_maxsz; + } +@@ -1101,13 +1243,25 @@ static void encode_locku(struct xdr_stre + *p++ = cpu_to_be32(OP_LOCKU); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + *p++ = cpu_to_be32(args->seqid->sequence->counter); +- p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->stateid->u.data, ++ NFS4_STATEID_SIZE); + p = xdr_encode_hyper(p, args->fl->fl_start); + xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + hdr->nops++; + hdr->replen += decode_locku_maxsz; + } + ++static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(OP_RELEASE_LOCKOWNER); ++ encode_lockowner(xdr, lowner); ++ hdr->nops++; ++ hdr->replen += decode_release_lockowner_maxsz; ++} ++ + static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) + { + int len = name->len; +@@ -1172,7 +1326,7 @@ static inline void encode_createmode(str + break; + default: + clp = arg->server->nfs_client; +- if (clp->cl_minorversion > 0) { ++ if (clp->cl_mvops->minor_version > 0) { + if (nfs4_has_persistent_session(clp)) { + *p = cpu_to_be32(NFS4_CREATE_GUARDED); + encode_attrs(xdr, arg->u.attrs, arg->server); +@@ -1251,7 +1405,7 @@ static inline void encode_claim_delegate + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); +- xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, stateid->u.data, NFS4_STATEID_SIZE); + encode_string(xdr, name->len, name->name); + } + +@@ -1282,7 +1436,7 @@ static void encode_open_confirm(struct x + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_CONFIRM); +- p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); + hdr->nops++; + hdr->replen += decode_open_confirm_maxsz; +@@ -1294,7 +1448,7 @@ static void encode_open_downgrade(struct + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); +- p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); + encode_share_access(xdr, arg->fmode); + hdr->nops++; +@@ -1324,17 +1478,17 @@ static void encode_putrootfh(struct xdr_ + hdr->replen += decode_putrootfh_maxsz; + } + +-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) ++static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx) + { + nfs4_stateid stateid; + __be32 *p; + + p = reserve_space(xdr, NFS4_STATEID_SIZE); + if (ctx->state != NULL) { +- nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); +- xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); ++ nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); ++ xdr_encode_opaque_fixed(p, stateid.u.data, NFS4_STATEID_SIZE); + } else +- xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, zero_stateid.u.data, NFS4_STATEID_SIZE); + } + + static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) +@@ -1344,7 +1498,7 @@ static void encode_read(struct xdr_strea + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_READ); + +- encode_stateid(xdr, args->context); ++ encode_stateid(xdr, args->context, args->lock_context); + + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->offset); +@@ -1448,7 +1602,7 @@ encode_setacl(struct xdr_stream *xdr, st + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); +- xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, zero_stateid.u.data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 2*4); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(FATTR4_WORD0_ACL); +@@ -1479,7 +1633,7 @@ static void encode_setattr(struct xdr_st + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); +- xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, arg->stateid.u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_setattr_maxsz; + encode_attrs(xdr, arg->iap, server); +@@ -1523,7 +1677,7 @@ static void encode_write(struct xdr_stre + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_WRITE); + +- encode_stateid(xdr, args->context); ++ encode_stateid(xdr, args->context, args->lock_context); + + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, args->offset); +@@ -1542,7 +1696,7 @@ static void encode_delegreturn(struct xd + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + + *p++ = cpu_to_be32(OP_DELEGRETURN); +- xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, stateid->u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_delegreturn_maxsz; + } +@@ -1696,6 +1850,162 @@ static void encode_sequence(struct xdr_s + #endif /* CONFIG_NFS_V4_1 */ + } + ++#ifdef CONFIG_NFS_V4_1 ++static void ++encode_getdevicelist(struct xdr_stream *xdr, ++ const struct nfs4_pnfs_getdevicelist_arg *args, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ nfs4_verifier dummy = { ++ .data = "dummmmmy", ++ }; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_GETDEVICELIST); ++ *p++ = cpu_to_be32(args->layoutclass); ++ *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); ++ xdr_encode_hyper(p, 0ULL); /* cookie */ ++ encode_nfs4_verifier(xdr, &dummy); ++ hdr->nops++; ++} ++ ++static void ++encode_getdeviceinfo(struct xdr_stream *xdr, ++ const struct nfs4_pnfs_getdeviceinfo_arg *args, ++ struct compound_hdr *hdr) ++{ ++ int has_bitmap = (args->pdev->dev_notify_types != 0); ++ int len = 16 + NFS4_PNFS_DEVICEID4_SIZE + (has_bitmap * 4); ++ __be32 *p; ++ ++ p = reserve_space(xdr, len); ++ *p++ = cpu_to_be32(OP_GETDEVICEINFO); ++ p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ *p++ = cpu_to_be32(args->pdev->layout_type); ++ *p++ = cpu_to_be32(args->pdev->pglen + len); /* gdia_maxcount */ ++ *p++ = cpu_to_be32(has_bitmap); /* bitmap length [01] */ ++ if (has_bitmap) ++ *p = cpu_to_be32(args->pdev->dev_notify_types); ++ hdr->nops++; ++} ++ ++static void ++encode_layoutget(struct xdr_stream *xdr, ++ const struct nfs4_pnfs_layoutget_arg *args, ++ struct compound_hdr *hdr) ++{ ++ nfs4_stateid stateid; ++ __be32 *p; ++ ++ p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(OP_LAYOUTGET); ++ *p++ = cpu_to_be32(0); /* Signal layout available */ ++ *p++ = cpu_to_be32(args->type); ++ *p++ = cpu_to_be32(args->lseg.iomode); ++ p = xdr_encode_hyper(p, args->lseg.offset); ++ p = xdr_encode_hyper(p, args->lseg.length); ++ p = xdr_encode_hyper(p, args->minlength); ++ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout); ++ p = xdr_encode_opaque_fixed(p, &stateid.u.data, NFS4_STATEID_SIZE); ++ *p = cpu_to_be32(args->maxcount); ++ ++ dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", ++ __func__, ++ args->type, ++ args->lseg.iomode, ++ (unsigned long)args->lseg.offset, ++ (unsigned long)args->lseg.length, ++ args->maxcount); ++ hdr->nops++; ++ hdr->replen += decode_layoutget_maxsz; ++} ++ ++static int ++encode_layoutcommit(struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *args, ++ struct compound_hdr *hdr) ++{ ++ struct layoutdriver_io_operations *ld_io_ops = ++ NFS_SERVER(args->inode)->pnfs_curr_ld->ld_io_ops; ++ __be32 *p; ++ ++ dprintk("%s: %llu@%llu lbw: %llu type: %d\n", __func__, ++ args->lseg.length, args->lseg.offset, args->lastbytewritten, ++ args->layout_type); ++ ++ p = reserve_space(xdr, 40 + NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); ++ p = xdr_encode_hyper(p, args->lseg.offset); ++ p = xdr_encode_hyper(p, args->lseg.length); ++ *p++ = cpu_to_be32(0); /* reclaim */ ++ p = xdr_encode_opaque_fixed(p, args->stateid.u.data, NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(1); /* newoffset = TRUE */ ++ p = xdr_encode_hyper(p, args->lastbytewritten); ++ *p = cpu_to_be32(args->time_modify_changed != 0); ++ if (args->time_modify_changed) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(0); ++ *p++ = cpu_to_be32(args->time_modify.tv_sec); ++ *p = cpu_to_be32(args->time_modify.tv_nsec); ++ } ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(args->layout_type); ++ ++ if (ld_io_ops->encode_layoutcommit) { ++ ld_io_ops->encode_layoutcommit(NFS_I(args->inode)->layout, ++ xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ xdr_encode_opaque(p, NULL, 0); ++ } ++ ++ hdr->nops++; ++ hdr->replen += decode_layoutcommit_maxsz; ++ return 0; ++} ++ ++static void ++encode_layoutreturn(struct xdr_stream *xdr, ++ const struct nfs4_pnfs_layoutreturn_arg *args, ++ struct compound_hdr *hdr) ++{ ++ nfs4_stateid stateid; ++ __be32 *p; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_LAYOUTRETURN); ++ *p++ = cpu_to_be32(args->reclaim); ++ *p++ = cpu_to_be32(args->layout_type); ++ *p++ = cpu_to_be32(args->lseg.iomode); ++ *p = cpu_to_be32(args->return_type); ++ if (args->return_type == RETURN_FILE) { ++ struct layoutdriver_io_operations *ld_io_ops = ++ NFS_SERVER(args->inode)->pnfs_curr_ld->ld_io_ops; ++ ++ p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE); ++ p = xdr_encode_hyper(p, args->lseg.offset); ++ p = xdr_encode_hyper(p, args->lseg.length); ++ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout); ++ p = xdr_encode_opaque_fixed(p, &stateid.u.data, ++ NFS4_STATEID_SIZE); ++ dprintk("%s: call %pF\n", __func__, ++ ld_io_ops->encode_layoutreturn); ++ if (ld_io_ops->encode_layoutreturn) { ++ ld_io_ops->encode_layoutreturn( ++ NFS_I(args->inode)->layout, xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(0); ++ } ++ } ++ hdr->nops++; ++ hdr->replen += decode_layoutreturn_maxsz; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ + /* + * END OF "GENERIC" ENCODE ROUTINES. + */ +@@ -1704,7 +2014,7 @@ static u32 nfs4_xdr_minorversion(const s + { + #if defined(CONFIG_NFS_V4_1) + if (args->sa_session) +- return args->sa_session->clp->cl_minorversion; ++ return args->sa_session->clp->cl_mvops->minor_version; + #endif /* CONFIG_NFS_V4_1 */ + return 0; + } +@@ -2048,6 +2358,20 @@ static int nfs4_xdr_enc_locku(struct rpc + return 0; + } + ++static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = 0, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_release_lockowner(&xdr, &args->lock_owner, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ + /* + * Encode a READLINK request + */ +@@ -2330,7 +2654,7 @@ static int nfs4_xdr_enc_setclientid_conf + struct compound_hdr hdr = { + .nops = 0, + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); +@@ -2395,7 +2719,7 @@ static int nfs4_xdr_enc_exchange_id(stru + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = args->client->cl_minorversion, ++ .minorversion = args->client->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2413,7 +2737,7 @@ static int nfs4_xdr_enc_create_session(s + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = args->client->cl_minorversion, ++ .minorversion = args->client->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2431,7 +2755,7 @@ static int nfs4_xdr_enc_destroy_session( + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = session->clp->cl_minorversion, ++ .minorversion = session->clp->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2469,7 +2793,7 @@ static int nfs4_xdr_enc_get_lease_time(s + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); +@@ -2499,6 +2823,159 @@ static int nfs4_xdr_enc_reclaim_complete + return 0; + } + ++/* ++ * Encode GETDEVICELIST request ++ */ ++static int ++nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_pnfs_getdevicelist_arg *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_getdevicelist(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode GETDEVICEINFO request ++ */ ++static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_pnfs_getdeviceinfo_arg *args) ++{ ++ struct xdr_stream xdr; ++ struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ int replen; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_getdeviceinfo(&xdr, args, &hdr); ++ ++ /* set up reply kvec. Subtract notification bitmap max size (8) ++ * so that notification bitmap is put in xdr_buf tail */ ++ replen = (RPC_REPHDRSIZE + auth->au_rslack + ++ NFS4_dec_getdeviceinfo_sz - 8) << 2; ++ xdr_inline_pages(&req->rq_rcv_buf, replen, args->pdev->pages, ++ args->pdev->pgbase, args->pdev->pglen); ++ dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", ++ __func__, replen, args->pdev->pages, ++ args->pdev->pgbase, args->pdev->pglen); ++ ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTGET request ++ */ ++static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_pnfs_layoutget_arg *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, NFS_FH(args->inode), &hdr); ++ encode_layoutget(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTCOMMIT request ++ */ ++static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, uint32_t *p, ++ struct pnfs_layoutcommit_arg *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_layoutcommit(&xdr, args, &hdr); ++ encode_getfattr(&xdr, args->bitmask, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTRETURN request ++ */ ++static int nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_pnfs_layoutreturn_arg *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, NFS_FH(args->inode), &hdr); ++ encode_layoutreturn(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server WRITE request ++ */ ++static int nfs4_xdr_enc_dswrite(struct rpc_rqst *req, uint32_t *p, ++ struct nfs_writeargs *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_write(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server COMMIT request ++ */ ++static int nfs4_xdr_enc_dscommit(struct rpc_rqst *req, uint32_t *p, ++ struct nfs_writeargs *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_commit(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} + #endif /* CONFIG_NFS_V4_1 */ + + static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +@@ -2599,14 +3076,17 @@ static int decode_attr_bitmap(struct xdr + goto out_overflow; + bmlen = be32_to_cpup(p); + +- bitmap[0] = bitmap[1] = 0; ++ bitmap[0] = bitmap[1] = bitmap[2] = 0; + p = xdr_inline_decode(xdr, (bmlen << 2)); + if (unlikely(!p)) + goto out_overflow; + if (bmlen > 0) { + bitmap[0] = be32_to_cpup(p++); +- if (bmlen > 1) +- bitmap[1] = be32_to_cpup(p); ++ if (bmlen > 1) { ++ bitmap[1] = be32_to_cpup(p++); ++ if (bmlen > 2) ++ bitmap[2] = be32_to_cpup(p); ++ } + } + return 0; + out_overflow: +@@ -2635,8 +3115,9 @@ static int decode_attr_supported(struct + decode_attr_bitmap(xdr, bitmask); + bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; + } else +- bitmask[0] = bitmask[1] = 0; +- dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); ++ bitmask[0] = bitmask[1] = bitmask[2] = 0; ++ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, ++ bitmask[0], bitmask[1], bitmask[2]); + return 0; + } + +@@ -3565,7 +4046,7 @@ static int decode_opaque_fixed(struct xd + + static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) + { +- return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); ++ return decode_opaque_fixed(xdr, stateid->u.data, NFS4_STATEID_SIZE); + } + + static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +@@ -3621,7 +4102,7 @@ out_overflow: + static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3647,7 +4128,7 @@ xdr_error: + static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3679,7 +4160,7 @@ xdr_error: + static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3705,7 +4186,7 @@ static int decode_getfattr(struct xdr_st + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}, ++ bitmap[3] = {0}, + type; + int status; + umode_t fmode = 0; +@@ -3824,24 +4305,101 @@ xdr_error: + return status; + } + +- +-static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) ++#if defined(CONFIG_NFS_V4_1) ++/* ++ * Decode potentially multiple layout types. Currently we only support ++ * one layout driver per file system. ++ */ ++static int decode_pnfs_list(struct xdr_stream *xdr, uint32_t *layoutclass) + { +- __be32 *savep; +- uint32_t attrlen, bitmap[2]; +- int status; ++ uint32_t *p; ++ int num; + +- if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +- goto xdr_error; +- if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) +- goto xdr_error; +- if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) +- goto xdr_error; ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ num = be32_to_cpup(p); + +- fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ ++ /* pNFS is not supported by the underlying file system */ ++ if (num == 0) { ++ *layoutclass = 0; ++ return 0; ++ } + +- if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) +- goto xdr_error; ++ /* TODO: We will eventually support multiple layout drivers ? */ ++ if (num > 1) ++ printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers " ++ "per filesystem not supported\n", __func__); ++ ++ /* Decode and set first layout type */ ++ p = xdr_inline_decode(xdr, num * 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ *layoutclass = be32_to_cpup(p); ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++/* ++ * The type of file system exported ++ */ ++static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, ++ uint32_t *layoutclass) ++{ ++ int status = 0; ++ ++ dprintk("%s: bitmap is %x\n", __func__, bitmap[1]); ++ if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) ++ return -EIO; ++ if (likely(bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES)) { ++ status = decode_pnfs_list(xdr, layoutclass); ++ bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; ++ } ++ return status; ++} ++ ++/* ++ * The prefered block size for layout directed io ++ */ ++static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, ++ uint32_t *res) ++{ ++ __be32 *p; ++ ++ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); ++ *res = 0; ++ if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) { ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++ } ++ *res = be32_to_cpup(p); ++ bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; ++ } ++ return 0; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ ++static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) ++{ ++ __be32 *savep; ++ uint32_t attrlen, bitmap[3]; ++ int status; ++ ++ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) ++ goto xdr_error; ++ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) ++ goto xdr_error; ++ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) ++ goto xdr_error; ++ ++ fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ ++ ++ if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) ++ goto xdr_error; + if ((status = decode_attr_maxfilesize(xdr, bitmap, &fsinfo->maxfilesize)) != 0) + goto xdr_error; + if ((status = decode_attr_maxread(xdr, bitmap, &fsinfo->rtmax)) != 0) +@@ -3850,6 +4408,14 @@ static int decode_fsinfo(struct xdr_stre + if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) + goto xdr_error; + fsinfo->wtpref = fsinfo->wtmax; ++#if defined(CONFIG_NFS_V4_1) ++ status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); ++ if (status) ++ goto xdr_error; ++ status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); ++ if (status) ++ goto xdr_error; ++#endif /* CONFIG_NFS_V4_1 */ + + status = verify_attr_len(xdr, savep, attrlen); + xdr_error: +@@ -3973,6 +4539,11 @@ static int decode_locku(struct xdr_strea + return status; + } + ++static int decode_release_lockowner(struct xdr_stream *xdr) ++{ ++ return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER); ++} ++ + static int decode_lookup(struct xdr_stream *xdr) + { + return decode_op_hdr(xdr, OP_LOOKUP); +@@ -4333,7 +4904,7 @@ static int decode_getacl(struct xdr_stre + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}; ++ bitmap[3] = {0}; + struct kvec *iov = req->rq_rcv_buf.head; + int status; + +@@ -4682,6 +5253,226 @@ out_overflow: + #endif /* CONFIG_NFS_V4_1 */ + } + ++#if defined(CONFIG_NFS_V4_1) ++/* ++ * TODO: Need to handle case when EOF != true; ++ */ ++static int decode_getdevicelist(struct xdr_stream *xdr, ++ struct pnfs_devicelist *res) ++{ ++ __be32 *p; ++ int status, i; ++ struct nfs_writeverf verftemp; ++ ++ status = decode_op_hdr(xdr, OP_GETDEVICELIST); ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 8 + 8 + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ ++ /* TODO: Skip cookie for now */ ++ p += 2; ++ ++ /* Read verifier */ ++ p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); ++ ++ res->num_devs = be32_to_cpup(p); ++ ++ dprintk("%s: num_dev %d\n", __func__, res->num_devs); ++ ++ if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) ++ return -NFS4ERR_REP_TOO_BIG; ++ ++ p = xdr_inline_decode(xdr, ++ res->num_devs * NFS4_PNFS_DEVICEID4_SIZE + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ for (i = 0; i < res->num_devs; i++) ++ p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ res->eof = be32_to_cpup(p); ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_getdeviceinfo(struct xdr_stream *xdr, ++ struct pnfs_device *pdev) ++{ ++ __be32 *p; ++ uint32_t len, type; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_GETDEVICEINFO); ++ if (status) { ++ if (status == -ETOOSMALL) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ pdev->mincount = be32_to_cpup(p); ++ dprintk("%s: Min count too small. mincnt = %u\n", ++ __func__, pdev->mincount); ++ } ++ return status; ++ } ++ ++ p = xdr_inline_decode(xdr, 8); ++ if (unlikely(!p)) ++ goto out_overflow; ++ type = be32_to_cpup(p++); ++ if (type != pdev->layout_type) { ++ dprintk("%s: layout mismatch req: %u pdev: %u\n", ++ __func__, pdev->layout_type, type); ++ return -EINVAL; ++ } ++ /* ++ * Get the length of the opaque device_addr4. xdr_read_pages places ++ * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages) ++ * and places the remaining xdr data in xdr_buf->tail ++ */ ++ pdev->mincount = be32_to_cpup(p); ++ xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ ++ ++ /* At most one bitmap word */ ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ len = be32_to_cpup(p); ++ if (len) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ pdev->dev_notify_types = be32_to_cpup(p); ++ } else ++ pdev->dev_notify_types = 0; ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, ++ struct nfs4_pnfs_layoutget_res *res) ++{ ++ __be32 *p; ++ int status; ++ u32 layout_count, dummy; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTGET); ++ if (status) ++ return status; ++ p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->return_on_close = be32_to_cpup(p++); ++ p = xdr_decode_opaque_fixed(p, res->stateid.u.data, NFS4_STATEID_SIZE); ++ layout_count = be32_to_cpup(p); ++ if (!layout_count) { ++ dprintk("%s: server responded with empty layout array\n", ++ __func__); ++ return -EINVAL; ++ } ++ ++ p = xdr_inline_decode(xdr, 24); ++ if (unlikely(!p)) ++ goto out_overflow; ++ p = xdr_decode_hyper(p, &res->lseg.offset); ++ p = xdr_decode_hyper(p, &res->lseg.length); ++ res->lseg.iomode = be32_to_cpup(p++); ++ res->type = be32_to_cpup(p++); ++ ++ status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p); ++ if (unlikely(status)) ++ return status; ++ ++ dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", ++ __func__, ++ (unsigned long)res->lseg.offset, ++ (unsigned long)res->lseg.length, ++ res->lseg.iomode, ++ res->type, ++ res->layout.len); ++ ++ /* presuambly, pnfs4_proc_layoutget allocated a single page */ ++ if (res->layout.len > PAGE_SIZE) ++ return -ENOMEM; ++ memcpy(res->layout.buf, p, res->layout.len); ++ ++ /* FIXME: the whole layout array should be passed up to the pnfs ++ * client */ ++ if (layout_count > 1) { ++ dprintk("%s: server responded with %d layouts, dropping tail\n", ++ __func__, layout_count); ++ ++ while (--layout_count) { ++ p = xdr_inline_decode(xdr, 24); ++ if (unlikely(!p)) ++ goto out_overflow; ++ status = decode_opaque_inline(xdr, &dummy, (char **)&p); ++ if (unlikely(status)) ++ return status; ++ } ++ } ++ ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutreturn(struct xdr_stream *xdr, ++ struct nfs4_pnfs_layoutreturn_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTRETURN); ++ if (status) ++ return status; ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->lrs_present = be32_to_cpup(p); ++ if (res->lrs_present) ++ status = decode_stateid(xdr, &res->stateid); ++ return status; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutcommit(struct xdr_stream *xdr, ++ struct rpc_rqst *req, ++ struct pnfs_layoutcommit_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->sizechanged = be32_to_cpup(p); ++ ++ if (res->sizechanged) { ++ p = xdr_inline_decode(xdr, 8); ++ if (unlikely(!p)) ++ goto out_overflow; ++ xdr_decode_hyper(p, &res->newsize); ++ } ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ + /* + * END OF "GENERIC" DECODE ROUTINES. + */ +@@ -5259,6 +6050,19 @@ out: + return status; + } + ++static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (!status) ++ status = decode_release_lockowner(&xdr); ++ return status; ++} ++ + /* + * Decode READLINK response + */ +@@ -5696,6 +6500,186 @@ static int nfs4_xdr_dec_reclaim_complete + status = decode_reclaim_complete(&xdr, (void *)NULL); + return status; + } ++ ++/* ++ * Decode GETDEVICELIST response ++ */ ++static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_pnfs_getdevicelist_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ dprintk("encoding getdevicelist!\n"); ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status != 0) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status != 0) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status != 0) ++ goto out; ++ status = decode_getdevicelist(&xdr, res->devlist); ++out: ++ return status; ++} ++ ++/* ++ * Decode GETDEVINFO response ++ */ ++static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_pnfs_getdeviceinfo_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status != 0) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status != 0) ++ goto out; ++ status = decode_getdeviceinfo(&xdr, res->pdev); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTGET response ++ */ ++static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_pnfs_layoutget_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutget(&xdr, rqstp, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTRETURN response ++ */ ++static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_pnfs_layoutreturn_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutreturn(&xdr, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTCOMMIT response ++ */ ++static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, uint32_t *p, ++ struct pnfs_layoutcommit_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutcommit(&xdr, rqstp, res); ++ if (status) ++ goto out; ++ decode_getfattr(&xdr, res->fattr, res->server, ++ !RPC_IS_ASYNC(rqstp->rq_task)); ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server WRITE response ++ */ ++static int nfs4_xdr_dec_dswrite(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs_writeres *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_write(&xdr, res); ++ if (!status) ++ return res->count; ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server COMMIT response ++ */ ++static int nfs4_xdr_dec_dscommit(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs_writeres *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_commit(&xdr, res); ++out: ++ return status; ++} + #endif /* CONFIG_NFS_V4_1 */ + + __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) +@@ -5866,6 +6850,7 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(GETACL, enc_getacl, dec_getacl), + PROC(SETACL, enc_setacl, dec_setacl), + PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), ++ PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), + #if defined(CONFIG_NFS_V4_1) + PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), + PROC(CREATE_SESSION, enc_create_session, dec_create_session), +@@ -5873,6 +6858,13 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(SEQUENCE, enc_sequence, dec_sequence), + PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), ++ PROC(PNFS_GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), ++ PROC(PNFS_GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), ++ PROC(PNFS_LAYOUTGET, enc_layoutget, dec_layoutget), ++ PROC(PNFS_LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), ++ PROC(PNFS_LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), ++ PROC(PNFS_WRITE, enc_dswrite, dec_dswrite), ++ PROC(PNFS_COMMIT, enc_dscommit, dec_dscommit), + #endif /* CONFIG_NFS_V4_1 */ + }; + +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild +--- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-23 12:09:03.348511665 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-08-23 12:09:03.348511665 -0400 +@@ -0,0 +1,11 @@ ++# ++# Makefile for the pNFS Objects Layout Driver kernel module ++# ++objlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o objio_osd.o ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o ++ ++# ++# Panasas pNFS Layout Driver kernel module ++# ++panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o ++obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-23 12:09:03.349501459 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-23 12:09:03.349501459 -0400 +@@ -0,0 +1,1087 @@ ++/* ++ * objio_osd.c ++ * ++ * pNFS Objects layout implementation over open-osd initiator library ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++#define _LLU(x) ((unsigned long long)x) ++ ++enum { BIO_MAX_PAGES_KMALLOC = ++ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), ++}; ++ ++/* A per mountpoint struct currently for device cache */ ++struct objio_mount_type { ++ struct list_head dev_list; ++ spinlock_t dev_list_lock; ++}; ++ ++struct _dev_ent { ++ struct list_head list; ++ struct pnfs_deviceid d_id; ++ struct osd_dev *od; ++}; ++ ++static void _dev_list_remove_all(struct objio_mount_type *omt) ++{ ++ spin_lock(&omt->dev_list_lock); ++ ++ while (!list_empty(&omt->dev_list)) { ++ struct _dev_ent *de = list_entry(omt->dev_list.next, ++ struct _dev_ent, list); ++ ++ list_del_init(&de->list); ++ osduld_put_device(de->od); ++ kfree(de); ++ } ++ ++ spin_unlock(&omt->dev_list_lock); ++} ++ ++static struct osd_dev *___dev_list_find(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id) ++{ ++ struct list_head *le; ++ ++ list_for_each(le, &omt->dev_list) { ++ struct _dev_ent *de = list_entry(le, struct _dev_ent, list); ++ ++ if (0 == memcmp(&de->d_id, d_id, sizeof(*d_id))) ++ return de->od; ++ } ++ ++ return NULL; ++} ++ ++static struct osd_dev *_dev_list_find(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id) ++{ ++ struct osd_dev *od; ++ ++ spin_lock(&omt->dev_list_lock); ++ od = ___dev_list_find(omt, d_id); ++ spin_unlock(&omt->dev_list_lock); ++ return od; ++} ++ ++static int _dev_list_add(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id, struct osd_dev *od) ++{ ++ struct _dev_ent *de = kzalloc(sizeof(*de), GFP_KERNEL); ++ ++ if (!de) ++ return -ENOMEM; ++ ++ spin_lock(&omt->dev_list_lock); ++ ++ if (___dev_list_find(omt, d_id)) { ++ kfree(de); ++ goto out; ++ } ++ ++ de->d_id = *d_id; ++ de->od = od; ++ list_add(&de->list, &omt->dev_list); ++ ++out: ++ spin_unlock(&omt->dev_list_lock); ++ return 0; ++} ++ ++struct objio_segment { ++ struct pnfs_osd_layout *layout; ++ ++ unsigned mirrors_p1; ++ unsigned stripe_unit; ++ unsigned group_width; /* Data stripe_units without integrity comps */ ++ u64 group_depth; ++ unsigned group_count; ++ ++ unsigned num_comps; ++ /* variable length */ ++ struct osd_dev *ods[1]; ++}; ++ ++struct objio_state; ++typedef ssize_t (*objio_done_fn)(struct objio_state *ios); ++ ++struct objio_state { ++ /* Generic layer */ ++ struct objlayout_io_state ol_state; ++ ++ struct objio_segment *objio_seg; ++ ++ struct kref kref; ++ objio_done_fn done; ++ void *private; ++ ++ unsigned long length; ++ unsigned numdevs; /* Actually used devs in this IO */ ++ /* A per-device variable array of size numdevs */ ++ struct _objio_per_comp { ++ struct bio *bio; ++ struct osd_request *or; ++ unsigned long length; ++ u64 offset; ++ unsigned dev; ++ } per_dev[]; ++}; ++ ++/* Send and wait for a get_device_info of devices in the layout, ++ then look them up with the osd_initiator library */ ++static struct osd_dev *_device_lookup(struct pnfs_layout_type *pnfslay, ++ struct objio_segment *objio_seg, unsigned comp) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ struct pnfs_osd_deviceaddr *deviceaddr; ++ struct pnfs_deviceid *d_id; ++ struct osd_dev *od; ++ struct osd_dev_info odi; ++ struct objio_mount_type *omt = PNFS_NFS_SERVER(pnfslay)->pnfs_ld_data; ++ int err; ++ ++ d_id = &layout->olo_comps[comp].oc_object_id.oid_device_id; ++ ++ od = _dev_list_find(omt, d_id); ++ if (od) ++ return od; ++ ++ err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr); ++ if (unlikely(err)) { ++ dprintk("%s: objlayout_get_deviceinfo=>%d\n", __func__, err); ++ return ERR_PTR(err); ++ } ++ ++ odi.systemid_len = deviceaddr->oda_systemid.len; ++ if (odi.systemid_len > sizeof(odi.systemid)) { ++ err = -EINVAL; ++ goto out; ++ } else if (odi.systemid_len) ++ memcpy(odi.systemid, deviceaddr->oda_systemid.data, ++ odi.systemid_len); ++ odi.osdname_len = deviceaddr->oda_osdname.len; ++ odi.osdname = (u8 *)deviceaddr->oda_osdname.data; ++ ++ if (!odi.osdname_len && !odi.systemid_len) { ++ dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", ++ __func__); ++ err = -ENODEV; ++ goto out; ++ } ++ ++ od = osduld_info_lookup(&odi); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ dprintk("%s: osduld_info_lookup => %d\n", __func__, err); ++ goto out; ++ } ++ ++ _dev_list_add(omt, d_id, od); ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ objlayout_put_deviceinfo(deviceaddr); ++ return err ? ERR_PTR(err) : od; ++} ++ ++static int objio_devices_lookup(struct pnfs_layout_type *pnfslay, ++ struct objio_segment *objio_seg) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ unsigned i, num_comps = layout->olo_num_comps; ++ int err; ++ ++ /* lookup all devices */ ++ for (i = 0; i < num_comps; i++) { ++ struct osd_dev *od; ++ ++ od = _device_lookup(pnfslay, objio_seg, i); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ goto out; ++ } ++ objio_seg->ods[i] = od; ++ } ++ objio_seg->num_comps = num_comps; ++ err = 0; ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ return err; ++} ++ ++static int _verify_data_map(struct pnfs_osd_layout *layout) ++{ ++ struct pnfs_osd_data_map *data_map = &layout->olo_map; ++ u64 stripe_length; ++ u32 group_width; ++ ++/* FIXME: Only raid0 for now. if not go through MDS */ ++ if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { ++ printk(KERN_ERR "Only RAID_0 for now\n"); ++ return -ENOTSUPP; ++ } ++ if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { ++ printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", ++ data_map->odm_num_comps, data_map->odm_mirror_cnt); ++ return -EINVAL; ++ } ++ ++ if (data_map->odm_group_width) ++ group_width = data_map->odm_group_width; ++ else ++ group_width = data_map->odm_num_comps / ++ (data_map->odm_mirror_cnt + 1); ++ ++ stripe_length = (u64)data_map->odm_stripe_unit * group_width; ++ if (stripe_length >= (1ULL << 32)) { ++ printk(KERN_ERR "Total Stripe length(0x%llx)" ++ " >= 32bit is not supported\n", _LLU(stripe_length)); ++ return -ENOTSUPP; ++ } ++ ++ if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { ++ printk(KERN_ERR "Stripe Unit(0x%llx)" ++ " must be Multples of PAGE_SIZE(0x%lx)\n", ++ _LLU(data_map->odm_stripe_unit), PAGE_SIZE); ++ return -ENOTSUPP; ++ } ++ ++ return 0; ++} ++ ++int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_type *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ struct objio_segment *objio_seg; ++ int err; ++ ++ err = _verify_data_map(layout); ++ if (unlikely(err)) ++ return err; ++ ++ objio_seg = kzalloc(sizeof(*objio_seg) + ++ (layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]), ++ GFP_KERNEL); ++ if (!objio_seg) ++ return -ENOMEM; ++ ++ objio_seg->layout = layout; ++ err = objio_devices_lookup(pnfslay, objio_seg); ++ if (err) ++ goto free_seg; ++ ++ objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1; ++ objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit; ++ if (layout->olo_map.odm_group_width) { ++ objio_seg->group_width = layout->olo_map.odm_group_width; ++ objio_seg->group_depth = layout->olo_map.odm_group_depth; ++ objio_seg->group_count = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1 / ++ objio_seg->group_width; ++ } else { ++ objio_seg->group_width = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1; ++ objio_seg->group_depth = -1; ++ objio_seg->group_count = 1; ++ } ++ ++ *outp = objio_seg; ++ return 0; ++ ++free_seg: ++ dprintk("%s: Error: return %d\n", __func__, err); ++ kfree(objio_seg); ++ *outp = NULL; ++ return err; ++} ++ ++void objio_free_lseg(void *p) ++{ ++ struct objio_segment *objio_seg = p; ++ ++ kfree(objio_seg); ++} ++ ++int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct objio_segment *objio_seg = seg; ++ struct objio_state *ios; ++ const unsigned first_size = sizeof(*ios) + ++ objio_seg->num_comps * sizeof(ios->per_dev[0]); ++ const unsigned sec_size = objio_seg->num_comps * ++ sizeof(ios->ol_state.ioerrs[0]); ++ ++ dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps); ++ ios = kzalloc(first_size + sec_size, GFP_KERNEL); ++ if (unlikely(!ios)) ++ return -ENOMEM; ++ ++ ios->objio_seg = objio_seg; ++ ios->ol_state.ioerrs = ((void *)ios) + first_size; ++ ios->ol_state.num_comps = objio_seg->num_comps; ++ ++ *outp = &ios->ol_state; ++ return 0; ++} ++ ++void objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ ++ kfree(ios); ++} ++ ++enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) ++{ ++ switch (oep) { ++ case OSD_ERR_PRI_NO_ERROR: ++ return (enum pnfs_osd_errno)0; ++ ++ case OSD_ERR_PRI_CLEAR_PAGES: ++ BUG_ON(1); ++ return 0; ++ ++ case OSD_ERR_PRI_RESOURCE: ++ return PNFS_OSD_ERR_RESOURCE; ++ case OSD_ERR_PRI_BAD_CRED: ++ return PNFS_OSD_ERR_BAD_CRED; ++ case OSD_ERR_PRI_NO_ACCESS: ++ return PNFS_OSD_ERR_NO_ACCESS; ++ case OSD_ERR_PRI_UNREACHABLE: ++ return PNFS_OSD_ERR_UNREACHABLE; ++ case OSD_ERR_PRI_NOT_FOUND: ++ return PNFS_OSD_ERR_NOT_FOUND; ++ case OSD_ERR_PRI_NO_SPACE: ++ return PNFS_OSD_ERR_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case OSD_ERR_PRI_EIO: ++ return PNFS_OSD_ERR_EIO; ++ } ++} ++ ++static void _clear_bio(struct bio *bio) ++{ ++ struct bio_vec *bv; ++ unsigned i; ++ ++ __bio_for_each_segment(bv, bio, i, 0) { ++ unsigned this_count = bv->bv_len; ++ ++ if (likely(PAGE_SIZE == this_count)) ++ clear_highpage(bv->bv_page); ++ else ++ zero_user(bv->bv_page, bv->bv_offset, this_count); ++ } ++} ++ ++static int _io_check(struct objio_state *ios, bool is_write) ++{ ++ enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; ++ int lin_ret = 0; ++ int i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_sense_info osi; ++ struct osd_request *or = ios->per_dev[i].or; ++ int ret; ++ ++ if (!or) ++ continue; ++ ++ ret = osd_req_decode_sense(or, &osi); ++ if (likely(!ret)) ++ continue; ++ ++ if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { ++ /* start read offset passed endof file */ ++ BUG_ON(is_write); ++ _clear_bio(ios->per_dev[i].bio); ++ dprintk("%s: start read offset passed end of file " ++ "offset=0x%llx, length=0x%lx\n", __func__, ++ _LLU(ios->per_dev[i].offset), ++ ios->per_dev[i].length); ++ ++ continue; /* we recovered */ ++ } ++ objlayout_io_set_result(&ios->ol_state, ios->per_dev[i].dev, ++ osd_pri_2_pnfs_err(osi.osd_err_pri), ++ ios->per_dev[i].offset, ++ ios->per_dev[i].length, ++ is_write); ++ ++ if (osi.osd_err_pri >= oep) { ++ oep = osi.osd_err_pri; ++ lin_ret = ret; ++ } ++ } ++ ++ return lin_ret; ++} ++ ++/* ++ * Common IO state helpers. ++ */ ++static void _io_free(struct objio_state *ios) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[i]; ++ ++ if (per_dev->or) { ++ osd_end_request(per_dev->or); ++ per_dev->or = NULL; ++ } ++ ++ if (per_dev->bio) { ++ bio_put(per_dev->bio); ++ per_dev->bio = NULL; ++ } ++ } ++} ++ ++struct osd_dev * _io_od(struct objio_state *ios, unsigned dev) ++{ ++ unsigned min_dev = ios->objio_seg->layout->olo_comps_index; ++ unsigned max_dev = min_dev + ios->ol_state.num_comps; ++ ++ BUG_ON(dev < min_dev || max_dev <= dev); ++ return ios->objio_seg->ods[dev - min_dev]; ++} ++ ++struct _striping_info { ++ u64 obj_offset; ++ u64 group_length; ++ u64 total_group_length; ++ u64 Major; ++ unsigned dev; ++ unsigned unit_off; ++}; ++ ++static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, ++ struct _striping_info *si) ++{ ++ u32 stripe_unit = ios->objio_seg->stripe_unit; ++ u32 group_width = ios->objio_seg->group_width; ++ u64 group_depth = ios->objio_seg->group_depth; ++ u32 U = stripe_unit * group_width; ++ ++ u64 T = U * group_depth; ++ u64 S = T * ios->objio_seg->group_count; ++ u64 M = div64_u64(file_offset, S); ++ ++ /* ++ G = (L - (M * S)) / T ++ H = (L - (M * S)) % T ++ */ ++ u64 LmodU = file_offset - M * S; ++ u32 G = div64_u64(LmodU, T); ++ u64 H = LmodU - G * T; ++ ++ u32 N = div_u64(H, U); ++ ++ div_u64_rem(file_offset, stripe_unit, &si->unit_off); ++ si->obj_offset = si->unit_off + (N * stripe_unit) + ++ (M * group_depth * stripe_unit); ++ ++ /* "H - (N * U)" is just "H % U" so it's bound to u32 */ ++ si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; ++ si->dev *= ios->objio_seg->mirrors_p1; ++ ++ si->group_length = T - H; ++ si->total_group_length = T; ++ si->Major = M; ++} ++ ++static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, ++ unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len) ++{ ++ unsigned pg = *cur_pg; ++ struct request_queue *q = ++ osd_request_queue(_io_od(ios, per_dev->dev)); ++ ++ per_dev->length += cur_len; ++ ++ if (per_dev->bio == NULL) { ++ unsigned stripes = ios->ol_state.num_comps / ++ ios->objio_seg->mirrors_p1; ++ unsigned pages_in_stripe = stripes * ++ (ios->objio_seg->stripe_unit / PAGE_SIZE); ++ unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / ++ stripes; ++ ++ per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); ++ if (unlikely(!per_dev->bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", bio_size); ++ return -ENOMEM; ++ } ++ } ++ ++ while (cur_len > 0) { ++ unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); ++ unsigned added_len; ++ ++ BUG_ON(ios->ol_state.nr_pages <= pg); ++ cur_len -= pglen; ++ ++ added_len = bio_add_pc_page(q, per_dev->bio, ++ ios->ol_state.pages[pg], pglen, pgbase); ++ if (unlikely(pglen != added_len)) ++ return -ENOMEM; ++ pgbase = 0; ++ ++pg; ++ } ++ BUG_ON(cur_len); ++ ++ *cur_pg = pg; ++ return 0; ++} ++ ++static int _prepare_one_group(struct objio_state *ios, u64 length, ++ struct _striping_info *si, unsigned first_comp, ++ unsigned *last_pg) ++{ ++ unsigned stripe_unit = ios->objio_seg->stripe_unit; ++ unsigned mirrors_p1 = ios->objio_seg->mirrors_p1; ++ unsigned devs_in_group = ios->objio_seg->group_width * mirrors_p1; ++ unsigned dev = si->dev; ++ unsigned first_dev = dev - (dev % devs_in_group); ++ unsigned comp = first_comp + (dev - first_dev); ++ unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; ++ unsigned cur_pg = *last_pg; ++ int ret = 0; ++ ++ while (length) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[comp]; ++ unsigned cur_len, page_off = 0; ++ ++ if (!per_dev->length) { ++ per_dev->dev = dev; ++ if (dev < si->dev) { ++ per_dev->offset = si->obj_offset + stripe_unit - ++ si->unit_off; ++ cur_len = stripe_unit; ++ } else if (dev == si->dev) { ++ per_dev->offset = si->obj_offset; ++ cur_len = stripe_unit - si->unit_off; ++ page_off = si->unit_off & ~PAGE_MASK; ++ BUG_ON(page_off && ++ (page_off != ios->ol_state.pgbase)); ++ } else { /* dev > si->dev */ ++ per_dev->offset = si->obj_offset - si->unit_off; ++ cur_len = stripe_unit; ++ } ++ ++ if (max_comp < comp) ++ max_comp = comp; ++ ++ dev += mirrors_p1; ++ dev = (dev % devs_in_group) + first_dev; ++ } else { ++ cur_len = stripe_unit; ++ } ++ if (cur_len >= length) ++ cur_len = length; ++ ++ ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, ++ cur_len); ++ if (unlikely(ret)) ++ goto out; ++ ++ comp += mirrors_p1; ++ comp = (comp % devs_in_group) + first_comp; ++ ++ length -= cur_len; ++ ios->length += cur_len; ++ } ++out: ++ ios->numdevs = max_comp + mirrors_p1; ++ *last_pg = cur_pg; ++ return ret; ++} ++ ++static int _io_rw_pagelist(struct objio_state *ios) ++{ ++ u64 length = ios->ol_state.count; ++ struct _striping_info si; ++ unsigned devs_in_group = ios->objio_seg->group_width * ++ ios->objio_seg->mirrors_p1; ++ unsigned first_comp = 0; ++ unsigned num_comps = ios->objio_seg->layout->olo_map.odm_num_comps; ++ unsigned last_pg = 0; ++ int ret = 0; ++ ++ _calc_stripe_info(ios, ios->ol_state.offset, &si); ++ while (length) { ++ if (length < si.group_length) ++ si.group_length = length; ++ ++ ret = _prepare_one_group(ios, si.group_length, &si, first_comp, ++ &last_pg); ++ if (unlikely(ret)) ++ goto out; ++ ++ length -= si.group_length; ++ ++ si.group_length = si.total_group_length; ++ si.unit_off = 0; ++ ++si.Major; ++ si.obj_offset = si.Major * ios->objio_seg->stripe_unit * ++ ios->objio_seg->group_depth; ++ ++ si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group; ++ si.dev %= num_comps; ++ ++ first_comp += devs_in_group; ++ first_comp %= num_comps; ++ } ++ ++out: ++ if (!ios->length) ++ return ret; ++ ++ return 0; ++} ++ ++static ssize_t _sync_done(struct objio_state *ios) ++{ ++ struct completion *waiting = ios->private; ++ ++ complete(waiting); ++ return 0; ++} ++ ++static void _last_io(struct kref *kref) ++{ ++ struct objio_state *ios = container_of(kref, struct objio_state, kref); ++ ++ ios->done(ios); ++} ++ ++static void _done_io(struct osd_request *or, void *p) ++{ ++ struct objio_state *ios = p; ++ ++ kref_put(&ios->kref, _last_io); ++} ++ ++static ssize_t _io_exec(struct objio_state *ios) ++{ ++ DECLARE_COMPLETION_ONSTACK(wait); ++ ssize_t status = 0; /* sync status */ ++ unsigned i; ++ objio_done_fn saved_done_fn = ios->done; ++ bool sync = ios->ol_state.sync; ++ ++ if (sync) { ++ ios->done = _sync_done; ++ ios->private = &wait; ++ } ++ ++ kref_init(&ios->kref); ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_request *or = ios->per_dev[i].or; ++ ++ if (!or) ++ continue; ++ ++ kref_get(&ios->kref); ++ osd_execute_request_async(or, _done_io, ios); ++ } ++ ++ kref_put(&ios->kref, _last_io); ++ ++ if (sync) { ++ wait_for_completion(&wait); ++ status = saved_done_fn(ios); ++ } ++ ++ return status; ++} ++ ++/* ++ * read ++ */ ++static ssize_t _read_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, false); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) ++ status = ios->length; ++ else ++ status = ret; ++ ++ objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct osd_request *or = NULL; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = per_dev->dev; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ int ret; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ ++err: ++ return ret; ++} ++ ++static ssize_t _read_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _read_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _read_done; ++ return _io_exec(ios); /* In sync mode exec returns the io status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _read_exec(ios); ++} ++ ++/* ++ * write ++ */ ++static ssize_t _write_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, true); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) { ++ /* FIXME: should be based on the OSD's persistence model ++ * See OSD2r05 Section 4.13 Data persistence model */ ++ ios->ol_state.committed = NFS_UNSTABLE; //NFS_FILE_SYNC; ++ status = ios->length; ++ } else { ++ status = ret; ++ } ++ ++ objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = ios->per_dev[cur_comp].dev; ++ unsigned last_comp = cur_comp + ios->objio_seg->mirrors_p1; ++ int ret; ++ ++ for (; cur_comp < last_comp; ++cur_comp, ++dev) { ++ struct osd_request *or = NULL; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ struct bio *bio; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ if (per_dev != master_dev) { ++ bio = bio_kmalloc(GFP_KERNEL, ++ master_dev->bio->bi_max_vecs); ++ if (unlikely(!bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", ++ master_dev->bio->bi_max_vecs); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ __bio_clone(bio, master_dev->bio); ++ bio->bi_bdev = NULL; ++ bio->bi_next = NULL; ++ per_dev->bio = bio; ++ per_dev->dev = dev; ++ per_dev->length = master_dev->length; ++ per_dev->offset = master_dev->offset; ++ } else { ++ bio = master_dev->bio; ++ /* FIXME: bio_set_dir() */ ++ bio->bi_rw |= (1 << BIO_RW); ++ } ++ ++ osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ } ++ ++err: ++ return ret; ++} ++ ++static ssize_t _write_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _write_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _write_done; ++ return _io_exec(ios); /* In sync mode exec returns the io->status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ /* TODO: ios->stable = stable; */ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _write_exec(ios); ++} ++ ++/* ++ * Policy Operations ++ */ ++ ++/* ++ * Return the stripe size for the specified file ++ */ ++ssize_t ++objlayout_get_stripesize(struct pnfs_layout_type *pnfslay) ++{ ++ ssize_t sz, maxsz = -1; ++ struct pnfs_layout_segment *lseg; ++ ++ list_for_each_entry(lseg, &pnfslay->segs, fi_list) { ++ int n; ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(lseg); ++ struct pnfs_osd_layout *lo = ++ (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; ++ struct pnfs_osd_data_map *map = &lo->olo_map; ++ ++ n = map->odm_group_width; ++ if (n == 0) ++ n = map->odm_num_comps / (map->odm_mirror_cnt + 1); ++ ++ switch (map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_5: ++ n -= 1; ++ break; ++ ++ case PNFS_OSD_RAID_PQ: ++ n -= 2; ++ break; ++ ++ default: ++ BUG_ON(1); ++ } ++ sz = map->odm_stripe_unit * n; ++ if (sz > maxsz) ++ maxsz = sz; ++ } ++ dprintk("%s: Return %Zx\n", __func__, maxsz); ++ return maxsz; ++} ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++objlayout_get_blocksize(void) ++{ ++ ssize_t sz = BIO_MAX_PAGES_KMALLOC * PAGE_SIZE; ++ ++ return sz; ++} ++ ++static struct layoutdriver_policy_operations objlayout_policy_operations = { ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ .get_stripesize = objlayout_get_stripesize, ++ .get_blocksize = objlayout_get_blocksize, ++}; ++ ++static struct pnfs_layoutdriver_type objlayout_type = { ++ .id = LAYOUT_OSD2_OBJECTS, ++ .name = "LAYOUT_OSD2_OBJECTS", ++ .ld_io_ops = &objlayout_io_operations, ++ .ld_policy_ops = &objlayout_policy_operations, ++}; ++ ++void *objio_init_mt(void) ++{ ++ struct objio_mount_type *omt = kzalloc(sizeof(*omt), GFP_KERNEL); ++ ++ if (!omt) ++ return ERR_PTR(-ENOMEM); ++ ++ INIT_LIST_HEAD(&omt->dev_list); ++ spin_lock_init(&omt->dev_list_lock); ++ return omt; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++ _dev_list_remove_all(mountid); ++ kfree(mountid); ++} ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++objlayout_init(void) ++{ ++ pnfs_client_ops = pnfs_register_layoutdriver(&objlayout_type); ++ printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", ++ __func__); ++ return 0; ++} ++ ++static void __exit ++objlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&objlayout_type); ++ printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(objlayout_init); ++module_exit(objlayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-23 12:09:03.350491564 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-08-23 12:09:03.350491564 -0400 +@@ -0,0 +1,790 @@ ++/* ++ * objlayout.c ++ * ++ * pNFS layout driver for Panasas OSDs ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pnfs_client_operations *pnfs_client_ops; ++ ++/* ++ * Create a objlayout layout structure for the given inode and return it. ++ */ ++static struct pnfs_layout_type * ++objlayout_alloc_layout(struct inode *inode) ++{ ++ struct objlayout *objlay; ++ ++ objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL); ++ if (objlay) { ++ spin_lock_init(&objlay->lock); ++ INIT_LIST_HEAD(&objlay->err_list); ++ } ++ dprintk("%s: Return %p\n", __func__, objlay); ++ return &objlay->pnfs_layout; ++} ++ ++/* ++ * Free an objlayout layout structure ++ */ ++static void ++objlayout_free_layout(struct pnfs_layout_type *lo) ++{ ++ struct objlayout *objlay = OBJLAYOUT(lo); ++ ++ dprintk("%s: objlay %p\n", __func__, objlay); ++ ++ WARN_ON(!list_empty(&objlay->err_list)); ++ kfree(objlay); ++} ++ ++/* ++ * Unmarshall layout and store it in pnfslay. ++ */ ++static struct pnfs_layout_segment * ++objlayout_alloc_lseg(struct pnfs_layout_type *pnfslay, ++ struct nfs4_pnfs_layoutget_res *lgr) ++{ ++ int status; ++ void *layout = lgr->layout.buf; ++ struct pnfs_layout_segment *lseg; ++ struct objlayout_segment *objlseg; ++ struct pnfs_osd_layout *pnfs_osd_layout; ++ ++ dprintk("%s: Begin pnfslay %p layout %p\n", __func__, pnfslay, layout); ++ ++ BUG_ON(!layout); ++ ++ status = -ENOMEM; ++ lseg = kzalloc(sizeof(*lseg) + sizeof(*objlseg) + ++ pnfs_osd_layout_incore_sz(layout), GFP_KERNEL); ++ if (!lseg) ++ goto err; ++ ++ objlseg = LSEG_LD_DATA(lseg); ++ pnfs_osd_layout = (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; ++ pnfs_osd_xdr_decode_layout(pnfs_osd_layout, layout); ++ ++ status = objio_alloc_lseg(&objlseg->internal, pnfslay, lseg, ++ pnfs_osd_layout); ++ if (status) ++ goto err; ++ ++ dprintk("%s: Return %p\n", __func__, lseg); ++ return lseg; ++ ++ err: ++ kfree(lseg); ++ return ERR_PTR(status); ++} ++ ++/* ++ * Free a layout segement ++ */ ++static void ++objlayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ struct objlayout_segment *objlseg; ++ ++ dprintk("%s: freeing layout segment %p\n", __func__, lseg); ++ ++ if (unlikely(!lseg)) ++ return; ++ ++ objlseg = LSEG_LD_DATA(lseg); ++ objio_free_lseg(objlseg->internal); ++ kfree(lseg); ++} ++ ++/* ++ * I/O Operations ++ */ ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ ++static struct objlayout_io_state * ++objlayout_alloc_io_state(struct pnfs_layout_type *pnfs_layout_type, ++ struct page **pages, ++ unsigned pgbase, ++ unsigned nr_pages, ++ loff_t offset, ++ size_t count, ++ struct pnfs_layout_segment *lseg, ++ void *rpcdata) ++{ ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(lseg); ++ struct objlayout_io_state *state; ++ u64 lseg_end_offset; ++ size_t size_nr_pages; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ if (objio_alloc_io_state(objlseg->internal, &state)) ++ return NULL; ++ ++ BUG_ON(offset < lseg->range.offset); ++ lseg_end_offset = end_offset(lseg->range.offset, lseg->range.length); ++ BUG_ON(offset >= lseg_end_offset); ++ if (offset + count > lseg_end_offset) { ++ count = lseg->range.length - (offset - lseg->range.offset); ++ dprintk("%s: truncated count %Zd\n", __func__, count); ++ } ++ ++ if (pgbase > PAGE_SIZE) { ++ unsigned n = pgbase >> PAGE_SHIFT; ++ ++ pgbase &= ~PAGE_MASK; ++ pages += n; ++ nr_pages -= n; ++ } ++ ++ size_nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ BUG_ON(nr_pages < size_nr_pages); ++ if (nr_pages > size_nr_pages) ++ nr_pages = size_nr_pages; ++ ++ INIT_LIST_HEAD(&state->err_list); ++ state->lseg = lseg; ++ state->rpcdata = rpcdata; ++ state->pages = pages; ++ state->pgbase = pgbase; ++ state->nr_pages = nr_pages; ++ state->offset = offset; ++ state->count = count; ++ state->sync = 0; ++ ++ return state; ++} ++ ++static void ++objlayout_free_io_state(struct objlayout_io_state *state) ++{ ++ dprintk("%s: freeing io_state\n", __func__); ++ if (unlikely(!state)) ++ return; ++ ++ objio_free_io_state(state); ++} ++ ++/* ++ * I/O done common code ++ */ ++static void ++objlayout_iodone(struct objlayout_io_state *state) ++{ ++ dprintk("%s: state %p status\n", __func__, state); ++ ++ if (likely(state->status >= 0)) { ++ objlayout_free_io_state(state); ++ } else { ++ struct objlayout *objlay = OBJLAYOUT(state->lseg->layout); ++ ++ spin_lock(&objlay->lock); ++ objlay->delta_space_valid = OBJ_DSU_INVALID; ++ list_add(&objlay->err_list, &state->err_list); ++ spin_unlock(&objlay->lock); ++ } ++} ++ ++/* ++ * objlayout_io_set_result - Set an osd_error code on a specific osd comp. ++ * ++ * The @index component IO failed (error returned from target). Register ++ * the error for later reporting at layout-return. ++ */ ++void ++objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, ++ int osd_error, u64 offset, u64 length, bool is_write) ++{ ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; ++ ++ BUG_ON(index >= state->num_comps); ++ if (osd_error) { ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(state->lseg); ++ struct pnfs_osd_layout *layout = ++ (typeof(layout))objlseg->pnfs_osd_layout; ++ ++ ioerr->oer_component = layout->olo_comps[index].oc_object_id; ++ ioerr->oer_comp_offset = offset; ++ ioerr->oer_comp_length = length; ++ ioerr->oer_iswrite = is_write; ++ ioerr->oer_errno = osd_error; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " ++ "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", ++ __func__, index, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ } else { ++ /* User need not call if no error is reported */ ++ ioerr->oer_errno = 0; ++ } ++} ++ ++static void _rpc_commit_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_client_ops->nfs_commit_complete(wdata); ++} ++ ++/* ++ * Commit data remotely on OSDs ++ */ ++enum pnfs_try_status ++objlayout_commit(struct nfs_write_data *wdata, int how) ++{ ++ int status = PNFS_ATTEMPTED; ++ ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_commit_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ dprintk("%s: Return %d\n", __func__, status); ++ return status; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_read_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ ++ pnfs_client_ops->nfs_readlist_complete(rdata); ++} ++ ++void ++objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) ++{ ++ int eof = state->eof; ++ struct nfs_read_data *rdata; ++ ++ state->status = status; ++ dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof); ++ rdata = state->rpcdata; ++ rdata->task.tk_status = status; ++ if (status >= 0) { ++ rdata->res.count = status; ++ rdata->res.eof = eof; ++ } ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_client_ops->nfs_readlist_complete(rdata); ++ else { ++ INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); ++ schedule_work(&rdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async reads. ++ */ ++enum pnfs_try_status ++objlayout_read_pagelist(struct nfs_read_data *rdata, unsigned nr_pages) ++{ ++ loff_t offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct objlayout_io_state *state; ++ ssize_t status = 0; ++ loff_t eof; ++ ++ dprintk("%s: Begin inode %p offset %llu count %d\n", ++ __func__, rdata->inode, offset, (int)count); ++ ++ eof = i_size_read(rdata->inode); ++ if (unlikely(offset + count > eof)) { ++ if (offset >= eof) { ++ status = 0; ++ rdata->res.count = 0; ++ rdata->res.eof = 1; ++ goto out; ++ } ++ count = eof - offset; ++ } ++ ++ state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, ++ rdata->args.pages, rdata->args.pgbase, ++ nr_pages, offset, count, ++ rdata->pdata.lseg, rdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->eof = state->offset + state->count >= eof; ++ ++ status = objio_read_pagelist(state); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ rdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_write_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_client_ops->nfs_writelist_complete(wdata); ++} ++ ++void ++objlayout_write_done(struct objlayout_io_state *state, ssize_t status, ++ bool sync) ++{ ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s: Begin\n", __func__); ++ wdata = state->rpcdata; ++ state->status = status; ++ wdata->task.tk_status = status; ++ if (status >= 0) { ++ wdata->res.count = status; ++ wdata->verf.committed = state->committed; ++ dprintk("%s: Return status %d committed %d\n", ++ __func__, wdata->task.tk_status, ++ wdata->verf.committed); ++ } else ++ dprintk("%s: Return status %d\n", ++ __func__, wdata->task.tk_status); ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_client_ops->nfs_writelist_complete(wdata); ++ else { ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async writes. ++ */ ++enum pnfs_try_status ++objlayout_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int how) ++{ ++ struct objlayout_io_state *state; ++ ssize_t status; ++ ++ dprintk("%s: Begin inode %p offset %llu count %u\n", ++ __func__, wdata->inode, wdata->args.offset, wdata->args.count); ++ ++ state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, ++ wdata->args.pages, ++ wdata->args.pgbase, ++ nr_pages, ++ wdata->args.offset, ++ wdata->args.count, ++ wdata->pdata.lseg, wdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->sync = how & FLUSH_SYNC; ++ ++ status = objio_write_pagelist(state, how & FLUSH_STABLE); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ wdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++void ++objlayout_encode_layoutcommit(struct pnfs_layout_type *pnfslay, ++ struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct pnfs_osd_layoutupdate lou; ++ __be32 *start; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ spin_lock(&objlay->lock); ++ lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); ++ lou.dsu_delta = objlay->delta_space_used; ++ objlay->delta_space_used = 0; ++ objlay->delta_space_valid = OBJ_DSU_INIT; ++ lou.olu_ioerr_flag = !list_empty(&objlay->err_list); ++ spin_unlock(&objlay->lock); ++ ++ start = xdr_reserve_space(xdr, 4); ++ ++ BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ ++ dprintk("%s: Return delta_space_used %lld err %d\n", __func__, ++ lou.dsu_delta, lou.olu_ioerr_flag); ++} ++ ++static int ++err_prio(u32 oer_errno) ++{ ++ switch (oer_errno) { ++ case 0: ++ return 0; ++ ++ case PNFS_OSD_ERR_RESOURCE: ++ return OSD_ERR_PRI_RESOURCE; ++ case PNFS_OSD_ERR_BAD_CRED: ++ return OSD_ERR_PRI_BAD_CRED; ++ case PNFS_OSD_ERR_NO_ACCESS: ++ return OSD_ERR_PRI_NO_ACCESS; ++ case PNFS_OSD_ERR_UNREACHABLE: ++ return OSD_ERR_PRI_UNREACHABLE; ++ case PNFS_OSD_ERR_NOT_FOUND: ++ return OSD_ERR_PRI_NOT_FOUND; ++ case PNFS_OSD_ERR_NO_SPACE: ++ return OSD_ERR_PRI_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case PNFS_OSD_ERR_EIO: ++ return OSD_ERR_PRI_EIO; ++ } ++} ++ ++static void ++merge_ioerr(struct pnfs_osd_ioerr *dest_err, ++ const struct pnfs_osd_ioerr *src_err) ++{ ++ u64 dest_end, src_end; ++ ++ if (!dest_err->oer_errno) { ++ *dest_err = *src_err; ++ /* accumulated device must be blank */ ++ memset(&dest_err->oer_component.oid_device_id, 0, ++ sizeof(dest_err->oer_component.oid_device_id)); ++ ++ return; ++ } ++ ++ if (dest_err->oer_component.oid_partition_id != ++ src_err->oer_component.oid_partition_id) ++ dest_err->oer_component.oid_partition_id = 0; ++ ++ if (dest_err->oer_component.oid_object_id != ++ src_err->oer_component.oid_object_id) ++ dest_err->oer_component.oid_object_id = 0; ++ ++ if (dest_err->oer_comp_offset > src_err->oer_comp_offset) ++ dest_err->oer_comp_offset = src_err->oer_comp_offset; ++ ++ dest_end = end_offset(dest_err->oer_comp_offset, ++ dest_err->oer_comp_length); ++ src_end = end_offset(src_err->oer_comp_offset, ++ src_err->oer_comp_length); ++ if (dest_end < src_end) ++ dest_end = src_end; ++ ++ dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; ++ ++ if ((src_err->oer_iswrite == dest_err->oer_iswrite) && ++ (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { ++ dest_err->oer_errno = src_err->oer_errno; ++ } else if (src_err->oer_iswrite) { ++ dest_err->oer_iswrite = true; ++ dest_err->oer_errno = src_err->oer_errno; ++ } ++} ++ ++static void ++encode_accumulated_error(struct objlayout *objlay, struct xdr_stream *xdr) ++{ ++ struct objlayout_io_state *state, *tmp; ++ struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ ++ for (i = 0; i < state->num_comps; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ merge_ioerr(&accumulated_err, ioerr); ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++ ++ BUG_ON(pnfs_osd_xdr_encode_ioerr(xdr, &accumulated_err)); ++} ++ ++void ++objlayout_encode_layoutreturn(struct pnfs_layout_type *pnfslay, ++ struct xdr_stream *xdr, ++ const struct nfs4_pnfs_layoutreturn_arg *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct objlayout_io_state *state, *tmp; ++ __be32 *start, *uninitialized_var(last_xdr); ++ ++ dprintk("%s: Begin\n", __func__); ++ start = xdr_reserve_space(xdr, 4); ++ BUG_ON(!start); ++ ++ spin_lock(&objlay->lock); ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ int res = 0; ++ ++ for (i = 0; i < state->num_comps && !res; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ last_xdr = xdr->p; ++ res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]); ++ } ++ if (unlikely(res)) { ++ /* no space for even one error descriptor */ ++ BUG_ON(last_xdr == start + 1); ++ ++ /* we've encountered a situation with lots and lots of ++ * errors and no space to encode them all. Use the last ++ * available slot to report the union of all the ++ * remaining errors. ++ */ ++ xdr_rewind_stream(xdr, last_xdr - ++ pnfs_osd_ioerr_xdr_sz() / 4); ++ encode_accumulated_error(objlay, xdr); ++ goto loop_done; ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++loop_done: ++ spin_unlock(&objlay->lock); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ dprintk("%s: Return\n", __func__); ++} ++ ++struct objlayout_deviceinfo { ++ struct page *page; ++ struct pnfs_osd_deviceaddr da; /* This must be last */ ++}; ++ ++/* Initialize and call nfs_getdeviceinfo, then decode and return a ++ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() ++ * should be called. ++ */ ++int objlayout_get_deviceinfo(struct pnfs_layout_type *pnfslay, ++ struct pnfs_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi; ++ struct pnfs_device pd; ++ struct super_block *sb; ++ struct page *page; ++ size_t sz; ++ u32 *p; ++ int err; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ pd.area = page_address(page); ++ ++ memcpy(&pd.dev_id, d_id, sizeof(*d_id)); ++ pd.layout_type = LAYOUT_OSD2_OBJECTS; ++ pd.dev_notify_types = 0; ++ pd.pages = &page; ++ pd.pgbase = 0; ++ pd.pglen = PAGE_SIZE; ++ pd.mincount = 0; ++ ++ sb = PNFS_INODE(pnfslay)->i_sb; ++ err = pnfs_client_ops->nfs_getdeviceinfo(PNFS_NFS_SERVER(pnfslay), &pd); ++ dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); ++ if (err) ++ goto err_out; ++ ++ p = pd.area; ++ sz = pnfs_osd_xdr_deviceaddr_incore_sz(p); ++ odi = kzalloc(sz + (sizeof(*odi) - sizeof(odi->da)), GFP_KERNEL); ++ if (!odi) { ++ err = -ENOMEM; ++ goto err_out; ++ } ++ pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); ++ odi->page = page; ++ *deviceaddr = &odi->da; ++ return 0; ++ ++err_out: ++ __free_page(page); ++ return err; ++} ++ ++void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi = container_of(deviceaddr, ++ struct objlayout_deviceinfo, ++ da); ++ ++ __free_page(odi->page); ++ kfree(odi); ++} ++ ++/* ++ * Initialize a mountpoint by retrieving the list of ++ * available devices for it. ++ * Return the pnfs_mount_type structure so the ++ * pNFS_client can refer to the mount point later on. ++ */ ++static int ++objlayout_initialize_mountpoint(struct nfs_server *server, ++ const struct nfs_fh *mntfh) ++{ ++ void *data; ++ ++ data = objio_init_mt(); ++ if (IS_ERR(data)) { ++ printk(KERN_INFO "%s: objlayout lib not ready err=%ld\n", ++ __func__, PTR_ERR(data)); ++ return PTR_ERR(data); ++ } ++ server->pnfs_ld_data = data; ++ ++ dprintk("%s: Return data=%p\n", __func__, data); ++ return 0; ++} ++ ++/* ++ * Uninitialize a mountpoint ++ */ ++static int ++objlayout_uninitialize_mountpoint(struct nfs_server *server) ++{ ++ dprintk("%s: Begin %p\n", __func__, server->pnfs_ld_data); ++ objio_fini_mt(server->pnfs_ld_data); ++ return 0; ++} ++ ++struct layoutdriver_io_operations objlayout_io_operations = { ++ .commit = objlayout_commit, ++ .read_pagelist = objlayout_read_pagelist, ++ .write_pagelist = objlayout_write_pagelist, ++ .alloc_layout = objlayout_alloc_layout, ++ .free_layout = objlayout_free_layout, ++ .alloc_lseg = objlayout_alloc_lseg, ++ .free_lseg = objlayout_free_lseg, ++ .encode_layoutcommit = objlayout_encode_layoutcommit, ++ .encode_layoutreturn = objlayout_encode_layoutreturn, ++ .initialize_mountpoint = objlayout_initialize_mountpoint, ++ .uninitialize_mountpoint = objlayout_uninitialize_mountpoint, ++}; +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-23 12:09:03.351434439 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-08-23 12:09:03.351434439 -0400 +@@ -0,0 +1,171 @@ ++/* ++ * objlayout.h ++ * ++ * Data types and function declerations for interfacing with the ++ * pNFS standard object layout driver. ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#ifndef _OBJLAYOUT_H ++#define _OBJLAYOUT_H ++ ++#include ++#include ++#include ++ ++/* ++ * in-core layout segment ++ */ ++struct objlayout_segment { ++ void *internal; /* for provider internal use */ ++ u8 pnfs_osd_layout[]; ++}; ++ ++/* ++ * per-inode layout ++ */ ++struct objlayout { ++ struct pnfs_layout_type pnfs_layout; ++ ++ /* for layout_commit */ ++ enum osd_delta_space_valid_enum { ++ OBJ_DSU_INIT = 0, ++ OBJ_DSU_VALID, ++ OBJ_DSU_INVALID, ++ } delta_space_valid; ++ s64 delta_space_used; /* consumed by write ops */ ++ ++ /* for layout_return */ ++ spinlock_t lock; ++ struct list_head err_list; ++}; ++ ++static inline struct objlayout * ++OBJLAYOUT(struct pnfs_layout_type *lo) ++{ ++ return container_of(lo, struct objlayout, pnfs_layout); ++} ++ ++/* ++ * per-I/O operation state ++ * embedded in objects provider io_state data structure ++ */ ++struct objlayout_io_state { ++ struct pnfs_layout_segment *lseg; ++ ++ struct page **pages; ++ unsigned pgbase; ++ unsigned nr_pages; ++ unsigned long count; ++ loff_t offset; ++ bool sync; ++ ++ void *rpcdata; ++ int status; /* res */ ++ int eof; /* res */ ++ int committed; /* res */ ++ ++ /* Error reporting (layout_return) */ ++ struct list_head err_list; ++ unsigned num_comps; ++ /* Pointer to array of error descriptors of size num_comps. ++ * It should contain as many entries as devices in the osd_layout ++ * that participate in the I/O. It is up to the io_engine to allocate ++ * needed space and set num_comps. ++ */ ++ struct pnfs_osd_ioerr *ioerrs; ++}; ++ ++/* ++ * Raid engine I/O API ++ */ ++extern void *objio_init_mt(void); ++extern void objio_fini_mt(void *mt); ++ ++extern int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_type *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout); ++extern void objio_free_lseg(void *p); ++ ++extern int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp); ++extern void objio_free_io_state(struct objlayout_io_state *state); ++ ++extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); ++extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable); ++ ++/* ++ * callback API ++ */ ++extern void objlayout_io_set_result(struct objlayout_io_state *state, ++ unsigned index, int osd_error, ++ u64 offset, u64 length, bool is_write); ++ ++static inline void ++objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) ++{ ++ struct objlayout *objlay = OBJLAYOUT(state->lseg->layout); ++ ++ /* If one of the I/Os errored out and the delta_space_used was ++ * invalid we render the complete report as invalid. Protocol mandate ++ * the DSU be accurate or not reported. ++ */ ++ spin_lock(&objlay->lock); ++ if (objlay->delta_space_valid != OBJ_DSU_INVALID) { ++ objlay->delta_space_valid = OBJ_DSU_VALID; ++ objlay->delta_space_used += space_used; ++ } ++ spin_unlock(&objlay->lock); ++} ++ ++extern void objlayout_read_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++extern void objlayout_write_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++ ++extern int objlayout_get_deviceinfo(struct pnfs_layout_type *pnfslay, ++ struct pnfs_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr); ++extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); ++ ++/* ++ * exported generic objects function vectors ++ */ ++extern struct layoutdriver_io_operations objlayout_io_operations; ++extern struct pnfs_client_operations *pnfs_client_ops; ++ ++#endif /* _OBJLAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-23 12:09:03.352501716 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-23 12:09:03.352501716 -0400 +@@ -0,0 +1,734 @@ ++/* ++ * panfs_shim.c ++ * ++ * Shim layer for interfacing with the Panasas DirectFlow module I/O stack ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include "objlayout.h" ++#include "panfs_shim.h" ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct panfs_export_operations *panfs_export_ops; ++ ++void * ++objio_init_mt(void) ++{ ++ return panfs_export_ops == NULL ? ERR_PTR(-EAGAIN) : NULL; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++} ++ ++static int ++panfs_shim_conv_raid01(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) { ++ hdr->type = PAN_AGG_RAID1; ++ hdr->hdr.raid1.num_comps = lo_map->odm_mirror_cnt + 1; ++ } else if (layout->olo_num_comps > 1) { ++ hdr->type = PAN_AGG_RAID0; ++ hdr->hdr.raid0.num_comps = layout->olo_num_comps; ++ hdr->hdr.raid0.stripe_unit = lo_map->odm_stripe_unit; ++ } else ++ hdr->type = PAN_AGG_SIMPLE; ++ return 0; ++} ++ ++static int ++panfs_shim_conv_raid5(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) ++ goto err; ++ ++ if (lo_map->odm_group_width || lo_map->odm_group_depth) { ++ if (!lo_map->odm_group_width || !lo_map->odm_group_depth) ++ goto err; ++ ++ hdr->type = PAN_AGG_GRP_RAID5_LEFT; ++ hdr->hdr.grp_raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.grp_raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.grp_raid5_left.stripe_unit = lo_map->odm_stripe_unit; ++ hdr->hdr.grp_raid5_left.rg_width = lo_map->odm_group_width; ++ hdr->hdr.grp_raid5_left.rg_depth = lo_map->odm_group_depth; ++ /* this is a guess, panasas server is not supposed to ++ hand out layotu otherwise */ ++ hdr->hdr.grp_raid5_left.group_layout_policy = ++ PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN; ++ } else { ++ hdr->type = PAN_AGG_RAID5_LEFT; ++ hdr->hdr.raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.raid5_left.stripe_unit2 = ++ hdr->hdr.raid5_left.stripe_unit1 = ++ hdr->hdr.raid5_left.stripe_unit0 = lo_map->odm_stripe_unit; ++ } ++ ++ return 0; ++err: ++ return -EINVAL; ++} ++ ++/* ++ * Convert a pnfs_osd data map into Panasas aggregation layout header ++ */ ++static int ++panfs_shim_conv_pnfs_osd_data_map( ++ struct pnfs_osd_layout *layout, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ int status = -EINVAL; ++ struct pnfs_osd_data_map *lo_map = &layout->olo_map; ++ ++ if (!layout->olo_num_comps) { ++ dprintk("%s: !!layout.n_comps(%u)\n", __func__, ++ layout->olo_num_comps); ++ goto err; ++ } ++ ++ switch (lo_map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_0 " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid01(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_5: ++ if (!lo_map->odm_group_width) { ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_5 !group_width " ++ "layout.n_comps(%u)!=map.n_comps(%u) " ++ "|| comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ } else if ((layout->olo_num_comps != lo_map->odm_num_comps && ++ layout->olo_num_comps > lo_map->odm_group_width) || ++ (layout->olo_comps_index % lo_map->odm_group_width)){ ++ dprintk("%s: !!PNFS_OSD_RAID_5 group_width(%u) " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ lo_map->odm_group_width, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid5(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_PQ: ++ default: ++ dprintk("%s: !!PNFS_OSD_RAID_(%d)\n", __func__, ++ lo_map->odm_raid_algorithm); ++ goto err; ++ } ++ ++ return 0; ++ ++err: ++ return status; ++} ++ ++/* ++ * Convert pnfs_osd layout into Panasas map and caps type ++ */ ++int ++objio_alloc_lseg(void **outp, ++ struct pnfs_layout_type *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ int i, total_comps; ++ int status; ++ struct pnfs_osd_object_cred *lo_comp; ++ pan_size_t alloc_sz, local_sz; ++ pan_sm_map_cap_t *mcs = NULL; ++ u8 *buf; ++ pan_agg_comp_obj_t *pan_comp; ++ pan_sm_sec_t *pan_sec; ++ ++ status = -EINVAL; ++ if (layout->olo_num_comps < layout->olo_map.odm_group_width) { ++ total_comps = layout->olo_comps_index + layout->olo_num_comps; ++ } else { ++ /* allocate full map, otherwise SAM gets confused */ ++ total_comps = layout->olo_map.odm_num_comps; ++ } ++ alloc_sz = total_comps * ++ (sizeof(pan_agg_comp_obj_t) + sizeof(pan_sm_sec_t)); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p = layout->olo_comps[i].oc_cap.cred; ++ if (panfs_export_ops->sm_sec_t_get_size_otw( ++ (pan_sm_sec_otw_t *)&p, &local_sz, NULL, NULL)) ++ goto err; ++ alloc_sz += local_sz; ++ } ++ ++ status = -ENOMEM; ++ mcs = kzalloc(sizeof(*mcs) + alloc_sz, GFP_KERNEL); ++ if (!mcs) ++ goto err; ++ buf = (u8 *)&mcs[1]; ++ ++ mcs->offset = lseg->range.offset; ++ mcs->length = lseg->range.length; ++#if 0 ++ /* FIXME: for now */ ++ mcs->expiration_time.ts_sec = 0; ++ mcs->expiration_time.ts_nsec = 0; ++#endif ++ mcs->full_map.map_hdr.avail_state = PAN_AGG_OBJ_STATE_NORMAL; ++ status = panfs_shim_conv_pnfs_osd_data_map(layout, ++ &mcs->full_map.layout_hdr); ++ if (status) ++ goto err; ++ ++ mcs->full_map.components.size = total_comps; ++ mcs->full_map.components.data = (pan_agg_comp_obj_t *)buf; ++ buf += total_comps * sizeof(pan_agg_comp_obj_t); ++ ++ mcs->secs.size = total_comps; ++ mcs->secs.data = (pan_sm_sec_t *)buf; ++ buf += total_comps * sizeof(pan_sm_sec_t); ++ ++ lo_comp = layout->olo_comps; ++ pan_comp = mcs->full_map.components.data + layout->olo_comps_index; ++ pan_sec = mcs->secs.data + layout->olo_comps_index; ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p; ++ pan_stor_obj_id_t *obj_id = &mcs->full_map.map_hdr.obj_id; ++ struct pnfs_osd_objid *oc_obj_id = &lo_comp->oc_object_id; ++ u64 dev_id = __be64_to_cpup( ++ (__be64 *)oc_obj_id->oid_device_id.data + 1); ++ ++ dprintk("%s: i=%d deviceid=%Lx:%Lx partition=%Lx object=%Lx\n", ++ __func__, i, ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data), ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data + 1), ++ oc_obj_id->oid_partition_id, oc_obj_id->oid_object_id); ++ ++ if (i == 0) { ++ /* make up mgr_id to calm sam down */ ++ pan_mgr_id_construct_artificial(PAN_MGR_SM, 0, ++ &obj_id->dev_id); ++ obj_id->grp_id = oc_obj_id->oid_partition_id; ++ obj_id->obj_id = oc_obj_id->oid_object_id; ++ } ++ ++ if (obj_id->grp_id != lo_comp->oc_object_id.oid_partition_id) { ++ dprintk("%s: i=%d grp_id=0x%Lx oid_partition_id=0x%Lx\n", ++ __func__, i, (u64)obj_id->grp_id, ++ lo_comp->oc_object_id.oid_partition_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ if (obj_id->obj_id != lo_comp->oc_object_id.oid_object_id) { ++ dprintk("%s: i=%d obj_id=0x%Lx oid_object_id=0x%Lx\n", ++ __func__, i, obj_id->obj_id, ++ lo_comp->oc_object_id.oid_object_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ pan_comp->dev_id = dev_id; ++ if (!pan_stor_is_device_id_an_obsd_id(pan_comp->dev_id)) { ++ dprintk("%s: i=%d dev_id=0x%Lx not an obsd_id\n", ++ __func__, i, obj_id->dev_id); ++ status = -EINVAL; ++ goto err; ++ } ++ if (lo_comp->oc_osd_version == PNFS_OSD_MISSING) { ++ dprintk("%s: degraded maps not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ pan_comp->avail_state = PAN_AGG_COMP_STATE_NORMAL; ++ if (lo_comp->oc_cap_key_sec != PNFS_OSD_CAP_KEY_SEC_NONE) { ++ dprintk("%s: cap key security not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ ++ p = lo_comp->oc_cap.cred; ++ panfs_export_ops->sm_sec_t_unmarshall( ++ (pan_sm_sec_otw_t *)&p, ++ pan_sec, ++ buf, ++ alloc_sz, ++ NULL, ++ &local_sz); ++ buf += local_sz; ++ alloc_sz -= local_sz; ++ ++ lo_comp++; ++ pan_comp++; ++ pan_sec++; ++ } ++ ++ *outp = mcs; ++ dprintk("%s:Return mcs=%p\n", __func__, mcs); ++ return 0; ++ ++err: ++ objio_free_lseg(mcs); ++ dprintk("%s:Error %d\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Free a Panasas map and caps type ++ */ ++void ++objio_free_lseg(void *p) ++{ ++ kfree(p); ++} ++ ++/* ++ * I/O routines ++ */ ++int ++objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct panfs_shim_io_state *p; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ p = kzalloc(sizeof(*p), GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ *outp = &p->ol_state; ++ return 0; ++} ++ ++/* ++ * Free an I/O state ++ */ ++void ++objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ int i; ++ ++ dprintk("%s: freeing io_state\n", __func__); ++ for (i = 0; i < state->ol_state.nr_pages; i++) ++ kunmap(state->ol_state.pages[i]); ++ ++ if (state->ucreds) ++ panfs_export_ops->ucreds_put(state->ucreds); ++ kfree(state->sg_list); ++ kfree(state); ++} ++ ++static int ++panfs_shim_pages_to_sg( ++ struct panfs_shim_io_state *state, ++ struct page **pages, ++ unsigned int pgbase, ++ unsigned nr_pages, ++ size_t count) ++{ ++ unsigned i, n; ++ pan_sg_entry_t *sg; ++ ++ dprintk("%s pgbase %u nr_pages %u count %d " ++ "pg0 %p flags 0x%x index %llu\n", ++ __func__, pgbase, nr_pages, (int)count, pages[0], ++ (unsigned)pages[0]->flags, (unsigned long long)pages[0]->index); ++ ++ sg = kmalloc(nr_pages * sizeof(*sg), GFP_KERNEL); ++ if (sg == NULL) ++ return -ENOMEM; ++ ++ dprintk("%s sg_list %p pages %p pgbase %u nr_pages %u\n", ++ __func__, sg, pages, pgbase, nr_pages); ++ ++ for (i = 0; i < nr_pages; i++) { ++ sg[i].buffer = (char *)kmap(pages[i]) + pgbase; ++ n = PAGE_SIZE - pgbase; ++ pgbase = 0; ++ if (n > count) ++ n = count; ++ sg[i].chunk_size = n; ++ count -= n; ++ if (likely(count)) { ++ sg[i].next = &sg[i+1]; ++ } else { ++ /* we're done */ ++ sg[i].next = NULL; ++ break; ++ } ++ } ++ BUG_ON(count); ++ ++ state->sg_list = sg; ++ return 0; ++} ++ ++/* ++ * Callback function for async reads ++ */ ++static void ++panfs_shim_read_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.read.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++ status = res_p->length; ++ WARN_ON(status < 0); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_read rc %d: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_read_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ struct objlayout_segment *lseg = LSEG_LD_DATA(ol_state->lseg); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)lseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.read.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.read.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_read(PAN_SAM_ACCESS_BYPASS_TIMESTAMP, ++ &state->u.read.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_read_done, ++ state, NULL, ++ &state->u.read.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_read_done(state, NULL, &state->u.read.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Callback function for async writes ++ */ ++static void ++panfs_shim_write_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.write.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++/* state->ol_state.committed = NFS_FILE_SYNC;*/ ++ state->ol_state.committed = NFS_UNSTABLE; ++ status = res_p->length; ++ WARN_ON(status < 0); ++ ++ objlayout_add_delta_space_used(&state->ol_state, ++ res_p->delta_capacity_used); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_write rc %u: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_write_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable /* unused, PanOSD writes are stable */) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ struct objlayout_segment *lseg = LSEG_LD_DATA(ol_state->lseg); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)lseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.write.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.write.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_write(PAN_SAM_ACCESS_NONE, ++ &state->u.write.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_write_done, ++ state, ++ NULL, ++ &state->u.write.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_write_done(state, NULL, &state->u.write.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++int ++panfs_shim_register(struct panfs_export_operations *ops) ++{ ++ if (panfs_export_ops) { ++ printk(KERN_INFO ++ "%s: panfs already registered (panfs ops %p)\n", ++ __func__, panfs_export_ops); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: registering panfs ops %p\n", ++ __func__, ops); ++ ++ panfs_export_ops = ops; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_register); ++ ++int ++panfs_shim_unregister(void) ++{ ++ if (!panfs_export_ops) { ++ printk(KERN_INFO "%s: panfs is not registered\n", __func__); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: unregistering panfs ops %p\n", ++ __func__, panfs_export_ops); ++ ++ panfs_export_ops = NULL; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_unregister); ++ ++/* ++ * Policy Operations ++ */ ++ ++/* ++ * Return the stripe size for the specified file ++ */ ++ssize_t ++panlayout_get_stripesize(struct pnfs_layout_type *pnfslay) ++{ ++ ssize_t sz, maxsz = -1; ++ struct pnfs_layout_segment *lseg; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ list_for_each_entry(lseg, &pnfslay->segs, fi_list) { ++ int n; ++ struct objlayout_segment *panlseg = LSEG_LD_DATA(lseg); ++ struct pnfs_osd_layout *lo = ++ (struct pnfs_osd_layout *)panlseg->pnfs_osd_layout; ++ struct pnfs_osd_data_map *map = &lo->olo_map; ++ ++ n = map->odm_group_width; ++ if (n == 0) ++ n = map->odm_num_comps / (map->odm_mirror_cnt + 1); ++ ++ switch (map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_5: ++ n -= 1; ++ n *= 8; /* FIXME: until we have 2-D coalescing */ ++ break; ++ ++ case PNFS_OSD_RAID_PQ: ++ n -= 2; ++ break; ++ ++ default: ++ BUG_ON(1); ++ } ++ sz = map->odm_stripe_unit * n; ++ if (sz > maxsz) ++ maxsz = sz; ++ } ++ dprintk("%s: Return %Zd\n", __func__, maxsz); ++ return maxsz; ++} ++ ++#define PANLAYOUT_DEF_STRIPE_UNIT (64*1024) ++#define PANLAYOUT_DEF_STRIPE_WIDTH 9 ++#define PANLAYOUT_MAX_STRIPE_WIDTH 11 ++#define PANLAYOUT_MAX_GATHER_STRIPES 8 ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++panlayout_get_blocksize(void) ++{ ++ ssize_t sz = (PANLAYOUT_MAX_STRIPE_WIDTH-1) * ++ PANLAYOUT_DEF_STRIPE_UNIT * ++ PANLAYOUT_MAX_GATHER_STRIPES; ++ dprintk("%s: Return %Zd\n", __func__, sz); ++ return sz; ++} ++ ++static struct layoutdriver_policy_operations panlayout_policy_operations = { ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ .get_stripesize = panlayout_get_stripesize, ++ .get_blocksize = panlayout_get_blocksize, ++}; ++ ++#define PNFS_LAYOUT_PANOSD (NFS4_PNFS_PRIVATE_LAYOUT | LAYOUT_OSD2_OBJECTS) ++ ++static struct pnfs_layoutdriver_type panlayout_type = { ++ .id = PNFS_LAYOUT_PANOSD, ++ .name = "PNFS_LAYOUT_PANOSD", ++ .ld_io_ops = &objlayout_io_operations, ++ .ld_policy_ops = &panlayout_policy_operations, ++}; ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for Panasas OSDs"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++panlayout_init(void) ++{ ++ pnfs_client_ops = pnfs_register_layoutdriver(&panlayout_type); ++ printk(KERN_INFO "%s: Registered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++ return 0; ++} ++ ++static void __exit ++panlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&panlayout_type); ++ printk(KERN_INFO "%s: Unregistered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(panlayout_init); ++module_exit(panlayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-23 12:09:03.353501685 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-23 12:09:03.353501685 -0400 +@@ -0,0 +1,482 @@ ++/* ++ * panfs_shim.h ++ * ++ * Data types and external function declerations for interfacing with ++ * panfs (Panasas DirectFlow) I/O stack ++ * ++ * Copyright (C) 2007 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#ifndef _PANLAYOUT_PANFS_SHIM_H ++#define _PANLAYOUT_PANFS_SHIM_H ++ ++typedef s8 pan_int8_t; ++typedef u8 pan_uint8_t; ++typedef s16 pan_int16_t; ++typedef u16 pan_uint16_t; ++typedef s32 pan_int32_t; ++typedef u32 pan_uint32_t; ++typedef s64 pan_int64_t; ++typedef u64 pan_uint64_t; ++ ++/* ++ * from pan_base_types.h ++ */ ++typedef pan_uint64_t pan_rpc_none_t; ++typedef pan_uint32_t pan_rpc_arrdim_t; ++typedef pan_uint32_t pan_status_t; ++typedef pan_uint8_t pan_otw_t; ++typedef pan_uint8_t pan_pad_t; ++ ++typedef pan_uint32_t pan_timespec_sec_t; ++typedef pan_uint32_t pan_timespec_nsec_t; ++ ++typedef struct pan_timespec_s pan_timespec_t; ++struct pan_timespec_s { ++ pan_timespec_sec_t ts_sec; ++ pan_timespec_nsec_t ts_nsec; ++}; ++ ++/* ++ * from pan_std_types.h ++ */ ++typedef pan_uint32_t pan_size_t; ++typedef int pan_bool_t; ++ ++/* ++ * from pan_common_error.h ++ */ ++#define PAN_SUCCESS ((pan_status_t)0) ++#define PAN_ERR_IN_PROGRESS ((pan_status_t)55) ++ ++/* ++ * from pan_sg.h ++ */ ++typedef struct pan_sg_entry_s pan_sg_entry_t; ++struct pan_sg_entry_s { ++ void *buffer; /* pointer to memory */ ++ pan_uint32_t chunk_size; /* size of each chunk (bytes) */ ++ pan_sg_entry_t *next; ++}; ++ ++/* ++ * from pan_storage.h ++ */ ++typedef pan_uint64_t pan_stor_dev_id_t; ++typedef pan_uint32_t pan_stor_obj_grp_id_t; ++typedef pan_uint64_t pan_stor_obj_uniq_t; ++typedef pan_uint32_t pan_stor_action_t; ++typedef pan_uint8_t pan_stor_cap_key_t[20]; ++ ++typedef pan_uint8_t pan_stor_key_type_t; ++typedef pan_uint64_t pan_stor_len_t; ++typedef pan_int64_t pan_stor_delta_len_t; ++typedef pan_uint64_t pan_stor_offset_t; ++typedef pan_uint16_t pan_stor_op_t; ++ ++typedef pan_uint16_t pan_stor_sec_level_t; ++ ++struct pan_stor_obj_id_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_obj_uniq_t obj_id; ++ pan_stor_obj_grp_id_t grp_id; ++}; ++ ++typedef struct pan_stor_obj_id_s pan_stor_obj_id_t; ++ ++#define PAN_STOR_OP_NONE ((pan_stor_op_t) 0U) ++#define PAN_STOR_OP_READ ((pan_stor_op_t) 8U) ++#define PAN_STOR_OP_WRITE ((pan_stor_op_t) 9U) ++#define PAN_STOR_OP_APPEND ((pan_stor_op_t) 10U) ++#define PAN_STOR_OP_GETATTR ((pan_stor_op_t) 11U) ++#define PAN_STOR_OP_SETATTR ((pan_stor_op_t) 12U) ++#define PAN_STOR_OP_FLUSH ((pan_stor_op_t) 13U) ++#define PAN_STOR_OP_CLEAR ((pan_stor_op_t) 14U) ++ ++/* ++ * from pan_aggregation_map.h ++ */ ++typedef pan_uint8_t pan_agg_type_t; ++typedef pan_uint64_t pan_agg_map_version_t; ++typedef pan_uint8_t pan_agg_obj_state_t; ++typedef pan_uint8_t pan_agg_comp_state_t; ++typedef pan_uint8_t pan_agg_comp_flag_t; ++ ++#define PAN_AGG_OBJ_STATE_INVALID ((pan_agg_obj_state_t) 0x00) ++#define PAN_AGG_OBJ_STATE_NORMAL ((pan_agg_obj_state_t) 0x01) ++#define PAN_AGG_OBJ_STATE_DEGRADED ((pan_agg_obj_state_t) 0x02) ++#define PAN_AGG_OBJ_STATE_RECONSTRUCT ((pan_agg_obj_state_t) 0x03) ++#define PAN_AGG_OBJ_STATE_COPYBACK ((pan_agg_obj_state_t) 0x04) ++#define PAN_AGG_OBJ_STATE_UNAVAILABLE ((pan_agg_obj_state_t) 0x05) ++#define PAN_AGG_OBJ_STATE_CREATING ((pan_agg_obj_state_t) 0x06) ++#define PAN_AGG_OBJ_STATE_DELETED ((pan_agg_obj_state_t) 0x07) ++#define PAN_AGG_COMP_STATE_INVALID ((pan_agg_comp_state_t) 0x00) ++#define PAN_AGG_COMP_STATE_NORMAL ((pan_agg_comp_state_t) 0x01) ++#define PAN_AGG_COMP_STATE_UNAVAILABLE ((pan_agg_comp_state_t) 0x02) ++#define PAN_AGG_COMP_STATE_COPYBACK ((pan_agg_comp_state_t) 0x03) ++#define PAN_AGG_COMP_F_NONE ((pan_agg_comp_flag_t) 0x00) ++#define PAN_AGG_COMP_F_ATTR_STORING ((pan_agg_comp_flag_t) 0x01) ++#define PAN_AGG_COMP_F_OBJ_CORRUPT_OBS ((pan_agg_comp_flag_t) 0x02) ++#define PAN_AGG_COMP_F_TEMP ((pan_agg_comp_flag_t) 0x04) ++ ++struct pan_aggregation_map_s { ++ pan_agg_map_version_t version; ++ pan_agg_obj_state_t avail_state; ++ pan_stor_obj_id_t obj_id; ++}; ++ ++typedef struct pan_aggregation_map_s pan_aggregation_map_t; ++ ++struct pan_agg_comp_obj_s { ++ pan_stor_dev_id_t dev_id; ++ pan_agg_comp_state_t avail_state; ++ pan_agg_comp_flag_t comp_flags; ++}; ++ ++typedef struct pan_agg_comp_obj_s pan_agg_comp_obj_t; ++ ++struct pan_agg_simple_header_s { ++ pan_uint8_t unused; ++}; ++ ++typedef struct pan_agg_simple_header_s pan_agg_simple_header_t; ++ ++struct pan_agg_raid1_header_s { ++ pan_uint16_t num_comps; ++}; ++ ++typedef struct pan_agg_raid1_header_s pan_agg_raid1_header_t; ++ ++struct pan_agg_raid0_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++}; ++ ++typedef struct pan_agg_raid0_header_s pan_agg_raid0_header_t; ++ ++struct pan_agg_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit0; ++ pan_uint32_t stripe_unit1; ++ pan_uint32_t stripe_unit2; ++}; ++ ++typedef struct pan_agg_raid5_left_header_s pan_agg_raid5_left_header_t; ++ ++typedef struct pan_agg_grp_raid5_left_header_s pan_agg_grp_raid5_left_header_t; ++ ++struct pan_agg_grp_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++ pan_uint16_t rg_width; ++ pan_uint16_t rg_depth; ++ pan_uint8_t group_layout_policy; ++}; ++ ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_INVALID ((pan_uint8_t) 0x00) ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN ((pan_uint8_t) 0x01) ++ ++#define PAN_AGG_NULL_MAP ((pan_agg_type_t) 0x00) ++#define PAN_AGG_SIMPLE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_RAID1 ((pan_agg_type_t) 0x02) ++#define PAN_AGG_RAID0 ((pan_agg_type_t) 0x03) ++#define PAN_AGG_RAID5_LEFT ((pan_agg_type_t) 0x04) ++#define PAN_AGG_GRP_RAID5_LEFT ((pan_agg_type_t) 0x06) ++#define PAN_AGG_MINTYPE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_MAXTYPE ((pan_agg_type_t) 0x06) ++ ++struct pan_agg_layout_hdr_s { ++ pan_agg_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_uint64_t null; ++ pan_agg_simple_header_t simple; ++ pan_agg_raid1_header_t raid1; ++ pan_agg_raid0_header_t raid0; ++ pan_agg_raid5_left_header_t raid5_left; ++ pan_agg_grp_raid5_left_header_t grp_raid5_left; ++ } hdr; ++}; ++ ++typedef struct pan_agg_layout_hdr_s pan_agg_layout_hdr_t; ++ ++struct pan_agg_comp_obj_a_s { ++ pan_rpc_arrdim_t size; ++ pan_agg_comp_obj_t *data; ++}; ++typedef struct pan_agg_comp_obj_a_s pan_agg_comp_obj_a; ++ ++struct pan_agg_full_map_s { ++ pan_aggregation_map_t map_hdr; ++ pan_agg_layout_hdr_t layout_hdr; ++ pan_agg_comp_obj_a components; ++}; ++ ++typedef struct pan_agg_full_map_s pan_agg_full_map_t; ++ ++/* ++ * from pan_obsd_rpc_types.h ++ */ ++typedef pan_uint8_t pan_obsd_security_key_a[16]; ++ ++typedef pan_uint8_t pan_obsd_capability_key_a[20]; ++ ++typedef pan_uint8_t pan_obsd_key_holder_id_t; ++ ++#define PAN_OBSD_KEY_HOLDER_BASIS_KEY ((pan_obsd_key_holder_id_t) 0x01) ++#define PAN_OBSD_KEY_HOLDER_CAP_KEY ((pan_obsd_key_holder_id_t) 0x02) ++ ++struct pan_obsd_key_holder_s { ++ pan_obsd_key_holder_id_t select; ++ pan_pad_t pad[3]; ++ union { ++ pan_obsd_security_key_a basis_key; ++ pan_obsd_capability_key_a cap_key; ++ } key; ++}; ++ ++typedef struct pan_obsd_key_holder_s pan_obsd_key_holder_t; ++ ++/* ++ * from pan_sm_sec.h ++ */ ++typedef pan_uint8_t pan_sm_sec_type_t; ++typedef pan_uint8_t pan_sm_sec_otw_allo_mode_t; ++ ++struct pan_obsd_capability_generic_otw_t_s { ++ pan_rpc_arrdim_t size; ++ pan_uint8_t *data; ++}; ++typedef struct pan_obsd_capability_generic_otw_t_s ++ pan_obsd_capability_generic_otw_t; ++ ++struct pan_sm_sec_obsd_s { ++ pan_obsd_key_holder_t key; ++ pan_obsd_capability_generic_otw_t cap_otw; ++ pan_sm_sec_otw_allo_mode_t allo_mode; ++}; ++ ++typedef struct pan_sm_sec_obsd_s pan_sm_sec_obsd_t; ++ ++struct pan_sm_sec_s { ++ pan_sm_sec_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_rpc_none_t none; ++ pan_sm_sec_obsd_t obsd; ++ } variant; ++}; ++ ++typedef struct pan_sm_sec_s pan_sm_sec_t; ++ ++struct pan_sm_sec_a_s { ++ pan_rpc_arrdim_t size; ++ pan_sm_sec_t *data; ++}; ++typedef struct pan_sm_sec_a_s pan_sm_sec_a; ++typedef pan_otw_t *pan_sm_sec_otw_t; ++ ++/* ++ * from pan_sm_types.h ++ */ ++typedef pan_uint64_t pan_sm_cap_handle_t; ++ ++struct pan_sm_map_cap_s { ++ pan_agg_full_map_t full_map; ++ pan_stor_offset_t offset; ++ pan_stor_len_t length; ++ pan_sm_sec_a secs; ++ pan_sm_cap_handle_t handle; ++ pan_timespec_t expiration_time; ++ pan_stor_action_t action_mask; ++ pan_uint32_t flags; ++}; ++ ++typedef struct pan_sm_map_cap_s pan_sm_map_cap_t; ++ ++/* ++ * from pan_sm_ops.h ++ */ ++typedef pan_rpc_none_t pan_sm_cache_ptr_t; ++ ++/* ++ * from pan_sam_api.h ++ */ ++typedef pan_uint32_t pan_sam_access_flags_t; ++ ++typedef struct pan_sam_dev_error_s pan_sam_dev_error_t; ++struct pan_sam_dev_error_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_op_t stor_op; ++ pan_status_t error; ++}; ++ ++typedef struct pan_sam_ext_status_s pan_sam_ext_status_t; ++struct pan_sam_ext_status_s { ++ pan_uint32_t available; ++ pan_uint32_t size; ++ pan_sam_dev_error_t *errors; ++}; ++ ++enum pan_sam_rpc_sec_sel_e { ++ PAN_SAM_RPC_SEC_DEFAULT, ++ PAN_SAM_RPC_SEC_ATLEAST, ++ PAN_SAM_RPC_SEC_EXACTLY ++}; ++typedef enum pan_sam_rpc_sec_sel_e pan_sam_rpc_sec_sel_t; ++ ++typedef struct pan_sam_obj_sec_s pan_sam_obj_sec_t; ++struct pan_sam_obj_sec_s { ++ pan_stor_sec_level_t min_security; ++ pan_sm_map_cap_t *map_ccaps; ++}; ++ ++typedef struct pan_sam_rpc_sec_s pan_sam_rpc_sec_t; ++struct pan_sam_rpc_sec_s { ++ pan_sam_rpc_sec_sel_t selector; ++}; ++ ++typedef struct pan_sam_read_args_s pan_sam_read_args_t; ++struct pan_sam_read_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ void *return_attr; ++ void *checksum; ++ pan_stor_offset_t offset; ++ pan_uint16_t sm_options; ++ void *callout; ++ void *callout_arg; ++}; ++ ++typedef struct pan_sam_read_res_s pan_sam_read_res_t; ++struct pan_sam_read_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ void *attr; ++ void *checksum; ++}; ++ ++typedef void (*pan_sam_read_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t status); ++ ++#define PAN_SAM_ACCESS_NONE 0x0000 ++#define PAN_SAM_ACCESS_BYPASS_TIMESTAMP 0x0020 ++ ++typedef struct pan_sam_write_args_s pan_sam_write_args_t; ++struct pan_sam_write_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ pan_stor_offset_t offset; ++ void *attr; ++ void *return_attr; ++}; ++ ++typedef struct pan_sam_write_res_s pan_sam_write_res_t; ++struct pan_sam_write_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ pan_stor_delta_len_t delta_capacity_used; ++ pan_bool_t parity_dirty; ++ void *attr; ++}; ++ ++typedef void (*pan_sam_write_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t status); ++ ++/* ++ * from pan_mgr_types.h ++ */ ++#define PAN_MGR_ID_TYPE_SHIFT 56 ++#define PAN_MGR_ID_TYPE_MASK ((pan_mgr_id_t)18374686479671623680ULL) ++#define PAN_MGR_ID_UNIQ_MASK ((pan_mgr_id_t)72057594037927935ULL) ++ ++typedef pan_uint16_t pan_mgr_type_t; ++typedef pan_uint64_t pan_mgr_id_t; ++ ++#define PAN_MGR_SM ((pan_mgr_type_t) 2U) ++#define PAN_MGR_OBSD ((pan_mgr_type_t) 6U) ++ ++/* ++ * from pan_mgr_types_c.h ++ */ ++#define pan_mgr_id_construct_artificial(_mgr_type_, _mgr_uniq_, _mgr_id_p_) { \ ++ pan_mgr_id_t _id1, _id2; \ ++\ ++ _id1 = (_mgr_type_); \ ++ _id1 <<= PAN_MGR_ID_TYPE_SHIFT; \ ++ _id1 &= PAN_MGR_ID_TYPE_MASK; \ ++ _id2 = (_mgr_uniq_); \ ++ _id2 &= PAN_MGR_ID_UNIQ_MASK; \ ++ _id1 |= _id2; \ ++ *(_mgr_id_p_) = _id1; \ ++} ++ ++/* ++ * from pan_storage_c.h ++ */ ++#define pan_stor_is_device_id_an_obsd_id(_device_id_) \ ++ ((((_device_id_) & PAN_MGR_ID_TYPE_MASK) >> PAN_MGR_ID_TYPE_SHIFT) \ ++ == PAN_MGR_OBSD) ++ ++/* ++ * pnfs_shim internal definitions ++ */ ++ ++struct panfs_shim_io_state { ++ struct objlayout_io_state ol_state; ++ ++ pan_sg_entry_t *sg_list; ++ pan_sam_obj_sec_t obj_sec; ++ void *ucreds; ++ union { ++ struct { ++ pan_sam_read_args_t args; ++ pan_sam_read_res_t res; ++ } read; ++ struct { ++ pan_sam_write_args_t args; ++ pan_sam_write_res_t res; ++ } write; ++ } u; ++}; ++ ++#endif /* _PANLAYOUT_PANFS_SHIM_H */ +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-23 12:09:03.354501721 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-23 12:09:03.354501721 -0400 +@@ -0,0 +1,435 @@ ++/* ++ * pnfs_osd_xdr.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* ++ * The following implementation is based on these Internet Drafts: ++ * ++ * draft-ietf-nfsv4-minorversion-21 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_objid(u32 *p, struct pnfs_osd_objid *objid) ++{ ++ COPYMEM(objid->oid_device_id.data, sizeof(objid->oid_device_id.data)); ++ READ64(objid->oid_partition_id); ++ READ64(objid->oid_object_id); ++ return p; ++} ++ ++static inline u32 * ++pnfs_osd_xdr_decode_opaque_cred(u32 *p, ++ struct pnfs_osd_opaque_cred *opaque_cred) ++{ ++ READ32(opaque_cred->cred_len); ++ COPYMEM(opaque_cred->cred, opaque_cred->cred_len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_object_cred(u32 *p, struct pnfs_osd_object_cred *comp, ++ u8 **credp) ++{ ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_objid(p, &comp->oc_object_id); ++ READ32(comp->oc_osd_version); ++ READ32(comp->oc_cap_key_sec); ++ ++ cred = *credp; ++ comp->oc_cap_key.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap_key); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap_key.cred_len)); ++ comp->oc_cap.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap.cred_len)); ++ *credp = cred; ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_data_map(u32 *p, struct pnfs_osd_data_map *data_map) ++{ ++ READ32(data_map->odm_num_comps); ++ READ64(data_map->odm_stripe_unit); ++ READ32(data_map->odm_group_width); ++ READ32(data_map->odm_group_depth); ++ READ32(data_map->odm_mirror_cnt); ++ READ32(data_map->odm_raid_algorithm); ++ dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " ++ "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", ++ __func__, ++ data_map->odm_num_comps, ++ (unsigned long long)data_map->odm_stripe_unit, ++ data_map->odm_group_width, ++ data_map->odm_group_depth, ++ data_map->odm_mirror_cnt, ++ data_map->odm_raid_algorithm); ++ return p; ++} ++ ++struct pnfs_osd_layout * ++pnfs_osd_xdr_decode_layout(struct pnfs_osd_layout *layout, u32 *p) ++{ ++ int i; ++ u32 *start = p; ++ struct pnfs_osd_object_cred *comp; ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_data_map(p, &layout->olo_map); ++ READ32(layout->olo_comps_index); ++ READ32(layout->olo_num_comps); ++ layout->olo_comps = (struct pnfs_osd_object_cred *)(layout + 1); ++ comp = layout->olo_comps; ++ cred = (u8 *)(comp + layout->olo_num_comps); ++ dprintk("%s: comps_index=%u num_comps=%u\n", ++ __func__, layout->olo_comps_index, layout->olo_num_comps); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ p = pnfs_osd_xdr_decode_object_cred(p, comp, &cred); ++ dprintk("%s: comp[%d]=dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "key_len=%u cap_len=%u\n", ++ __func__, i, ++ _DEVID_LO(&comp->oc_object_id.oid_device_id), ++ _DEVID_HI(&comp->oc_object_id.oid_device_id), ++ comp->oc_object_id.oid_partition_id, ++ comp->oc_object_id.oid_object_id, ++ comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); ++ comp++; ++ } ++ dprintk("%s: xdr_size=%Zd end=%p in_core_size=%Zd\n", __func__, ++ (char *)p - (char *)start, cred, (char *)cred - (char *)layout); ++ return layout; ++} ++ ++/* ++ * Get Device Information Decoding ++ * ++ * Note: since Device Information is currently done synchronously, most ++ * of the actual fields are left inside the rpc buffer and are only ++ * pointed to by the pnfs_osd_deviceaddr members. So the read buffer ++ * should not be freed while the returned information is in use. ++ */ ++ ++u32 *__xdr_read_calc_nfs4_string( ++ u32 *p, struct nfs4_string *str, u8 **freespace) ++{ ++ u32 len; ++ char *data; ++ bool need_copy; ++ ++ READ32(len); ++ data = (char *)p; ++ ++ if (data[len]) { /* Not null terminated we'll need extra space */ ++ data = *freespace; ++ *freespace += len + 1; ++ need_copy = true; ++ } else { ++ need_copy = false; ++ } ++ ++ if (str) { ++ str->len = len; ++ str->data = data; ++ if (need_copy) { ++ memcpy(data, p, len); ++ data[len] = 0; ++ } ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++u32 *__xdr_read_calc_u8_opaque( ++ u32 *p, struct nfs4_string *str) ++{ ++ u32 len; ++ ++ READ32(len); ++ ++ if (str) { ++ str->len = len; ++ str->data = (char *)p; ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetid { ++ * u32 oti_type; ++ * struct nfs4_string oti_scsi_device_id; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetid( ++ u32 *p, struct pnfs_osd_targetid* targetid, u8 **freespace) ++{ ++ u32 oti_type; ++ ++ READ32(oti_type); ++ if (targetid) ++ targetid->oti_type = oti_type; ++ ++ switch (oti_type) { ++ case OBJ_TARGET_SCSI_NAME: ++ case OBJ_TARGET_SCSI_DEVICE_ID: ++ p = __xdr_read_calc_u8_opaque(p, ++ targetid ? &targetid->oti_scsi_device_id : NULL); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_net_addr { ++ * struct nfs4_string r_netid; ++ * struct nfs4_string r_addr; ++ * }; ++ */ ++u32 *__xdr_read_calc_net_addr( ++ u32 *p, struct pnfs_osd_net_addr* netaddr, u8 **freespace) ++{ ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_netid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_addr : NULL, ++ freespace); ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetaddr { ++ * u32 ota_available; ++ * struct pnfs_osd_net_addr ota_netaddr; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetaddr( ++ u32 *p, struct pnfs_osd_targetaddr *targetaddr, u8 **freespace) ++{ ++ u32 ota_available; ++ ++ READ32(ota_available); ++ if (targetaddr) ++ targetaddr->ota_available = ota_available; ++ ++ if (ota_available) { ++ p = __xdr_read_calc_net_addr(p, ++ targetaddr ? &targetaddr->ota_netaddr : NULL, ++ freespace); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++u32 *__xdr_read_calc_deviceaddr( ++ u32 *p, struct pnfs_osd_deviceaddr *deviceaddr, u8 **freespace) ++{ ++ p = __xdr_read_calc_targetid(p, ++ deviceaddr ? &deviceaddr->oda_targetid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_targetaddr(p, ++ deviceaddr ? &deviceaddr->oda_targetaddr : NULL, ++ freespace); ++ ++ if (deviceaddr) ++ COPYMEM(deviceaddr->oda_lun, sizeof(deviceaddr->oda_lun)); ++ else ++ p += XDR_QUADLEN(sizeof(deviceaddr->oda_lun)); ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_systemid : NULL); ++ ++ if (deviceaddr) { ++ p = pnfs_osd_xdr_decode_object_cred(p, ++ &deviceaddr->oda_root_obj_cred, freespace); ++ } else { ++ *freespace += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_osdname : NULL); ++ ++ return p; ++} ++ ++size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p) ++{ ++ u8 *null_freespace = NULL; ++ size_t sz; ++ ++ __xdr_read_calc_deviceaddr(p, NULL, &null_freespace); ++ sz = sizeof(struct pnfs_osd_deviceaddr) + (size_t)null_freespace; ++ ++ return sz; ++} ++ ++void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p) ++{ ++ u8 *freespace = (u8 *)(deviceaddr + 1); ++ ++ __xdr_read_calc_deviceaddr(p, deviceaddr, &freespace); ++} ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou) ++{ ++ __be32 *p = xdr_reserve_space(xdr, 16); ++ ++ if (!p) ++ return -E2BIG; ++ ++ *p++ = cpu_to_be32(lou->dsu_valid); ++ if (lou->dsu_valid) ++ p = xdr_encode_hyper(p, lou->dsu_delta); ++ *p++ = cpu_to_be32(lou->olu_ioerr_flag); ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ */ ++static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p; ++ ++ p = xdr_reserve_space(xdr, 32); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, ++ sizeof(object_id->oid_device_id.data)); ++ p = xdr_encode_hyper(p, object_id->oid_partition_id); ++ p = xdr_encode_hyper(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, ++ struct pnfs_osd_ioerr *ioerr) ++{ ++ __be32 *p; ++ int ret; ++ ++ ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component); ++ if (ret) ++ return ret; ++ ++ p = xdr_reserve_space(xdr, 24); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_hyper(p, ioerr->oer_comp_offset); ++ p = xdr_encode_hyper(p, ioerr->oer_comp_length); ++ *p++ = cpu_to_be32(ioerr->oer_iswrite); ++ *p = cpu_to_be32(ioerr->oer_errno); ++ ++ return 0; ++} +diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/pagelist.c +--- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-08-23 12:08:29.056411363 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-23 12:09:03.355511659 -0400 +@@ -20,6 +20,7 @@ + #include + + #include "internal.h" ++#include "pnfs.h" + + static struct kmem_cache *nfs_page_cachep; + +@@ -56,7 +57,8 @@ nfs_page_free(struct nfs_page *p) + struct nfs_page * + nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, + struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + +@@ -79,7 +81,11 @@ nfs_create_request(struct nfs_open_conte + req->wb_pgbase = offset; + req->wb_bytes = count; + req->wb_context = get_nfs_open_context(ctx); ++ req->wb_lock_context = nfs_get_lock_context(ctx); + kref_init(&req->wb_kref); ++ req->wb_lseg = lseg; ++ if (lseg) ++ get_lseg(lseg); + return req; + } + +@@ -141,18 +147,26 @@ void nfs_clear_request(struct nfs_page * + { + struct page *page = req->wb_page; + struct nfs_open_context *ctx = req->wb_context; ++ struct nfs_lock_context *l_ctx = req->wb_lock_context; + + if (page != NULL) { + page_cache_release(page); + req->wb_page = NULL; + } ++ if (l_ctx != NULL) { ++ nfs_put_lock_context(l_ctx); ++ req->wb_lock_context = NULL; ++ } + if (ctx != NULL) { + put_nfs_open_context(ctx); + req->wb_context = NULL; + } ++ if (req->wb_lseg != NULL) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } + } + +- + /** + * nfs_release_request - Release the count on an NFS read/write request + * @req: request to release +@@ -231,11 +245,12 @@ void nfs_pageio_init(struct nfs_pageio_d + * Return 'true' if this is the case, else return 'false'. + */ + static int nfs_can_coalesce_requests(struct nfs_page *prev, +- struct nfs_page *req) ++ struct nfs_page *req, ++ struct nfs_pageio_descriptor *pgio) + { + if (req->wb_context->cred != prev->wb_context->cred) + return 0; +- if (req->wb_context->lockowner != prev->wb_context->lockowner) ++ if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) + return 0; + if (req->wb_context->state != prev->wb_context->state) + return 0; +@@ -245,6 +260,12 @@ static int nfs_can_coalesce_requests(str + return 0; + if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) + return 0; ++ if (req->wb_lseg != prev->wb_lseg) ++ return 0; ++#ifdef CONFIG_NFS_V4_1 ++ if (pgio->pg_test && !pgio->pg_test(pgio, prev, req)) ++ return 0; ++#endif /* CONFIG_NFS_V4_1 */ + return 1; + } + +@@ -277,7 +298,7 @@ static int nfs_pageio_do_add_request(str + if (newlen > desc->pg_bsize) + return 0; + prev = nfs_list_entry(desc->pg_list.prev); +- if (!nfs_can_coalesce_requests(prev, req)) ++ if (!nfs_can_coalesce_requests(prev, req, desc)) + return 0; + } else + desc->pg_base = req->wb_pgbase; +@@ -366,6 +387,7 @@ void nfs_pageio_cond_complete(struct nfs + * @idx_start: lower bound of page->index to scan + * @npages: idx_start + npages sets the upper bound to scan. + * @tag: tag to scan for ++ * @use_pnfs: will be set TRUE if commit needs to be handled by layout driver + * + * Moves elements from one of the inode request lists. + * If the number of requests is set to 0, the entire address_space +@@ -375,7 +397,7 @@ void nfs_pageio_cond_complete(struct nfs + */ + int nfs_scan_list(struct nfs_inode *nfsi, + struct list_head *dst, pgoff_t idx_start, +- unsigned int npages, int tag) ++ unsigned int npages, int tag, int *use_pnfs) + { + struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; + struct nfs_page *req; +@@ -406,6 +428,8 @@ int nfs_scan_list(struct nfs_inode *nfsi + radix_tree_tag_clear(&nfsi->nfs_page_tree, + req->wb_index, tag); + nfs_list_add_request(req, dst); ++ if (req->wb_lseg) ++ *use_pnfs = 1; + res++; + if (res == INT_MAX) + goto out; +diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs.c +--- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-08-23 12:09:03.356501413 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-08-23 12:09:03.357481204 -0400 +@@ -0,0 +1,2027 @@ ++/* ++ * linux/fs/nfs/pnfs.c ++ * ++ * pNFS functions to call and manage layout drivers. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "internal.h" ++#include "nfs4_fs.h" ++#include "pnfs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS ++ ++#define MIN_POOL_LC (4) ++ ++static int pnfs_initialized; ++ ++static void pnfs_free_layout(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layout_segment *range); ++static inline void get_layout(struct pnfs_layout_type *lo); ++ ++/* Locking: ++ * ++ * pnfs_spinlock: ++ * protects pnfs_modules_tbl. ++ */ ++static spinlock_t pnfs_spinlock = __SPIN_LOCK_UNLOCKED(pnfs_spinlock); ++ ++/* ++ * pnfs_modules_tbl holds all pnfs modules ++ */ ++static struct list_head pnfs_modules_tbl; ++static struct kmem_cache *pnfs_cachep; ++static mempool_t *pnfs_layoutcommit_mempool; ++ ++static inline struct pnfs_layoutcommit_data *pnfs_layoutcommit_alloc(void) ++{ ++ struct pnfs_layoutcommit_data *p = ++ mempool_alloc(pnfs_layoutcommit_mempool, GFP_NOFS); ++ if (p) ++ memset(p, 0, sizeof(*p)); ++ ++ return p; ++} ++ ++void pnfs_layoutcommit_free(struct pnfs_layoutcommit_data *p) ++{ ++ mempool_free(p, pnfs_layoutcommit_mempool); ++} ++ ++/* ++ * struct pnfs_module - One per pNFS device module. ++ */ ++struct pnfs_module { ++ struct pnfs_layoutdriver_type *pnfs_ld_type; ++ struct list_head pnfs_tblid; ++}; ++ ++int ++pnfs_initialize(void) ++{ ++ INIT_LIST_HEAD(&pnfs_modules_tbl); ++ ++ pnfs_cachep = kmem_cache_create("pnfs_layoutcommit_data", ++ sizeof(struct pnfs_layoutcommit_data), ++ 0, SLAB_HWCACHE_ALIGN, NULL); ++ if (pnfs_cachep == NULL) ++ return -ENOMEM; ++ ++ pnfs_layoutcommit_mempool = mempool_create(MIN_POOL_LC, ++ mempool_alloc_slab, ++ mempool_free_slab, ++ pnfs_cachep); ++ if (pnfs_layoutcommit_mempool == NULL) { ++ kmem_cache_destroy(pnfs_cachep); ++ return -ENOMEM; ++ } ++ ++ pnfs_initialized = 1; ++ return 0; ++} ++ ++void pnfs_uninitialize(void) ++{ ++ mempool_destroy(pnfs_layoutcommit_mempool); ++ kmem_cache_destroy(pnfs_cachep); ++} ++ ++/* search pnfs_modules_tbl for right pnfs module */ ++static int ++find_pnfs(u32 id, struct pnfs_module **module) { ++ struct pnfs_module *local = NULL; ++ ++ dprintk("PNFS: %s: Searching for %u\n", __func__, id); ++ list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) { ++ if (local->pnfs_ld_type->id == id) { ++ *module = local; ++ return(1); ++ } ++ } ++ return 0; ++} ++ ++/* Set lo_cred to indicate we require a layoutcommit ++ * If we don't even have a layout, we don't need to commit it. ++ */ ++void ++pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx) ++{ ++ dprintk("%s: has_layout=%d ctx=%p\n", __func__, has_layout(nfsi), ctx); ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (has_layout(nfsi) && ++ !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->pnfs_layout_state)) { ++ nfsi->layout->lo_cred = get_rpccred(ctx->state->owner->so_cred); ++ __set_bit(NFS_INO_LAYOUTCOMMIT, ++ &nfsi->layout->pnfs_layout_state); ++ nfsi->change_attr++; ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ dprintk("%s: Set layoutcommit\n", __func__); ++ return; ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* Update last_write_offset for layoutcommit. ++ * TODO: We should only use commited extents, but the current nfs ++ * implementation does not calculate the written range in nfs_commit_done. ++ * We therefore update this field in writeback_done. ++ */ ++void ++pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent) ++{ ++ loff_t end_pos; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (offset < nfsi->layout->pnfs_write_begin_pos) ++ nfsi->layout->pnfs_write_begin_pos = offset; ++ end_pos = offset + extent - 1; /* I'm being inclusive */ ++ if (end_pos > nfsi->layout->pnfs_write_end_pos) ++ nfsi->layout->pnfs_write_end_pos = end_pos; ++ dprintk("%s: Wrote %lu@%lu bpos %lu, epos: %lu\n", ++ __func__, ++ (unsigned long) extent, ++ (unsigned long) offset , ++ (unsigned long) nfsi->layout->pnfs_write_begin_pos, ++ (unsigned long) nfsi->layout->pnfs_write_end_pos); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* Unitialize a mountpoint in a layout driver */ ++void ++unmount_pnfs_layoutdriver(struct nfs_server *nfss) ++{ ++ if (PNFS_EXISTS_LDIO_OP(nfss, uninitialize_mountpoint)) ++ nfss->pnfs_curr_ld->ld_io_ops->uninitialize_mountpoint(nfss); ++} ++ ++/* ++ * Set the server pnfs module to the first registered pnfs_type. ++ * Only one pNFS layout driver is supported. ++ */ ++void ++set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, ++ u32 id) ++{ ++ struct pnfs_module *mod = NULL; ++ ++ if (server->pnfs_curr_ld) ++ return; ++ ++ if (!find_pnfs(id, &mod)) { ++ request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); ++ find_pnfs(id, &mod); ++ } ++ ++ if (!mod) { ++ dprintk("%s: No pNFS module found for %u. ", __func__, id); ++ goto out_err; ++ } ++ ++ server->pnfs_curr_ld = mod->pnfs_ld_type; ++ if (mod->pnfs_ld_type->ld_io_ops->initialize_mountpoint( ++ server, mntfh)) { ++ printk(KERN_ERR "%s: Error initializing mount point " ++ "for layout driver %u. ", __func__, id); ++ goto out_err; ++ } ++ ++ dprintk("%s: pNFS module for %u set\n", __func__, id); ++ return; ++ ++out_err: ++ dprintk("Using NFSv4 I/O\n"); ++ server->pnfs_curr_ld = NULL; ++} ++ ++/* Allow I/O module to set its functions structure */ ++struct pnfs_client_operations* ++pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) ++{ ++ struct pnfs_module *pnfs_mod; ++ struct layoutdriver_io_operations *io_ops = ld_type->ld_io_ops; ++ ++ if (!pnfs_initialized) { ++ printk(KERN_ERR "%s Registration failure. " ++ "pNFS not initialized.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops || !io_ops->alloc_layout || !io_ops->free_layout) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "alloc_layout and free_layout.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops->alloc_lseg || !io_ops->free_lseg) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "alloc_lseg and free_lseg.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops->read_pagelist || !io_ops->write_pagelist || ++ !io_ops->commit) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "read_pagelist, write_pagelist, and commit.\n", ++ __func__); ++ return NULL; ++ } ++ ++ pnfs_mod = kmalloc(sizeof(struct pnfs_module), GFP_KERNEL); ++ if (pnfs_mod != NULL) { ++ dprintk("%s Registering id:%u name:%s\n", ++ __func__, ++ ld_type->id, ++ ld_type->name); ++ pnfs_mod->pnfs_ld_type = ld_type; ++ INIT_LIST_HEAD(&pnfs_mod->pnfs_tblid); ++ ++ spin_lock(&pnfs_spinlock); ++ list_add(&pnfs_mod->pnfs_tblid, &pnfs_modules_tbl); ++ spin_unlock(&pnfs_spinlock); ++ } ++ ++ return &pnfs_ops; ++} ++ ++/* Allow I/O module to set its functions structure */ ++void ++pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) ++{ ++ struct pnfs_module *pnfs_mod; ++ ++ if (find_pnfs(ld_type->id, &pnfs_mod)) { ++ dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); ++ spin_lock(&pnfs_spinlock); ++ list_del(&pnfs_mod->pnfs_tblid); ++ spin_unlock(&pnfs_spinlock); ++ kfree(pnfs_mod); ++ } ++} ++ ++/* ++ * pNFS client layout cache ++ */ ++#if defined(CONFIG_SMP) ++#define BUG_ON_UNLOCKED_INO(ino) \ ++ BUG_ON(!spin_is_locked(&ino->i_lock)) ++#define BUG_ON_UNLOCKED_LO(lo) \ ++ BUG_ON_UNLOCKED_INO(PNFS_INODE(lo)) ++#else /* CONFIG_SMP */ ++#define BUG_ON_UNLOCKED_INO(lo) do {} while (0) ++#define BUG_ON_UNLOCKED_LO(lo) do {} while (0) ++#endif /* CONFIG_SMP */ ++ ++static inline void ++get_layout(struct pnfs_layout_type *lo) ++{ ++ BUG_ON_UNLOCKED_LO(lo); ++ lo->refcount++; ++} ++ ++static inline void ++put_layout_locked(struct pnfs_layout_type *lo) ++{ ++ BUG_ON_UNLOCKED_LO(lo); ++ BUG_ON(lo->refcount <= 0); ++ ++ lo->refcount--; ++ if (!lo->refcount) { ++ struct layoutdriver_io_operations *io_ops = PNFS_LD_IO_OPS(lo); ++ struct nfs_inode *nfsi = PNFS_NFS_INODE(lo); ++ ++ dprintk("%s: freeing layout cache %p\n", __func__, lo); ++ WARN_ON(!list_empty(&lo->lo_layouts)); ++ io_ops->free_layout(lo); ++ nfsi->layout = NULL; ++ } ++} ++ ++void ++put_layout(struct inode *inode) ++{ ++ spin_lock(&inode->i_lock); ++ put_layout_locked(NFS_I(inode)->layout); ++ spin_unlock(&inode->i_lock); ++ ++} ++ ++void ++pnfs_layout_release(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ struct nfs_inode *nfsi = PNFS_NFS_INODE(lo); ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (range) ++ pnfs_free_layout(lo, range); ++ /* ++ * Matched in _pnfs_update_layout for layoutget ++ * and by get_layout in _pnfs_return_layout for layoutreturn ++ */ ++ put_layout_locked(lo); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ wake_up_all(&nfsi->lo_waitq); ++} ++ ++void ++pnfs_destroy_layout(struct nfs_inode *nfsi) ++{ ++ struct pnfs_layout_type *lo; ++ struct nfs4_pnfs_layout_segment range = { ++ .iomode = IOMODE_ANY, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ lo = nfsi->layout; ++ if (lo) { ++ pnfs_free_layout(lo, &range); ++ WARN_ON(!list_empty(&nfsi->layout->segs)); ++ WARN_ON(!list_empty(&nfsi->layout->lo_layouts)); ++ ++ if (nfsi->layout->refcount != 1) ++ printk(KERN_WARNING "%s: layout refcount not=1 %d\n", ++ __func__, nfsi->layout->refcount); ++ WARN_ON(nfsi->layout->refcount != 1); ++ ++ /* Matched by refcount set to 1 in alloc_init_layout */ ++ put_layout_locked(lo); ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* ++ * Called by the state manger to remove all layouts established under an ++ * expired lease. ++ */ ++void ++pnfs_destroy_all_layouts(struct nfs_client *clp) ++{ ++ struct pnfs_layout_type *lo; ++ ++ while (!list_empty(&clp->cl_layouts)) { ++ lo = list_entry(clp->cl_layouts.next, struct pnfs_layout_type, ++ lo_layouts); ++ dprintk("%s freeing layout for inode %lu\n", __func__, ++ lo->lo_inode->i_ino); ++ pnfs_destroy_layout(NFS_I(lo->lo_inode)); ++ } ++} ++ ++static inline void ++init_lseg(struct pnfs_layout_type *lo, struct pnfs_layout_segment *lseg) ++{ ++ INIT_LIST_HEAD(&lseg->fi_list); ++ kref_init(&lseg->kref); ++ lseg->valid = true; ++ lseg->layout = lo; ++} ++ ++static void ++destroy_lseg(struct kref *kref) ++{ ++ struct pnfs_layout_segment *lseg = ++ container_of(kref, struct pnfs_layout_segment, kref); ++ ++ dprintk("--> %s\n", __func__); ++ /* Matched by get_layout in pnfs_insert_layout */ ++ put_layout_locked(lseg->layout); ++ PNFS_LD_IO_OPS(lseg->layout)->free_lseg(lseg); ++} ++ ++static void ++put_lseg_locked(struct pnfs_layout_segment *lseg) ++{ ++ bool do_wake_up; ++ struct nfs_inode *nfsi; ++ ++ if (!lseg) ++ return; ++ ++ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, ++ atomic_read(&lseg->kref.refcount), lseg->valid); ++ do_wake_up = !lseg->valid; ++ nfsi = PNFS_NFS_INODE(lseg->layout); ++ kref_put(&lseg->kref, destroy_lseg); ++ if (do_wake_up) ++ wake_up(&nfsi->lo_waitq); ++} ++ ++void ++put_lseg(struct pnfs_layout_segment *lseg) ++{ ++ bool do_wake_up; ++ struct nfs_inode *nfsi; ++ ++ if (!lseg) ++ return; ++ ++ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, ++ atomic_read(&lseg->kref.refcount), lseg->valid); ++ do_wake_up = !lseg->valid; ++ nfsi = PNFS_NFS_INODE(lseg->layout); ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ kref_put(&lseg->kref, destroy_lseg); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ if (do_wake_up) ++ wake_up(&nfsi->lo_waitq); ++} ++EXPORT_SYMBOL(put_lseg); ++ ++void get_lseg(struct pnfs_layout_segment *lseg) ++{ ++ kref_get(&lseg->kref); ++} ++EXPORT_SYMBOL(get_lseg); ++ ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end: NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1: NFS4_MAX_UINT64; ++} ++ ++/* ++ * is l2 fully contained in l1? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_contained(struct nfs4_pnfs_layout_segment *l1, ++ struct nfs4_pnfs_layout_segment *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (start1 <= start2) && (end1 >= end2); ++} ++ ++/* ++ * is l1 and l2 intersecting? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_intersecting(struct nfs4_pnfs_layout_segment *l1, ++ struct nfs4_pnfs_layout_segment *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (end1 == NFS4_MAX_UINT64 || end1 > start2) && ++ (end2 == NFS4_MAX_UINT64 || end2 > start1); ++} ++ ++void ++pnfs_set_layout_stateid(struct pnfs_layout_type *lo, ++ const nfs4_stateid *stateid) ++{ ++ write_seqlock(&lo->seqlock); ++ memcpy(lo->stateid.u.data, stateid->u.data, sizeof(lo->stateid.u.data)); ++ write_sequnlock(&lo->seqlock); ++} ++ ++void ++pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_type *lo) ++{ ++ int seq; ++ ++ dprintk("--> %s\n", __func__); ++ ++ do { ++ seq = read_seqbegin(&lo->seqlock); ++ memcpy(dst->u.data, lo->stateid.u.data, ++ sizeof(lo->stateid.u.data)); ++ } while (read_seqretry(&lo->seqlock, seq)); ++ ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void ++pnfs_layout_from_open_stateid(struct pnfs_layout_type *lo, ++ struct nfs4_state *state) ++{ ++ int seq; ++ ++ dprintk("--> %s\n", __func__); ++ ++ write_seqlock(&lo->seqlock); ++ if (!memcmp(lo->stateid.u.data, &zero_stateid, NFS4_STATEID_SIZE)) ++ do { ++ seq = read_seqbegin(&state->seqlock); ++ memcpy(lo->stateid.u.data, state->stateid.u.data, ++ sizeof(state->stateid.u.data)); ++ } while (read_seqretry(&state->seqlock, seq)); ++ write_sequnlock(&lo->seqlock); ++ dprintk("<-- %s\n", __func__); ++} ++ ++/* ++* Get layout from server. ++* for now, assume that whole file layouts are requested. ++* arg->offset: 0 ++* arg->length: all ones ++*/ ++static int ++send_layoutget(struct inode *ino, ++ struct nfs_open_context *ctx, ++ struct nfs4_pnfs_layout_segment *range, ++ struct pnfs_layout_segment **lsegpp, ++ struct pnfs_layout_type *lo) ++{ ++ int status; ++ struct nfs_server *server = NFS_SERVER(ino); ++ struct nfs4_pnfs_layoutget *lgp; ++ ++ dprintk("--> %s\n", __func__); ++ ++ lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); ++ if (lgp == NULL) { ++ pnfs_layout_release(lo, NULL); ++ return -ENOMEM; ++ } ++ lgp->args.minlength = NFS4_MAX_UINT64; ++ lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; ++ lgp->args.lseg.iomode = range->iomode; ++ lgp->args.lseg.offset = 0; ++ lgp->args.lseg.length = NFS4_MAX_UINT64; ++ lgp->args.type = server->pnfs_curr_ld->id; ++ lgp->args.inode = ino; ++ lgp->lsegpp = lsegpp; ++ ++ if (!memcmp(lo->stateid.u.data, &zero_stateid, NFS4_STATEID_SIZE)) { ++ struct nfs_open_context *oldctx = ctx; ++ ++ if (!oldctx) { ++ ctx = nfs_find_open_context(ino, NULL, ++ (range->iomode == IOMODE_READ) ? ++ FMODE_READ: FMODE_WRITE); ++ BUG_ON(!ctx); ++ } ++ /* Set the layout stateid from the open stateid */ ++ pnfs_layout_from_open_stateid(NFS_I(ino)->layout, ctx->state); ++ if (!oldctx) ++ put_nfs_open_context(ctx); ++ } ++ ++ /* Retrieve layout information from server */ ++ status = pnfs4_proc_layoutget(lgp); ++ ++ dprintk("<-- %s status %d\n", __func__, status); ++ return status; ++} ++ ++/* ++ * iomode matching rules: ++ * range lseg match ++ * ----- ----- ----- ++ * ANY READ true ++ * ANY RW true ++ * RW READ false ++ * RW RW true ++ * READ READ true ++ * READ RW false ++ */ ++static inline int ++should_free_lseg(struct pnfs_layout_segment *lseg, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ return (range->iomode == IOMODE_ANY || ++ lseg->range.iomode == range->iomode) && ++ lo_seg_intersecting(&lseg->range, range); ++} ++ ++static struct pnfs_layout_segment * ++has_layout_to_return(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ struct pnfs_layout_segment *out = NULL, *lseg; ++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", ++ __func__, lo, range->offset, range->length, range->iomode); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry (lseg, &lo->segs, fi_list) ++ if (should_free_lseg(lseg, range)) { ++ out = lseg; ++ break; ++ } ++ ++ dprintk("%s:Return lseg=%p\n", __func__, out); ++ return out; ++} ++ ++static inline bool ++_pnfs_can_return_lseg(struct pnfs_layout_segment *lseg) ++{ ++ return atomic_read(&lseg->kref.refcount) == 1; ++} ++ ++ ++static void ++pnfs_free_layout(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ struct pnfs_layout_segment *lseg, *next; ++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", ++ __func__, lo, range->offset, range->length, range->iomode); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry_safe (lseg, next, &lo->segs, fi_list) { ++ if (!should_free_lseg(lseg, range) || ++ !_pnfs_can_return_lseg(lseg)) ++ continue; ++ dprintk("%s: freeing lseg %p iomode %d " ++ "offset %llu length %llu\n", __func__, ++ lseg, lseg->range.iomode, lseg->range.offset, ++ lseg->range.length); ++ list_del(&lseg->fi_list); ++ put_lseg_locked(lseg); ++ } ++ if (list_empty(&lo->segs)) { ++ struct nfs_client *clp; ++ ++ clp = PNFS_NFS_SERVER(lo)->nfs_client; ++ spin_lock(&clp->cl_lock); ++ list_del_init(&lo->lo_layouts); ++ spin_unlock(&clp->cl_lock); ++ pnfs_set_layout_stateid(lo, &zero_stateid); ++ } ++ ++ dprintk("%s:Return\n", __func__); ++} ++ ++static bool ++pnfs_return_layout_barrier(struct nfs_inode *nfsi, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ struct pnfs_layout_segment *lseg; ++ bool ret = false; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ list_for_each_entry(lseg, &nfsi->layout->segs, fi_list) { ++ if (!should_free_lseg(lseg, range)) ++ continue; ++ lseg->valid = false; ++ if (!_pnfs_can_return_lseg(lseg)) { ++ dprintk("%s: wait on lseg %p refcount %d\n", ++ __func__, lseg, ++ atomic_read(&lseg->kref.refcount)); ++ ret = true; ++ } ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ dprintk("%s:Return %d\n", __func__, ret); ++ return ret; ++} ++ ++static int ++return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range, ++ enum pnfs_layoutreturn_type type, struct pnfs_layout_type *lo, ++ bool wait) ++{ ++ struct nfs4_pnfs_layoutreturn *lrp; ++ struct nfs_server *server = NFS_SERVER(ino); ++ int status = -ENOMEM; ++ ++ dprintk("--> %s\n", __func__); ++ ++ BUG_ON(type != RETURN_FILE); ++ ++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); ++ if (lrp == NULL) { ++ if (lo && (type == RETURN_FILE)) ++ pnfs_layout_release(lo, NULL); ++ goto out; ++ } ++ lrp->args.reclaim = 0; ++ lrp->args.layout_type = server->pnfs_curr_ld->id; ++ lrp->args.return_type = type; ++ lrp->args.lseg = *range; ++ lrp->args.inode = ino; ++ ++ status = pnfs4_proc_layoutreturn(lrp, wait); ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; ++} ++ ++int ++_pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ struct pnfs_layout_type *lo = NULL; ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct nfs4_pnfs_layout_segment arg; ++ int status = 0; ++ ++ dprintk("--> %s type %d\n", __func__, type); ++ ++ ++ arg.iomode = range ? range->iomode : IOMODE_ANY; ++ arg.offset = 0; ++ arg.length = NFS4_MAX_UINT64; ++ ++ if (type == RETURN_FILE) { ++ spin_lock(&ino->i_lock); ++ lo = nfsi->layout; ++ if (lo && !has_layout_to_return(lo, &arg)) { ++ lo = NULL; ++ } ++ if (!lo) { ++ spin_unlock(&ino->i_lock); ++ dprintk("%s: no layout segments to return\n", __func__); ++ goto out; ++ } ++ ++ /* Reference for layoutreturn matched in pnfs_layout_release */ ++ get_layout(lo); ++ ++ spin_unlock(&ino->i_lock); ++ ++ if (pnfs_return_layout_barrier(nfsi, &arg)) { ++ if (stateid) { /* callback */ ++ status = -EAGAIN; ++ goto out_put; ++ } ++ dprintk("%s: waiting\n", __func__); ++ wait_event(nfsi->lo_waitq, ++ !pnfs_return_layout_barrier(nfsi, &arg)); ++ } ++ ++ if (layoutcommit_needed(nfsi)) { ++ if (stateid && !wait) { /* callback */ ++ dprintk("%s: layoutcommit pending\n", __func__); ++ status = -EAGAIN; ++ goto out_put; ++ } ++ status = pnfs_layoutcommit_inode(ino, wait); ++ if (status) { ++ /* Return layout even if layoutcommit fails */ ++ dprintk("%s: layoutcommit failed, status=%d. " ++ "Returning layout anyway\n", ++ __func__, status); ++ } ++ } ++ ++ if (!stateid) ++ status = return_layout(ino, &arg, type, lo, wait); ++ else ++ pnfs_layout_release(lo, &arg); ++ } ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; ++out_put: ++ put_layout(ino); ++ goto out; ++} ++ ++/* ++ * cmp two layout segments for sorting into layout cache ++ */ ++static inline s64 ++cmp_layout(struct nfs4_pnfs_layout_segment *l1, ++ struct nfs4_pnfs_layout_segment *l2) ++{ ++ s64 d; ++ ++ /* higher offset > lower offset */ ++ d = l1->offset - l2->offset; ++ if (d) ++ return d; ++ ++ /* longer length > shorter length */ ++ d = l1->length - l2->length; ++ if (d) ++ return d; ++ ++ /* read > read/write */ ++ return (int)(l1->iomode == IOMODE_READ) - ++ (int)(l2->iomode == IOMODE_READ); ++} ++ ++static void ++pnfs_insert_layout(struct pnfs_layout_type *lo, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct pnfs_layout_segment *lp; ++ int found = 0; ++ ++ dprintk("%s:Begin\n", __func__); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ if (list_empty(&lo->segs)) { ++ struct nfs_client *clp = PNFS_NFS_SERVER(lo)->nfs_client; ++ ++ spin_lock(&clp->cl_lock); ++ BUG_ON(!list_empty(&lo->lo_layouts)); ++ list_add_tail(&lo->lo_layouts, &clp->cl_layouts); ++ spin_unlock(&clp->cl_lock); ++ } ++ list_for_each_entry (lp, &lo->segs, fi_list) { ++ if (cmp_layout(&lp->range, &lseg->range) > 0) ++ continue; ++ list_add_tail(&lseg->fi_list, &lp->fi_list); ++ dprintk("%s: inserted lseg %p " ++ "iomode %d offset %llu length %llu before " ++ "lp %p iomode %d offset %llu length %llu\n", ++ __func__, lseg, lseg->range.iomode, ++ lseg->range.offset, lseg->range.length, ++ lp, lp->range.iomode, lp->range.offset, ++ lp->range.length); ++ found = 1; ++ break; ++ } ++ if (!found) { ++ list_add_tail(&lseg->fi_list, &lo->segs); ++ dprintk("%s: inserted lseg %p " ++ "iomode %d offset %llu length %llu at tail\n", ++ __func__, lseg, lseg->range.iomode, ++ lseg->range.offset, lseg->range.length); ++ } ++ get_layout(lo); ++ ++ dprintk("%s:Return\n", __func__); ++} ++ ++/* ++ * Each layoutdriver embeds pnfs_layout_type as the first field in it's ++ * per-layout type layout cache structure and returns it ZEROed ++ * from layoutdriver_io_ops->alloc_layout ++ */ ++static struct pnfs_layout_type * ++alloc_init_layout(struct inode *ino) ++{ ++ struct pnfs_layout_type *lo; ++ struct layoutdriver_io_operations *io_ops; ++ ++ io_ops = NFS_SERVER(ino)->pnfs_curr_ld->ld_io_ops; ++ lo = io_ops->alloc_layout(ino); ++ if (!lo) { ++ printk(KERN_ERR ++ "%s: out of memory: io_ops->alloc_layout failed\n", ++ __func__); ++ return NULL; ++ } ++ lo->refcount = 1; ++ INIT_LIST_HEAD(&lo->lo_layouts); ++ INIT_LIST_HEAD(&lo->segs); ++ seqlock_init(&lo->seqlock); ++ lo->lo_inode = ino; ++ return lo; ++} ++ ++/* ++ * Retrieve and possibly allocate the inode layout ++ * ++ * ino->i_lock must be taken by the caller. ++ */ ++static struct pnfs_layout_type * ++pnfs_alloc_layout(struct inode *ino) ++{ ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_type *new = NULL; ++ ++ dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); ++ ++ BUG_ON_UNLOCKED_INO(ino); ++ if (likely(nfsi->layout)) ++ return nfsi->layout; ++ ++ spin_unlock(&ino->i_lock); ++ new = alloc_init_layout(ino); ++ spin_lock(&ino->i_lock); ++ ++ if (likely(nfsi->layout == NULL)) { /* Won the race? */ ++ nfsi->layout = new; ++ } else if (new) { ++ /* Reference the layout accross i_lock release and grab */ ++ get_layout(nfsi->layout); ++ spin_unlock(&ino->i_lock); ++ NFS_SERVER(ino)->pnfs_curr_ld->ld_io_ops->free_layout(new); ++ spin_lock(&ino->i_lock); ++ put_layout_locked(nfsi->layout); ++ } ++ return nfsi->layout; ++} ++ ++/* ++ * iomode matching rules: ++ * range lseg match ++ * ----- ----- ----- ++ * ANY READ true ++ * ANY RW true ++ * RW READ false ++ * RW RW true ++ * READ READ true ++ * READ RW true ++ */ ++static inline int ++has_matching_lseg(struct pnfs_layout_segment *lseg, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ struct nfs4_pnfs_layout_segment range1; ++ ++ if ((range->iomode == IOMODE_RW && lseg->range.iomode != IOMODE_RW) || ++ !lo_seg_intersecting(&lseg->range, range)) ++ return 0; ++ ++ /* range1 covers only the first byte in the range */ ++ range1 = *range; ++ range1.length = 1; ++ return lo_seg_contained(&lseg->range, &range1); ++} ++ ++/* ++ * lookup range in layout ++ */ ++static struct pnfs_layout_segment * ++pnfs_has_layout(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layout_segment *range, ++ bool take_ref, ++ bool only_valid) ++{ ++ struct pnfs_layout_segment *lseg, *ret = NULL; ++ ++ dprintk("%s:Begin\n", __func__); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry (lseg, &lo->segs, fi_list) { ++ if (has_matching_lseg(lseg, range) && ++ (lseg->valid || !only_valid)) { ++ ret = lseg; ++ if (take_ref) ++ get_lseg(ret); ++ break; ++ } ++ if (cmp_layout(range, &lseg->range) > 0) ++ break; ++ } ++ ++ dprintk("%s:Return lseg %p take_ref %d ref %d valid %d\n", ++ __func__, ret, take_ref, ++ ret ? atomic_read(&ret->kref.refcount) : 0, ++ ret ? ret->valid : 0); ++ return ret; ++} ++ ++/* Update the file's layout for the given range and iomode. ++ * Layout is retreived from the server if needed. ++ * If lsegpp is given, the appropriate layout segment is referenced and ++ * returned to the caller. ++ */ ++void ++_pnfs_update_layout(struct inode *ino, ++ struct nfs_open_context *ctx, ++ loff_t pos, ++ u64 count, ++ enum pnfs_iomode iomode, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ struct nfs4_pnfs_layout_segment arg = { ++ .iomode = iomode, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_type *lo; ++ struct pnfs_layout_segment *lseg = NULL; ++ bool take_ref = (lsegpp != NULL); ++ ++ if (take_ref) ++ *lsegpp = NULL; ++ spin_lock(&ino->i_lock); ++ lo = pnfs_alloc_layout(ino); ++ if (lo == NULL) { ++ dprintk("%s ERROR: can't get pnfs_layout_type\n", __func__); ++ goto out_unlock; ++ } ++ ++ /* Check to see if the layout for the given range already exists */ ++ lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref); ++ if (lseg && !lseg->valid) { ++ if (take_ref) ++ put_lseg_locked(lseg); ++ /* someone is cleaning the layout */ ++ lseg = NULL; ++ goto out_unlock; ++ } ++ ++ if (lseg) { ++ dprintk("%s: Using cached lseg %p for %llu@%llu iomode %d)\n", ++ __func__, ++ lseg, ++ arg.length, ++ arg.offset, ++ arg.iomode); ++ ++ goto out_unlock; ++ } ++ ++ /* if get layout already failed once goto out */ ++ if (test_bit(lo_fail_bit(iomode), &nfsi->layout->pnfs_layout_state)) { ++ if (unlikely(nfsi->pnfs_layout_suspend && ++ get_seconds() >= nfsi->pnfs_layout_suspend)) { ++ dprintk("%s: layout_get resumed\n", __func__); ++ clear_bit(lo_fail_bit(iomode), ++ &nfsi->layout->pnfs_layout_state); ++ nfsi->pnfs_layout_suspend = 0; ++ } else ++ goto out_unlock; ++ } ++ ++ /* Reference the layout for layoutget matched in pnfs_layout_release */ ++ get_layout(lo); ++ spin_unlock(&ino->i_lock); ++ ++ send_layoutget(ino, ctx, &arg, lsegpp, lo); ++out: ++ dprintk("%s end, state 0x%lx lseg %p\n", __func__, ++ nfsi->layout->pnfs_layout_state, lseg); ++ return; ++out_unlock: ++ if (lsegpp) ++ *lsegpp = lseg; ++ spin_unlock(&ino->i_lock); ++ goto out; ++} ++ ++void ++pnfs_get_layout_done(struct nfs4_pnfs_layoutget *lgp, int rpc_status) ++{ ++ struct pnfs_layout_segment *lseg = NULL; ++ struct nfs_inode *nfsi = NFS_I(lgp->args.inode); ++ time_t suspend = 0; ++ ++ dprintk("-->%s\n", __func__); ++ ++ lgp->status = rpc_status; ++ if (likely(!rpc_status)) { ++ if (unlikely(lgp->res.layout.len < 0)) { ++ printk(KERN_ERR ++ "%s: ERROR Returned layout size is ZERO\n", __func__); ++ lgp->status = -EIO; ++ } ++ goto out; ++ } ++ ++ dprintk("%s: ERROR retrieving layout %d\n", __func__, rpc_status); ++ switch (rpc_status) { ++ case -NFS4ERR_BADLAYOUT: ++ lgp->status = -ENOENT; ++ /* FALLTHROUGH */ ++ case -EACCES: /* NFS4ERR_ACCESS */ ++ /* transient error, don't mark with NFS_INO_LAYOUT_FAILED */ ++ goto out; ++ ++ case -NFS4ERR_LAYOUTTRYLATER: ++ case -NFS4ERR_RECALLCONFLICT: ++ case -NFS4ERR_OLD_STATEID: ++ case -EAGAIN: /* NFS4ERR_LOCKED */ ++ lgp->status = -NFS4ERR_DELAY; /* for nfs4_handle_exception */ ++ /* FALLTHROUGH */ ++ case -NFS4ERR_GRACE: ++ case -NFS4ERR_DELAY: ++ goto out; ++ ++ case -NFS4ERR_ADMIN_REVOKED: ++ case -NFS4ERR_DELEG_REVOKED: ++ /* The layout is expected to be returned at this point. ++ * This should clear the layout stateid as well */ ++ suspend = get_seconds() + 1; ++ break; ++ ++ case -NFS4ERR_LAYOUTUNAVAILABLE: ++ lgp->status = -ENOTSUPP; ++ break; ++ ++ case -NFS4ERR_REP_TOO_BIG: ++ case -NFS4ERR_REP_TOO_BIG_TO_CACHE: ++ lgp->status = -E2BIG; ++ break; ++ ++ /* Leave the following errors untranslated */ ++ case -NFS4ERR_DEADSESSION: ++ case -NFS4ERR_DQUOT: ++ case -EINVAL: /* NFS4ERR_INVAL */ ++ case -EIO: /* NFS4ERR_IO */ ++ case -NFS4ERR_FHEXPIRED: ++ case -NFS4ERR_MOVED: ++ case -NFS4ERR_NOSPC: ++ case -ESERVERFAULT: /* NFS4ERR_SERVERFAULT */ ++ case -ESTALE: /* NFS4ERR_STALE */ ++ case -ETOOSMALL: /* NFS4ERR_TOOSMALL */ ++ break; ++ ++ /* The following errors are our fault and should never happen */ ++ case -NFS4ERR_BADIOMODE: ++ case -NFS4ERR_BADXDR: ++ case -NFS4ERR_REQ_TOO_BIG: ++ case -NFS4ERR_UNKNOWN_LAYOUTTYPE: ++ case -NFS4ERR_WRONG_TYPE: ++ lgp->status = -EINVAL; ++ /* FALLTHROUGH */ ++ case -NFS4ERR_BAD_STATEID: ++ case -NFS4ERR_NOFILEHANDLE: ++ case -ENOTSUPP: /* NFS4ERR_NOTSUPP */ ++ case -NFS4ERR_OPENMODE: ++ case -NFS4ERR_OP_NOT_IN_SESSION: ++ case -NFS4ERR_TOO_MANY_OPS: ++ dprintk("%s: error %d: should never happen\n", __func__, ++ rpc_status); ++ break; ++ ++ /* The following errors are the server's fault */ ++ default: ++ dprintk("%s: illegal error %d\n", __func__, rpc_status); ++ lgp->status = -EIO; ++ break; ++ } ++ ++ /* remember that get layout failed and suspend trying */ ++ nfsi->pnfs_layout_suspend = suspend; ++ set_bit(lo_fail_bit(lgp->args.lseg.iomode), ++ &nfsi->layout->pnfs_layout_state); ++ dprintk("%s: layout_get suspended until %ld\n", ++ __func__, suspend); ++out: ++ dprintk("%s end (err:%d) state 0x%lx lseg %p\n", ++ __func__, lgp->status, nfsi->layout->pnfs_layout_state, lseg); ++ return; ++} ++ ++int ++pnfs_layout_process(struct nfs4_pnfs_layoutget *lgp) ++{ ++ struct pnfs_layout_type *lo = NFS_I(lgp->args.inode)->layout; ++ struct nfs4_pnfs_layoutget_res *res = &lgp->res; ++ struct pnfs_layout_segment *lseg; ++ struct inode *ino = PNFS_INODE(lo); ++ int status = 0; ++ ++ /* Inject layout blob into I/O device driver */ ++ lseg = PNFS_LD_IO_OPS(lo)->alloc_lseg(lo, res); ++ if (!lseg || IS_ERR(lseg)) { ++ if (!lseg) ++ status = -ENOMEM; ++ else ++ status = PTR_ERR(lseg); ++ dprintk("%s: Could not allocate layout: error %d\n", ++ __func__, status); ++ goto out; ++ } ++ ++ spin_lock(&ino->i_lock); ++ init_lseg(lo, lseg); ++ lseg->range = res->lseg; ++ if (lgp->lsegpp) { ++ get_lseg(lseg); ++ *lgp->lsegpp = lseg; ++ } ++ pnfs_insert_layout(lo, lseg); ++ ++ if (res->return_on_close) { ++ lo->roc_iomode |= res->lseg.iomode; ++ if (!lo->roc_iomode) ++ lo->roc_iomode = IOMODE_ANY; ++ } ++ ++ /* Done processing layoutget. Set the layout stateid */ ++ pnfs_set_layout_stateid(lo, &res->stateid); ++ spin_unlock(&ino->i_lock); ++out: ++ return status; ++} ++ ++void ++readahead_range(struct inode *inode, struct list_head *pages, loff_t *offset, ++ size_t *count) ++{ ++ struct page *first, *last; ++ loff_t foff, i_size = i_size_read(inode); ++ pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; ++ size_t range; ++ ++ ++ first = list_entry((pages)->prev, struct page, lru); ++ last = list_entry((pages)->next, struct page, lru); ++ ++ foff = (loff_t)first->index << PAGE_CACHE_SHIFT; ++ ++ range = (last->index - first->index) * PAGE_CACHE_SIZE; ++ if (last->index == end_index) ++ range += ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; ++ else ++ range += PAGE_CACHE_SIZE; ++ dprintk("%s foff %lu, range %Zu\n", __func__, (unsigned long)foff, ++ range); ++ *offset = foff; ++ *count = range; ++} ++ ++void ++pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio) ++{ ++ struct pnfs_layout_type *laytype; ++ struct pnfs_layoutdriver_type *ld; ++ ++ pgio->pg_test = NULL; ++ ++ laytype = NFS_I(inode)->layout; ++ ld = NFS_SERVER(inode)->pnfs_curr_ld; ++ if (!pnfs_enabled_sb(NFS_SERVER(inode)) || !laytype) ++ return; ++ ++ if (ld->ld_policy_ops) ++ pgio->pg_test = ld->ld_policy_ops->pg_test; ++} ++ ++static u32 ++pnfs_getboundary(struct inode *inode) ++{ ++ u32 stripe_size = 0; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct layoutdriver_policy_operations *policy_ops; ++ ++ if (!nfss->pnfs_curr_ld) ++ goto out; ++ ++ policy_ops = nfss->pnfs_curr_ld->ld_policy_ops; ++ if (!policy_ops || !policy_ops->get_stripesize) ++ goto out; ++ ++ /* The default is to not gather across stripes */ ++ if (pnfs_ld_gather_across_stripes(nfss->pnfs_curr_ld)) ++ goto out; ++ ++ spin_lock(&inode->i_lock); ++ if (NFS_I(inode)->layout) ++ stripe_size = policy_ops->get_stripesize(NFS_I(inode)->layout); ++ spin_unlock(&inode->i_lock); ++out: ++ return stripe_size; ++} ++ ++/* ++ * rsize is already set by caller to MDS rsize. ++ */ ++void ++pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, ++ struct inode *inode, ++ struct nfs_open_context *ctx, ++ struct list_head *pages, ++ size_t *rsize) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ size_t count = 0; ++ loff_t loff; ++ ++ pgio->pg_iswrite = 0; ++ pgio->pg_boundary = 0; ++ pgio->pg_test = NULL; ++ pgio->pg_lseg = NULL; ++ ++ if (!pnfs_enabled_sb(nfss)) ++ return; ++ ++ /* Calculate the total read-ahead count */ ++ readahead_range(inode, pages, &loff, &count); ++ ++ if (count > 0) { ++ _pnfs_update_layout(inode, ctx, loff, count, IOMODE_READ, ++ &pgio->pg_lseg); ++ if (!pgio->pg_lseg) ++ return; ++ ++ *rsize = NFS_SERVER(inode)->ds_rsize; ++ pgio->pg_boundary = pnfs_getboundary(inode); ++ if (pgio->pg_boundary) ++ pnfs_set_pg_test(inode, pgio); ++ } ++} ++ ++void ++pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, ++ size_t *wsize) ++{ ++ struct nfs_server *server = NFS_SERVER(inode); ++ ++ pgio->pg_iswrite = 1; ++ if (!pnfs_enabled_sb(server)) { ++ pgio->pg_boundary = 0; ++ pgio->pg_test = NULL; ++ return; ++ } ++ pgio->pg_boundary = pnfs_getboundary(inode); ++ pnfs_set_pg_test(inode, pgio); ++ *wsize = server->ds_wsize; ++} ++ ++/* Return I/O buffer size for a layout driver ++ * This value will determine what size reads and writes ++ * will be gathered into and sent to the data servers. ++ * blocksize must be a multiple of the page cache size. ++ */ ++unsigned int ++pnfs_getiosize(struct nfs_server *server) ++{ ++ if (!PNFS_EXISTS_LDPOLICY_OP(server, get_blocksize)) ++ return 0; ++ return server->pnfs_curr_ld->ld_policy_ops->get_blocksize(); ++} ++ ++void ++pnfs_set_ds_iosize(struct nfs_server *server) ++{ ++ unsigned dssize = pnfs_getiosize(server); ++ ++ /* Set buffer size for data servers */ ++ if (dssize > 0) { ++ server->ds_rsize = server->ds_wsize = ++ nfs_block_size(dssize, NULL); ++ } else { ++ server->ds_wsize = server->wsize; ++ server->ds_rsize = server->rsize; ++ } ++} ++ ++static int ++pnfs_call_done(struct pnfs_call_data *pdata, struct rpc_task *task, void *data) ++{ ++ put_lseg(pdata->lseg); ++ pdata->lseg = NULL; ++ pdata->call_ops->rpc_call_done(task, data); ++ if (pdata->pnfs_error == -EAGAIN || task->tk_status == -EAGAIN) ++ return -EAGAIN; ++ if (pdata->pnfsflags & PNFS_NO_RPC) { ++ pdata->call_ops->rpc_release(data); ++ } else { ++ /* ++ * just restore original rpc call ops ++ * rpc_release will be called later by the rpc scheduling layer. ++ */ ++ task->tk_ops = pdata->call_ops; ++ } ++ return 0; ++} ++ ++/* Post-write completion function ++ * Invoked by all layout drivers when write_pagelist is done. ++ * ++ * NOTE: callers set data->pnfsflags PNFS_NO_RPC ++ * so that the NFS cleanup routines perform only the page cache ++ * cleanup. ++ */ ++static void ++pnfs_write_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ struct nfs4_pnfs_layout_segment range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = wdata->args.offset; ++ range.length = wdata->args.count; ++ _pnfs_return_layout(wdata->inode, &range, NULL, RETURN_FILE, true); ++ pnfs_initiate_write(wdata, NFS_CLIENT(wdata->inode), ++ wdata->pdata.call_ops, wdata->pdata.how); ++} ++ ++static void ++pnfs_writeback_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ /* update last write offset and need layout commit ++ * for non-files layout types (files layout calls ++ * pnfs4_write_done for this) ++ */ ++ if ((pdata->pnfsflags & PNFS_NO_RPC) && ++ data->task.tk_status >= 0 && data->res.count > 0) { ++ struct nfs_inode *nfsi = NFS_I(data->inode); ++ ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++ } ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_write_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++ ++static void _pnfs_clear_lseg_from_pages(struct list_head *head) ++{ ++ struct nfs_page *req; ++ ++ list_for_each_entry(req, head, wb_list) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } ++} ++ ++/* ++ * Call the appropriate parallel I/O subsystem write function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ * ++ * TODO: Is wdata->how and wdata->args.stable always the same value? ++ * TODO: It seems in NFS, the server may not do a stable write even ++ * though it was requested (and vice-versa?). To check, it looks ++ * in data->res.verf->committed. Do we need this ability ++ * for non-file layout drivers? ++ */ ++enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *wdata, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ struct inode *inode = wdata->inode; ++ enum pnfs_try_status trypnfs; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = wdata->req->wb_lseg; ++ ++ wdata->pdata.call_ops = call_ops; ++ wdata->pdata.pnfs_error = 0; ++ wdata->pdata.how = how; ++ ++ dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, ++ inode->i_ino, wdata->args.count, wdata->args.offset, how); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ wdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ wdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->write_pagelist(wdata, ++ nfs_page_array_len(wdata->args.pgbase, wdata->args.count), ++ how); ++ ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ wdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ wdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&wdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* Post-read completion function. Invoked by all layout drivers when ++ * read_pagelist is done ++ */ ++static void ++pnfs_read_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ struct nfs4_pnfs_layout_segment range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = rdata->args.offset; ++ range.length = rdata->args.count; ++ _pnfs_return_layout(rdata->inode, &range, NULL, RETURN_FILE, true); ++ pnfs_initiate_read(rdata, NFS_CLIENT(rdata->inode), ++ rdata->pdata.call_ops); ++} ++ ++static void ++pnfs_read_done(struct nfs_read_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_read_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++ ++/* ++ * Call the appropriate parallel I/O subsystem read function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ */ ++enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *rdata, ++ const struct rpc_call_ops *call_ops) ++{ ++ struct inode *inode = rdata->inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = rdata->req->wb_lseg; ++ enum pnfs_try_status trypnfs; ++ ++ rdata->pdata.call_ops = call_ops; ++ rdata->pdata.pnfs_error = 0; ++ ++ dprintk("%s: Reading ino:%lu %u@%llu\n", ++ __func__, inode->i_ino, rdata->args.count, rdata->args.offset); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ rdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ rdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->read_pagelist(rdata, ++ nfs_page_array_len(rdata->args.pgbase, rdata->args.count)); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ rdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ rdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&rdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_READ); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* ++ * This gives the layout driver an opportunity to read in page "around" ++ * the data to be written. It returns 0 on success, otherwise an error code ++ * which will either be passed up to user, or ignored if ++ * some previous part of write succeeded. ++ * Note the range [pos, pos+len-1] is entirely within the page. ++ */ ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata) ++{ ++ struct pnfs_fsdata *data; ++ int status = 0; ++ ++ dprintk("--> %s: pos=%llu len=%u\n", ++ __func__, (unsigned long long)pos, len); ++ data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL); ++ if (!data) { ++ status = -ENOMEM; ++ goto out; ++ } ++ data->lseg = lseg; /* refcount passed into data to be managed there */ ++ status = NFS_SERVER(inode)->pnfs_curr_ld->ld_io_ops->write_begin( ++ lseg, page, pos, len, data); ++ if (status) { ++ kfree(data); ++ data = NULL; ++ } ++out: ++ *fsdata = data; ++ dprintk("<-- %s: status=%d\n", __func__, status); ++ return status; ++} ++ ++/* Return 0 on succes, negative on failure */ ++/* CAREFUL - what happens if copied < len??? */ ++int _pnfs_write_end(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int status; ++ ++ status = nfss->pnfs_curr_ld->ld_io_ops->write_end(inode, page, ++ pos, len, copied, lseg); ++ return status; ++} ++ ++/* pNFS Commit callback function for all layout drivers */ ++static void ++pnfs_commit_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ struct nfs4_pnfs_layout_segment range = { ++ .iomode = IOMODE_RW, ++ .offset = data->args.offset, ++ .length = data->args.count, ++ }; ++ dprintk("%s: retrying\n", __func__); ++ _pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE, ++ true); ++ pnfs_initiate_commit(data, NFS_CLIENT(data->inode), ++ pdata->call_ops, pdata->how, 1); ++ } ++} ++ ++enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int sync) ++{ ++ struct inode *inode = data->inode; ++ struct nfs_server *nfss = NFS_SERVER(data->inode); ++ enum pnfs_try_status trypnfs; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ /* We need to account for possibility that ++ * each nfs_page can point to a different lseg (or be NULL). ++ * For the immediate case of whole-file-only layouts, we at ++ * least know there can be only a single lseg. ++ * We still have to account for the possibility of some being NULL. ++ * This will be done by passing the buck to the layout driver. ++ */ ++ data->pdata.call_ops = call_ops; ++ data->pdata.pnfs_error = 0; ++ data->pdata.how = sync; ++ data->pdata.lseg = NULL; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->commit(data, sync); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ data->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ _pnfs_clear_lseg_from_pages(&data->pages); ++ } else ++ nfs_inc_stats(inode, NFSIOS_PNFS_COMMIT); ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++void pnfs_cleanup_layoutcommit(struct pnfs_layoutcommit_data *data) ++{ ++ struct nfs_server *nfss = NFS_SERVER(data->args.inode); ++ ++ /* TODO: Maybe we should avoid this by allowing the layout driver ++ * to directly xdr its layout on the wire. ++ */ ++ if (nfss->pnfs_curr_ld->ld_io_ops->cleanup_layoutcommit) ++ nfss->pnfs_curr_ld->ld_io_ops->cleanup_layoutcommit( ++ NFS_I(data->args.inode)->layout, ++ &data->args, data->status); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int ++pnfs_layoutcommit_setup(struct inode *inode, ++ struct pnfs_layoutcommit_data *data, ++ loff_t write_begin_pos, loff_t write_end_pos) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int result = 0; ++ ++ dprintk("--> %s\n", __func__); ++ ++ data->args.inode = inode; ++ data->args.fh = NFS_FH(inode); ++ data->args.layout_type = nfss->pnfs_curr_ld->id; ++ data->res.fattr = &data->fattr; ++ nfs_fattr_init(&data->fattr); ++ ++ /* TODO: Need to determine the correct values */ ++ data->args.time_modify_changed = 0; ++ ++ /* Set values from inode so it can be reset ++ */ ++ data->args.lseg.iomode = IOMODE_RW; ++ data->args.lseg.offset = write_begin_pos; ++ data->args.lseg.length = write_end_pos - write_begin_pos + 1; ++ data->args.lastbytewritten = min(write_end_pos, ++ i_size_read(inode) - 1); ++ data->args.bitmask = nfss->attr_bitmask; ++ data->res.server = nfss; ++ ++ /* Call layout driver to set the arguments */ ++ if (nfss->pnfs_curr_ld->ld_io_ops->setup_layoutcommit) ++ result = nfss->pnfs_curr_ld->ld_io_ops->setup_layoutcommit( ++ NFS_I(inode)->layout, &data->args); ++ ++ dprintk("<-- %s Status %d\n", __func__, result); ++ return result; ++} ++ ++/* Issue a async layoutcommit for an inode. ++ */ ++int ++pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ struct pnfs_layoutcommit_data *data; ++ struct nfs_inode *nfsi = NFS_I(inode); ++ loff_t write_begin_pos; ++ loff_t write_end_pos; ++ ++ int status = 0; ++ ++ dprintk("%s Begin (sync:%d)\n", __func__, sync); ++ ++ BUG_ON(!has_layout(nfsi)); ++ ++ data = pnfs_layoutcommit_alloc(); ++ if (!data) ++ return -ENOMEM; ++ ++ spin_lock(&inode->i_lock); ++ if (!layoutcommit_needed(nfsi)) { ++ spin_unlock(&inode->i_lock); ++ goto out_free; ++ } ++ ++ /* Clear layoutcommit properties in the inode so ++ * new lc info can be generated ++ */ ++ write_begin_pos = nfsi->layout->pnfs_write_begin_pos; ++ write_end_pos = nfsi->layout->pnfs_write_end_pos; ++ data->cred = nfsi->layout->lo_cred; ++ nfsi->layout->pnfs_write_begin_pos = 0; ++ nfsi->layout->pnfs_write_end_pos = 0; ++ nfsi->layout->lo_cred = NULL; ++ __clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->pnfs_layout_state); ++ pnfs_get_layout_stateid(&data->args.stateid, nfsi->layout); ++ ++ /* Reference for layoutcommit matched in pnfs_layoutcommit_release */ ++ get_layout(NFS_I(inode)->layout); ++ ++ spin_unlock(&inode->i_lock); ++ ++ /* Set up layout commit args */ ++ status = pnfs_layoutcommit_setup(inode, data, write_begin_pos, ++ write_end_pos); ++ if (status) { ++ /* The layout driver failed to setup the layoutcommit */ ++ put_rpccred(data->cred); ++ put_layout(inode); ++ goto out_free; ++ } ++ status = pnfs4_proc_layoutcommit(data, sync); ++out: ++ dprintk("%s end (err:%d)\n", __func__, status); ++ return status; ++out_free: ++ pnfs_layoutcommit_free(data); ++ goto out; ++} ++ ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata) ++{ ++ if (fsdata) { ++ /* lseg refcounting handled directly in nfs_Write_end */ ++ kfree(fsdata); ++ } ++} ++ ++/* Callback operations for layout drivers. ++ */ ++struct pnfs_client_operations pnfs_ops = { ++ .nfs_getdevicelist = nfs4_pnfs_getdevicelist, ++ .nfs_getdeviceinfo = nfs4_pnfs_getdeviceinfo, ++ .nfs_readlist_complete = pnfs_read_done, ++ .nfs_writelist_complete = pnfs_writeback_done, ++ .nfs_commit_complete = pnfs_commit_done, ++}; ++ ++EXPORT_SYMBOL(pnfs_unregister_layoutdriver); ++EXPORT_SYMBOL(pnfs_register_layoutdriver); ++ ++ ++/* Device ID cache. Supports one layout type per struct nfs_client */ ++int ++nfs4_alloc_init_deviceid_cache(struct nfs_client *clp, ++ void (*free_callback)(struct kref *)) ++{ ++ struct nfs4_deviceid_cache *c; ++ ++ c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL); ++ if (!c) ++ return -ENOMEM; ++ spin_lock(&clp->cl_lock); ++ if (clp->cl_devid_cache != NULL) { ++ kref_get(&clp->cl_devid_cache->dc_kref); ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [kref [%d]]\n", __func__, ++ atomic_read(&clp->cl_devid_cache->dc_kref.refcount)); ++ kfree(c); ++ } else { ++ int i; ++ ++ spin_lock_init(&c->dc_lock); ++ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE ; i++) ++ INIT_HLIST_HEAD(&c->dc_deviceids[i]); ++ kref_init(&c->dc_kref); ++ c->dc_free_callback = free_callback; ++ clp->cl_devid_cache = c; ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [new]\n", __func__); ++ } ++ return 0; ++} ++EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache); ++ ++void ++nfs4_init_deviceid_node(struct nfs4_deviceid *d) ++{ ++ INIT_HLIST_NODE(&d->de_node); ++ kref_init(&d->de_kref); ++} ++EXPORT_SYMBOL(nfs4_init_deviceid_node); ++ ++/* Called from layoutdriver_io_operations->alloc_lseg */ ++void ++nfs4_set_layout_deviceid(struct pnfs_layout_segment *l, struct nfs4_deviceid *d) ++{ ++ dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); ++ l->deviceid = d; ++ kref_get(&d->de_kref); ++} ++EXPORT_SYMBOL(nfs4_set_layout_deviceid); ++ ++/* Called from layoutdriver_io_operations->free_lseg */ ++void ++nfs4_unset_layout_deviceid(struct pnfs_layout_segment *l, ++ struct nfs4_deviceid *d, ++ void (*free_callback)(struct kref *)) ++{ ++ dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); ++ l->deviceid = NULL; ++ kref_put(&d->de_kref, free_callback); ++} ++EXPORT_SYMBOL(nfs4_unset_layout_deviceid); ++ ++struct nfs4_deviceid * ++nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ long hash = nfs4_deviceid_hash(id); ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ rcu_read_lock(); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) { ++ rcu_read_unlock(); ++ return d; ++ } ++ } ++ rcu_read_unlock(); ++ return NULL; ++} ++EXPORT_SYMBOL(nfs4_find_deviceid); ++ ++/* ++ * Add or kref_get a deviceid. ++ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new ++ */ ++struct nfs4_deviceid * ++nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ long hash = nfs4_deviceid_hash(&new->de_id); ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ spin_lock(&c->dc_lock); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) { ++ spin_unlock(&c->dc_lock); ++ dprintk("%s [discard]\n", __func__); ++ c->dc_free_callback(&new->de_kref); ++ return d; ++ } ++ } ++ hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]); ++ spin_unlock(&c->dc_lock); ++ dprintk("%s [new]\n", __func__); ++ return new; ++} ++EXPORT_SYMBOL(nfs4_add_deviceid); ++ ++static int ++nfs4_remove_deviceid(struct nfs4_deviceid_cache *c, long hash, ++ struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ spin_lock(&c->dc_lock); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (id && memcmp(id, &d->de_id, NFS4_PNFS_DEVICEID4_SIZE)) ++ continue; ++ hlist_del_rcu(&d->de_node); ++ spin_unlock(&c->dc_lock); ++ synchronize_rcu(); ++ dprintk("%s [%d]\n", __func__, ++ atomic_read(&d->de_kref.refcount)); ++ kref_put(&d->de_kref, c->dc_free_callback); ++ return 1; ++ } ++ spin_unlock(&c->dc_lock); ++ return 0; ++} ++ ++void ++nfs4_delete_device(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) ++{ ++ long hash = nfs4_deviceid_hash(id); ++ ++ nfs4_remove_deviceid(c, hash, id); ++} ++EXPORT_SYMBOL(nfs4_delete_device); ++ ++static void ++nfs4_free_deviceid_cache(struct kref *kref) ++{ ++ struct nfs4_deviceid_cache *cache = ++ container_of(kref, struct nfs4_deviceid_cache, dc_kref); ++ long i; ++ ++ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) ++ while (nfs4_remove_deviceid(cache, i, NULL)) ++ ; ++ kfree(cache); ++} ++ ++void ++nfs4_put_deviceid_cache(struct nfs_client *clp) ++{ ++ struct nfs4_deviceid_cache *tmp = clp->cl_devid_cache; ++ int refcount; ++ ++ dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); ++ spin_lock(&clp->cl_lock); ++ refcount = atomic_read(&clp->cl_devid_cache->dc_kref.refcount); ++ if (refcount == 1) ++ clp->cl_devid_cache = NULL; ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [%d]\n", __func__, refcount); ++ kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache); ++} ++EXPORT_SYMBOL(nfs4_put_deviceid_cache); +diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs.h +--- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-08-23 12:09:03.358501440 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-08-23 12:09:03.358501440 -0400 +@@ -0,0 +1,355 @@ ++/* ++ * fs/nfs/pnfs.h ++ * ++ * pNFS client data structures. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef FS_NFS_PNFS_H ++#define FS_NFS_PNFS_H ++ ++#include ++ ++#ifdef CONFIG_NFS_V4_1 ++ ++#include ++#include ++#include ++#include "iostat.h" ++ ++/* nfs4proc.c */ ++extern int nfs4_pnfs_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist); ++extern int nfs4_pnfs_getdeviceinfo(struct nfs_server *server, ++ struct pnfs_device *dev); ++extern int pnfs4_proc_layoutget(struct nfs4_pnfs_layoutget *lgp); ++extern int pnfs4_proc_layoutcommit(struct pnfs_layoutcommit_data *data, ++ int issync); ++extern int pnfs4_proc_layoutreturn(struct nfs4_pnfs_layoutreturn *lrp, bool wait); ++ ++/* pnfs.c */ ++extern const nfs4_stateid zero_stateid; ++ ++void _pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp); ++ ++int _pnfs_return_layout(struct inode *, struct nfs4_pnfs_layout_segment *, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type, bool wait); ++void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id); ++void unmount_pnfs_layoutdriver(struct nfs_server *); ++enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, ++ const struct rpc_call_ops *); ++int pnfs_initialize(void); ++void pnfs_uninitialize(void); ++void pnfs_layoutcommit_free(struct pnfs_layoutcommit_data *data); ++void pnfs_cleanup_layoutcommit(struct pnfs_layoutcommit_data *data); ++int pnfs_layoutcommit_inode(struct inode *inode, int sync); ++void pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent); ++void pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx); ++unsigned int pnfs_getiosize(struct nfs_server *server); ++void pnfs_set_ds_iosize(struct nfs_server *server); ++enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, ++ struct nfs_open_context *, struct list_head *, ++ size_t *); ++void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, ++ size_t *); ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata); ++void pnfs_get_layout_done(struct nfs4_pnfs_layoutget *, int rpc_status); ++int pnfs_layout_process(struct nfs4_pnfs_layoutget *lgp); ++void pnfs_layout_release(struct pnfs_layout_type *, struct nfs4_pnfs_layout_segment *range); ++void pnfs_set_layout_stateid(struct pnfs_layout_type *lo, ++ const nfs4_stateid *stateid); ++void pnfs_destroy_layout(struct nfs_inode *); ++void pnfs_destroy_all_layouts(struct nfs_client *); ++void put_layout(struct inode *inode); ++void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_type *lo); ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata); ++int _pnfs_write_end(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg); ++ ++#define PNFS_EXISTS_LDIO_OP(srv, opname) ((srv)->pnfs_curr_ld && \ ++ (srv)->pnfs_curr_ld->ld_io_ops && \ ++ (srv)->pnfs_curr_ld->ld_io_ops->opname) ++#define PNFS_EXISTS_LDPOLICY_OP(srv, opname) ((srv)->pnfs_curr_ld && \ ++ (srv)->pnfs_curr_ld->ld_policy_ops && \ ++ (srv)->pnfs_curr_ld->ld_policy_ops->opname) ++ ++#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" ++ ++static inline int lo_fail_bit(u32 iomode) ++{ ++ return iomode == IOMODE_RW ? ++ NFS_INO_RW_LAYOUT_FAILED : NFS_INO_RO_LAYOUT_FAILED; ++} ++ ++/* Return true if a layout driver is being used for this mountpoint */ ++static inline int pnfs_enabled_sb(struct nfs_server *nfss) ++{ ++ return nfss->pnfs_curr_ld != NULL; ++} ++ ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) ++{ ++ return !fsdata || ((struct pnfs_layout_segment *)fsdata == lseg) || ++ !fsdata->bypass_eof; ++} ++ ++/* Should the pNFS client commit and return the layout upon a setattr */ ++static inline bool ++pnfs_ld_layoutret_on_setattr(struct inode *inode) ++{ ++ if (!pnfs_enabled_sb(NFS_SERVER(inode))) ++ return false; ++ return NFS_SERVER(inode)->pnfs_curr_ld->ld_policy_ops->flags & ++ PNFS_LAYOUTRET_ON_SETATTR; ++} ++ ++/* Should the pNFS client commit and return the layout on close ++ */ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout->roc_iomode; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int status = 0; ++ ++ *fsdata = lseg; ++ if (lseg && PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ status = _pnfs_write_begin(inode, page, pos, len, lseg, ++ (struct pnfs_fsdata **) fsdata); ++ return status; ++} ++ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_end)) ++ return _pnfs_write_end(inode, page, pos, len, copied, lseg); ++ else ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++ if (fsdata) { ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_end_cleanup)) ++ nfss->pnfs_curr_ld->ld_io_ops->write_end_cleanup(filp, fsdata); ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ pnfs_free_fsdata(fsdata); ++ } ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct nfs4_pnfs_layout_segment *lseg, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct nfs_server *nfss = NFS_SERVER(ino); ++ ++ if (pnfs_enabled_sb(nfss) && ++ (type != RETURN_FILE || has_layout(nfsi))) ++ return _pnfs_return_layout(ino, lseg, stateid, type, wait); ++ ++ return 0; ++} ++ ++static inline void pnfs_update_layout(struct inode *ino, ++ struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ struct nfs_server *nfss = NFS_SERVER(ino); ++ ++ if (pnfs_enabled_sb(nfss)) ++ _pnfs_update_layout(ino, ctx, pos, count, access_type, lsegpp); ++ else { ++ if (lsegpp) ++ *lsegpp = NULL; ++ } ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline int pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ if (pnfs_enabled_sb(nfss)) ++ return pnfs_ld_use_rpc_code(nfss->pnfs_curr_ld); ++ ++ return 1; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ if (fsdata) { ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ return ((struct pnfs_fsdata *) fsdata)->lseg; ++ } ++ return fsdata; ++} ++#else /* CONFIG_NFS_V4_1 */ ++ ++static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) ++{ ++} ++ ++static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) ++{ ++} ++ ++static inline void get_lseg(struct pnfs_layout_segment *lseg) ++{ ++} ++ ++static inline void put_lseg(struct pnfs_layout_segment *lseg) ++{ ++} ++ ++static inline void ++pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ if (lsegpp) ++ *lsegpp = NULL; ++} ++ ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) ++{ ++ return 1; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ *fsdata = NULL; ++ return 0; ++} ++ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return 0; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return 0; ++} ++ ++static inline int pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ return 1; ++} ++ ++static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ return 0; ++} ++ ++static inline bool ++pnfs_ld_layoutret_on_setattr(struct inode *inode) ++{ ++ return false; ++} ++ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return 0; ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct nfs4_pnfs_layout_segment *lseg, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ return 0; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ return NULL; ++} ++ ++#endif /* CONFIG_NFS_V4_1 */ ++ ++#endif /* FS_NFS_PNFS_H */ +diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc.c +--- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-08-23 12:08:29.057511533 -0400 ++++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-23 12:09:03.359501471 -0400 +@@ -443,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, stru + fattr = nfs_alloc_fattr(); + status = -ENOMEM; + if (fh == NULL || fattr == NULL) +- goto out; ++ goto out_free; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); +@@ -455,7 +455,7 @@ nfs_proc_symlink(struct inode *dir, stru + */ + if (status == 0) + status = nfs_instantiate(dentry, fh, fattr); +- ++out_free: + nfs_free_fattr(fattr); + nfs_free_fhandle(fh); + out: +@@ -694,6 +694,7 @@ const struct nfs_rpc_ops nfs_v2_clientop + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs_dir_inode_operations, + .file_inode_ops = &nfs_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs_proc_get_root, + .getattr = nfs_proc_getattr, + .setattr = nfs_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read.c +--- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-08-23 12:08:29.057511533 -0400 ++++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-23 12:09:03.359501471 -0400 +@@ -18,8 +18,12 @@ + #include + #include + #include ++#include ++#include + + #include ++#include ++#include "pnfs.h" + + #include "nfs4_fs.h" + #include "internal.h" +@@ -117,11 +121,14 @@ int nfs_readpage_async(struct nfs_open_c + LIST_HEAD(one_request); + struct nfs_page *new; + unsigned int len; ++ struct pnfs_layout_segment *lseg; + + len = nfs_page_length(page); + if (len == 0) + return nfs_return_empty_page(page); +- new = nfs_create_request(ctx, inode, page, 0, len); ++ pnfs_update_layout(inode, ctx, 0, NFS4_MAX_UINT64, IOMODE_READ, &lseg); ++ new = nfs_create_request(ctx, inode, page, 0, len, lseg); ++ put_lseg(lseg); + if (IS_ERR(new)) { + unlock_page(page); + return PTR_ERR(new); +@@ -155,24 +162,20 @@ static void nfs_readpage_release(struct + nfs_release_request(req); + } + +-/* +- * Set up the NFS read request struct +- */ +-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset) ++int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, +@@ -180,9 +183,46 @@ static int nfs_read_rpcsetup(struct nfs_ + .flags = RPC_TASK_ASYNC | swap_flags, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->read_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_read); ++ ++int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) ++ return pnfs_get_read_status(data); ++ ++ return nfs_initiate_read(data, clnt, call_ops); ++} ++ ++/* ++ * Set up the NFS read request struct ++ */ ++static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + data->req = req; + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -190,27 +230,14 @@ static int nfs_read_rpcsetup(struct nfs_ + data->args.pages = data->pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); ++ data->args.lock_context = req->wb_lock_context; + + data->res.fattr = &data->fattr; + data->res.count = count; + data->res.eof = 0; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- rpc_put_task(task); +- return 0; ++ return pnfs_initiate_read(data, NFS_CLIENT(inode), call_ops); + } + + static void +@@ -354,7 +381,14 @@ static void nfs_readpage_retry(struct rp + { + struct nfs_readargs *argp = &data->args; + struct nfs_readres *resp = &data->res; ++ struct nfs_client *clp = NFS_SERVER(data->inode)->nfs_client; + ++#ifdef CONFIG_NFS_V4_1 ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + if (resp->eof || resp->count == argp->count) + return; + +@@ -368,7 +402,10 @@ static void nfs_readpage_retry(struct rp + argp->offset += resp->count; + argp->pgbase += resp->count; + argp->count -= resp->count; +- nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ ++ nfs_restart_rpc(task, clp); + } + + /* +@@ -409,13 +446,19 @@ static void nfs_readpage_release_partial + void nfs_read_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_read_data *data = calldata; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client, ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, + &data->args.seq_args, &data->res.seq_res, + 0, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_read_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_read_partial_ops = { +@@ -568,7 +611,8 @@ readpage_async_filler(void *data, struct + if (len == 0) + return nfs_return_empty_page(page); + +- new = nfs_create_request(desc->ctx, inode, page, 0, len); ++ new = nfs_create_request(desc->ctx, inode, page, 0, len, ++ desc->pgio->pg_lseg); + if (IS_ERR(new)) + goto out_error; + +@@ -624,6 +668,9 @@ int nfs_readpages(struct file *filp, str + if (ret == 0) + goto read_complete; /* all pages were read */ + ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_pageio_init_read(&pgio, inode, desc.ctx, pages, &rsize); ++#endif /* CONFIG_NFS_V4_1 */ + if (rsize < PAGE_CACHE_SIZE) + nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); + else +@@ -632,6 +679,7 @@ int nfs_readpages(struct file *filp, str + ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); + + nfs_pageio_complete(&pgio); ++ put_lseg(pgio.pg_lseg); + npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nfs_add_stats(inode, NFSIOS_READPAGES, npages); + read_complete: +diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/super.c +--- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-23 12:08:29.059491391 -0400 ++++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-23 12:09:03.361501458 -0400 +@@ -64,6 +64,7 @@ + #include "iostat.h" + #include "internal.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_VFS + +@@ -669,6 +670,28 @@ static int nfs_show_options(struct seq_f + + return 0; + } ++#ifdef CONFIG_NFS_V4_1 ++void show_sessions(struct seq_file *m, struct nfs_server *server) ++{ ++ if (nfs4_has_session(server->nfs_client)) ++ seq_printf(m, ",sessions"); ++} ++#else ++void show_sessions(struct seq_file *m, struct nfs_server *server) {} ++#endif ++ ++#ifdef CONFIG_NFS_V4_1 ++void show_pnfs(struct seq_file *m, struct nfs_server *server) ++{ ++ seq_printf(m, ",pnfs="); ++ if (server->pnfs_curr_ld) ++ seq_printf(m, "%s", server->pnfs_curr_ld->name); ++ else ++ seq_printf(m, "not configured"); ++} ++#else /* CONFIG_NFS_V4_1 */ ++void show_pnfs(struct seq_file *m, struct nfs_server *server) {} ++#endif /* CONFIG_NFS_V4_1 */ + + /* + * Present statistical information for this VFS mountpoint +@@ -707,6 +730,8 @@ static int nfs_show_stats(struct seq_fil + seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); + seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); + seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); ++ show_sessions(m, nfss); ++ show_pnfs(m, nfss); + } + #endif + +diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unlink.c +--- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-08-23 12:08:29.060501485 -0400 ++++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-23 12:09:03.362419975 -0400 +@@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task + struct nfs_unlinkdata *data = calldata; + struct nfs_server *server = NFS_SERVER(data->dir); + +- if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, ++ if (nfs4_setup_sequence(server, NULL, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/write.c +--- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-08-23 12:08:27.630563929 -0400 ++++ linux-2.6.34.noarch/fs/nfs/write.c 2010-08-23 12:09:03.364491337 -0400 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + #include + +@@ -28,6 +29,7 @@ + #include "iostat.h" + #include "nfs4_fs.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PAGECACHE + +@@ -59,6 +61,7 @@ struct nfs_write_data *nfs_commitdata_al + } + return p; + } ++EXPORT_SYMBOL(nfs_commitdata_alloc); + + void nfs_commit_free(struct nfs_write_data *p) + { +@@ -66,6 +69,7 @@ void nfs_commit_free(struct nfs_write_da + kfree(p->pagevec); + mempool_free(p, nfs_commit_mempool); + } ++EXPORT_SYMBOL(nfs_commit_free); + + struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) + { +@@ -418,6 +422,17 @@ static void nfs_inode_remove_request(str + nfs_clear_request(req); + nfs_release_request(req); + } ++static void ++nfs_mark_request_nopnfs(struct nfs_page *req) ++{ ++ struct pnfs_layout_segment *lseg = req->wb_lseg; ++ ++ if (req->wb_lseg == NULL) ++ return; ++ req->wb_lseg = NULL; ++ put_lseg(lseg); ++ dprintk(" retry through MDS\n"); ++} + + static void + nfs_mark_request_dirty(struct nfs_page *req) +@@ -523,7 +538,7 @@ nfs_need_commit(struct nfs_inode *nfsi) + * The requests are *not* checked to ensure that they form a contiguous set. + */ + static int +-nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) ++nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int *use_pnfs) + { + struct nfs_inode *nfsi = NFS_I(inode); + int ret; +@@ -531,7 +546,8 @@ nfs_scan_commit(struct inode *inode, str + if (!nfs_need_commit(nfsi)) + return 0; + +- ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); ++ ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT, ++ use_pnfs); + if (ret > 0) + nfsi->ncommit -= ret; + if (nfs_need_commit(NFS_I(inode))) +@@ -560,7 +576,8 @@ static inline int nfs_scan_commit(struct + static struct nfs_page *nfs_try_to_update_request(struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int bytes) ++ unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + unsigned int rqend; +@@ -585,8 +602,8 @@ static struct nfs_page *nfs_try_to_updat + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. + */ +- if (offset > rqend +- || end < req->wb_offset) ++ if (offset > rqend || end < req->wb_offset || ++ req->wb_lseg != lseg) + goto out_flushme; + + if (nfs_set_page_tag_locked(req)) +@@ -634,16 +651,17 @@ out_err: + * already called nfs_flush_incompatible() if necessary. + */ + static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, +- struct page *page, unsigned int offset, unsigned int bytes) ++ struct page *page, unsigned int offset, unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct inode *inode = page->mapping->host; + struct nfs_page *req; + int error; + +- req = nfs_try_to_update_request(inode, page, offset, bytes); ++ req = nfs_try_to_update_request(inode, page, offset, bytes, lseg); + if (req != NULL) + goto out; +- req = nfs_create_request(ctx, inode, page, offset, bytes); ++ req = nfs_create_request(ctx, inode, page, offset, bytes, lseg); + if (IS_ERR(req)) + goto out; + error = nfs_inode_add_request(inode, req); +@@ -656,23 +674,27 @@ out: + } + + static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, ++ void *fsdata) + { + struct nfs_page *req; + +- req = nfs_setup_write_request(ctx, page, offset, count); ++ req = nfs_setup_write_request(ctx, page, offset, count, lseg); + if (IS_ERR(req)) + return PTR_ERR(req); + nfs_mark_request_dirty(req); + /* Update file length */ +- nfs_grow_file(page, offset, count); ++ if (pnfs_grow_ok(lseg, fsdata)) ++ nfs_grow_file(page, offset, count); + nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + return 0; + } + +-int nfs_flush_incompatible(struct file *file, struct page *page) ++int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct nfs_page *req; +@@ -689,7 +711,10 @@ int nfs_flush_incompatible(struct file * + req = nfs_page_find_request(page); + if (req == NULL) + return 0; +- do_flush = req->wb_page != page || req->wb_context != ctx; ++ do_flush = req->wb_page != page || req->wb_context != ctx || ++ req->wb_lock_context->lockowner != current->files || ++ req->wb_lock_context->pid != current->tgid || ++ req->wb_lseg != lseg; + nfs_release_request(req); + if (!do_flush) + return 0; +@@ -716,7 +741,8 @@ static int nfs_write_pageuptodate(struct + * things with a page scheduled for an RPC call (e.g. invalidate it). + */ + int nfs_updatepage(struct file *file, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct inode *inode = page->mapping->host; +@@ -741,7 +767,7 @@ int nfs_updatepage(struct file *file, st + offset = 0; + } + +- status = nfs_writepage_setup(ctx, page, offset, count); ++ status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata); + if (status < 0) + nfs_set_pageerror(page); + +@@ -771,25 +797,21 @@ static int flush_task_priority(int how) + return RPC_PRIORITY_NORMAL; + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_write_rpcsetup(struct nfs_page *req, +- struct nfs_write_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset, +- int how) ++int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .task = &data->task, + .rpc_message = &msg, + .callback_ops = call_ops, +@@ -800,12 +822,62 @@ static int nfs_write_rpcsetup(struct nfs + }; + int ret = 0; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->write_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ if (how & FLUSH_SYNC) { ++ ret = rpc_wait_for_completion_task(task); ++ if (ret == 0) ++ ret = task->tk_status; ++ } ++ rpc_put_task(task); ++out: ++ return ret; ++} ++EXPORT_SYMBOL(nfs_initiate_write); ++ ++int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_write(data, clnt, call_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_write_rpcsetup(struct nfs_page *req, ++ struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset, ++ int how) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + data->req = req; + data->inode = inode = req->wb_context->path.dentry->d_inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -813,6 +885,7 @@ static int nfs_write_rpcsetup(struct nfs + data->args.pages = data->pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); ++ data->args.lock_context = req->wb_lock_context; + data->args.stable = NFS_UNSTABLE; + if (how & FLUSH_STABLE) { + data->args.stable = NFS_DATA_SYNC; +@@ -825,30 +898,7 @@ static int nfs_write_rpcsetup(struct nfs + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- dprintk("NFS: %5u initiated write call " +- "(req %s/%lld, %u bytes @ offset %llu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) { +- ret = PTR_ERR(task); +- goto out; +- } +- if (how & FLUSH_SYNC) { +- ret = rpc_wait_for_completion_task(task); +- if (ret == 0) +- ret = task->tk_status; +- } +- rpc_put_task(task); +-out: +- return ret; ++ return pnfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); + } + + /* If a nfs_flush_* function fails, it should remove reqs from @head and +@@ -859,6 +909,7 @@ static void nfs_redirty_request(struct n + { + struct page *page = req->wb_page; + ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); +@@ -971,6 +1022,10 @@ static void nfs_pageio_init_write(struct + { + size_t wsize = NFS_SERVER(inode)->wsize; + ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_pageio_init_write(pgio, inode, &wsize); ++#endif /* CONFIG_NFS_V4_1 */ ++ + if (wsize < PAGE_CACHE_SIZE) + nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); + else +@@ -1036,13 +1091,27 @@ out: + void nfs_write_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_write_data *data = calldata; +- struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(clp, &data->args.seq_args, ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } else if (data->args.count > NFS_SERVER(data->inode)->wsize) { ++ /* retrying via MDS? */ ++ data->pdata.orig_count = data->args.count; ++ data->args.count = NFS_SERVER(data->inode)->wsize; ++ dprintk("%s: trimmed count %u to wsize %u\n", __func__, ++ data->pdata.orig_count, data->args.count); ++ } else ++ data->pdata.orig_count = 0; ++ ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, ++ &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_write_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_write_partial_ops = { +@@ -1126,10 +1195,11 @@ int nfs_writeback_done(struct rpc_task * + struct nfs_writeargs *argp = &data->args; + struct nfs_writeres *resp = &data->res; + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *clp = server->nfs_client; + int status; + +- dprintk("NFS: %5u nfs_writeback_done (status %d)\n", +- task->tk_pid, task->tk_status); ++ dprintk("NFS: %5u nfs_writeback_done (status %d count %u)\n", ++ task->tk_pid, task->tk_status, resp->count); + + /* + * ->write_done will attempt to use post-op attributes to detect +@@ -1142,6 +1212,13 @@ int nfs_writeback_done(struct rpc_task * + if (status != 0) + return status; + nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); ++#ifdef CONFIG_NFS_V4_1 ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) + if (resp->verf->committed < argp->stable && task->tk_status >= 0) { +@@ -1158,7 +1235,7 @@ int nfs_writeback_done(struct rpc_task * + if (time_before(complain, jiffies)) { + dprintk("NFS: faulty NFS server %s:" + " (committed = %d) != (stable = %d)\n", +- server->nfs_client->cl_hostname, ++ clp->cl_hostname, + resp->verf->committed, argp->stable); + complain = jiffies + 300 * HZ; + } +@@ -1168,6 +1245,9 @@ int nfs_writeback_done(struct rpc_task * + if (task->tk_status >= 0 && resp->count < argp->count) { + static unsigned long complain; + ++ dprintk("NFS: short write:" ++ " (resp->count %u) < (argp->count = %u)\n", ++ resp->count, argp->count); + nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE); + + /* Has the server at least made some progress? */ +@@ -1184,7 +1264,10 @@ int nfs_writeback_done(struct rpc_task * + */ + argp->stable = NFS_FILE_SYNC; + } +- nfs_restart_rpc(task, server->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ ++ nfs_restart_rpc(task, clp); + return -EAGAIN; + } + if (time_before(complain, jiffies)) { +@@ -1228,40 +1311,73 @@ static void nfs_commitdata_release(void + nfs_commit_free(wdata); + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_commit_rpcsetup(struct list_head *head, +- struct nfs_write_data *data, +- int how) ++int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct nfs_page *first = nfs_list_entry(head->next); +- struct inode *inode = first->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = first->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, +- .callback_ops = &nfs_commit_ops, ++ .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + .priority = priority, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->commit_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_commit); ++ ++ ++int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs) ++{ ++ if (pnfs && ++ (pnfs_try_to_commit(data, &nfs_commit_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_commit(data, clnt, &nfs_commit_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_commit_rpcsetup(struct list_head *head, ++ struct nfs_write_data *data, ++ int how, int pnfs) ++{ ++ struct nfs_page *first = nfs_list_entry(head->next); ++ struct inode *inode = first->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + list_splice_init(head, &data->pages); + + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = first->wb_context->cred; + + data->args.fh = NFS_FH(data->inode); + /* Note: we always request a commit of the entire inode */ +@@ -1272,45 +1388,47 @@ static int nfs_commit_rpcsetup(struct li + data->res.fattr = &data->fattr; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); ++ kref_init(&data->refcount); ++ data->parent = NULL; ++ data->args.context = first->wb_context; /* used by commit done */ + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->commit_setup(data, &msg); ++ return pnfs_initiate_commit(data, NFS_CLIENT(inode), &nfs_commit_ops, ++ how, pnfs); ++} + +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++/* Handle memory error during commit */ ++void nfs_mark_list_commit(struct list_head *head) ++{ ++ struct nfs_page *req; + +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- rpc_put_task(task); +- return 0; ++ while (!list_empty(head)) { ++ req = nfs_list_entry(head->next); ++ nfs_list_remove_request(req); ++ nfs_mark_request_commit(req); ++ dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); ++ dec_bdi_stat(req->wb_page->mapping->backing_dev_info, ++ BDI_RECLAIMABLE); ++ nfs_clear_page_tag_locked(req); ++ } + } ++EXPORT_SYMBOL(nfs_mark_list_commit); + + /* + * Commit dirty pages + */ + static int +-nfs_commit_list(struct inode *inode, struct list_head *head, int how) ++nfs_commit_list(struct inode *inode, struct list_head *head, int how, int pnfs) + { + struct nfs_write_data *data; +- struct nfs_page *req; + + data = nfs_commitdata_alloc(); +- + if (!data) + goto out_bad; + + /* Set up the argument struct */ +- return nfs_commit_rpcsetup(head, data, how); ++ return nfs_commit_rpcsetup(head, data, how, pnfs); + out_bad: +- while (!list_empty(head)) { +- req = nfs_list_entry(head->next); +- nfs_list_remove_request(req); +- nfs_mark_request_commit(req); +- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); +- dec_bdi_stat(req->wb_page->mapping->backing_dev_info, +- BDI_RECLAIMABLE); +- nfs_clear_page_tag_locked(req); +- } ++ nfs_mark_list_commit(head); + nfs_commit_clear_lock(NFS_I(inode)); + return -ENOMEM; + } +@@ -1330,6 +1448,19 @@ static void nfs_commit_done(struct rpc_t + return; + } + ++static inline void nfs_commit_cleanup(struct kref *kref) ++{ ++ struct nfs_write_data *data; ++ ++ data = container_of(kref, struct nfs_write_data, refcount); ++ /* Clear lock only when all cloned commits are finished */ ++ if (data->parent) ++ kref_put(&data->parent->refcount, nfs_commit_cleanup); ++ else ++ nfs_commit_clear_lock(NFS_I(data->inode)); ++ nfs_commitdata_release(data); ++} ++ + static void nfs_commit_release(void *calldata) + { + struct nfs_write_data *data = calldata; +@@ -1347,6 +1478,11 @@ static void nfs_commit_release(void *cal + req->wb_bytes, + (long long)req_offset(req)); + if (status < 0) { ++ if (req->wb_lseg) { ++ nfs_mark_request_nopnfs(req); ++ nfs_mark_request_dirty(req); ++ goto next; ++ } + nfs_context_set_write_error(req->wb_context, status); + nfs_inode_remove_request(req); + dprintk(", error = %d\n", status); +@@ -1363,12 +1499,12 @@ static void nfs_commit_release(void *cal + } + /* We have a mismatch. Write the page again */ + dprintk(" mismatch\n"); ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + next: + nfs_clear_page_tag_locked(req); + } +- nfs_commit_clear_lock(NFS_I(data->inode)); +- nfs_commitdata_release(calldata); ++ kref_put(&data->refcount, nfs_commit_cleanup); + } + + static const struct rpc_call_ops nfs_commit_ops = { +@@ -1384,21 +1520,22 @@ int nfs_commit_inode(struct inode *inode + LIST_HEAD(head); + int may_wait = how & FLUSH_SYNC; + int res = 0; ++ int use_pnfs = 0; + + if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) + goto out_mark_dirty; + spin_lock(&inode->i_lock); +- res = nfs_scan_commit(inode, &head, 0, 0); ++ res = nfs_scan_commit(inode, &head, 0, 0, &use_pnfs); + spin_unlock(&inode->i_lock); + if (res) { +- int error = nfs_commit_list(inode, &head, how); ++ int error = nfs_commit_list(inode, &head, how, use_pnfs); + if (error < 0) + return error; +- if (may_wait) ++ if (may_wait) { + wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); +- else ++ } else + goto out_mark_dirty; + } else + nfs_commit_clear_lock(NFS_I(inode)); +@@ -1451,7 +1588,18 @@ static int nfs_commit_unstable_pages(str + + int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) + { +- return nfs_commit_unstable_pages(inode, wbc); ++ int ret; ++ ret = nfs_commit_unstable_pages(inode, wbc); ++ if (ret >= 0 && layoutcommit_needed(NFS_I(inode))) { ++ int err, sync = wbc->sync_mode; ++ ++ if (wbc->nonblocking || wbc->for_background) ++ sync = 0; ++ err = pnfs_layoutcommit_inode(inode, sync); ++ if (err < 0) ++ ret = err; ++ } ++ return ret; + } + + /* +@@ -1459,6 +1607,7 @@ int nfs_write_inode(struct inode *inode, + */ + int nfs_wb_all(struct inode *inode) + { ++ int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, +@@ -1466,7 +1615,8 @@ int nfs_wb_all(struct inode *inode) + .range_end = LLONG_MAX, + }; + +- return sync_inode(inode, &wbc); ++ ret = sync_inode(inode, &wbc); ++ return ret; + } + + int nfs_wb_page_cancel(struct inode *inode, struct page *page) +diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/include/linux/exportfs.h +--- linux-2.6.34.noarch/include/linux/exportfs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-08-23 12:09:03.365501459 -0400 +@@ -2,6 +2,7 @@ + #define LINUX_EXPORTFS_H 1 + + #include ++#include + + struct dentry; + struct inode; +@@ -175,4 +176,62 @@ extern struct dentry *generic_fh_to_pare + struct fid *fid, int fh_len, int fh_type, + struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)); + ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct pnfs_filelayout_device; ++struct pnfs_filelayout_layout; ++ ++extern int filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev); ++extern enum nfsstat4 filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct list_head; ++ ++extern int blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes); ++ ++extern enum nfsstat4 blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *layouts); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_PNFSD) ++#include ++ ++struct pnfsd_cb_operations; ++ ++struct pnfsd_cb_ctl { ++ spinlock_t lock; ++ struct module *module; ++ const struct pnfsd_cb_operations *cb_op; ++}; ++ ++/* in expfs.c so that file systems can depend on it */ ++extern struct pnfsd_cb_ctl pnfsd_cb_ctl; ++ ++static inline int ++pnfsd_get_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ int ret = -ENOENT; ++ ++ spin_lock(&pnfsd_cb_ctl.lock); ++ if (!pnfsd_cb_ctl.cb_op) ++ goto out; ++ if (!try_module_get(pnfsd_cb_ctl.module)) ++ goto out; ++ ctl->cb_op = pnfsd_cb_ctl.cb_op; ++ ctl->module = pnfsd_cb_ctl.module; ++ ret = 0; ++out: ++ spin_unlock(&pnfsd_cb_ctl.lock); ++ return ret; ++} ++ ++static inline void ++pnfsd_put_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ module_put(ctl->module); ++} ++#endif /* CONFIG_PNFSD */ + #endif /* LINUX_EXPORTFS_H */ +diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/include/linux/exp_xdr.h +--- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-08-23 12:09:03.367491365 -0400 ++++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-08-23 12:09:03.367491365 -0400 +@@ -0,0 +1,141 @@ ++#ifndef _LINUX_EXP_XDR_H ++#define _LINUX_EXP_XDR_H ++ ++#include ++#include ++#include ++ ++struct exp_xdr_stream { ++ __be32 *p; ++ __be32 *end; ++}; ++ ++/** ++ * exp_xdr_qwords - Calculate the number of quad-words holding nbytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline size_t ++exp_xdr_qwords(__u32 nbytes) ++{ ++ return DIV_ROUND_UP(nbytes, 4); ++} ++ ++/** ++ * exp_xdr_qbytes - Calculate the number of bytes holding qwords ++ * @qwords: number of quad-words to encode ++ */ ++static inline size_t ++exp_xdr_qbytes(size_t qwords) ++{ ++ return qwords << 2; ++} ++ ++/** ++ * exp_xdr_reserve_space - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nbytes: number of bytes to reserve ++ * ++ * Checks that we have enough buffer space to encode 'nbytes' more ++ * bytes of data. If so, update the xdr stream. ++ */ ++static inline __be32 * ++exp_xdr_reserve_space(struct exp_xdr_stream *xdr, size_t nbytes) ++{ ++ __be32 *p = xdr->p; ++ __be32 *q; ++ ++ /* align nbytes on the next 32-bit boundary */ ++ q = p + exp_xdr_qwords(nbytes); ++ if (unlikely(q > xdr->end || q < p)) ++ return NULL; ++ xdr->p = q; ++ return p; ++} ++ ++/** ++ * exp_xdr_reserve_qwords - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nwords: number of quad words (u32's) to reserve ++ */ ++static inline __be32 * ++exp_xdr_reserve_qwords(struct exp_xdr_stream *xdr, size_t qwords) ++{ ++ return exp_xdr_reserve_space(xdr, exp_xdr_qbytes(qwords)); ++} ++ ++/** ++ * exp_xdr_encode_u32 - Encode an unsigned 32-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u32(__be32 *p, __u32 val) ++{ ++ *p = cpu_to_be32(val); ++ return p + 1; ++} ++ ++/** ++ * exp_xdr_encode_u64 - Encode an unsigned 64-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u64(__be32 *p, __u64 val) ++{ ++ put_unaligned_be64(val, p); ++ return p + 2; ++} ++ ++/** ++ * exp_xdr_encode_bytes - Encode an array of bytes onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the array of bytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_bytes(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ if (likely(nbytes != 0)) { ++ unsigned int qwords = exp_xdr_qwords(nbytes); ++ unsigned int padding = exp_xdr_qbytes(qwords) - nbytes; ++ ++ memcpy(p, ptr, nbytes); ++ if (padding != 0) ++ memset((char *)p + nbytes, 0, padding); ++ p += qwords; ++ } ++ return p; ++} ++ ++/** ++ * exp_xdr_encode_opaque - Encode an opaque type onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the opaque array ++ * @nbytes: number of bytes to encode ++ * ++ * Encodes the 32-bit opaque size in bytes followed by the opaque value. ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ p = exp_xdr_encode_u32(p, nbytes); ++ return exp_xdr_encode_bytes(p, ptr, nbytes); ++} ++ ++/** ++ * exp_xdr_encode_opaque_qlen - Encode the opaque length onto a xdr stream ++ * @lenp: pointer to the opaque length destination ++ * @endp: pointer to the end of the opaque array ++ * ++ * Encodes the 32-bit opaque size in bytes given the start and end pointers ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque_len(__be32 *lenp, const void *endp) ++{ ++ size_t nbytes = (char *)endp - (char *)(lenp + 1); ++ ++ exp_xdr_encode_u32(lenp, nbytes); ++ return lenp + 1 + exp_xdr_qwords(nbytes); ++} ++#endif /* _LINUX_EXP_XDR_H */ +diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include/linux/fs.h +--- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-08-23 12:08:29.021511898 -0400 ++++ linux-2.6.34.noarch/include/linux/fs.h 2010-08-23 12:09:03.369481147 -0400 +@@ -387,6 +387,7 @@ struct inodes_stat_t { + #include + + struct export_operations; ++struct pnfs_export_operations; + struct hd_geometry; + struct iovec; + struct nameidata; +@@ -1329,6 +1330,7 @@ struct super_block { + const struct dquot_operations *dq_op; + const struct quotactl_ops *s_qcop; + const struct export_operations *s_export_op; ++ const struct pnfs_export_operations *s_pnfs_op; + unsigned long s_flags; + unsigned long s_magic; + struct dentry *s_root; +diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/include/linux/nfs4.h +--- linux-2.6.34.noarch/include/linux/nfs4.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-08-23 12:09:03.371491472 -0400 +@@ -17,7 +17,10 @@ + + #define NFS4_BITMAP_SIZE 2 + #define NFS4_VERIFIER_SIZE 8 +-#define NFS4_STATEID_SIZE 16 ++#define NFS4_CLIENTID_SIZE 8 ++#define NFS4_STATEID_SEQID_SIZE 4 ++#define NFS4_STATEID_OTHER_SIZE 12 ++#define NFS4_STATEID_SIZE (NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE) + #define NFS4_FHSIZE 128 + #define NFS4_MAXPATHLEN PATH_MAX + #define NFS4_MAXNAMLEN NAME_MAX +@@ -119,6 +122,13 @@ + #define EXCHGID4_FLAG_MASK_A 0x40070003 + #define EXCHGID4_FLAG_MASK_R 0x80070003 + ++static inline bool ++is_ds_only_session(u32 exchange_flags) ++{ ++ u32 mask = EXCHGID4_FLAG_USE_PNFS_DS | EXCHGID4_FLAG_USE_PNFS_MDS; ++ return (exchange_flags & mask) == EXCHGID4_FLAG_USE_PNFS_DS; ++} ++ + #define SEQ4_STATUS_CB_PATH_DOWN 0x00000001 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING 0x00000002 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED 0x00000004 +@@ -166,8 +176,25 @@ struct nfs4_acl { + struct nfs4_ace aces[0]; + }; + ++struct nfs4_fsid { ++ u64 major; ++ u64 minor; ++}; ++ + typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier; +-typedef struct { char data[NFS4_STATEID_SIZE]; } nfs4_stateid; ++typedef struct { char data[NFS4_CLIENTID_SIZE]; } nfs4_clientid; ++ ++struct nfs41_stateid { ++ __be32 seqid; ++ char other[NFS4_STATEID_OTHER_SIZE]; ++} __attribute__ ((packed)); ++ ++typedef struct { ++ union { ++ char data[NFS4_STATEID_SIZE]; ++ struct nfs41_stateid stateid; ++ } u; ++} nfs4_stateid; + + enum nfs_opnum4 { + OP_ACCESS = 3, +@@ -471,6 +498,8 @@ enum lock_type4 { + #define FATTR4_WORD1_TIME_MODIFY (1UL << 21) + #define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22) + #define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23) ++#define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30) ++#define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) + + #define NFSPROC4_NULL 0 + #define NFSPROC4_COMPOUND 1 +@@ -523,6 +552,7 @@ enum { + NFSPROC4_CLNT_GETACL, + NFSPROC4_CLNT_SETACL, + NFSPROC4_CLNT_FS_LOCATIONS, ++ NFSPROC4_CLNT_RELEASE_LOCKOWNER, + + /* nfs41 */ + NFSPROC4_CLNT_EXCHANGE_ID, +@@ -531,6 +561,13 @@ enum { + NFSPROC4_CLNT_SEQUENCE, + NFSPROC4_CLNT_GET_LEASE_TIME, + NFSPROC4_CLNT_RECLAIM_COMPLETE, ++ NFSPROC4_CLNT_PNFS_LAYOUTGET, ++ NFSPROC4_CLNT_PNFS_LAYOUTCOMMIT, ++ NFSPROC4_CLNT_PNFS_LAYOUTRETURN, ++ NFSPROC4_CLNT_PNFS_GETDEVICELIST, ++ NFSPROC4_CLNT_PNFS_GETDEVICEINFO, ++ NFSPROC4_CLNT_PNFS_WRITE, ++ NFSPROC4_CLNT_PNFS_COMMIT, + }; + + /* nfs41 types */ +@@ -549,6 +586,43 @@ enum state_protect_how4 { + SP4_SSV = 2 + }; + ++enum pnfs_layouttype { ++ LAYOUT_NFSV4_1_FILES = 1, ++ LAYOUT_OSD2_OBJECTS = 2, ++ LAYOUT_BLOCK_VOLUME = 3, ++}; ++ ++/* used for both layout return and recall */ ++enum pnfs_layoutreturn_type { ++ RETURN_FILE = 1, ++ RETURN_FSID = 2, ++ RETURN_ALL = 3 ++}; ++ ++enum pnfs_iomode { ++ IOMODE_READ = 1, ++ IOMODE_RW = 2, ++ IOMODE_ANY = 3, ++}; ++ ++enum pnfs_notify_deviceid_type4 { ++ NOTIFY_DEVICEID4_CHANGE = 1 << 1, ++ NOTIFY_DEVICEID4_DELETE = 1 << 2, ++}; ++ ++#define NFL4_UFLG_MASK 0x0000003F ++#define NFL4_UFLG_DENSE 0x00000001 ++#define NFL4_UFLG_COMMIT_THRU_MDS 0x00000002 ++#define NFL4_UFLG_STRIPE_UNIT_SIZE_MASK 0xFFFFFFC0 ++ ++/* Encoded in the loh_body field of type layouthint4 */ ++enum filelayout_hint_care4 { ++ NFLH4_CARE_DENSE = NFL4_UFLG_DENSE, ++ NFLH4_CARE_COMMIT_THRU_MDS = NFL4_UFLG_COMMIT_THRU_MDS, ++ NFLH4_CARE_STRIPE_UNIT_SIZE = 0x00000040, ++ NFLH4_CARE_STRIPE_COUNT = 0x00000080 ++}; ++ + #endif + #endif + +diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfs4_pnfs.h +--- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-23 12:09:03.372501550 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-08-23 12:09:03.372501550 -0400 +@@ -0,0 +1,330 @@ ++/* ++ * include/linux/nfs4_pnfs.h ++ * ++ * Common data structures needed by the pnfs client and pnfs layout driver. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef LINUX_NFS4_PNFS_H ++#define LINUX_NFS4_PNFS_H ++ ++#include ++#include ++ ++enum pnfs_try_status { ++ PNFS_ATTEMPTED = 0, ++ PNFS_NOT_ATTEMPTED = 1, ++}; ++ ++#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 ++ ++/* Per-layout driver specific registration structure */ ++struct pnfs_layoutdriver_type { ++ const u32 id; ++ const char *name; ++ struct layoutdriver_io_operations *ld_io_ops; ++ struct layoutdriver_policy_operations *ld_policy_ops; ++}; ++ ++struct pnfs_fsdata { ++ int bypass_eof; ++ struct pnfs_layout_segment *lseg; ++ void *private; ++}; ++ ++#if defined(CONFIG_NFS_V4_1) ++ ++static inline struct nfs_inode * ++PNFS_NFS_INODE(struct pnfs_layout_type *lo) ++{ ++ return NFS_I(lo->lo_inode); ++} ++ ++static inline struct inode * ++PNFS_INODE(struct pnfs_layout_type *lo) ++{ ++ return lo->lo_inode; ++} ++ ++static inline struct nfs_server * ++PNFS_NFS_SERVER(struct pnfs_layout_type *lo) ++{ ++ return NFS_SERVER(PNFS_INODE(lo)); ++} ++ ++static inline struct pnfs_layoutdriver_type * ++PNFS_LD(struct pnfs_layout_type *lo) ++{ ++ return NFS_SERVER(PNFS_INODE(lo))->pnfs_curr_ld; ++} ++ ++static inline struct layoutdriver_io_operations * ++PNFS_LD_IO_OPS(struct pnfs_layout_type *lo) ++{ ++ return PNFS_LD(lo)->ld_io_ops; ++} ++ ++static inline struct layoutdriver_policy_operations * ++PNFS_LD_POLICY_OPS(struct pnfs_layout_type *lo) ++{ ++ return PNFS_LD(lo)->ld_policy_ops; ++} ++ ++static inline bool ++has_layout(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout != NULL; ++} ++ ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) ++{ ++ return has_layout(nfsi) && ++ test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->pnfs_layout_state); ++} ++ ++extern void put_lseg(struct pnfs_layout_segment *lseg); ++extern void get_lseg(struct pnfs_layout_segment *lseg); ++ ++#else /* CONFIG_NFS_V4_1 */ ++ ++static inline bool ++has_layout(struct nfs_inode *nfsi) ++{ ++ return false; ++} ++ ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_NFS_V4_1 */ ++ ++struct pnfs_layout_segment { ++ struct list_head fi_list; ++ struct nfs4_pnfs_layout_segment range; ++ struct kref kref; ++ bool valid; ++ struct pnfs_layout_type *layout; ++ struct nfs4_deviceid *deviceid; ++ u8 ld_data[]; /* layout driver private data */ ++}; ++ ++static inline void * ++LSEG_LD_DATA(struct pnfs_layout_segment *lseg) ++{ ++ return lseg->ld_data; ++} ++ ++/* Layout driver I/O operations. ++ * Either the pagecache or non-pagecache read/write operations must be implemented ++ */ ++struct layoutdriver_io_operations { ++ /* Functions that use the pagecache. ++ * If use_pagecache == 1, then these functions must be implemented. ++ */ ++ /* read and write pagelist should return just 0 (to indicate that ++ * the layout code has taken control) or 1 (to indicate that the ++ * layout code wishes to fall back to normal nfs.) If 0 is returned, ++ * information can be passed back through nfs_data->res and ++ * nfs_data->task.tk_status, and the appropriate pnfs done function ++ * MUST be called. ++ */ ++ enum pnfs_try_status ++ (*read_pagelist) (struct nfs_read_data *nfs_data, unsigned nr_pages); ++ enum pnfs_try_status ++ (*write_pagelist) (struct nfs_write_data *nfs_data, unsigned nr_pages, int how); ++ int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page, ++ loff_t pos, unsigned count, ++ struct pnfs_fsdata *fsdata); ++ int (*write_end)(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, ++ struct pnfs_layout_segment *lseg); ++ void (*write_end_cleanup)(struct file *filp, ++ struct pnfs_fsdata *fsdata); ++ ++ /* Consistency ops */ ++ /* 2 problems: ++ * 1) the page list contains nfs_pages, NOT pages ++ * 2) currently the NFS code doesn't create a page array (as it does with read/write) ++ */ ++ enum pnfs_try_status ++ (*commit) (struct nfs_write_data *nfs_data, int how); ++ ++ /* Layout information. For each inode, alloc_layout is executed once to retrieve an ++ * inode specific layout structure. Each subsequent layoutget operation results in ++ * a set_layout call to set the opaque layout in the layout driver.*/ ++ struct pnfs_layout_type * (*alloc_layout) (struct inode *inode); ++ void (*free_layout) (struct pnfs_layout_type *); ++ struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_type *layoutid, struct nfs4_pnfs_layoutget_res *lgr); ++ void (*free_lseg) (struct pnfs_layout_segment *lseg); ++ ++ int (*setup_layoutcommit) (struct pnfs_layout_type *layoutid, ++ struct pnfs_layoutcommit_arg *args); ++ ++ void (*encode_layoutcommit) (struct pnfs_layout_type *layoutid, ++ struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *args); ++ void (*cleanup_layoutcommit) (struct pnfs_layout_type *layoutid, ++ struct pnfs_layoutcommit_arg *args, ++ int status); ++ void (*encode_layoutreturn) (struct pnfs_layout_type *layoutid, ++ struct xdr_stream *xdr, ++ const struct nfs4_pnfs_layoutreturn_arg *args); ++ ++ /* Registration information for a new mounted file system ++ */ ++ int (*initialize_mountpoint) (struct nfs_server *, ++ const struct nfs_fh * mntfh); ++ int (*uninitialize_mountpoint) (struct nfs_server *server); ++}; ++ ++enum layoutdriver_policy_flags { ++ /* Should the full nfs rpc cleanup code be used after io */ ++ PNFS_USE_RPC_CODE = 1 << 0, ++ ++ /* Should the NFS req. gather algorithm cross stripe boundaries? */ ++ PNFS_GATHER_ACROSS_STRIPES = 1 << 1, ++ ++ /* Should the pNFS client commit and return the layout upon a setattr */ ++ PNFS_LAYOUTRET_ON_SETATTR = 1 << 3, ++}; ++ ++struct layoutdriver_policy_operations { ++ unsigned flags; ++ ++ /* The stripe size of the file system */ ++ ssize_t (*get_stripesize) (struct pnfs_layout_type *layoutid); ++ ++ /* test for nfs page cache coalescing */ ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++ ++ /* Retreive the block size of the file system. ++ * If gather_across_stripes == 1, then the file system will gather ++ * requests into the block size. ++ * TODO: Where will the layout driver get this info? It is hard ++ * coded in PVFS2. ++ */ ++ ssize_t (*get_blocksize) (void); ++}; ++ ++/* Should the full nfs rpc cleanup code be used after io */ ++static inline int ++pnfs_ld_use_rpc_code(struct pnfs_layoutdriver_type *ld) ++{ ++ return ld->ld_policy_ops->flags & PNFS_USE_RPC_CODE; ++} ++ ++/* Should the NFS req. gather algorithm cross stripe boundaries? */ ++static inline int ++pnfs_ld_gather_across_stripes(struct pnfs_layoutdriver_type *ld) ++{ ++ return ld->ld_policy_ops->flags & PNFS_GATHER_ACROSS_STRIPES; ++} ++ ++struct pnfs_device { ++ struct pnfs_deviceid dev_id; ++ unsigned int layout_type; ++ unsigned int mincount; ++ struct page **pages; ++ void *area; ++ unsigned int pgbase; ++ unsigned int pglen; ++ unsigned int dev_notify_types; ++}; ++ ++struct pnfs_devicelist { ++ unsigned int eof; ++ unsigned int num_devs; ++ struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; ++}; ++ ++/* ++ * Device ID RCU cache. A device ID is unique per client ID and layout type. ++ */ ++#define NFS4_DEVICE_ID_HASH_BITS 5 ++#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) ++#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) ++ ++static inline u32 ++nfs4_deviceid_hash(struct pnfs_deviceid *id) ++{ ++ unsigned char *cptr = (unsigned char *)id->data; ++ unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE; ++ u32 x = 0; ++ ++ while (nbytes--) { ++ x *= 37; ++ x += *cptr++; ++ } ++ return x & NFS4_DEVICE_ID_HASH_MASK; ++} ++ ++struct nfs4_deviceid_cache { ++ spinlock_t dc_lock; ++ struct kref dc_kref; ++ void (*dc_free_callback)(struct kref *); ++ struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE]; ++}; ++ ++/* Device ID cache node */ ++struct nfs4_deviceid { ++ struct hlist_node de_node; ++ struct pnfs_deviceid de_id; ++ struct kref de_kref; ++}; ++ ++extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *, ++ void (*free_callback)(struct kref *)); ++extern void nfs4_put_deviceid_cache(struct nfs_client *); ++extern void nfs4_init_deviceid_node(struct nfs4_deviceid *); ++extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *, ++ struct pnfs_deviceid *); ++extern struct nfs4_deviceid *nfs4_add_deviceid(struct nfs4_deviceid_cache *, ++ struct nfs4_deviceid *); ++extern void nfs4_set_layout_deviceid(struct pnfs_layout_segment *, ++ struct nfs4_deviceid *); ++extern void nfs4_unset_layout_deviceid(struct pnfs_layout_segment *, ++ struct nfs4_deviceid *, ++ void (*free_callback)(struct kref *)); ++extern void nfs4_delete_device(struct nfs4_deviceid_cache *, ++ struct pnfs_deviceid *); ++ ++/* pNFS client callback functions. ++ * These operations allow the layout driver to access pNFS client ++ * specific information or call pNFS client->server operations. ++ * E.g., getdeviceinfo, I/O callbacks, etc ++ */ ++struct pnfs_client_operations { ++ int (*nfs_getdevicelist) (struct nfs_server *, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist); ++ int (*nfs_getdeviceinfo) (struct nfs_server *, ++ struct pnfs_device *dev); ++ ++ /* Post read callback. */ ++ void (*nfs_readlist_complete) (struct nfs_read_data *nfs_data); ++ ++ /* Post write callback. */ ++ void (*nfs_writelist_complete) (struct nfs_write_data *nfs_data); ++ ++ /* Post commit callback. */ ++ void (*nfs_commit_complete) (struct nfs_write_data *nfs_data); ++ void (*nfs_return_layout) (struct inode *); ++}; ++ ++extern struct pnfs_client_operations pnfs_ops; ++ ++extern struct pnfs_client_operations *pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); ++extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); ++ ++#define NFS4_PNFS_MAX_LAYOUTS 4 ++#define NFS4_PNFS_PRIVATE_LAYOUT 0x80000000 ++ ++#endif /* LINUX_NFS4_PNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarch/include/linux/nfsd4_block.h +--- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-08-23 12:09:03.373491892 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-08-23 12:09:03.374491393 -0400 +@@ -0,0 +1,101 @@ ++#ifndef NFSD4_BLOCK ++#define NFSD4_BLOCK ++ ++#include ++#include ++#include ++#include ++ ++#define PNFS_BLOCK_SUCCESS 1 ++#define PNFS_BLOCK_FAILURE 0 ++ ++#define PNFS_BLOCK_CTL_START 1 ++#define PNFS_BLOCK_CTL_STOP 2 ++#define PNFS_BLOCK_CTL_VERS 3 /* Allows daemon to request current ++ * version from kernel via an upcall. ++ */ ++ ++#define PNFS_UPCALL_MSG_STOP 0 ++#define PNFS_UPCALL_MSG_GETSIG 1 ++#define PNFS_UPCALL_MSG_GETSLICE 2 ++#define PNFS_UPCALL_MSG_DMCHK 3 // See if dev_t is a DM volume ++#define PNFS_UPCALL_MSG_DMGET 4 ++#define PNFS_UPCALL_MSG_VERS 5 ++ ++#define PNFS_UPCALL_VERS 8 ++ ++typedef struct stripe_dev { ++ int major, ++ minor, ++ offset; ++} stripe_dev_t; ++ ++typedef struct bl_comm_res { ++ int res_status; ++ union { ++ struct { ++ long long start, ++ length; ++ } slice; ++ struct { ++ int num_stripes, ++ stripe_size; ++ stripe_dev_t devs[]; ++ } stripe; ++ struct { ++ long long sector; ++ int offset, ++ len; ++ char sig[]; ++ } sig; ++ int vers, ++ dm_vol; ++ } u; ++} bl_comm_res_t; ++ ++typedef struct bl_comm_msg { ++ int msg_type, ++ msg_status; ++ union { ++ dev_t msg_dev; ++ int msg_vers; ++ } u; ++ bl_comm_res_t *msg_res; ++} bl_comm_msg_t; ++ ++#ifdef __KERNEL__ ++ ++typedef struct bl_comm { ++ /* ---- protects access to this structure ---- */ ++ struct mutex lock; ++ /* ---- protects access to rpc pipe ---- */ ++ struct mutex pipe_lock; ++ struct dentry *pipe_dentry; ++ wait_queue_head_t pipe_wq; ++ bl_comm_msg_t msg; ++} bl_comm_t; ++ ++int pnfs_block_enabled(struct inode *, int); ++int bl_layout_type(struct super_block *sb); ++int bl_getdeviceiter(struct super_block *, u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int bl_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++enum nfsstat4 bl_layoutget(struct inode *, struct exp_xdr_stream *, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int bl_layoutcommit(struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++int bl_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len); ++int bl_init_proc(void); ++int bl_upcall(bl_comm_t *, bl_comm_msg_t *, bl_comm_res_t **); ++ ++extern bl_comm_t *bl_comm_global; // Ugly... ++#endif /* __KERNEL__ */ ++ ++#endif /* NFSD4_BLOCK */ ++ +diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h +--- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-23 12:09:03.375501481 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-08-23 12:09:03.375501481 -0400 +@@ -0,0 +1,345 @@ ++/* ++ * include/linux/nfsd4_spnfs.h ++ * ++ * spNFS - simple pNFS implementation with userspace daemon ++ * ++ */ ++ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#ifndef NFS_SPNFS_H ++#define NFS_SPNFS_H ++ ++ ++#ifdef __KERNEL__ ++#include "exportfs.h" ++#include "sunrpc/svc.h" ++#include "nfsd/nfsfh.h" ++#else ++#include ++#endif /* __KERNEL__ */ ++ ++#define SPNFS_STATUS_INVALIDMSG 0x01 ++#define SPNFS_STATUS_AGAIN 0x02 ++#define SPNFS_STATUS_FAIL 0x04 ++#define SPNFS_STATUS_SUCCESS 0x08 ++ ++#define SPNFS_TYPE_LAYOUTGET 0x01 ++#define SPNFS_TYPE_LAYOUTCOMMIT 0x02 ++#define SPNFS_TYPE_LAYOUTRETURN 0x03 ++#define SPNFS_TYPE_GETDEVICEITER 0x04 ++#define SPNFS_TYPE_GETDEVICEINFO 0x05 ++#define SPNFS_TYPE_SETATTR 0x06 ++#define SPNFS_TYPE_OPEN 0x07 ++#define SPNFS_TYPE_CLOSE 0x08 ++#define SPNFS_TYPE_CREATE 0x09 ++#define SPNFS_TYPE_REMOVE 0x0a ++#define SPNFS_TYPE_COMMIT 0x0b ++#define SPNFS_TYPE_READ 0x0c ++#define SPNFS_TYPE_WRITE 0x0d ++ ++#define SPNFS_MAX_DEVICES 1 ++#define SPNFS_MAX_DATA_SERVERS 16 ++#define SPNFS_MAX_IO 512 ++ ++/* layout */ ++struct spnfs_msg_layoutget_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_filelayout_list { ++ u_int32_t fh_len; ++ unsigned char fh_val[128]; /* DMXXX fix this const */ ++}; ++ ++struct spnfs_msg_layoutget_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t stripe_size; ++ u_int32_t stripe_type; ++ u_int32_t stripe_count; ++ struct spnfs_filelayout_list flist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++/* layoutcommit */ ++struct spnfs_msg_layoutcommit_args { ++ unsigned long inode; ++ unsigned long generation; ++ u_int64_t file_size; ++}; ++ ++struct spnfs_msg_layoutcommit_res { ++ int status; ++}; ++ ++/* layoutreturn */ ++/* No op for the daemon */ ++/* ++struct spnfs_msg_layoutreturn_args { ++}; ++ ++struct spnfs_msg_layoutreturn_res { ++}; ++*/ ++ ++/* getdeviceiter */ ++struct spnfs_msg_getdeviceiter_args { ++ unsigned long inode; ++ u_int64_t cookie; ++ u_int64_t verf; ++}; ++ ++struct spnfs_msg_getdeviceiter_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t cookie; ++ u_int64_t verf; ++ u_int32_t eof; ++}; ++ ++/* getdeviceinfo */ ++struct spnfs_data_server { ++ u_int32_t dsid; ++ char netid[5]; ++ char addr[29]; ++}; ++ ++struct spnfs_device { ++ u_int64_t devid; ++ int dscount; ++ struct spnfs_data_server dslist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++struct spnfs_msg_getdeviceinfo_args { ++ u_int64_t devid; ++}; ++ ++struct spnfs_msg_getdeviceinfo_res { ++ int status; ++ struct spnfs_device devinfo; ++}; ++ ++/* setattr */ ++struct spnfs_msg_setattr_args { ++ unsigned long inode; ++ unsigned long generation; ++ int file_size; ++}; ++ ++struct spnfs_msg_setattr_res { ++ int status; ++}; ++ ++/* open */ ++struct spnfs_msg_open_args { ++ unsigned long inode; ++ unsigned long generation; ++ int create; ++ int createmode; ++ int truncate; ++}; ++ ++struct spnfs_msg_open_res { ++ int status; ++}; ++ ++/* close */ ++/* No op for daemon */ ++struct spnfs_msg_close_args { ++ int x; ++}; ++ ++struct spnfs_msg_close_res { ++ int y; ++}; ++ ++/* create */ ++/* ++struct spnfs_msg_create_args { ++ int x; ++}; ++ ++struct spnfs_msg_create_res { ++ int y; ++}; ++*/ ++ ++/* remove */ ++struct spnfs_msg_remove_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_msg_remove_res { ++ int status; ++}; ++ ++/* commit */ ++/* ++struct spnfs_msg_commit_args { ++ int x; ++}; ++ ++struct spnfs_msg_commit_res { ++ int y; ++}; ++*/ ++ ++/* read */ ++struct spnfs_msg_read_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++}; ++ ++struct spnfs_msg_read_res { ++ int status; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++/* write */ ++struct spnfs_msg_write_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++struct spnfs_msg_write_res { ++ int status; ++}; ++ ++/* bundle args and responses */ ++union spnfs_msg_args { ++ struct spnfs_msg_layoutget_args layoutget_args; ++ struct spnfs_msg_layoutcommit_args layoutcommit_args; ++/* ++ struct spnfs_msg_layoutreturn_args layoutreturn_args; ++*/ ++ struct spnfs_msg_getdeviceiter_args getdeviceiter_args; ++ struct spnfs_msg_getdeviceinfo_args getdeviceinfo_args; ++ struct spnfs_msg_setattr_args setattr_args; ++ struct spnfs_msg_open_args open_args; ++ struct spnfs_msg_close_args close_args; ++/* ++ struct spnfs_msg_create_args create_args; ++*/ ++ struct spnfs_msg_remove_args remove_args; ++/* ++ struct spnfs_msg_commit_args commit_args; ++*/ ++ struct spnfs_msg_read_args read_args; ++ struct spnfs_msg_write_args write_args; ++}; ++ ++union spnfs_msg_res { ++ struct spnfs_msg_layoutget_res layoutget_res; ++ struct spnfs_msg_layoutcommit_res layoutcommit_res; ++/* ++ struct spnfs_msg_layoutreturn_res layoutreturn_res; ++*/ ++ struct spnfs_msg_getdeviceiter_res getdeviceiter_res; ++ struct spnfs_msg_getdeviceinfo_res getdeviceinfo_res; ++ struct spnfs_msg_setattr_res setattr_res; ++ struct spnfs_msg_open_res open_res; ++ struct spnfs_msg_close_res close_res; ++/* ++ struct spnfs_msg_create_res create_res; ++*/ ++ struct spnfs_msg_remove_res remove_res; ++/* ++ struct spnfs_msg_commit_res commit_res; ++*/ ++ struct spnfs_msg_read_res read_res; ++ struct spnfs_msg_write_res write_res; ++}; ++ ++/* a spnfs message, args and response */ ++struct spnfs_msg { ++ unsigned char im_type; ++ unsigned char im_status; ++ union spnfs_msg_args im_args; ++ union spnfs_msg_res im_res; ++}; ++ ++/* spnfs configuration info */ ++struct spnfs_config { ++ unsigned char dense_striping; ++ int stripe_size; ++ int num_ds; ++ char ds_dir[SPNFS_MAX_DATA_SERVERS][80]; /* XXX */ ++}; ++ ++#if defined(__KERNEL__) && defined(CONFIG_SPNFS) ++ ++#include ++ ++/* pipe mgmt structure. messages flow through here */ ++struct spnfs { ++ struct dentry *spnfs_dentry; /* dentry for pipe */ ++ wait_queue_head_t spnfs_wq; ++ struct spnfs_msg spnfs_im; /* spnfs message */ ++ struct mutex spnfs_lock; /* Serializes upcalls */ ++ struct mutex spnfs_plock; ++}; ++ ++struct nfsd4_open; ++ ++int spnfs_layout_type(struct super_block *); ++enum nfsstat4 spnfs_layoutget(struct inode *, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int spnfs_layoutcommit(void); ++int spnfs_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int spnfs_getdeviceiter(struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int spnfs_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++int spnfs_setattr(void); ++int spnfs_open(struct inode *, struct nfsd4_open *); ++int spnfs_get_state(struct inode *, struct knfsd_fh *, struct pnfs_get_state *); ++int spnfs_remove(unsigned long, unsigned long); ++__be32 spnfs_read(struct inode *, loff_t, unsigned long *, ++ int, struct svc_rqst *); ++__be32 spnfs_write(struct inode *, loff_t, size_t, int, struct svc_rqst *); ++int spnfs_getfh(int, struct nfs_fh *); ++int spnfs_test_layoutrecall(char *, u64, u64); ++int spnfs_layoutrecall(struct inode *, int, u64, u64); ++ ++int nfsd_spnfs_new(void); ++void nfsd_spnfs_delete(void); ++int spnfs_upcall(struct spnfs *, struct spnfs_msg *, union spnfs_msg_res *); ++int spnfs_enabled(void); ++int spnfs_init_proc(void); ++ ++extern struct spnfs_config *spnfs_config; ++ ++#endif /* __KERNEL__ && CONFIG_SPNFS */ ++ ++#endif /* NFS_SPNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch/include/linux/nfsd/const.h +--- linux-2.6.34.noarch/include/linux/nfsd/const.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-08-23 12:09:03.376401789 -0400 +@@ -29,6 +29,7 @@ + #ifdef __KERNEL__ + + #include ++#include + + /* + * Largest number of bytes we need to allocate for an NFS +diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch/include/linux/nfsd/debug.h +--- linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-08-23 12:09:03.376401789 -0400 +@@ -32,6 +32,8 @@ + #define NFSDDBG_REPCACHE 0x0080 + #define NFSDDBG_XDR 0x0100 + #define NFSDDBG_LOCKD 0x0200 ++#define NFSDDBG_PNFS 0x0400 ++#define NFSDDBG_FILELAYOUT 0x0800 + #define NFSDDBG_ALL 0x7FFF + #define NFSDDBG_NOCHANGE 0xFFFF + +diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarch/include/linux/nfsd/export.h +--- linux-2.6.34.noarch/include/linux/nfsd/export.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-08-23 12:09:03.377481954 -0400 +@@ -100,6 +100,7 @@ struct svc_export { + uid_t ex_anon_uid; + gid_t ex_anon_gid; + int ex_fsid; ++ int ex_pnfs; + unsigned char * ex_uuid; /* 16 byte fsid */ + struct nfsd4_fs_locations ex_fslocs; + int ex_nflavors; +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-23 12:09:03.377481954 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-23 12:09:03.378501747 -0400 +@@ -0,0 +1,132 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef NFSD_NFS4LAYOUTXDR_H ++#define NFSD_NFS4LAYOUTXDR_H ++ ++#include ++#include ++ ++/* the nfsd4_pnfs_devlist dev_addr for the file layout type */ ++struct pnfs_filelayout_devaddr { ++ struct xdr_netobj r_netid; ++ struct xdr_netobj r_addr; ++}; ++ ++/* list of multipath servers */ ++struct pnfs_filelayout_multipath { ++ u32 fl_multipath_length; ++ struct pnfs_filelayout_devaddr *fl_multipath_list; ++}; ++ ++struct pnfs_filelayout_device { ++ u32 fl_stripeindices_length; ++ u32 *fl_stripeindices_list; ++ u32 fl_device_length; ++ struct pnfs_filelayout_multipath *fl_device_list; ++}; ++ ++struct pnfs_filelayout_layout { ++ u32 lg_layout_type; /* response */ ++ u32 lg_stripe_type; /* response */ ++ u32 lg_commit_through_mds; /* response */ ++ u64 lg_stripe_unit; /* response */ ++ u64 lg_pattern_offset; /* response */ ++ u32 lg_first_stripe_index; /* response */ ++ struct nfsd4_pnfs_deviceid device_id; /* response */ ++ u32 lg_fh_length; /* response */ ++ struct knfsd_fh *lg_fh_list; /* response */ ++}; ++ ++enum stripetype4 { ++ STRIPE_SPARSE = 1, ++ STRIPE_DENSE = 2 ++}; ++ ++enum pnfs_block_extent_state4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, ++ PNFS_BLOCK_NONE_DATA = 3 ++}; ++ ++enum pnfs_block_volume_type4 { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, ++ PNFS_BLOCK_VOLUME_SLICE = 1, ++ PNFS_BLOCK_VOLUME_CONCAT = 2, ++ PNFS_BLOCK_VOLUME_STRIPE = 3, ++}; ++typedef enum pnfs_block_volume_type4 pnfs_block_volume_type4; ++ ++enum bl_cache_state { ++ BLOCK_LAYOUT_NEW = 0, ++ BLOCK_LAYOUT_CACHE = 1, ++ BLOCK_LAYOUT_UPDATE = 2, ++}; ++ ++typedef struct pnfs_blocklayout_layout { ++ struct list_head bll_list; ++ struct nfsd4_pnfs_deviceid bll_vol_id; ++ u64 bll_foff; // file offset ++ u64 bll_len; ++ u64 bll_soff; // storage offset ++ int bll_recalled; ++ enum pnfs_block_extent_state4 bll_es; ++ enum bl_cache_state bll_cache_state; ++} pnfs_blocklayout_layout_t; ++ ++typedef struct pnfs_blocklayout_devinfo { ++ struct list_head bld_list; ++ pnfs_block_volume_type4 bld_type; ++ struct nfsd4_pnfs_deviceid bld_devid; ++ int bld_index_loc; ++ union { ++ struct { ++ u64 bld_offset; ++ u32 bld_sig_len, ++ *bld_sig; ++ } simple; ++ struct { ++ u64 bld_start, ++ bld_len; ++ u32 bld_index; /* Index of Simple Volume */ ++ } slice; ++ struct { ++ u32 bld_stripes; ++ u64 bld_chunk_size; ++ u32 *bld_stripe_indexs; ++ } stripe; ++ } u; ++} pnfs_blocklayout_devinfo_t; ++ ++#endif /* NFSD_NFS4LAYOUTXDR_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-23 12:09:03.378501747 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-23 12:09:03.378501747 -0400 +@@ -0,0 +1,54 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++#include ++ ++/* ++ * Length of comma separated pnfs data server IPv4 addresses. Enough room for ++ * 32 addresses. ++ */ ++#define NFSD_DLM_DS_LIST_MAX 512 ++/* ++ * Length of colon separated pnfs dlm device of the form ++ * disk_name:comma separated data server IPv4 address ++ */ ++#define NFSD_PNFS_DLM_DEVICE_MAX (NFSD_DLM_DS_LIST_MAX + DISK_NAME_LEN + 1) ++ ++#ifdef CONFIG_PNFSD ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++extern const struct pnfs_export_operations pnfs_dlm_export_ops; ++ ++int nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len); ++ ++void nfsd4_pnfs_dlm_shutdown(void); ++ ++ssize_t nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen); ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-23 12:09:03.379487099 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-23 12:09:03.379487099 -0400 +@@ -0,0 +1,271 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef _LINUX_NFSD_NFSD4_PNFS_H ++#define _LINUX_NFSD_NFSD4_PNFS_H ++ ++#include ++#include ++#include ++ ++struct nfsd4_pnfs_deviceid { ++ u64 sbid; /* per-superblock unique ID */ ++ u64 devid; /* filesystem-wide unique device ID */ ++}; ++ ++struct nfsd4_pnfs_dev_iter_res { ++ u64 gd_cookie; /* request/repsonse */ ++ u64 gd_verf; /* request/repsonse */ ++ u64 gd_devid; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++/* Arguments for set_device_notify */ ++struct pnfs_devnotify_arg { ++ struct nfsd4_pnfs_deviceid dn_devid; /* request */ ++ u32 dn_layout_type; /* request */ ++ u32 dn_notify_types; /* request/response */ ++}; ++ ++struct nfsd4_layout_seg { ++ u64 clientid; ++ u32 layout_type; ++ u32 iomode; ++ u64 offset; ++ u64 length; ++}; ++ ++/* Used by layout_get to encode layout (loc_body var in spec) ++ * Args: ++ * minlength - min number of accessible bytes given by layout ++ * fsid - Major part of struct pnfs_deviceid. File system uses this ++ * to build the deviceid returned in the layout. ++ * fh - fs can modify the file handle for use on data servers ++ * seg - layout info requested and layout info returned ++ * xdr - xdr info ++ * return_on_close - true if layout to be returned on file close ++ */ ++ ++struct nfsd4_pnfs_layoutget_arg { ++ u64 lg_minlength; ++ u64 lg_sbid; ++ const struct knfsd_fh *lg_fh; ++}; ++ ++struct nfsd4_pnfs_layoutget_res { ++ struct nfsd4_layout_seg lg_seg; /* request/resopnse */ ++ u32 lg_return_on_close; ++}; ++ ++struct nfsd4_pnfs_layoutcommit_arg { ++ struct nfsd4_layout_seg lc_seg; /* request */ ++ u32 lc_reclaim; /* request */ ++ u32 lc_newoffset; /* request */ ++ u64 lc_last_wr; /* request */ ++ struct nfstime4 lc_mtime; /* request */ ++ u32 lc_up_len; /* layout length */ ++ void *lc_up_layout; /* decoded by callback */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit_res { ++ u32 lc_size_chg; /* boolean for response */ ++ u64 lc_newsize; /* response */ ++}; ++ ++#define PNFS_LAST_LAYOUT_NO_RECALLS ((void *)-1) /* used with lr_cookie below */ ++ ++struct nfsd4_pnfs_layoutreturn_arg { ++ u32 lr_return_type; /* request */ ++ struct nfsd4_layout_seg lr_seg; /* request */ ++ u32 lr_reclaim; /* request */ ++ u32 lrf_body_len; /* request */ ++ void *lrf_body; /* request */ ++ void *lr_cookie; /* fs private */ ++}; ++ ++/* pNFS Metadata to Data server state communication */ ++struct pnfs_get_state { ++ u32 dsid; /* request */ ++ u64 ino; /* request */ ++ nfs4_stateid stid; /* request;response */ ++ nfs4_clientid clid; /* response */ ++ u32 access; /* response */ ++ u32 stid_gen; /* response */ ++ u32 verifier[2]; /* response */ ++}; ++ ++/* ++ * pNFS export operations vector. ++ * ++ * The filesystem must implement the following methods: ++ * layout_type ++ * get_device_info ++ * layout_get ++ * ++ * All other methods are optional and can be set to NULL if not implemented. ++ */ ++struct pnfs_export_operations { ++ /* Returns the supported pnfs_layouttype4. */ ++ int (*layout_type) (struct super_block *); ++ ++ /* Encode device info onto the xdr stream. */ ++ int (*get_device_info) (struct super_block *, ++ struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++ ++ /* Retrieve all available devices via an iterator. ++ * arg->cookie == 0 indicates the beginning of the list, ++ * otherwise arg->verf is used to verify that the list hasn't changed ++ * while retrieved. ++ * ++ * On output, the filesystem sets the devid based on the current cookie ++ * and sets res->cookie and res->verf corresponding to the next entry. ++ * When the last entry in the list is retrieved, res->eof is set to 1. ++ */ ++ int (*get_device_iter) (struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++ ++ int (*set_device_notify) (struct super_block *, ++ struct pnfs_devnotify_arg *); ++ ++ /* Retrieve and encode a layout for inode onto the xdr stream. ++ * arg->minlength is the minimum number of accessible bytes required ++ * by the client. ++ * The maximum number of bytes to encode the layout is given by ++ * the xdr stream end pointer. ++ * arg->fsid contains the major part of struct pnfs_deviceid. ++ * The file system uses this to build the deviceid returned ++ * in the layout. ++ * res->seg - layout segment requested and layout info returned. ++ * res->fh can be modified the file handle for use on data servers ++ * res->return_on_close - true if layout to be returned on file close ++ * ++ * return one of the following nfs errors: ++ * NFS_OK Success ++ * NFS4ERR_ACCESS Permission error ++ * NFS4ERR_BADIOMODE Server does not support requested iomode ++ * NFS4ERR_BADLAYOUT No layout matching loga_minlength rules ++ * NFS4ERR_INVAL Parameter other than layout is invalid ++ * NFS4ERR_IO I/O error ++ * NFS4ERR_LAYOUTTRYLATER Layout may be retrieved later ++ * NFS4ERR_LAYOUTUNAVAILABLE Layout unavailable for this file ++ * NFS4ERR_LOCKED Lock conflict ++ * NFS4ERR_NOSPC Out-of-space error occured ++ * NFS4ERR_RECALLCONFLICT Layout currently unavialable due to ++ * a conflicting CB_LAYOUTRECALL ++ * NFS4ERR_SERVERFAULT Server went bezerk ++ * NFS4ERR_TOOSMALL loga_maxcount too small to fit layout ++ * NFS4ERR_WRONG_TYPE Wrong file type (not a regular file) ++ */ ++ enum nfsstat4 (*layout_get) (struct inode *, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++ ++ /* Commit changes to layout */ ++ int (*layout_commit) (struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++ ++ /* Returns the layout */ ++ int (*layout_return) (struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++ ++ /* Can layout segments be merged for this layout type? */ ++ int (*can_merge_layouts) (u32 layout_type); ++ ++ /* pNFS Files layout specific operations */ ++ ++ /* Get the write verifier for DS (called on MDS only) */ ++ void (*get_verifier) (struct super_block *, u32 *p); ++ /* Call fs on DS only */ ++ int (*get_state) (struct inode *, struct knfsd_fh *, ++ struct pnfs_get_state *); ++}; ++ ++struct nfsd4_pnfs_cb_layout { ++ u32 cbl_recall_type; /* request */ ++ struct nfsd4_layout_seg cbl_seg; /* request */ ++ u32 cbl_layoutchanged; /* request */ ++ nfs4_stateid cbl_sid; /* request */ ++ struct nfs4_fsid cbl_fsid; ++ void *cbl_cookie; /* fs private */ ++}; ++ ++/* layoutrecall request (from exported filesystem) */ ++struct nfs4_layoutrecall { ++ struct kref clr_ref; ++ struct nfsd4_pnfs_cb_layout cb; /* request */ ++ struct list_head clr_perclnt; /* on cl_layoutrecalls */ ++ struct nfs4_client *clr_client; ++ struct nfs4_file *clr_file; ++ struct timespec clr_time; /* last activity */ ++ struct super_block *clr_sb; /* We might not have a file */ ++ struct nfs4_layoutrecall *parent; /* The initiating recall */ ++ ++ void *clr_args; /* nfsd internal */ ++}; ++ ++struct nfsd4_pnfs_cb_dev_item { ++ u32 cbd_notify_type; /* request */ ++ u32 cbd_layout_type; /* request */ ++ struct nfsd4_pnfs_deviceid cbd_devid; /* request */ ++ u32 cbd_immediate; /* request */ ++}; ++ ++struct nfsd4_pnfs_cb_dev_list { ++ u32 cbd_len; /* request */ ++ struct nfsd4_pnfs_cb_dev_item *cbd_list; /* request */ ++}; ++ ++/* ++ * callbacks provided by the nfsd ++ */ ++struct pnfsd_cb_operations { ++ /* Generic callbacks */ ++ int (*cb_layout_recall) (struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++ int (*cb_device_notify) (struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++ ++ /* pNFS Files layout specific callbacks */ ++ ++ /* Callback from fs on MDS only */ ++ int (*cb_get_state) (struct super_block *, struct pnfs_get_state *); ++ /* Callback from fs on DS only */ ++ int (*cb_change_state) (struct pnfs_get_state *); ++}; ++ ++#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noarch/include/linux/nfsd/syscall.h +--- linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-08-23 12:09:03.380502500 -0400 +@@ -29,6 +29,7 @@ + /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ + #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ + #define NFSCTL_GETFS 8 /* get an fh by path with max FH len */ ++#define NFSCTL_FD2FH 9 /* get a fh from a fd */ + + /* SVC */ + struct nfsctl_svc { +@@ -71,6 +72,11 @@ struct nfsctl_fsparm { + int gd_maxlen; + }; + ++/* FD2FH */ ++struct nfsctl_fd2fh { ++ int fd; ++}; ++ + /* + * This is the argument union. + */ +@@ -82,6 +88,7 @@ struct nfsctl_arg { + struct nfsctl_export u_export; + struct nfsctl_fdparm u_getfd; + struct nfsctl_fsparm u_getfs; ++ struct nfsctl_fd2fh u_fd2fh; + /* + * The following dummy member is needed to preserve binary compatibility + * on platforms where alignof(void*)>alignof(int). It's needed because +@@ -95,6 +102,7 @@ struct nfsctl_arg { + #define ca_export u.u_export + #define ca_getfd u.u_getfd + #define ca_getfs u.u_getfs ++#define ca_fd2fh u.u_fd2fh + }; + + union nfsctl_res { +diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/include/linux/nfs_fs.h +--- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-23 12:08:29.061494081 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-23 12:09:03.381511751 -0400 +@@ -72,13 +72,20 @@ struct nfs_access_entry { + int mask; + }; + ++struct nfs_lock_context { ++ atomic_t count; ++ struct list_head list; ++ struct nfs_open_context *open_context; ++ fl_owner_t lockowner; ++ pid_t pid; ++}; ++ + struct nfs4_state; + struct nfs_open_context { +- atomic_t count; ++ struct nfs_lock_context lock_context; + struct path path; + struct rpc_cred *cred; + struct nfs4_state *state; +- fl_owner_t lockowner; + fmode_t mode; + + unsigned long flags; +@@ -97,6 +104,26 @@ struct nfs_delegation; + + struct posix_acl; + ++struct pnfs_layout_type { ++ int refcount; ++ struct list_head lo_layouts; /* other client layouts */ ++ struct list_head segs; /* layout segments list */ ++ int roc_iomode; /* iomode to return on close, 0=none */ ++ seqlock_t seqlock; /* Protects the stateid */ ++ nfs4_stateid stateid; ++ unsigned long pnfs_layout_state; ++ #define NFS_INO_RO_LAYOUT_FAILED 0 /* get ro layout failed stop trying */ ++ #define NFS_INO_RW_LAYOUT_FAILED 1 /* get rw layout failed stop trying */ ++ #define NFS_INO_LAYOUTCOMMIT 3 /* LAYOUTCOMMIT needed */ ++ struct rpc_cred *lo_cred; /* layoutcommit credential */ ++ /* DH: These vars keep track of the maximum write range ++ * so the values can be used for layoutcommit. ++ */ ++ loff_t pnfs_write_begin_pos; ++ loff_t pnfs_write_end_pos; ++ struct inode *lo_inode; ++}; ++ + /* + * nfs fs inode data in memory + */ +@@ -181,6 +208,13 @@ struct nfs_inode { + struct nfs_delegation *delegation; + fmode_t delegation_state; + struct rw_semaphore rwsem; ++ ++ /* pNFS layout information */ ++#if defined(CONFIG_NFS_V4_1) ++ wait_queue_head_t lo_waitq; ++ struct pnfs_layout_type *layout; ++ time_t pnfs_layout_suspend; ++#endif /* CONFIG_NFS_V4_1 */ + #endif /* CONFIG_NFS_V4*/ + #ifdef CONFIG_NFS_FSCACHE + struct fscache_cookie *fscache; +@@ -353,6 +387,8 @@ extern void nfs_setattr_update_inode(str + extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); + extern void put_nfs_open_context(struct nfs_open_context *ctx); + extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode); ++extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx); ++extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); + extern u64 nfs_compat_user_ino64(u64 fileid); + extern void nfs_fattr_init(struct nfs_fattr *fattr); + +@@ -481,8 +517,12 @@ extern void nfs_unblock_sillyrename(stru + extern int nfs_congestion_kb; + extern int nfs_writepage(struct page *page, struct writeback_control *wbc); + extern int nfs_writepages(struct address_space *, struct writeback_control *); +-extern int nfs_flush_incompatible(struct file *file, struct page *page); +-extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); ++struct pnfs_layout_segment; ++extern int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg); ++extern int nfs_updatepage(struct file *, struct page *, ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata); + extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); + + /* +@@ -604,6 +644,8 @@ extern void * nfs_root_data(void); + #define NFSDBG_CLIENT 0x0200 + #define NFSDBG_MOUNT 0x0400 + #define NFSDBG_FSCACHE 0x0800 ++#define NFSDBG_PNFS 0x1000 ++#define NFSDBG_PNFS_LD 0x2000 + #define NFSDBG_ALL 0xFFFF + + #ifdef __KERNEL__ +diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/include/linux/nfs_fs_sb.h +--- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-23 12:08:29.062501618 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-23 12:09:03.383491395 -0400 +@@ -15,6 +15,7 @@ struct nlm_host; + struct nfs4_sequence_args; + struct nfs4_sequence_res; + struct nfs_server; ++struct nfs4_minor_version_ops; + + /* + * The nfs_client identifies our client state to the server. +@@ -70,11 +71,7 @@ struct nfs_client { + */ + char cl_ipaddr[48]; + unsigned char cl_id_uniquifier; +- int (* cl_call_sync)(struct nfs_server *server, +- struct rpc_message *msg, +- struct nfs4_sequence_args *args, +- struct nfs4_sequence_res *res, +- int cache_reply); ++ const struct nfs4_minor_version_ops *cl_mvops; + #endif /* CONFIG_NFS_V4 */ + + #ifdef CONFIG_NFS_V4_1 +@@ -85,6 +82,8 @@ struct nfs_client { + /* The flags used for obtaining the clientid during EXCHANGE_ID */ + u32 cl_exchange_flags; + struct nfs4_session *cl_session; /* sharred session */ ++ struct list_head cl_layouts; ++ struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */ + #endif /* CONFIG_NFS_V4_1 */ + + #ifdef CONFIG_NFS_FSCACHE +@@ -92,6 +91,16 @@ struct nfs_client { + #endif + }; + ++static inline bool ++is_ds_only_client(struct nfs_client *clp) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ return is_ds_only_session(clp->cl_exchange_flags); ++#else ++ return false; ++#endif ++} ++ + /* + * NFS client parameters stored in the superblock. + */ +@@ -136,7 +145,7 @@ struct nfs_server { + #endif + + #ifdef CONFIG_NFS_V4 +- u32 attr_bitmask[2];/* V4 bitmask representing the set ++ u32 attr_bitmask[3];/* V4 bitmask representing the set + of attributes supported on this + filesystem */ + u32 cache_consistency_bitmask[2]; +@@ -148,6 +157,15 @@ struct nfs_server { + that are supported on this + filesystem */ + #endif ++ ++#ifdef CONFIG_NFS_V4_1 ++ u32 pnfs_blksize; /* layout_blksize attr */ ++ struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ ++ void *pnfs_ld_data; /* Per-mount data */ ++ unsigned int ds_rsize; /* Data server read size */ ++ unsigned int ds_wsize; /* Data server write size */ ++#endif /* CONFIG_NFS_V4_1 */ ++ + void (*destroy)(struct nfs_server *); + + atomic_t active; /* Keep trace of any activity to this server */ +diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch/include/linux/nfs_iostat.h +--- linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-08-23 12:09:03.384501540 -0400 +@@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { + NFSIOS_SHORTREAD, + NFSIOS_SHORTWRITE, + NFSIOS_DELAY, ++ NFSIOS_PNFS_READ, ++ NFSIOS_PNFS_WRITE, ++ NFSIOS_PNFS_COMMIT, + __NFSIOS_COUNTSMAX, + }; + +diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/include/linux/nfs_page.h +--- linux-2.6.34.noarch/include/linux/nfs_page.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-08-23 12:09:03.385491518 -0400 +@@ -39,6 +39,7 @@ struct nfs_page { + struct list_head wb_list; /* Defines state of page: */ + struct page *wb_page; /* page to read in/write out */ + struct nfs_open_context *wb_context; /* File state context info */ ++ struct nfs_lock_context *wb_lock_context; /* lock context info */ + atomic_t wb_complete; /* i/os we're waiting for */ + pgoff_t wb_index; /* Offset >> PAGE_CACHE_SHIFT */ + unsigned int wb_offset, /* Offset & ~PAGE_CACHE_MASK */ +@@ -47,6 +48,7 @@ struct nfs_page { + struct kref wb_kref; /* reference count */ + unsigned long wb_flags; + struct nfs_writeverf wb_verf; /* Commit cookie */ ++ struct pnfs_layout_segment *wb_lseg; /* Pnfs layout info */ + }; + + struct nfs_pageio_descriptor { +@@ -60,6 +62,12 @@ struct nfs_pageio_descriptor { + int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int); + int pg_ioflags; + int pg_error; ++ struct pnfs_layout_segment *pg_lseg; ++#ifdef CONFIG_NFS_V4_1 ++ int pg_iswrite; ++ int pg_boundary; ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++#endif /* CONFIG_NFS_V4_1 */ + }; + + #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) +@@ -68,13 +76,15 @@ extern struct nfs_page *nfs_create_reque + struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int count); ++ unsigned int count, ++ struct pnfs_layout_segment *lseg); + extern void nfs_clear_request(struct nfs_page *req); + extern void nfs_release_request(struct nfs_page *req); + + + extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst, +- pgoff_t idx_start, unsigned int npages, int tag); ++ pgoff_t idx_start, unsigned int npages, int tag, ++ int *use_pnfs); + extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, + struct inode *inode, + int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), +diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/include/linux/nfs_xdr.h +--- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-08-23 12:08:29.062501618 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-23 12:09:03.387491422 -0400 +@@ -3,6 +3,8 @@ + + #include + #include ++#include ++#include + + /* + * To change the maximum rsize and wsize supported by the NFS client, adjust +@@ -10,7 +12,7 @@ + * support a megabyte or more. The default is left at 4096 bytes, which is + * reasonable for NFS over UDP. + */ +-#define NFS_MAX_FILE_IO_SIZE (1048576U) ++#define NFS_MAX_FILE_IO_SIZE (4U * 1048576U) + #define NFS_DEF_FILE_IO_SIZE (4096U) + #define NFS_MIN_FILE_IO_SIZE (1024U) + +@@ -113,6 +115,10 @@ struct nfs_fsinfo { + __u32 dtpref; /* pref. readdir transfer size */ + __u64 maxfilesize; + __u32 lease_time; /* in seconds */ ++#if defined(CONFIG_NFS_V4_1) ++ __u32 layouttype; /* supported pnfs layout driver */ ++ __u32 blksize; /* preferred pnfs io block size */ ++#endif + }; + + struct nfs_fsstat { +@@ -196,8 +202,10 @@ struct nfs_openargs { + __u64 clientid; + __u64 id; + union { +- struct iattr * attrs; /* UNCHECKED, GUARDED */ +- nfs4_verifier verifier; /* EXCLUSIVE */ ++ struct { ++ struct iattr * attrs; /* UNCHECKED, GUARDED */ ++ nfs4_verifier verifier; /* EXCLUSIVE */ ++ }; + nfs4_stateid delegation; /* CLAIM_DELEGATE_CUR */ + fmode_t delegation_type; /* CLAIM_PREVIOUS */ + } u; +@@ -313,6 +321,10 @@ struct nfs_lockt_res { + struct nfs4_sequence_res seq_res; + }; + ++struct nfs_release_lockowner_args { ++ struct nfs_lowner lock_owner; ++}; ++ + struct nfs4_delegreturnargs { + const struct nfs_fh *fhandle; + const nfs4_stateid *stateid; +@@ -332,6 +344,7 @@ struct nfs4_delegreturnres { + struct nfs_readargs { + struct nfs_fh * fh; + struct nfs_open_context *context; ++ struct nfs_lock_context *lock_context; + __u64 offset; + __u32 count; + unsigned int pgbase; +@@ -352,6 +365,7 @@ struct nfs_readres { + struct nfs_writeargs { + struct nfs_fh * fh; + struct nfs_open_context *context; ++ struct nfs_lock_context *lock_context; + __u64 offset; + __u32 count; + enum nfs3_stable_how stable; +@@ -846,7 +860,7 @@ struct nfs4_server_caps_arg { + }; + + struct nfs4_server_caps_res { +- u32 attr_bitmask[2]; ++ u32 attr_bitmask[3]; + u32 acl_bitmask; + u32 has_links; + u32 has_symlinks; +@@ -961,6 +975,27 @@ struct nfs_page; + + #define NFS_PAGEVEC_SIZE (8U) + ++#if defined(CONFIG_NFS_V4_1) ++/* pnfsflag values */ ++#define PNFS_NO_RPC 0x0001 /* non rpc result callback switch */ ++ ++/* pnfs-specific data needed for read, write, and commit calls */ ++struct pnfs_call_data { ++ struct pnfs_layout_segment *lseg; ++ const struct rpc_call_ops *call_ops; ++ u32 orig_count; /* for retry via MDS */ ++ int pnfs_error; ++ u8 pnfsflags; ++ u8 how; /* for FLUSH_STABLE */ ++}; ++ ++/* files layout-type specific data for read, write, and commit */ ++struct pnfs_fl_call_data { ++ struct nfs_client *ds_nfs_client; ++ __u64 orig_offset; ++}; ++#endif /* CONFIG_NFS_V4_1 */ ++ + struct nfs_read_data { + int flags; + struct rpc_task task; +@@ -976,10 +1011,16 @@ struct nfs_read_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + + struct nfs_write_data { ++ struct kref refcount; /* For pnfs commit splitting */ ++ struct nfs_write_data *parent; /* For pnfs commit splitting */ + int flags; + struct rpc_task task; + struct inode *inode; +@@ -995,6 +1036,10 @@ struct nfs_write_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + +@@ -1008,6 +1053,7 @@ struct nfs_rpc_ops { + const struct dentry_operations *dentry_ops; + const struct inode_operations *dir_inode_ops; + const struct inode_operations *file_inode_ops; ++ const struct file_operations *file_ops; + + int (*getroot) (struct nfs_server *, struct nfs_fh *, + struct nfs_fsinfo *); +@@ -1072,6 +1118,7 @@ struct nfs_rpc_ops { + extern const struct nfs_rpc_ops nfs_v2_clientops; + extern const struct nfs_rpc_ops nfs_v3_clientops; + extern const struct nfs_rpc_ops nfs_v4_clientops; ++extern const struct nfs_rpc_ops pnfs_v4_clientops; + extern struct rpc_version nfs_version2; + extern struct rpc_version nfs_version3; + extern struct rpc_version nfs_version4; +diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.noarch/include/linux/panfs_shim_api.h +--- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-08-23 12:09:03.388491527 -0400 ++++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-08-23 12:09:03.388491527 -0400 +@@ -0,0 +1,57 @@ ++#ifndef _PANFS_SHIM_API_H ++#define _PANFS_SHIM_API_H ++ ++/* ++ * imported panfs functions ++ */ ++struct panfs_export_operations { ++ int (*convert_rc)(pan_status_t rc); ++ ++ int (*sm_sec_t_get_size_otw)( ++ pan_sm_sec_otw_t *var, ++ pan_size_t *core_sizep, ++ pan_size_t *wire_size, ++ void *buf_end); ++ ++ int (*sm_sec_t_unmarshall)( ++ pan_sm_sec_otw_t *in, ++ pan_sm_sec_t *out, ++ void *buf, ++ pan_size_t size, ++ pan_size_t *otw_consumed, ++ pan_size_t *in_core_consumed); ++ ++ int (*ucreds_get)(void **ucreds_pp); ++ ++ void (*ucreds_put)(void *ucreds); ++ ++ int (*sam_read)( ++ pan_sam_access_flags_t flags, ++ pan_sam_read_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_read_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p); ++ ++ int (*sam_write)( ++ pan_sam_access_flags_t flags, ++ pan_sam_write_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_write_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p); ++}; ++ ++extern int ++panfs_shim_register(struct panfs_export_operations *ops); ++ ++extern int ++panfs_shim_unregister(void); ++ ++#endif /* _PANFS_SHIM_API_H */ +diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h +--- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-23 12:09:03.390501461 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-08-23 12:09:03.390501461 -0400 +@@ -0,0 +1,440 @@ ++/* ++ * pnfs_osd_xdr.h ++ * ++ * pNFS-osd on-the-wire data structures ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#ifndef __PNFS_OSD_XDR_H__ ++#define __PNFS_OSD_XDR_H__ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#define PNFS_OSD_OSDNAME_MAXSIZE 256 ++ ++/* ++ * START OF "GENERIC" DECODE ROUTINES. ++ * These may look a little ugly since they are imported from a "generic" ++ * set of XDR encode/decode routines which are intended to be shared by ++ * all of our NFSv4 implementations (OpenBSD, MacOS X...). ++ * ++ * If the pain of reading these is too great, it should be a straightforward ++ * task to translate them into Linux-specific versions which are more ++ * consistent with the style used in NFSv2/v3... ++ */ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (u64)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++ ++/* ++ * draft-ietf-nfsv4-minorversion-22 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* Layout Structure */ ++ ++enum pnfs_osd_raid_algorithm4 { ++ PNFS_OSD_RAID_0 = 1, ++ PNFS_OSD_RAID_4 = 2, ++ PNFS_OSD_RAID_5 = 3, ++ PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ ++}; ++ ++/* struct pnfs_osd_data_map4 { ++ * uint32_t odm_num_comps; ++ * length4 odm_stripe_unit; ++ * uint32_t odm_group_width; ++ * uint32_t odm_group_depth; ++ * uint32_t odm_mirror_cnt; ++ * pnfs_osd_raid_algorithm4 odm_raid_algorithm; ++ * }; ++ */ ++struct pnfs_osd_data_map { ++ u32 odm_num_comps; ++ u64 odm_stripe_unit; ++ u32 odm_group_width; ++ u32 odm_group_depth; ++ u32 odm_mirror_cnt; ++ u32 odm_raid_algorithm; ++}; ++ ++static inline int ++pnfs_osd_data_map_xdr_sz(void) ++{ ++ return 1 + 2 + 1 + 1 + 1 + 1; ++} ++ ++static inline size_t ++pnfs_osd_data_map_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_data_map); ++} ++ ++/* struct pnfs_osd_objid4 { ++ * deviceid4 oid_device_id; ++ * uint64_t oid_partition_id; ++ * uint64_t oid_object_id; ++ * }; ++ */ ++struct pnfs_osd_objid { ++ struct pnfs_deviceid oid_device_id; ++ u64 oid_partition_id; ++ u64 oid_object_id; ++}; ++ ++/* For printout. I use "dev(%llx:%llx)", _DEVID_LO(), _DEVID_HI BE style */ ++#define _DEVID_LO(oid_device_id) \ ++ (unsigned long long)be64_to_cpup((__be64 *)oid_device_id.data) ++ ++#define _DEVID_HI(oid_device_id) \ ++ (unsigned long long)be64_to_cpup(((__be64 *)oid_device_id.data) + 1) ++ ++static inline int ++pnfs_osd_objid_xdr_sz(void) ++{ ++ return (NFS4_PNFS_DEVICEID4_SIZE / 4) + 2 + 2; ++} ++ ++static inline size_t ++pnfs_osd_objid_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_objid); ++} ++ ++enum pnfs_osd_version { ++ PNFS_OSD_MISSING = 0, ++ PNFS_OSD_VERSION_1 = 1, ++ PNFS_OSD_VERSION_2 = 2 ++}; ++ ++struct pnfs_osd_opaque_cred { ++ u32 cred_len; ++ u8 *cred; ++}; ++ ++static inline int ++pnfs_osd_opaque_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ READ32(n); ++ p += XDR_QUADLEN(n); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_opaque_cred_incore_sz(u32 *p) ++{ ++ u32 n; ++ ++ READ32(n); ++ return XDR_QUADLEN(n) * 4; ++} ++ ++enum pnfs_osd_cap_key_sec { ++ PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ PNFS_OSD_CAP_KEY_SEC_SSV = 1, ++}; ++ ++/* struct pnfs_osd_object_cred4 { ++ * pnfs_osd_objid4 oc_object_id; ++ * pnfs_osd_version4 oc_osd_version; ++ * pnfs_osd_cap_key_sec4 oc_cap_key_sec; ++ * opaque oc_capability_key<>; ++ * opaque oc_capability<>; ++ * }; ++ */ ++struct pnfs_osd_object_cred { ++ struct pnfs_osd_objid oc_object_id; ++ u32 oc_osd_version; ++ u32 oc_cap_key_sec; ++ struct pnfs_osd_opaque_cred oc_cap_key; ++ struct pnfs_osd_opaque_cred oc_cap; ++}; ++ ++static inline int ++pnfs_osd_object_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_object_cred_incore_sz(u32 *p) ++{ ++ size_t sz = sizeof(struct pnfs_osd_object_cred); ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ return sz; ++} ++ ++/* struct pnfs_osd_layout4 { ++ * pnfs_osd_data_map4 olo_map; ++ * uint32_t olo_comps_index; ++ * pnfs_osd_object_cred4 olo_components<>; ++ * }; ++ */ ++struct pnfs_osd_layout { ++ struct pnfs_osd_data_map olo_map; ++ u32 olo_comps_index; ++ u32 olo_num_comps; ++ struct pnfs_osd_object_cred *olo_comps; ++}; ++ ++static inline int ++pnfs_osd_layout_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ while ((int)(n--) > 0) ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_layout_incore_sz(u32 *p) ++{ ++ u32 n; ++ size_t sz; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ sz = sizeof(struct pnfs_osd_layout); ++ while ((int)(n--) > 0) { ++ sz += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ return sz; ++} ++ ++/* Device Address */ ++ ++enum pnfs_osd_targetid_type { ++ OBJ_TARGET_ANON = 1, ++ OBJ_TARGET_SCSI_NAME = 2, ++ OBJ_TARGET_SCSI_DEVICE_ID = 3, ++}; ++ ++/* union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) { ++ * case OBJ_TARGET_SCSI_NAME: ++ * string oti_scsi_name<>; ++ * ++ * case OBJ_TARGET_SCSI_DEVICE_ID: ++ * opaque oti_scsi_device_id<>; ++ * ++ * default: ++ * void; ++ * }; ++ * ++ * union pnfs_osd_targetaddr4 switch (bool ota_available) { ++ * case TRUE: ++ * netaddr4 ota_netaddr; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_deviceaddr4 { ++ * pnfs_osd_targetid4 oda_targetid; ++ * pnfs_osd_targetaddr4 oda_targetaddr; ++ * uint64_t oda_lun; ++ * opaque oda_systemid<>; ++ * pnfs_osd_object_cred4 oda_root_obj_cred; ++ * opaque oda_osdname<>; ++ * }; ++ */ ++struct pnfs_osd_targetid { ++ u32 oti_type; ++ struct nfs4_string oti_scsi_device_id; ++}; ++ ++enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 }; ++ ++/* struct netaddr4 { ++ * // see struct rpcb in RFC1833 ++ * string r_netid<>; // network id ++ * string r_addr<>; // universal address ++ * }; ++ */ ++struct pnfs_osd_net_addr { ++ struct nfs4_string r_netid; ++ struct nfs4_string r_addr; ++}; ++ ++struct pnfs_osd_targetaddr { ++ u32 ota_available; ++ struct pnfs_osd_net_addr ota_netaddr; ++}; ++ ++enum { ++ NETWORK_ID_MAX = 16 / 4, ++ UNIVERSAL_ADDRESS_MAX = 64 / 4, ++ PNFS_OSD_TARGETADDR_MAX = 3 + NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX, ++}; ++ ++struct pnfs_osd_deviceaddr { ++ struct pnfs_osd_targetid oda_targetid; ++ struct pnfs_osd_targetaddr oda_targetaddr; ++ u8 oda_lun[8]; ++ struct nfs4_string oda_systemid; ++ struct pnfs_osd_object_cred oda_root_obj_cred; ++ struct nfs4_string oda_osdname; ++}; ++ ++enum { ++ ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4, ++ PNFS_OSD_DEVICEADDR_MAX = ++ PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX + ++ 2 /*oda_lun*/ + ++ 1 + OSD_SYSTEMID_LEN + ++ 1 + ODA_OSDNAME_MAX, ++}; ++ ++/* LAYOUTCOMMIT: layoutupdate */ ++ ++/* union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) { ++ * case TRUE: ++ * int64_t dsu_delta; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_layoutupdate4 { ++ * pnfs_osd_deltaspaceused4 olu_delta_space_used; ++ * bool olu_ioerr_flag; ++ * }; ++ */ ++struct pnfs_osd_layoutupdate { ++ u32 dsu_valid; ++ s64 dsu_delta; ++ u32 olu_ioerr_flag; ++}; ++ ++/* LAYOUTRETURN: I/O Rrror Report */ ++ ++enum pnfs_osd_errno { ++ PNFS_OSD_ERR_EIO = 1, ++ PNFS_OSD_ERR_NOT_FOUND = 2, ++ PNFS_OSD_ERR_NO_SPACE = 3, ++ PNFS_OSD_ERR_BAD_CRED = 4, ++ PNFS_OSD_ERR_NO_ACCESS = 5, ++ PNFS_OSD_ERR_UNREACHABLE = 6, ++ PNFS_OSD_ERR_RESOURCE = 7 ++}; ++ ++/* struct pnfs_osd_ioerr4 { ++ * pnfs_osd_objid4 oer_component; ++ * length4 oer_comp_offset; ++ * length4 oer_comp_length; ++ * bool oer_iswrite; ++ * pnfs_osd_errno4 oer_errno; ++ * }; ++ */ ++struct pnfs_osd_ioerr { ++ struct pnfs_osd_objid oer_component; ++ u64 oer_comp_offset; ++ u64 oer_comp_length; ++ u32 oer_iswrite; ++ u32 oer_errno; ++}; ++ ++static inline unsigned ++pnfs_osd_ioerr_xdr_sz(void) ++{ ++ return pnfs_osd_objid_xdr_sz() + 2 + 2 + 1 + 1; ++} ++ ++/* OSD XDR API */ ++ ++/* Layout helpers */ ++extern struct pnfs_osd_layout *pnfs_osd_xdr_decode_layout( ++ struct pnfs_osd_layout *layout, u32 *p); ++ ++extern int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *layout); ++ ++/* Device Info helpers */ ++ ++/* First pass calculate total size for space needed */ ++extern size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p); ++ ++/* Note: some strings pointed to inside @deviceaddr might point ++ * to space inside @p. @p should stay valid while @deviceaddr ++ * is in use. ++ * It is assumed that @deviceaddr points to bigger memory of size ++ * calculated in first pass by pnfs_osd_xdr_deviceaddr_incore_sz() ++ */ ++extern void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p); ++ ++/* For Servers */ ++extern int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr); ++ ++/* layoutupdate (layout_commit) xdr helpers */ ++extern int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou); ++extern __be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p); ++ ++/* osd_ioerror encoding/decoding (layout_return) */ ++extern int ++pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, struct pnfs_osd_ioerr *ioerr); ++extern __be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p); ++ ++#endif /* __PNFS_OSD_XDR_H__ */ +diff -up linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_xdr.h +--- linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig 2010-08-23 12:09:03.391491550 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_xdr.h 2010-08-23 12:09:03.391491550 -0400 +@@ -0,0 +1,134 @@ ++/* ++ * include/linux/pnfs_xdr.h ++ * ++ * Common xdr data structures needed by pnfs client. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef LINUX_PNFS_XDR_H ++#define LINUX_PNFS_XDR_H ++ ++#define PNFS_LAYOUT_MAXSIZE 4096 ++#define NFS4_PNFS_DEVICEID4_SIZE 16 ++ ++struct pnfs_deviceid { ++ char data[NFS4_PNFS_DEVICEID4_SIZE]; ++}; ++ ++struct nfs4_pnfs_layout { ++ __u32 len; ++ void *buf; ++}; ++ ++struct nfs4_pnfs_layout_segment { ++ u32 iomode; ++ u64 offset; ++ u64 length; ++}; ++ ++struct nfs4_pnfs_layoutget_arg { ++ __u32 type; ++ struct nfs4_pnfs_layout_segment lseg; ++ __u64 minlength; ++ __u32 maxcount; ++ struct inode *inode; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_pnfs_layoutget_res { ++ __u32 return_on_close; ++ struct nfs4_pnfs_layout_segment lseg; ++ __u32 type; ++ nfs4_stateid stateid; ++ struct nfs4_pnfs_layout layout; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_pnfs_layoutget { ++ struct nfs4_pnfs_layoutget_arg args; ++ struct nfs4_pnfs_layoutget_res res; ++ struct pnfs_layout_segment **lsegpp; ++ int status; ++}; ++ ++struct pnfs_layoutcommit_arg { ++ nfs4_stateid stateid; ++ __u64 lastbytewritten; ++ __u32 time_modify_changed; ++ struct timespec time_modify; ++ const u32 *bitmask; ++ struct nfs_fh *fh; ++ struct inode *inode; ++ ++ /* Values set by layout driver */ ++ struct nfs4_pnfs_layout_segment lseg; ++ __u32 layout_type; ++ void *layoutdriver_data; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct pnfs_layoutcommit_res { ++ __u32 sizechanged; ++ __u64 newsize; ++ struct nfs_fattr *fattr; ++ const struct nfs_server *server; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct pnfs_layoutcommit_data { ++ struct rpc_task task; ++ struct rpc_cred *cred; ++ struct nfs_fattr fattr; ++ struct pnfs_layoutcommit_arg args; ++ struct pnfs_layoutcommit_res res; ++ int status; ++}; ++ ++struct nfs4_pnfs_layoutreturn_arg { ++ __u32 reclaim; ++ __u32 layout_type; ++ __u32 return_type; ++ struct nfs4_pnfs_layout_segment lseg; ++ struct inode *inode; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_pnfs_layoutreturn_res { ++ struct nfs4_sequence_res seq_res; ++ u32 lrs_present; ++ nfs4_stateid stateid; ++}; ++ ++struct nfs4_pnfs_layoutreturn { ++ struct nfs4_pnfs_layoutreturn_arg args; ++ struct nfs4_pnfs_layoutreturn_res res; ++ struct rpc_cred *cred; ++ int rpc_status; ++}; ++ ++struct nfs4_pnfs_getdevicelist_arg { ++ const struct nfs_fh *fh; ++ u32 layoutclass; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_pnfs_getdevicelist_res { ++ struct pnfs_devicelist *devlist; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_pnfs_getdeviceinfo_arg { ++ struct pnfs_device *pdev; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_pnfs_getdeviceinfo_res { ++ struct pnfs_device *pdev; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++#endif /* LINUX_PNFS_XDR_H */ +diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/include/linux/posix_acl.h +--- linux-2.6.34.noarch/include/linux/posix_acl.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-08-23 12:09:03.393501437 -0400 +@@ -8,6 +8,7 @@ + #ifndef __LINUX_POSIX_ACL_H + #define __LINUX_POSIX_ACL_H + ++#include + #include + + #define ACL_UNDEFINED_ID (-1) +diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h +--- linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-08-23 12:09:03.393501437 -0400 +@@ -14,6 +14,8 @@ + /* size of an XDR encoding unit in bytes, i.e. 32bit */ + #define XDR_UNIT (4) + ++#include ++ + /* spec defines authentication flavor as an unsigned 32 bit integer */ + typedef u32 rpc_authflavor_t; + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h +--- linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-23 12:09:03.394512138 -0400 +@@ -3,6 +3,7 @@ + + #ifdef __KERNEL__ + ++#include + #include + + struct rpc_pipe_msg { +@@ -11,6 +12,10 @@ struct rpc_pipe_msg { + size_t len; + size_t copied; + int errno; ++#define PIPEFS_AUTOFREE_RPCMSG 0x01 /* frees rpc_pipe_msg */ ++#define PIPEFS_AUTOFREE_RPCMSG_DATA 0x02 /* frees rpc_pipe_msg->data */ ++#define PIPEFS_AUTOFREE_UPCALL_MSG PIPEFS_AUTOFREE_RPCMSG_DATA ++ u8 flags; + }; + + struct rpc_pipe_ops { +diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h +--- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-23 12:09:03.394512138 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-23 12:09:03.395501822 -0400 +@@ -0,0 +1,111 @@ ++/* ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#ifndef _SIMPLE_RPC_PIPEFS_H_ ++#define _SIMPLE_RPC_PIPEFS_H_ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#define payload_of(headerp) ((void *)(headerp + 1)) ++ ++/* ++ * struct pipefs_hdr -- the generic message format for simple_rpc_pipefs. ++ * Messages may simply be the header itself, although having an optional ++ * data payload follow the header allows much more flexibility. ++ * ++ * Messages are created using pipefs_alloc_init_msg() and ++ * pipefs_alloc_init_msg_padded(), both of which accept a pointer to an ++ * (optional) data payload. ++ * ++ * Given a struct pipefs_hdr *msg that has a struct foo payload, the data ++ * can be accessed using: struct foo *foop = payload_of(msg) ++ */ ++struct pipefs_hdr { ++ u32 msgid; ++ u8 type; ++ u8 flags; ++ u16 totallen; /* length of entire message, including hdr itself */ ++ u32 status; ++}; ++ ++/* ++ * struct pipefs_list -- a type of list used for tracking callers who've made an ++ * upcall and are blocked waiting for a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_assign_upcall_reply(). ++ */ ++struct pipefs_list { ++ struct list_head list; ++ spinlock_t list_lock; ++}; ++ ++ ++/* See net/sunrpc/simple_rpc_pipefs.c for more info on using these functions. */ ++extern struct dentry *pipefs_mkpipe(const char *name, ++ const struct rpc_pipe_ops *ops, ++ int wait_for_open); ++extern void pipefs_closepipe(struct dentry *pipe); ++extern void pipefs_init_list(struct pipefs_list *list); ++extern struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen); ++extern struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, ++ u8 flags, void *data, ++ u16 datalen, u16 padlen); ++extern struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list ++ *uplist, u8 upflags, ++ u32 timeout); ++extern int pipefs_queue_upcall_noreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, u8 upflags); ++extern int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist); ++extern struct pipefs_hdr *pipefs_readmsg(struct file *filp, ++ const char __user *src, size_t len); ++extern ssize_t pipefs_generic_upcall(struct file *filp, ++ struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen); ++extern void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg); ++ ++#endif /* _SIMPLE_RPC_PIPEFS_H_ */ +diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h +--- linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-23 12:09:03.395501822 -0400 +@@ -166,4 +166,41 @@ static inline char *__svc_print_addr(con + + return buf; + } ++ ++/* ++ * Print a network address in a universal format (see rfc1833 and nfsv4.1) ++ */ ++static inline int __svc_print_netaddr(struct sockaddr *addr, ++ struct xdr_netobj *na) ++{ ++ u16 port; ++ ssize_t len; ++ ++ switch (addr->sa_family) { ++ case AF_INET: { ++ struct sockaddr_in *sin = (struct sockaddr_in *)addr; ++ port = ntohs(sin->sin_port); ++ ++ len = snprintf(na->data, na->len, "%pI4.%u.%u", ++ &sin->sin_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ case AF_INET6: { ++ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; ++ port = ntohs(sin6->sin6_port); ++ ++ len = snprintf(na->data, na->len, "%pI6.%u.%u", ++ &sin6->sin6_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ default: ++ snprintf(na->data, na->len, "unknown address type: %d", ++ addr->sa_family); ++ len = -EINVAL; ++ break; ++ } ++ return len; ++} + #endif /* SUNRPC_SVC_XPRT_H */ +diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xdr.h +--- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-23 12:08:29.066475323 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-23 12:09:03.396464612 -0400 +@@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp) + return p + 2; + } + ++static inline __be32 * ++xdr_decode_opaque_fixed(__be32 *p, void *ptr, unsigned int len) ++{ ++ memcpy(ptr, p, len); ++ return p + XDR_QUADLEN(len); ++} ++ + /* + * Adjust kvec to reflect end of xdr'ed data (RPC client XDR) + */ +@@ -197,6 +204,7 @@ struct xdr_stream { + + extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); + extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); ++extern __be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q); + extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, + unsigned int base, unsigned int len); + extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); +diff -up linux-2.6.34.noarch/localversion-pnfs.orig linux-2.6.34.noarch/localversion-pnfs +--- linux-2.6.34.noarch/localversion-pnfs.orig 2010-08-23 12:09:03.396464612 -0400 ++++ linux-2.6.34.noarch/localversion-pnfs 2010-08-23 12:09:03.396464612 -0400 +@@ -0,0 +1 @@ ++-pnfs +diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/sunrpc/Makefile +--- linux-2.6.34.noarch/net/sunrpc/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-08-23 12:09:03.397501662 -0400 +@@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt + svc.o svcsock.o svcauth.o svcauth_unix.o \ + addr.o rpcb_clnt.o timer.o xdr.o \ + sunrpc_syms.o cache.o rpc_pipe.o \ +- svc_xprt.o ++ svc_xprt.o simple_rpc_pipefs.o + sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o + sunrpc-$(CONFIG_PROC_FS) += stats.o + sunrpc-$(CONFIG_SYSCTL) += sysctl.o +diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c +--- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-23 12:09:03.398522348 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-23 12:09:03.398522348 -0400 +@@ -0,0 +1,424 @@ ++/* ++ * net/sunrpc/simple_rpc_pipefs.c ++ * ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++ ++/* ++ * Make an rpc_pipefs pipe named @name at the root of the mounted rpc_pipefs ++ * filesystem. ++ * ++ * If @wait_for_open is non-zero and an upcall is later queued but the userland ++ * end of the pipe has not yet been opened, the upcall will remain queued until ++ * the pipe is opened; otherwise, the upcall queueing will return with -EPIPE. ++ */ ++struct dentry *pipefs_mkpipe(const char *name, const struct rpc_pipe_ops *ops, ++ int wait_for_open) ++{ ++ struct dentry *dir, *pipe; ++ struct vfsmount *mnt; ++ ++ mnt = rpc_get_mount(); ++ if (IS_ERR(mnt)) { ++ pipe = ERR_CAST(mnt); ++ goto out; ++ } ++ dir = mnt->mnt_root; ++ if (!dir) { ++ pipe = ERR_PTR(-ENOENT); ++ goto out; ++ } ++ pipe = rpc_mkpipe(dir, name, NULL, ops, ++ wait_for_open ? RPC_PIPE_WAIT_FOR_OPEN : 0); ++out: ++ return pipe; ++} ++EXPORT_SYMBOL(pipefs_mkpipe); ++ ++/* ++ * Shutdown a pipe made by pipefs_mkpipe(). ++ * XXX: do we need to retain an extra reference on the mount? ++ */ ++void pipefs_closepipe(struct dentry *pipe) ++{ ++ rpc_unlink(pipe); ++ rpc_put_mount(); ++} ++EXPORT_SYMBOL(pipefs_closepipe); ++ ++/* ++ * Initialize a struct pipefs_list -- which are a way to keep track of callers ++ * who're blocked having made an upcall and are awaiting a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_find_upcall_msgid() for how ++ * to use them. ++ */ ++inline void pipefs_init_list(struct pipefs_list *list) ++{ ++ INIT_LIST_HEAD(&list->list); ++ spin_lock_init(&list->list_lock); ++} ++EXPORT_SYMBOL(pipefs_init_list); ++ ++/* ++ * Alloc/init a generic pipefs message header and copy into its message body ++ * an arbitrary data payload. ++ * ++ * struct pipefs_hdr's are meant to serve as generic, general-purpose message ++ * headers for easy rpc_pipefs I/O. When an upcall is made, the ++ * struct pipefs_hdr is assigned to a struct rpc_pipe_msg and delivered ++ * therein. --And yes, the naming can seem a little confusing at first: ++ * ++ * When one thinks of an upcall "message", in simple_rpc_pipefs that's a ++ * struct pipefs_hdr (possibly with an attached message body). A ++ * struct rpc_pipe_msg is actually only the -vehicle- by which the "real" ++ * message is delivered and processed. ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen, u16 padlen) ++{ ++ u16 totallen; ++ struct pipefs_hdr *msg = NULL; ++ ++ totallen = sizeof(*msg) + datalen + padlen; ++ if (totallen > PAGE_SIZE) { ++ msg = ERR_PTR(-E2BIG); ++ goto out; ++ } ++ ++ msg = kzalloc(totallen, GFP_KERNEL); ++ if (!msg) { ++ msg = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ msg->msgid = msgid; ++ msg->type = type; ++ msg->flags = flags; ++ msg->totallen = totallen; ++ memcpy(payload_of(msg), data, datalen); ++out: ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg_padded); ++ ++/* ++ * See the description of pipefs_alloc_init_msg_padded(). ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen) ++{ ++ return pipefs_alloc_init_msg_padded(msgid, type, flags, data, ++ datalen, 0); ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg); ++ ++ ++static void pipefs_init_rpcmsg(struct rpc_pipe_msg *rpcmsg, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ memset(rpcmsg, 0, sizeof(*rpcmsg)); ++ rpcmsg->data = msg; ++ rpcmsg->len = msg->totallen; ++ rpcmsg->flags = upflags; ++} ++ ++static struct rpc_pipe_msg *pipefs_alloc_init_rpcmsg(struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ struct rpc_pipe_msg *rpcmsg; ++ ++ rpcmsg = kmalloc(sizeof(*rpcmsg), GFP_KERNEL); ++ if (!rpcmsg) ++ return ERR_PTR(-ENOMEM); ++ ++ pipefs_init_rpcmsg(rpcmsg, msg, upflags); ++ return rpcmsg; ++} ++ ++ ++/* represents an upcall that'll block and wait for a reply */ ++struct pipefs_upcall { ++ u32 msgid; ++ struct rpc_pipe_msg rpcmsg; ++ struct list_head list; ++ wait_queue_head_t waitq; ++ struct pipefs_hdr *reply; ++}; ++ ++ ++static void pipefs_init_upcall_waitreply(struct pipefs_upcall *upcall, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ upcall->reply = NULL; ++ upcall->msgid = msg->msgid; ++ INIT_LIST_HEAD(&upcall->list); ++ init_waitqueue_head(&upcall->waitq); ++ pipefs_init_rpcmsg(&upcall->rpcmsg, msg, upflags); ++} ++ ++static int __pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_upcall *upcall, ++ struct pipefs_list *uplist, ++ u32 timeout) ++{ ++ int err = 0; ++ DECLARE_WAITQUEUE(wq, current); ++ ++ add_wait_queue(&upcall->waitq, &wq); ++ spin_lock(&uplist->list_lock); ++ list_add(&upcall->list, &uplist->list); ++ spin_unlock(&uplist->list_lock); ++ ++ err = rpc_queue_upcall(pipe->d_inode, &upcall->rpcmsg); ++ if (err < 0) ++ goto out; ++ ++ if (timeout) { ++ /* retval of 0 means timer expired */ ++ err = schedule_timeout_uninterruptible(timeout); ++ if (err == 0 && upcall->reply == NULL) ++ err = -ETIMEDOUT; ++ } else { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ } ++ ++out: ++ spin_lock(&uplist->list_lock); ++ list_del_init(&upcall->list); ++ spin_unlock(&uplist->list_lock); ++ remove_wait_queue(&upcall->waitq, &wq); ++ return err; ++} ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace, place the calling thread ++ * on @uplist, and block the thread to wait for a reply. If @timeout is ++ * nonzero, the thread will be blocked for at most @timeout jiffies. ++ * ++ * (To convert time units into jiffies, consider the functions ++ * msecs_to_jiffies(), usecs_to_jiffies(), timeval_to_jiffies(), and ++ * timespec_to_jiffies().) ++ * ++ * Once a reply is received by your downcall handler, call ++ * pipefs_assign_upcall_reply() with @uplist to find the corresponding upcall, ++ * assign the reply, and wake the waiting thread. ++ * ++ * This function's return value pointer may be an error and should be checked ++ * with IS_ERR() before attempting to access the reply message. ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list *uplist, ++ u8 upflags, u32 timeout) ++{ ++ int err = 0; ++ struct pipefs_upcall upcall; ++ ++ pipefs_init_upcall_waitreply(&upcall, msg, upflags); ++ err = __pipefs_queue_upcall_waitreply(pipe, &upcall, uplist, timeout); ++ if (err < 0) { ++ kfree(upcall.reply); ++ upcall.reply = ERR_PTR(err); ++ } ++ ++ return upcall.reply; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_waitreply); ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace and immediately return (i.e., ++ * no reply is expected). ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++int pipefs_queue_upcall_noreply(struct dentry *pipe, struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ int err = 0; ++ struct rpc_pipe_msg *rpcmsg; ++ ++ upflags |= PIPEFS_AUTOFREE_RPCMSG; ++ rpcmsg = pipefs_alloc_init_rpcmsg(msg, upflags); ++ if (IS_ERR(rpcmsg)) { ++ err = PTR_ERR(rpcmsg); ++ goto out; ++ } ++ err = rpc_queue_upcall(pipe->d_inode, rpcmsg); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_noreply); ++ ++ ++static struct pipefs_upcall *pipefs_find_upcall_msgid(u32 msgid, ++ struct pipefs_list *uplist) ++{ ++ struct pipefs_upcall *upcall; ++ ++ spin_lock(&uplist->list_lock); ++ list_for_each_entry(upcall, &uplist->list, list) ++ if (upcall->msgid == msgid) ++ goto out; ++ upcall = NULL; ++out: ++ spin_unlock(&uplist->list_lock); ++ return upcall; ++} ++ ++/* ++ * In your rpc_pipe_ops->downcall() handler, once you've read in a downcall ++ * message and have determined that it is a reply to a waiting upcall, ++ * you can use this function to find the appropriate upcall, assign the result, ++ * and wake the upcall thread. ++ * ++ * The reply message must have the same msgid as the original upcall message's. ++ * ++ * See also pipefs_queue_upcall_waitreply() and pipefs_readmsg(). ++ */ ++int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist) ++{ ++ int err = 0; ++ struct pipefs_upcall *upcall; ++ ++ upcall = pipefs_find_upcall_msgid(reply->msgid, uplist); ++ if (!upcall) { ++ printk(KERN_ERR "%s: ERROR: have reply but no matching upcall " ++ "for msgid %d\n", __func__, reply->msgid); ++ err = -ENOENT; ++ goto out; ++ } ++ upcall->reply = reply; ++ wake_up(&upcall->waitq); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_assign_upcall_reply); ++ ++/* ++ * Generic method to read-in and return a newly-allocated message which begins ++ * with a struct pipefs_hdr. ++ */ ++struct pipefs_hdr *pipefs_readmsg(struct file *filp, const char __user *src, ++ size_t len) ++{ ++ int err = 0, hdrsize; ++ struct pipefs_hdr *msg = NULL; ++ ++ hdrsize = sizeof(*msg); ++ if (len < hdrsize) { ++ printk(KERN_ERR "%s: ERROR: header is too short (%d vs %d)\n", ++ __func__, (int) len, hdrsize); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ msg = kzalloc(len, GFP_KERNEL); ++ if (!msg) { ++ err = -ENOMEM; ++ goto out; ++ } ++ if (copy_from_user(msg, src, len)) ++ err = -EFAULT; ++out: ++ if (err) { ++ kfree(msg); ++ msg = ERR_PTR(err); ++ } ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_readmsg); ++ ++/* ++ * Generic rpc_pipe_ops->upcall() handler implementation. ++ * ++ * Don't call this directly: to make an upcall, use ++ * pipefs_queue_upcall_waitreply() or pipefs_queue_upcall_noreply(). ++ */ ++ssize_t pipefs_generic_upcall(struct file *filp, struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen) ++{ ++ char *data; ++ ssize_t len, left; ++ ++ data = (char *)rpcmsg->data + rpcmsg->copied; ++ len = rpcmsg->len - rpcmsg->copied; ++ if (len > buflen) ++ len = buflen; ++ ++ left = copy_to_user(dst, data, len); ++ if (left < 0) { ++ rpcmsg->errno = left; ++ return left; ++ } ++ ++ len -= left; ++ rpcmsg->copied += len; ++ rpcmsg->errno = 0; ++ return len; ++} ++EXPORT_SYMBOL(pipefs_generic_upcall); ++ ++/* ++ * Generic rpc_pipe_ops->destroy_msg() handler implementation. ++ * ++ * Items are only freed if @rpcmsg->flags has been set appropriately. ++ * See pipefs_queue_upcall_noreply() and rpc_pipe_fs.h. ++ */ ++void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg) ++{ ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_UPCALL_MSG) ++ kfree(rpcmsg->data); ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_RPCMSG) ++ kfree(rpcmsg); ++} ++EXPORT_SYMBOL(pipefs_generic_destroy_msg); +diff -up linux-2.6.34.noarch/net/sunrpc/xdr.c.orig linux-2.6.34.noarch/net/sunrpc/xdr.c +--- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-08-23 12:08:29.081501640 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-23 12:09:03.399443371 -0400 +@@ -403,16 +403,14 @@ xdr_shrink_pagelen(struct xdr_buf *buf, + + /* Shift the tail first */ + if (tail->iov_len != 0) { +- p = (char *)tail->iov_base + len; +- if (tail->iov_len > len) { +- copy = tail->iov_len - len; +- memmove(p, tail->iov_base, copy); +- } else +- buf->buflen -= len; +- /* Copy from the inlined pages into the tail */ + copy = len; +- if (copy > tail->iov_len) ++ if (tail->iov_len > len) { ++ p = (char *)tail->iov_base + len; ++ memmove(p, tail->iov_base, tail->iov_len - len); ++ } else { + copy = tail->iov_len; ++ } ++ /* Copy from the inlined pages into the tail */ + _copy_from_pages((char *)tail->iov_base, + buf->pages, buf->page_base + pglen - len, + copy); +@@ -496,6 +494,27 @@ __be32 * xdr_reserve_space(struct xdr_st + EXPORT_SYMBOL_GPL(xdr_reserve_space); + + /** ++ * xdr_rewind_stream - rewind a stream back to some checkpoint ++ * @xdr: pointer to xdr_stream ++ * @q: some checkpoint at historical place of @xdr ++ * ++ * Restors an xdr stream to some historical point. @q must be ++ * a logical xdr point in the past that was sampled by @q = @xdr->p. ++ */ ++__be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q) ++{ ++ size_t nbytes = (xdr->p - q) << 2; ++ ++ BUG_ON(xdr->p < q); ++ BUG_ON(nbytes > xdr->iov->iov_len || nbytes > xdr->buf->len); ++ xdr->p = q; ++ xdr->iov->iov_len -= nbytes; ++ xdr->buf->len -= nbytes; ++ return q; ++} ++EXPORT_SYMBOL_GPL(xdr_rewind_stream); ++ ++/** + * xdr_write_pages - Insert a list of pages into an XDR buffer for sending + * @xdr: pointer to xdr_stream + * @pages: list of pages From 5c488563eb056aef9c02edc96903714135bf4bd6 Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Mon, 23 Aug 2010 14:15:46 -0400 Subject: [PATCH 02/20] Fixed a couple compile errors in the server code. Signed-off-by: Steve Dickson --- nfsd-35-fc.patch | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/nfsd-35-fc.patch b/nfsd-35-fc.patch index ef99b4995..2825464af 100644 --- a/nfsd-35-fc.patch +++ b/nfsd-35-fc.patch @@ -1,6 +1,6 @@ diff -up linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt --- linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt 2010-08-23 09:57:18.233564439 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt 2010-08-23 14:12:24.165356789 -0400 @@ -137,7 +137,7 @@ NS*| OPENATTR | OPT | | READ | REQ | | Section 18.22 | | READDIR | REQ | | Section 18.23 | @@ -12,7 +12,7 @@ diff -up linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig | RENAME | REQ | | Section 18.26 | diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c --- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 09:57:18.234564075 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 14:12:24.519356675 -0400 @@ -259,10 +259,9 @@ static struct cache_detail svc_expkey_ca .alloc = expkey_alloc, }; @@ -108,7 +108,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/e out_put_clp: diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c --- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 10:00:37.257414684 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 14:12:52.625429773 -0400 @@ -79,11 +79,6 @@ enum nfs_cb_opnum4 { cb_sequence_dec_sz + \ op_dec_sz) @@ -211,7 +211,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/ int status; - status = rpc_call_async(cb->cb_client, &msg, -+ status = rpc_call_async(cb->cl_cb_client, &msg, ++ status = rpc_call_async(clp->cl_cb_client, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, &nfsd4_cb_probe_ops, (void *)clp); - if (status) { @@ -402,7 +402,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/ } diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c --- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 09:57:18.237376763 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 14:12:25.698356909 -0400 @@ -969,20 +969,36 @@ static struct nfsd4_operation nfsd4_ops[ static const char *nfsd4_op_name(unsigned opnum); @@ -490,7 +490,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd static const char *nfsd4_op_name(unsigned opnum) diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c --- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 09:57:18.240356512 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 14:12:25.700356284 -0400 @@ -45,8 +45,8 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -1280,9 +1280,21 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs -{ - user_lease_time = leasetime; -} +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-08-23 14:14:22.882428704 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-23 14:14:33.418376589 -0400 +@@ -1900,7 +1900,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s + if (bmval0 & FATTR4_WORD0_LEASE_TIME) { + if ((buflen -= 4) < 0) + goto out_resource; +- WRITE32(NFSD_LEASE_TIME); ++ WRITE32(nfsd4_lease); + } + if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { + if ((buflen -= 4) < 0) diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c --- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 09:57:20.629370282 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 14:12:25.821359224 -0400 @@ -46,6 +46,7 @@ enum { */ #ifdef CONFIG_NFSD_V4 @@ -1403,7 +1415,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/n /* last one */ {""} diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h --- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 09:57:20.629370282 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 14:12:25.835418441 -0400 @@ -82,7 +82,6 @@ int nfs4_state_init(void); void nfsd4_free_slabs(void); int nfs4_state_start(void); @@ -1440,7 +1452,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfs /* diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h --- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 09:57:21.807501619 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 14:12:25.836366516 -0400 @@ -70,6 +70,16 @@ struct nfsd4_cb_sequence { struct nfs4_client *cbs_clp; }; @@ -1558,7 +1570,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/st nfs4_put_stateowner(struct nfs4_stateowner *so) diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h --- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 09:57:23.994379831 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 14:12:25.837387292 -0400 @@ -381,6 +381,10 @@ struct nfsd4_destroy_session { struct nfs4_sessionid sessionid; }; @@ -1600,7 +1612,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h --- linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h 2010-08-23 09:57:23.994379831 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h 2010-08-23 14:12:25.838377224 -0400 @@ -40,12 +40,12 @@ struct nfs_fhbase_old { * This is the new flexible, extensible style NFSv2/v3 file handle. * by Neil Brown - March 2000 @@ -1619,7 +1631,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig linux-2.6.34.noarch * This might allow a file to be confirmed to be in a writable part of a diff -up linux-2.6.34.noarch/net/sunrpc/cache.c.orig linux-2.6.34.noarch/net/sunrpc/cache.c --- linux-2.6.34.noarch/net/sunrpc/cache.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/cache.c 2010-08-23 09:57:23.995376793 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/cache.c 2010-08-23 14:12:25.839376838 -0400 @@ -49,11 +49,17 @@ static void cache_init(struct cache_head h->last_refresh = now; } @@ -1686,7 +1698,7 @@ diff -up linux-2.6.34.noarch/net/sunrpc/cache.c.orig linux-2.6.34.noarch/net/sun /* entry is valid */ diff -up linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig linux-2.6.34.noarch/net/sunrpc/svcsock.c --- linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/svcsock.c 2010-08-23 09:57:23.997368707 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/svcsock.c 2010-08-23 14:12:25.840384371 -0400 @@ -547,7 +547,6 @@ static int svc_udp_recvfrom(struct svc_r dprintk("svc: recvfrom returned error %d\n", -err); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); @@ -1753,7 +1765,7 @@ diff -up linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig linux-2.6.34.noarch/net/s error: diff -up linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig linux-2.6.34.noarch/net/sunrpc/svc_xprt.c --- linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/svc_xprt.c 2010-08-23 09:57:23.996377209 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/svc_xprt.c 2010-08-23 14:12:25.841371223 -0400 @@ -744,8 +744,10 @@ int svc_recv(struct svc_rqst *rqstp, lon if (rqstp->rq_deferred) { svc_xprt_received(xprt); @@ -1782,7 +1794,7 @@ diff -up linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig linux-2.6.34.noarch/net/ void svc_close_xprt(struct svc_xprt *xprt) diff -up linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c.orig linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c --- linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 2010-08-23 09:57:23.998377481 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 2010-08-23 14:12:25.842376584 -0400 @@ -566,7 +566,6 @@ static int rdma_read_complete(struct svc ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, rqstp->rq_arg.head[0].iov_len); From 5d3f10424d292ac18b38e364e311df274bad5951 Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Tue, 24 Aug 2010 14:49:23 -0400 Subject: [PATCH 03/20] Removed the localversion-pnfs file from the pnfs patch Signed-off-by: Steve Dickson --- kernel.spec | 2 +- pnfs-all-2.6.35-2010-08-19-f13.patch | 395 +++++++++++++-------------- 2 files changed, 196 insertions(+), 201 deletions(-) diff --git a/kernel.spec b/kernel.spec index 70b6f45c8..7b72cab84 100644 --- a/kernel.spec +++ b/kernel.spec @@ -23,7 +23,7 @@ Summary: The Linux kernel # # (Uncomment the '#' and both spaces below to set the buildid.) # -%define buildid .pnfs_all_2.6.35_2010_08_19 +%define buildid .pnfs34.2010.08.19 ################################################################### # The buildid can also be specified on the rpmbuild command line diff --git a/pnfs-all-2.6.35-2010-08-19-f13.patch b/pnfs-all-2.6.35-2010-08-19-f13.patch index a9d78ba0e..10df9b15c 100644 --- a/pnfs-all-2.6.35-2010-08-19-f13.patch +++ b/pnfs-all-2.6.35-2010-08-19-f13.patch @@ -1,6 +1,6 @@ diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arch/um/os-Linux/mem.c ---- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-08-23 12:08:27.310584826 -0400 -+++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-08-23 12:09:03.273553977 -0400 +--- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-08-24 14:14:03.643355000 -0400 ++++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-08-24 14:17:48.415730000 -0400 @@ -13,6 +13,7 @@ #include #include @@ -11,7 +11,7 @@ diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arc #include "os.h" diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd.c --- linux-2.6.34.noarch/block/genhd.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/block/genhd.c 2010-08-23 12:09:03.273553977 -0400 ++++ linux-2.6.34.noarch/block/genhd.c 2010-08-24 14:17:48.421730000 -0400 @@ -1009,6 +1009,7 @@ static void disk_release(struct device * struct class block_class = { .name = "block", @@ -21,8 +21,8 @@ diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd. static char *block_devnode(struct device *dev, mode_t *mode) { diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt ---- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-23 12:09:03.274563927 -0400 -+++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-08-23 12:09:03.274563927 -0400 +--- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-24 14:17:48.423729000 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-08-24 14:17:48.425730000 -0400 @@ -0,0 +1,211 @@ +(c) 2007 Network Appliance Inc. + @@ -237,7 +237,7 @@ diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6. + diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/drivers/md/dm-ioctl.c --- linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-08-23 12:09:03.275584050 -0400 ++++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-08-24 14:17:48.430730000 -0400 @@ -651,6 +651,12 @@ static int dev_create(struct dm_ioctl *p return r; } @@ -292,7 +292,7 @@ diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/driv int r; diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drivers/scsi/hosts.c --- linux-2.6.34.noarch/drivers/scsi/hosts.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-08-23 12:09:03.276563906 -0400 ++++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-08-24 14:17:48.435733000 -0400 @@ -49,7 +49,7 @@ static void scsi_host_cls_release(struct put_device(&class_to_shost(dev)->shost_gendev); } @@ -304,7 +304,7 @@ diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drive }; diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/exofs.h --- linux-2.6.34.noarch/fs/exofs/exofs.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-08-23 12:09:03.277563890 -0400 ++++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-08-24 14:17:48.440733000 -0400 @@ -36,13 +36,9 @@ #include #include @@ -360,8 +360,8 @@ diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/ + #endif diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs/export.c ---- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-08-23 12:09:03.278386746 -0400 -+++ linux-2.6.34.noarch/fs/exofs/export.c 2010-08-23 12:09:03.278386746 -0400 +--- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-08-24 14:17:48.444731000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/export.c 2010-08-24 14:17:48.446730000 -0400 @@ -0,0 +1,396 @@ +/* + * export.c - Implementation of the pnfs_export_operations @@ -761,7 +761,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs +} diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/inode.c --- linux-2.6.34.noarch/fs/exofs/inode.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-08-23 12:09:03.279502002 -0400 ++++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-08-24 14:17:48.452730000 -0400 @@ -833,7 +833,7 @@ void exofs_truncate(struct inode *inode) if (unlikely(wait_obj_created(oi))) goto fail; @@ -781,7 +781,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/ * Fill in an inode read from the OSD and set it up for use diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/Kbuild --- linux-2.6.34.noarch/fs/exofs/Kbuild.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-08-23 12:09:03.279502002 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-08-24 14:17:48.457733000 -0400 @@ -13,4 +13,5 @@ # @@ -790,7 +790,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/K obj-$(CONFIG_EXOFS_FS) += exofs.o diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/Kconfig --- linux-2.6.34.noarch/fs/exofs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-08-23 12:09:03.280553663 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-08-24 14:17:48.462739000 -0400 @@ -1,6 +1,7 @@ config EXOFS_FS tristate "exofs: OSD based file system support" @@ -801,7 +801,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/ as its backing storage. diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/super.c --- linux-2.6.34.noarch/fs/exofs/super.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/super.c 2010-08-23 12:09:03.281511951 -0400 ++++ linux-2.6.34.noarch/fs/exofs/super.c 2010-08-24 14:17:48.468730000 -0400 @@ -621,6 +621,7 @@ static int exofs_fill_super(struct super sb->s_fs_info = sbi; sb->s_op = &exofs_sops; @@ -812,7 +812,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/ EXOFS_ERR("ERROR: exofs_iget failed\n"); diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exportfs/expfs.c --- linux-2.6.34.noarch/fs/exportfs/expfs.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-08-23 12:09:03.282511528 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-08-24 14:17:48.473730000 -0400 @@ -16,6 +16,13 @@ #include #include @@ -829,7 +829,7 @@ diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exp diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/exportfs/Makefile --- linux-2.6.34.noarch/fs/exportfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-08-23 12:09:03.282511528 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-08-24 14:17:48.478733000 -0400 @@ -3,4 +3,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs.o @@ -840,8 +840,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/ex +exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o +exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c ---- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-23 12:09:03.283511561 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-23 12:09:03.283511561 -0400 +--- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-24 14:17:48.482731000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-24 14:17:48.484734000 -0400 @@ -0,0 +1,158 @@ +/* + * linux/fs/nfsd/nfs4blocklayoutxdr.c @@ -1002,8 +1002,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34. +} +EXPORT_SYMBOL_GPL(blocklayout_encode_layout); diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c ---- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-23 12:09:03.283511561 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-23 12:09:03.283511561 -0400 +--- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-24 14:17:48.487733000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-24 14:17:48.489734000 -0400 @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -1224,8 +1224,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.n +} +EXPORT_SYMBOL(filelayout_encode_layout); diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c ---- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-23 12:09:03.284511493 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-23 12:09:03.284511493 -0400 +--- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-24 14:17:48.493729000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-24 14:17:48.494735000 -0400 @@ -0,0 +1,289 @@ +/* + * pnfs_osd_xdr_enc.c @@ -1518,7 +1518,7 @@ diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.no +EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gfs2/ops_fstype.c --- linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-08-23 12:09:03.285539075 -0400 ++++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-08-24 14:17:48.499730000 -0400 @@ -19,6 +19,7 @@ #include #include @@ -1539,7 +1539,7 @@ diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gf sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig --- linux-2.6.34.noarch/fs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/Kconfig 2010-08-23 12:09:03.286512316 -0400 ++++ linux-2.6.34.noarch/fs/Kconfig 2010-08-24 14:17:48.505733000 -0400 @@ -224,6 +224,31 @@ config LOCKD_V4 config EXPORTFS tristate @@ -1573,8 +1573,8 @@ diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig tristate select FS_POSIX_ACL diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-23 12:09:03.287381619 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-23 12:09:03.287381619 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-24 14:17:48.509734000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-24 14:17:48.511732000 -0400 @@ -0,0 +1,66 @@ +#include +#include @@ -1643,8 +1643,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.or + return; +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-23 12:09:03.288501648 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-23 12:09:03.288501648 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-24 14:17:48.514733000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-24 14:17:48.516731000 -0400 @@ -0,0 +1,1160 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.c @@ -2807,8 +2807,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34. +module_init(nfs4blocklayout_init); +module_exit(nfs4blocklayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-23 12:09:03.289501933 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-23 12:09:03.289501933 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-24 14:17:48.519731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-24 14:17:48.521730000 -0400 @@ -0,0 +1,335 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdev.c @@ -3146,8 +3146,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6. + goto out; +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-23 12:09:03.290395707 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-23 12:09:03.290395707 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-24 14:17:48.523733000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-24 14:17:48.525730000 -0400 @@ -0,0 +1,120 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdm.c @@ -3270,8 +3270,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.3 + } +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-23 12:09:03.290395707 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-23 12:09:03.291501560 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-24 14:17:48.528729000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-24 14:17:48.529735000 -0400 @@ -0,0 +1,303 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h @@ -3577,8 +3577,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34. + +#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-23 12:09:03.292511531 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-08-23 12:09:03.292511531 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-24 14:17:48.532731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-08-24 14:17:48.534734000 -0400 @@ -0,0 +1,948 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h @@ -4529,8 +4529,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noar + } +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile ---- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-23 12:09:03.292511531 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-08-23 12:09:03.293491476 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-24 14:17:48.537729000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-08-24 14:17:48.538739000 -0400 @@ -0,0 +1,6 @@ +# +# Makefile for the pNFS block layout driver kernel module @@ -4540,7 +4540,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarc + extents.o block-device-discovery-pipe.o diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/callback.h --- linux-2.6.34.noarch/fs/nfs/callback.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-08-23 12:09:03.293491476 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-08-24 14:17:48.544730000 -0400 @@ -8,6 +8,8 @@ #ifndef __LINUX_FS_NFS_CALLBACK_H #define __LINUX_FS_NFS_CALLBACK_H @@ -4613,7 +4613,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/c extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/nfs/callback_proc.c --- linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-08-23 12:09:03.294522414 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-08-24 14:17:48.562731000 -0400 @@ -8,10 +8,15 @@ #include #include @@ -5096,7 +5096,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/ return status; diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/nfs/callback_xdr.c --- linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-08-23 12:09:03.295502055 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-08-24 14:17:48.568730000 -0400 @@ -22,6 +22,8 @@ #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) @@ -5298,8 +5298,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/n .process_op = (callback_process_op_t)nfs4_callback_sequence, .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/client.c ---- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-08-23 12:08:29.037481540 -0400 -+++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-23 12:09:03.297501650 -0400 +--- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-08-24 14:14:13.062705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-24 14:17:48.575730000 -0400 @@ -39,6 +39,7 @@ #include #include @@ -5508,8 +5508,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/cli goto error; diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/bl_com.c ---- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-08-23 12:09:03.297501650 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-08-23 12:09:03.298501447 -0400 +--- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-08-24 14:17:48.578729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-08-24 14:17:48.579735000 -0400 @@ -0,0 +1,292 @@ +#if defined(CONFIG_SPNFS_BLOCK) + @@ -5804,8 +5804,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/b +} +#endif /* CONFIG_SPNFS_BLOCK */ diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/bl_ops.c ---- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-08-23 12:09:03.299501445 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-08-23 12:09:03.299501445 -0400 +--- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-08-24 14:17:48.584729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-08-24 14:17:48.586730000 -0400 @@ -0,0 +1,1672 @@ +/* + * bl_ops.c @@ -7480,8 +7480,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/b + +#endif /* CONFIG_SPNFS_BLOCK */ diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs/delegation.c ---- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-08-23 12:08:29.037481540 -0400 -+++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-23 12:09:03.300491952 -0400 +--- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-08-24 14:14:13.068705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-24 14:17:48.592730000 -0400 @@ -104,7 +104,8 @@ again: continue; if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) @@ -7558,7 +7558,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs rcu_read_unlock(); diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs/delegation.h --- linux-2.6.34.noarch/fs/nfs/delegation.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-08-23 12:09:03.301431797 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-08-24 14:17:48.597733000 -0400 @@ -34,9 +34,7 @@ enum { int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); @@ -7571,8 +7571,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c ---- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-08-23 12:08:29.089481525 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 12:09:03.302511603 -0400 +--- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-08-24 14:14:13.612707000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-24 14:17:48.604730000 -0400 @@ -17,11 +17,19 @@ #include #include @@ -7750,7 +7750,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/e svcauth_unix_purge(); diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/direct.c --- linux-2.6.34.noarch/fs/nfs/direct.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-08-23 12:09:03.303491500 -0400 ++++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-08-24 14:17:48.610730000 -0400 @@ -267,6 +267,38 @@ static const struct rpc_call_ops nfs_rea .rpc_release = nfs_direct_read_release, }; @@ -7996,7 +7996,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/dir user_addr += bytes; diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kconfig --- linux-2.6.34.noarch/fs/nfsd/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-08-23 12:09:03.304505472 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-08-24 14:17:48.616730000 -0400 @@ -79,3 +79,52 @@ config NFSD_V4 available from http://linux-nfs.org/. @@ -8052,7 +8052,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kc + If unsure, say N. diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/Makefile --- linux-2.6.34.noarch/fs/nfsd/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-08-23 12:09:03.304505472 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-08-24 14:17:48.621733000 -0400 @@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ @@ -8062,8 +8062,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/M +nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o +nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-23 12:08:29.090501507 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 12:09:03.306491345 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-24 14:14:13.618705000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-24 14:17:48.628730000 -0400 @@ -40,7 +40,6 @@ #define NFSPROC4_CB_NULL 0 @@ -8603,8 +8603,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/ +} +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-23 12:09:03.307491492 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-23 12:09:03.308491262 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-24 14:17:48.633729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-24 14:17:48.641730000 -0400 @@ -0,0 +1,1679 @@ +/****************************************************************************** + * @@ -10286,8 +10286,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfs + return status; +} diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-23 12:09:03.309501439 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-23 12:09:03.309501439 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-24 14:17:48.645731000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-24 14:17:48.647730000 -0400 @@ -0,0 +1,461 @@ +/****************************************************************************** + * @@ -10751,8 +10751,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/n +}; +EXPORT_SYMBOL(pnfs_dlm_export_ops); diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-23 12:09:03.310501390 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-23 12:09:03.310501390 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-24 14:17:48.651729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-24 14:17:48.652735000 -0400 @@ -0,0 +1,620 @@ +/* +* linux/fs/nfsd/nfs4pnfsds.c @@ -11375,8 +11375,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nf + +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-23 12:08:29.091491685 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 12:09:03.311501496 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-24 14:14:13.623707000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-24 14:17:48.658733000 -0400 @@ -34,10 +34,14 @@ */ #include @@ -11851,8 +11851,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd static const char *nfsd4_op_name(unsigned opnum) diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-08-23 12:08:29.093491375 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 12:09:03.313491310 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-08-24 14:14:13.632707000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-24 14:17:48.667732000 -0400 @@ -42,6 +42,8 @@ #include "xdr4.h" #include "vfs.h" @@ -12368,8 +12368,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs } diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-23 12:09:03.315491356 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-08-24 14:14:13.639707000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-24 14:17:48.675730000 -0400 @@ -47,9 +47,14 @@ #include #include @@ -12988,8 +12988,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/ [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c ---- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-08-23 12:08:29.094491943 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 12:09:03.317501495 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-08-24 14:14:13.645705000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-24 14:17:48.681730000 -0400 @@ -13,10 +13,15 @@ #include #include @@ -13166,8 +13166,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/n remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h ---- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-08-23 12:08:29.095491390 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 12:09:03.318355741 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-08-24 14:14:13.651705000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-24 14:17:48.687730000 -0400 @@ -285,11 +285,17 @@ extern time_t nfsd4_grace; #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ NFSD4_SUPPORTED_ATTRS_WORD0 @@ -13189,7 +13189,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfs { diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.c --- linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-08-23 12:09:03.319511586 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-08-24 14:17:48.693730000 -0400 @@ -10,6 +10,7 @@ #include @@ -13227,7 +13227,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nf __u32 tfh[2]; diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.h --- linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-08-23 12:09:03.319511586 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-08-24 14:17:48.698733000 -0400 @@ -14,6 +14,7 @@ enum nfsd_fsid { FSID_UUID8, FSID_UUID16, @@ -13280,8 +13280,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nf + #endif /* _LINUX_NFSD_FH_INT_H */ diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/nfssvc.c ---- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-08-23 12:08:27.631563969 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-08-23 12:09:03.320416974 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-08-24 14:14:06.365163000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-08-24 14:17:48.704731000 -0400 @@ -115,7 +115,7 @@ struct svc_program nfsd_program = { }; @@ -13292,8 +13292,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/n int nfsd_vers(int vers, enum vers_op change) { diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pnfsd.h ---- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-08-23 12:09:03.321376171 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-08-23 12:09:03.321376171 -0400 +--- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-08-24 14:17:48.708729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-08-24 14:17:48.710730000 -0400 @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2005 The Regents of the University of Michigan. @@ -13439,8 +13439,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pn + +#endif /* LINUX_NFSD_PNFSD_H */ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c ---- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-23 12:09:03.321376171 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-23 12:09:03.322501672 -0400 +--- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-24 14:17:48.713731000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-24 14:17:48.715730000 -0400 @@ -0,0 +1,225 @@ +/* + * linux/fs/nfsd/pnfs_lexp.c @@ -13668,8 +13668,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nf + inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; +} diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_com.c ---- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-23 12:09:03.322501672 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-08-23 12:09:03.323511608 -0400 +--- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-24 14:17:48.719729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-08-24 14:17:48.720735000 -0400 @@ -0,0 +1,535 @@ +/* + * fs/nfsd/spnfs_com.c @@ -14207,8 +14207,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfs +} +#endif /* CONFIG_PROC_FS */ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c ---- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-23 12:09:03.324501390 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-08-23 12:09:03.324501390 -0400 +--- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-24 14:17:48.724733000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-08-24 14:17:48.726730000 -0400 @@ -0,0 +1,878 @@ +/* + * fs/nfsd/spnfs_ops.c @@ -15089,8 +15089,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfs + return 0; +} diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h ---- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-08-23 12:08:29.096512142 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 12:09:03.325501424 -0400 +--- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-08-24 14:14:13.656705000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-24 14:17:48.731738000 -0400 @@ -242,6 +242,12 @@ struct nfs4_client { u32 cl_cb_seq_nr; struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ @@ -15207,8 +15207,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/st + #endif /* NFSD4_STATE_H */ diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs.c ---- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-08-23 12:08:27.632564132 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-08-23 12:09:03.326501490 -0400 +--- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-08-24 14:14:06.371160000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-08-24 14:17:48.737742000 -0400 @@ -37,7 +37,12 @@ #ifdef CONFIG_NFSD_V4 #include @@ -15335,8 +15335,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs. out_nfserr: err = nfserrno(host_err); diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h ---- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-08-23 12:08:29.097425997 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 12:09:03.327451643 -0400 +--- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-08-24 14:14:13.661705000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-24 14:17:48.743747000 -0400 @@ -37,6 +37,8 @@ #ifndef _LINUX_NFSD_XDR4_H #define _LINUX_NFSD_XDR4_H @@ -15413,8 +15413,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr struct nfs4_replay * replay; }; diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file.c ---- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-23 12:08:29.039491912 -0400 -+++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-23 12:09:03.328501680 -0400 +--- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-24 14:14:13.079708000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-24 14:17:48.749746000 -0400 @@ -28,6 +28,7 @@ #include #include @@ -15540,8 +15540,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file. if (!ret) return VM_FAULT_LOCKED; diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inode.c ---- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-08-23 12:08:29.042511552 -0400 -+++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-23 12:09:03.329501644 -0400 +--- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-08-24 14:14:13.095705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-24 14:17:48.757730000 -0400 @@ -48,6 +48,7 @@ #include "internal.h" #include "fscache.h" @@ -15755,8 +15755,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inod nfs_fs_proc_exit(); nfsiod_stop(); diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/internal.h ---- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-08-23 12:08:29.042511552 -0400 -+++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-23 12:09:03.330502148 -0400 +--- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-08-24 14:14:13.100708000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-24 14:17:48.763734000 -0400 @@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_serv struct nfs_fattr *); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); @@ -15817,7 +15817,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/i struct page *, struct page *); diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kconfig --- linux-2.6.34.noarch/fs/nfs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-08-23 12:09:03.331395814 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-08-24 14:17:48.769730000 -0400 @@ -79,10 +79,48 @@ config NFS_V4_1 depends on NFS_V4 && EXPERIMENTAL help @@ -15870,7 +15870,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kcon depends on NFS_FS=y && IP_PNP diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Makefile --- linux-2.6.34.noarch/fs/nfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-08-23 12:09:03.331395814 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-08-24 14:17:48.774730000 -0400 @@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4x delegation.o idmap.o \ callback.o callback_xdr.o callback_proc.o \ @@ -15885,8 +15885,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Mak +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ +obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs3proc.c ---- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-08-23 12:08:29.045525837 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-23 12:09:03.332511640 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-08-24 14:14:13.119708000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-24 14:17:48.780730000 -0400 @@ -833,6 +833,7 @@ const struct nfs_rpc_ops nfs_v3_clientop .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs3_dir_inode_operations, @@ -15896,8 +15896,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/n .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-23 12:09:03.333512111 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-08-23 12:09:03.334491472 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-24 14:17:48.784731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-08-24 14:17:48.786730000 -0400 @@ -0,0 +1,765 @@ +/* + * linux/fs/nfs/nfs4filelayout.c @@ -16665,8 +16665,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs +module_init(nfs4filelayout_init); +module_exit(nfs4filelayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-23 12:09:03.334491472 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-23 12:09:03.335501543 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-24 14:17:48.790731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-24 14:17:48.792730000 -0400 @@ -0,0 +1,636 @@ +/* + * linux/fs/nfs/nfs4filelayoutdev.c @@ -17305,8 +17305,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch +} + diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-23 12:09:03.335501543 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-08-23 12:09:03.335501543 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-24 14:17:48.795731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-08-24 14:17:48.796742000 -0400 @@ -0,0 +1,97 @@ +/* + * pnfs_nfs4filelayout.h @@ -17406,8 +17406,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs + +#endif /* FS_NFS_NFS4FILELAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nfs4_fs.h ---- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-23 12:08:29.047512264 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-23 12:09:03.336490079 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-24 14:14:13.130705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-24 14:17:48.802730000 -0400 @@ -45,8 +45,28 @@ enum nfs4_client_state { NFS4CLNT_RECLAIM_NOGRACE, NFS4CLNT_DELEGRETURN, @@ -17556,8 +17556,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nf /* nfs4xdr.c */ diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs4proc.c ---- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-08-23 12:08:29.050481368 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-23 12:09:03.339481253 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-08-24 14:14:13.143709000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-24 14:17:48.811734000 -0400 @@ -49,12 +49,15 @@ #include #include @@ -19223,7 +19223,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/n .setattr = nfs4_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs/nfs4renewd.c --- linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-08-23 12:09:03.341491726 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-08-24 14:17:48.818733000 -0400 @@ -54,17 +54,17 @@ void nfs4_renew_state(struct work_struct *work) @@ -19246,8 +19246,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs spin_lock(&clp->cl_lock); lease = clp->cl_lease_time; diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/nfs4state.c ---- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-08-23 12:08:29.052491341 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-23 12:09:03.342373443 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-08-24 14:14:13.150705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-24 14:17:48.825730000 -0400 @@ -53,6 +53,9 @@ #include "callback.h" #include "delegation.h" @@ -19566,8 +19566,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/ test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs4xdr.c ---- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-23 12:08:29.054481400 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-23 12:09:03.346481283 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-24 14:14:13.159705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-24 14:17:48.834738000 -0400 @@ -50,8 +50,11 @@ #include #include @@ -21078,8 +21078,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nf }; diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild ---- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-23 12:09:03.348511665 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-08-23 12:09:03.348511665 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-24 14:17:48.839734000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-08-24 14:17:48.840742000 -0400 @@ -0,0 +1,11 @@ +# +# Makefile for the pNFS Objects Layout Driver kernel module @@ -21093,8 +21093,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs +panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o +obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-23 12:09:03.349501459 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-23 12:09:03.349501459 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-24 14:17:48.843735000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-24 14:17:48.845739000 -0400 @@ -0,0 +1,1087 @@ +/* + * objio_osd.c @@ -22184,8 +22184,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noar +module_init(objlayout_init); +module_exit(objlayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-23 12:09:03.350491564 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-08-23 12:09:03.350491564 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-24 14:17:48.848735000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-08-24 14:17:48.851730000 -0400 @@ -0,0 +1,790 @@ +/* + * objlayout.c @@ -22978,8 +22978,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noar + .uninitialize_mountpoint = objlayout_uninitialize_mountpoint, +}; diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h ---- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-23 12:09:03.351434439 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-08-23 12:09:03.351434439 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-24 14:17:48.852735000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-08-24 14:17:48.854746000 -0400 @@ -0,0 +1,171 @@ +/* + * objlayout.h @@ -23153,8 +23153,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noar + +#endif /* _OBJLAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-23 12:09:03.352501716 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-23 12:09:03.352501716 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-24 14:17:48.857735000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-24 14:17:48.860740000 -0400 @@ -0,0 +1,734 @@ +/* + * panfs_shim.c @@ -23891,8 +23891,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noa +module_init(panlayout_init); +module_exit(panlayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h ---- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-23 12:09:03.353501685 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-23 12:09:03.353501685 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-24 14:17:48.863734000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-24 14:17:48.864730000 -0400 @@ -0,0 +1,482 @@ +/* + * panfs_shim.h @@ -24377,8 +24377,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noa + +#endif /* _PANLAYOUT_PANFS_SHIM_H */ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-23 12:09:03.354501721 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-23 12:09:03.354501721 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-24 14:17:48.868731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-24 14:17:48.869739000 -0400 @@ -0,0 +1,435 @@ +/* + * pnfs_osd_xdr.c @@ -24816,8 +24816,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6. + return 0; +} diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/pagelist.c ---- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-08-23 12:08:29.056411363 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-23 12:09:03.355511659 -0400 +--- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-08-24 14:14:13.169705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-24 14:17:48.875733000 -0400 @@ -20,6 +20,7 @@ #include @@ -24940,8 +24940,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/p if (res == INT_MAX) goto out; diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs.c ---- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-08-23 12:09:03.356501413 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-08-23 12:09:03.357481204 -0400 +--- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-08-24 14:17:48.880733000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-08-24 14:17:48.883730000 -0400 @@ -0,0 +1,2027 @@ +/* + * linux/fs/nfs/pnfs.c @@ -26971,8 +26971,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs. +} +EXPORT_SYMBOL(nfs4_put_deviceid_cache); diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs.h ---- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-08-23 12:09:03.358501440 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-08-23 12:09:03.358501440 -0400 +--- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-08-24 14:17:48.886733000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-08-24 14:17:48.887735000 -0400 @@ -0,0 +1,355 @@ +/* + * fs/nfs/pnfs.h @@ -27330,8 +27330,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs. + +#endif /* FS_NFS_PNFS_H */ diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc.c ---- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-08-23 12:08:29.057511533 -0400 -+++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-23 12:09:03.359501471 -0400 +--- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-08-24 14:14:13.174707000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-24 14:17:48.893730000 -0400 @@ -443,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, stru fattr = nfs_alloc_fattr(); status = -ENOMEM; @@ -27359,8 +27359,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc. .getattr = nfs_proc_getattr, .setattr = nfs_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read.c ---- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-08-23 12:08:29.057511533 -0400 -+++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-23 12:09:03.359501471 -0400 +--- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-08-24 14:14:13.179708000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-24 14:17:48.899733000 -0400 @@ -18,8 +18,12 @@ #include #include @@ -27575,8 +27575,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read. nfs_add_stats(inode, NFSIOS_READPAGES, npages); read_complete: diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/super.c ---- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-23 12:08:29.059491391 -0400 -+++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-23 12:09:03.361501458 -0400 +--- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-24 14:14:13.186707000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-24 14:17:48.907729000 -0400 @@ -64,6 +64,7 @@ #include "iostat.h" #include "internal.h" @@ -27624,8 +27624,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/supe #endif diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unlink.c ---- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-08-23 12:08:29.060501485 -0400 -+++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-23 12:09:03.362419975 -0400 +--- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-08-24 14:14:13.192705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-24 14:17:48.913730000 -0400 @@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task struct nfs_unlinkdata *data = calldata; struct nfs_server *server = NFS_SERVER(data->dir); @@ -27636,8 +27636,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unl return; rpc_call_start(task); diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/write.c ---- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-08-23 12:08:27.630563929 -0400 -+++ linux-2.6.34.noarch/fs/nfs/write.c 2010-08-23 12:09:03.364491337 -0400 +--- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-08-24 14:14:06.360160000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/write.c 2010-08-24 14:17:48.921712000 -0400 @@ -20,6 +20,7 @@ #include #include @@ -28326,7 +28326,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/writ int nfs_wb_page_cancel(struct inode *inode, struct page *page) diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/include/linux/exportfs.h --- linux-2.6.34.noarch/include/linux/exportfs.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-08-23 12:09:03.365501459 -0400 ++++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-08-24 14:17:48.933713000 -0400 @@ -2,6 +2,7 @@ #define LINUX_EXPORTFS_H 1 @@ -28399,8 +28399,8 @@ diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/i +#endif /* CONFIG_PNFSD */ #endif /* LINUX_EXPORTFS_H */ diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/include/linux/exp_xdr.h ---- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-08-23 12:09:03.367491365 -0400 -+++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-08-23 12:09:03.367491365 -0400 +--- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-08-24 14:17:48.945690000 -0400 ++++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-08-24 14:17:48.946693000 -0400 @@ -0,0 +1,141 @@ +#ifndef _LINUX_EXP_XDR_H +#define _LINUX_EXP_XDR_H @@ -28544,8 +28544,8 @@ diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/in +} +#endif /* _LINUX_EXP_XDR_H */ diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include/linux/fs.h ---- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-08-23 12:08:29.021511898 -0400 -+++ linux-2.6.34.noarch/include/linux/fs.h 2010-08-23 12:09:03.369481147 -0400 +--- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-08-24 14:14:13.014707000 -0400 ++++ linux-2.6.34.noarch/include/linux/fs.h 2010-08-24 14:17:48.961675000 -0400 @@ -387,6 +387,7 @@ struct inodes_stat_t { #include @@ -28564,7 +28564,7 @@ diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include struct dentry *s_root; diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/include/linux/nfs4.h --- linux-2.6.34.noarch/include/linux/nfs4.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-08-23 12:09:03.371491472 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-08-24 14:17:48.974681000 -0400 @@ -17,7 +17,10 @@ #define NFS4_BITMAP_SIZE 2 @@ -28694,8 +28694,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/inclu #endif diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfs4_pnfs.h ---- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-23 12:09:03.372501550 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-08-23 12:09:03.372501550 -0400 +--- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-24 14:17:48.986670000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-08-24 14:17:48.989666000 -0400 @@ -0,0 +1,330 @@ +/* + * include/linux/nfs4_pnfs.h @@ -29028,8 +29028,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/ + +#endif /* LINUX_NFS4_PNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarch/include/linux/nfsd4_block.h ---- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-08-23 12:09:03.373491892 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-08-23 12:09:03.374491393 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-08-24 14:17:48.998668000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-08-24 14:17:49.000665000 -0400 @@ -0,0 +1,101 @@ +#ifndef NFSD4_BLOCK +#define NFSD4_BLOCK @@ -29133,8 +29133,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarc +#endif /* NFSD4_BLOCK */ + diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h ---- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-23 12:09:03.375501481 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-08-23 12:09:03.375501481 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-24 14:17:49.012664000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-08-24 14:17:49.013671000 -0400 @@ -0,0 +1,345 @@ +/* + * include/linux/nfsd4_spnfs.h @@ -29483,7 +29483,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarc +#endif /* NFS_SPNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch/include/linux/nfsd/const.h --- linux-2.6.34.noarch/include/linux/nfsd/const.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-08-23 12:09:03.376401789 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-08-24 14:17:49.018668000 -0400 @@ -29,6 +29,7 @@ #ifdef __KERNEL__ @@ -29494,7 +29494,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch * Largest number of bytes we need to allocate for an NFS diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch/include/linux/nfsd/debug.h --- linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-08-23 12:09:03.376401789 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-08-24 14:17:49.024673000 -0400 @@ -32,6 +32,8 @@ #define NFSDDBG_REPCACHE 0x0080 #define NFSDDBG_XDR 0x0100 @@ -29506,7 +29506,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarch/include/linux/nfsd/export.h --- linux-2.6.34.noarch/include/linux/nfsd/export.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-08-23 12:09:03.377481954 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-08-24 14:17:49.030665000 -0400 @@ -100,6 +100,7 @@ struct svc_export { uid_t ex_anon_uid; gid_t ex_anon_gid; @@ -29516,8 +29516,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarc struct nfsd4_fs_locations ex_fslocs; int ex_nflavors; diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h ---- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-23 12:09:03.377481954 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-23 12:09:03.378501747 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-24 14:17:49.033666000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-24 14:17:49.034665000 -0400 @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -29652,8 +29652,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.3 + +#endif /* NFSD_NFS4LAYOUTXDR_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h ---- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-23 12:09:03.378501747 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-23 12:09:03.378501747 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-24 14:17:49.037666000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-24 14:17:49.039665000 -0400 @@ -0,0 +1,54 @@ +/****************************************************************************** + * @@ -29710,8 +29710,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34. + +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h ---- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-23 12:09:03.379487099 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-23 12:09:03.379487099 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-24 14:17:49.042666000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-24 14:17:49.044665000 -0400 @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -29986,7 +29986,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.n +#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noarch/include/linux/nfsd/syscall.h --- linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-08-23 12:09:03.380502500 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-08-24 14:17:49.049665000 -0400 @@ -29,6 +29,7 @@ /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ @@ -30024,8 +30024,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noar union nfsctl_res { diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/include/linux/nfs_fs.h ---- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-23 12:08:29.061494081 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-23 12:09:03.381511751 -0400 +--- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-24 14:14:13.201710000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-24 14:17:49.063666000 -0400 @@ -72,13 +72,20 @@ struct nfs_access_entry { int mask; }; @@ -30124,8 +30124,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/inc #ifdef __KERNEL__ diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/include/linux/nfs_fs_sb.h ---- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-23 12:08:29.062501618 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-23 12:09:03.383491395 -0400 +--- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-24 14:14:13.206708000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-24 14:17:49.077665000 -0400 @@ -15,6 +15,7 @@ struct nlm_host; struct nfs4_sequence_args; struct nfs4_sequence_res; @@ -30200,7 +30200,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/ atomic_t active; /* Keep trace of any activity to this server */ diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch/include/linux/nfs_iostat.h --- linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-08-23 12:09:03.384501540 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-08-24 14:17:49.089668000 -0400 @@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { NFSIOS_SHORTREAD, NFSIOS_SHORTWRITE, @@ -30213,7 +30213,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/include/linux/nfs_page.h --- linux-2.6.34.noarch/include/linux/nfs_page.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-08-23 12:09:03.385491518 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-08-24 14:17:49.103665000 -0400 @@ -39,6 +39,7 @@ struct nfs_page { struct list_head wb_list; /* Defines state of page: */ struct page *wb_page; /* page to read in/write out */ @@ -30262,8 +30262,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/i struct inode *inode, int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/include/linux/nfs_xdr.h ---- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-08-23 12:08:29.062501618 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-23 12:09:03.387491422 -0400 +--- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-08-24 14:14:13.211708000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-24 14:17:49.116665000 -0400 @@ -3,6 +3,8 @@ #include @@ -30415,8 +30415,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/in extern struct rpc_version nfs_version3; extern struct rpc_version nfs_version4; diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.noarch/include/linux/panfs_shim_api.h ---- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-08-23 12:09:03.388491527 -0400 -+++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-08-23 12:09:03.388491527 -0400 +--- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-08-24 14:17:49.128664000 -0400 ++++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-08-24 14:17:49.129670000 -0400 @@ -0,0 +1,57 @@ +#ifndef _PANFS_SHIM_API_H +#define _PANFS_SHIM_API_H @@ -30476,8 +30476,8 @@ diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.no + +#endif /* _PANFS_SHIM_API_H */ diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h ---- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-23 12:09:03.390501461 -0400 -+++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-08-23 12:09:03.390501461 -0400 +--- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-24 14:17:49.141664000 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-08-24 14:17:49.142670000 -0400 @@ -0,0 +1,440 @@ +/* + * pnfs_osd_xdr.h @@ -30920,8 +30920,8 @@ diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noar + +#endif /* __PNFS_OSD_XDR_H__ */ diff -up linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_xdr.h ---- linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig 2010-08-23 12:09:03.391491550 -0400 -+++ linux-2.6.34.noarch/include/linux/pnfs_xdr.h 2010-08-23 12:09:03.391491550 -0400 +--- linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig 2010-08-24 14:17:49.153666000 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_xdr.h 2010-08-24 14:17:49.155665000 -0400 @@ -0,0 +1,134 @@ +/* + * include/linux/pnfs_xdr.h @@ -31059,7 +31059,7 @@ diff -up linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig linux-2.6.34.noarch/i +#endif /* LINUX_PNFS_XDR_H */ diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/include/linux/posix_acl.h --- linux-2.6.34.noarch/include/linux/posix_acl.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-08-23 12:09:03.393501437 -0400 ++++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-08-24 14:17:49.168668000 -0400 @@ -8,6 +8,7 @@ #ifndef __LINUX_POSIX_ACL_H #define __LINUX_POSIX_ACL_H @@ -31070,7 +31070,7 @@ diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/ #define ACL_UNDEFINED_ID (-1) diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h --- linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-08-23 12:09:03.393501437 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-08-24 14:17:49.174665000 -0400 @@ -14,6 +14,8 @@ /* size of an XDR encoding unit in bytes, i.e. 32bit */ #define XDR_UNIT (4) @@ -31082,7 +31082,7 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.n diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h --- linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-23 12:09:03.394512138 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-24 14:17:49.179667000 -0400 @@ -3,6 +3,7 @@ #ifdef __KERNEL__ @@ -31103,8 +31103,8 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.3 struct rpc_pipe_ops { diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h ---- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-23 12:09:03.394512138 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-23 12:09:03.395501822 -0400 +--- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-24 14:17:49.183664000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-24 14:17:49.184674000 -0400 @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2008 The Regents of the University of Michigan. @@ -31219,7 +31219,7 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux +#endif /* _SIMPLE_RPC_PIPEFS_H_ */ diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h --- linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-23 12:09:03.395501822 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-24 14:17:49.190665000 -0400 @@ -166,4 +166,41 @@ static inline char *__svc_print_addr(con return buf; @@ -31263,8 +31263,8 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.n +} #endif /* SUNRPC_SVC_XPRT_H */ diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xdr.h ---- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-23 12:08:29.066475323 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-23 12:09:03.396464612 -0400 +--- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-24 14:14:13.258707000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-24 14:17:49.195672000 -0400 @@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp) return p + 2; } @@ -31287,14 +31287,9 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, unsigned int len); extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); -diff -up linux-2.6.34.noarch/localversion-pnfs.orig linux-2.6.34.noarch/localversion-pnfs ---- linux-2.6.34.noarch/localversion-pnfs.orig 2010-08-23 12:09:03.396464612 -0400 -+++ linux-2.6.34.noarch/localversion-pnfs 2010-08-23 12:09:03.396464612 -0400 -@@ -0,0 +1 @@ -+-pnfs diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/sunrpc/Makefile --- linux-2.6.34.noarch/net/sunrpc/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-08-23 12:09:03.397501662 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-08-24 14:17:49.204668000 -0400 @@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt svc.o svcsock.o svcauth.o svcauth_unix.o \ addr.o rpcb_clnt.o timer.o xdr.o \ @@ -31305,8 +31300,8 @@ diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/su sunrpc-$(CONFIG_PROC_FS) += stats.o sunrpc-$(CONFIG_SYSCTL) += sysctl.o diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c ---- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-23 12:09:03.398522348 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-23 12:09:03.398522348 -0400 +--- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-24 14:17:49.208664000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-24 14:17:49.209670000 -0400 @@ -0,0 +1,424 @@ +/* + * net/sunrpc/simple_rpc_pipefs.c @@ -31733,8 +31728,8 @@ diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.no +} +EXPORT_SYMBOL(pipefs_generic_destroy_msg); diff -up linux-2.6.34.noarch/net/sunrpc/xdr.c.orig linux-2.6.34.noarch/net/sunrpc/xdr.c ---- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-08-23 12:08:29.081501640 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-23 12:09:03.399443371 -0400 +--- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-08-24 14:14:13.447705000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-24 14:17:49.215665000 -0400 @@ -403,16 +403,14 @@ xdr_shrink_pagelen(struct xdr_buf *buf, /* Shift the tail first */ From af636613e811089e31e967eded054a6bb64b25ca Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Tue, 24 Aug 2010 15:13:05 -0400 Subject: [PATCH 04/20] set the kernel flags --with firmware --with debuginfo --without vdso_install --without debug --without headers Signed-off-by: Steve Dickson --- kernel.spec | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel.spec b/kernel.spec index 7b72cab84..24ed4de0b 100644 --- a/kernel.spec +++ b/kernel.spec @@ -101,23 +101,23 @@ Summary: The Linux kernel # kernel-smp (only valid for ppc 32-bit) %define with_smp %{?_without_smp: 0} %{?!_without_smp: 1} # kernel-debug -%define with_debug %{?_without_debug: 0} %{?!_without_debug: 1} +%define with_debug %{?_without_debug: 0} %{?!_without_debug: 0} # kernel-doc -%define with_doc %{?_without_doc: 0} %{?!_without_doc: 1} +%define with_doc %{?_without_doc: 0} %{?!_without_doc: 0} # kernel-headers -%define with_headers %{?_without_headers: 0} %{?!_without_headers: 1} +%define with_headers %{?_without_headers: 0} %{?!_without_headers: 0} # kernel-firmware %define with_firmware %{?_with_firmware: 1} %{?!_with_firmware: 1} # tools/perf -%define with_perftool %{?_without_perftool: 0} %{?!_without_perftool: 1} +%define with_perftool %{?_without_perftool: 0} %{?!_without_perftool: 0} # perf noarch subpkg -%define with_perf %{?_without_perf: 0} %{?!_without_perf: 1} +%define with_perf %{?_without_perf: 0} %{?!_without_perf: 0} # kernel-debuginfo -%define with_debuginfo %{?_without_debuginfo: 0} %{?!_without_debuginfo: 1} +%define with_debuginfo %{?_without_debuginfo: 1} %{?!_without_debuginfo: 1} # kernel-bootwrapper (for creating zImages from kernel + initrd) %define with_bootwrapper %{?_without_bootwrapper: 0} %{?!_without_bootwrapper: 1} # Want to build a the vsdo directories installed -%define with_vdso_install %{?_without_vdso_install: 0} %{?!_without_vdso_install: 1} +%define with_vdso_install %{?_without_vdso_install: 0} %{?!_without_vdso_install: 0} # Build the kernel-doc package, but don't fail the build if it botches. # Here "true" means "continue" and "false" means "fail the build". From c7b01347fb165ece597d6a5863d4ea4103aa5dfb Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Mon, 23 Aug 2010 12:20:57 -0400 Subject: [PATCH 05/20] Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 Signed-off-by: Steve Dickson --- config-generic | 12 + kernel.spec | 15 +- linux-2.6-pnfs-compile.patch | 13 + linux-2.6.35-inline.patch | 11 + nfs-35-fc.patch | 7235 ++++++ nfsd-35-fc.patch | 1808 ++ pnfs-all-2.6.35-2010-08-19-f13.patch | 31788 +++++++++++++++++++++++++ 7 files changed, 40880 insertions(+), 2 deletions(-) create mode 100644 linux-2.6-pnfs-compile.patch create mode 100644 linux-2.6.35-inline.patch create mode 100644 nfs-35-fc.patch create mode 100644 nfsd-35-fc.patch create mode 100644 pnfs-all-2.6.35-2010-08-19-f13.patch diff --git a/config-generic b/config-generic index 3b23aabcc..76379c8eb 100644 --- a/config-generic +++ b/config-generic @@ -3322,6 +3322,18 @@ CONFIG_NFSD_V3=y CONFIG_NFSD_V3_ACL=y CONFIG_NFSD_V4=y CONFIG_NFS_FSCACHE=y +# Enable pNFS +CONFIG_PNFS=y +CONFIG_PNFSD=y +CONFIG_PNFSD_LOCAL_EXPORT=y +CONFIG_SPNFS=y +CONFIG_SPNFS_LAYOUTSEGMENTS=y +CONFIG_SPNFS_BLOCK=y +CONFIG_PNFS_OBJLAYOUT=m +CONFIG_PNFS_BLOCK=m +CONFIG_PNFS_PANLAYOUT=m +CONFIG_PNFS_FILE_LAYOUT=m +# CONFIG_LOCKD=m CONFIG_LOCKD_V4=y CONFIG_EXPORTFS=m diff --git a/kernel.spec b/kernel.spec index 6478f8671..14956777b 100644 --- a/kernel.spec +++ b/kernel.spec @@ -23,7 +23,7 @@ Summary: The Linux kernel # # (Uncomment the '#' and both spaces below to set the buildid.) # -# % define buildid .local +%define buildid .pnfs_all_2.6.35_2010_08_19 ################################################################### # The buildid can also be specified on the rpmbuild command line @@ -107,7 +107,7 @@ Summary: The Linux kernel # kernel-headers %define with_headers %{?_without_headers: 0} %{?!_without_headers: 1} # kernel-firmware -%define with_firmware %{?_with_firmware: 1} %{?!_with_firmware: 0} +%define with_firmware %{?_with_firmware: 1} %{?!_with_firmware: 1} # tools/perf %define with_perftool %{?_without_perftool: 0} %{?!_without_perftool: 1} # perf noarch subpkg @@ -766,6 +766,12 @@ Patch12460: xfs-move-aio-completion-after-unwritten-extent-conversion.patch Patch12470: drivers-hwmon-coretemp-c-detect-the-thermal-sensors-by-cpuid.patch Patch12480: kprobes-x86-fix-kprobes-to-skip-prefixes-correctly.patch +Patch30000: nfs-35-fc.patch +Patch30001: nfsd-35-fc.patch +Patch30002: pnfs-all-2.6.35-2010-08-19-f13.patch +Patch30003: linux-2.6-pnfs-compile.patch +Patch30004: linux-2.6.35-inline.patch + %endif BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root @@ -1424,6 +1430,11 @@ ApplyPatch drivers-hwmon-coretemp-c-detect-the-thermal-sensors-by-cpuid.patch # bz #610941 ApplyPatch kprobes-x86-fix-kprobes-to-skip-prefixes-correctly.patch +ApplyPatch nfs-35-fc.patch +ApplyPatch nfsd-35-fc.patch +ApplyPatch pnfs-all-2.6.35-2010-08-19-f13.patch +ApplyPatch linux-2.6-pnfs-compile.patch +ApplyPatch linux-2.6.35-inline.patch # END OF PATCH APPLICATIONS %endif diff --git a/linux-2.6-pnfs-compile.patch b/linux-2.6-pnfs-compile.patch new file mode 100644 index 000000000..7c8cc4248 --- /dev/null +++ b/linux-2.6-pnfs-compile.patch @@ -0,0 +1,13 @@ +diff -up linux-2.6.32.x86_64/fs/nfs/objlayout/pnfs_osd_xdr.h.orig linux-2.6.32.x86_64/fs/nfs/objlayout/pnfs_osd_xdr.h +diff -up linux-2.6.32.x86_64/include/net/inet_connection_sock.h.orig linux-2.6.32.x86_64/include/net/inet_connection_sock.h +--- linux-2.6.32.x86_64/include/net/inet_connection_sock.h.orig 2009-12-02 22:51:21.000000000 -0500 ++++ linux-2.6.32.x86_64/include/net/inet_connection_sock.h 2010-04-21 14:26:24.475659551 -0400 +@@ -23,7 +23,7 @@ + #include + #include + +-#define INET_CSK_DEBUG 1 ++//#define INET_CSK_DEBUG 1 + + /* Cancel timers, when they are not required. */ + #undef INET_CSK_CLEAR_TIMERS diff --git a/linux-2.6.35-inline.patch b/linux-2.6.35-inline.patch new file mode 100644 index 000000000..c56d8da5e --- /dev/null +++ b/linux-2.6.35-inline.patch @@ -0,0 +1,11 @@ +diff -up linux-2.6.34.noarch/arch/x86/Makefile.orig linux-2.6.34.noarch/arch/x86/Makefile +--- linux-2.6.34.noarch/arch/x86/Makefile.orig 2010-07-01 13:33:21.859627499 -0400 ++++ linux-2.6.34.noarch/arch/x86/Makefile 2010-07-01 13:36:26.751576450 -0400 +@@ -81,6 +81,7 @@ ifdef CONFIG_CC_STACKPROTECTOR + $(warning stack protector enabled but no compiler support) + endif + endif ++KBUILD_CFLAGS += -fno-inline-functions-called-once + + # Don't unroll struct assignments with kmemcheck enabled + ifeq ($(CONFIG_KMEMCHECK),y) diff --git a/nfs-35-fc.patch b/nfs-35-fc.patch new file mode 100644 index 000000000..c3ad25f65 --- /dev/null +++ b/nfs-35-fc.patch @@ -0,0 +1,7235 @@ +diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/client.c +--- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-23 11:01:00.352376393 -0400 +@@ -934,7 +934,6 @@ static int nfs_probe_fsinfo(struct nfs_s + } + + fsinfo.fattr = fattr; +- nfs_fattr_init(fattr); + error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); + if (error < 0) + goto out_error; +@@ -1047,13 +1046,18 @@ struct nfs_server *nfs_create_server(con + struct nfs_fh *mntfh) + { + struct nfs_server *server; +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr; + int error; + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + ++ error = -ENOMEM; ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto error; ++ + /* Get a client representation */ + error = nfs_init_server(server, data); + if (error < 0) +@@ -1064,7 +1068,7 @@ struct nfs_server *nfs_create_server(con + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + /* Probe the root fh to retrieve its FSID */ +- error = nfs_probe_fsinfo(server, mntfh, &fattr); ++ error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto error; + if (server->nfs_client->rpc_ops->version == 3) { +@@ -1077,14 +1081,14 @@ struct nfs_server *nfs_create_server(con + server->namelen = NFS2_MAXNAMLEN; + } + +- if (!(fattr.valid & NFS_ATTR_FATTR)) { +- error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); ++ if (!(fattr->valid & NFS_ATTR_FATTR)) { ++ error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); + if (error < 0) { + dprintk("nfs_create_server: getattr error = %d\n", -error); + goto error; + } + } +- memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid)); ++ memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); + + dprintk("Server FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, +@@ -1096,9 +1100,11 @@ struct nfs_server *nfs_create_server(con + spin_unlock(&nfs_client_lock); + + server->mount_time = jiffies; ++ nfs_free_fattr(fattr); + return server; + + error: ++ nfs_free_fattr(fattr); + nfs_free_server(server); + return ERR_PTR(error); + } +@@ -1340,7 +1346,7 @@ error: + struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, + struct nfs_fh *mntfh) + { +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr; + struct nfs_server *server; + int error; + +@@ -1350,6 +1356,11 @@ struct nfs_server *nfs4_create_server(co + if (!server) + return ERR_PTR(-ENOMEM); + ++ error = -ENOMEM; ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto error; ++ + /* set up the general RPC client */ + error = nfs4_init_server(server, data); + if (error < 0) +@@ -1364,7 +1375,7 @@ struct nfs_server *nfs4_create_server(co + goto error; + + /* Probe the root fh to retrieve its FSID */ +- error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path); ++ error = nfs4_get_rootfh(server, mntfh); + if (error < 0) + goto error; + +@@ -1375,7 +1386,7 @@ struct nfs_server *nfs4_create_server(co + + nfs4_session_set_rwsize(server); + +- error = nfs_probe_fsinfo(server, mntfh, &fattr); ++ error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto error; + +@@ -1389,9 +1400,11 @@ struct nfs_server *nfs4_create_server(co + + server->mount_time = jiffies; + dprintk("<-- nfs4_create_server() = %p\n", server); ++ nfs_free_fattr(fattr); + return server; + + error: ++ nfs_free_fattr(fattr); + nfs_free_server(server); + dprintk("<-- nfs4_create_server() = error %d\n", error); + return ERR_PTR(error); +@@ -1405,7 +1418,7 @@ struct nfs_server *nfs4_create_referral_ + { + struct nfs_client *parent_client; + struct nfs_server *server, *parent_server; +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr; + int error; + + dprintk("--> nfs4_create_referral_server()\n"); +@@ -1414,6 +1427,11 @@ struct nfs_server *nfs4_create_referral_ + if (!server) + return ERR_PTR(-ENOMEM); + ++ error = -ENOMEM; ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto error; ++ + parent_server = NFS_SB(data->sb); + parent_client = parent_server->nfs_client; + +@@ -1443,12 +1461,12 @@ struct nfs_server *nfs4_create_referral_ + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + /* Probe the root fh to retrieve its FSID and filehandle */ +- error = nfs4_path_walk(server, mntfh, data->mnt_path); ++ error = nfs4_get_rootfh(server, mntfh); + if (error < 0) + goto error; + + /* probe the filesystem info for this server filesystem */ +- error = nfs_probe_fsinfo(server, mntfh, &fattr); ++ error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto error; + +@@ -1466,10 +1484,12 @@ struct nfs_server *nfs4_create_referral_ + + server->mount_time = jiffies; + ++ nfs_free_fattr(fattr); + dprintk("<-- nfs_create_referral_server() = %p\n", server); + return server; + + error: ++ nfs_free_fattr(fattr); + nfs_free_server(server); + dprintk("<-- nfs4_create_referral_server() = error %d\n", error); + return ERR_PTR(error); +@@ -1485,7 +1505,7 @@ struct nfs_server *nfs_clone_server(stru + struct nfs_fattr *fattr) + { + struct nfs_server *server; +- struct nfs_fattr fattr_fsinfo; ++ struct nfs_fattr *fattr_fsinfo; + int error; + + dprintk("--> nfs_clone_server(,%llx:%llx,)\n", +@@ -1496,6 +1516,11 @@ struct nfs_server *nfs_clone_server(stru + if (!server) + return ERR_PTR(-ENOMEM); + ++ error = -ENOMEM; ++ fattr_fsinfo = nfs_alloc_fattr(); ++ if (fattr_fsinfo == NULL) ++ goto out_free_server; ++ + /* Copy data from the source */ + server->nfs_client = source->nfs_client; + atomic_inc(&server->nfs_client->cl_count); +@@ -1512,7 +1537,7 @@ struct nfs_server *nfs_clone_server(stru + nfs_init_server_aclclient(server); + + /* probe the filesystem info for this server filesystem */ +- error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo); ++ error = nfs_probe_fsinfo(server, fh, fattr_fsinfo); + if (error < 0) + goto out_free_server; + +@@ -1534,10 +1559,12 @@ struct nfs_server *nfs_clone_server(stru + + server->mount_time = jiffies; + ++ nfs_free_fattr(fattr_fsinfo); + dprintk("<-- nfs_clone_server() = %p\n", server); + return server; + + out_free_server: ++ nfs_free_fattr(fattr_fsinfo); + nfs_free_server(server); + dprintk("<-- nfs_clone_server() = error %d\n", error); + return ERR_PTR(error); +diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs/delegation.c +--- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-23 11:01:00.352376393 -0400 +@@ -213,7 +213,7 @@ int nfs_inode_set_delegation(struct inod + struct nfs_delegation *freeme = NULL; + int status = 0; + +- delegation = kmalloc(sizeof(*delegation), GFP_KERNEL); ++ delegation = kmalloc(sizeof(*delegation), GFP_NOFS); + if (delegation == NULL) + return -ENOMEM; + memcpy(delegation->stateid.data, res->delegation.data, +diff -up linux-2.6.34.noarch/fs/nfs/dir.c.orig linux-2.6.34.noarch/fs/nfs/dir.c +--- linux-2.6.34.noarch/fs/nfs/dir.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/dir.c 2010-08-23 11:01:00.353376419 -0400 +@@ -530,9 +530,7 @@ static int nfs_readdir(struct file *filp + nfs_readdir_descriptor_t my_desc, + *desc = &my_desc; + struct nfs_entry my_entry; +- struct nfs_fh fh; +- struct nfs_fattr fattr; +- long res; ++ int res = -ENOMEM; + + dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", + dentry->d_parent->d_name.name, dentry->d_name.name, +@@ -554,9 +552,11 @@ static int nfs_readdir(struct file *filp + + my_entry.cookie = my_entry.prev_cookie = 0; + my_entry.eof = 0; +- my_entry.fh = &fh; +- my_entry.fattr = &fattr; +- nfs_fattr_init(&fattr); ++ my_entry.fh = nfs_alloc_fhandle(); ++ my_entry.fattr = nfs_alloc_fattr(); ++ if (my_entry.fh == NULL || my_entry.fattr == NULL) ++ goto out_alloc_failed; ++ + desc->entry = &my_entry; + + nfs_block_sillyrename(dentry); +@@ -598,7 +598,10 @@ out: + nfs_unblock_sillyrename(dentry); + if (res > 0) + res = 0; +- dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n", ++out_alloc_failed: ++ nfs_free_fattr(my_entry.fattr); ++ nfs_free_fhandle(my_entry.fh); ++ dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + res); + return res; +@@ -776,9 +779,9 @@ static int nfs_lookup_revalidate(struct + struct inode *dir; + struct inode *inode; + struct dentry *parent; ++ struct nfs_fh *fhandle = NULL; ++ struct nfs_fattr *fattr = NULL; + int error; +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; + + parent = dget_parent(dentry); + dir = parent->d_inode; +@@ -811,14 +814,22 @@ static int nfs_lookup_revalidate(struct + if (NFS_STALE(inode)) + goto out_bad; + +- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); ++ error = -ENOMEM; ++ fhandle = nfs_alloc_fhandle(); ++ fattr = nfs_alloc_fattr(); ++ if (fhandle == NULL || fattr == NULL) ++ goto out_error; ++ ++ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + if (error) + goto out_bad; +- if (nfs_compare_fh(NFS_FH(inode), &fhandle)) ++ if (nfs_compare_fh(NFS_FH(inode), fhandle)) + goto out_bad; +- if ((error = nfs_refresh_inode(inode, &fattr)) != 0) ++ if ((error = nfs_refresh_inode(inode, fattr)) != 0) + goto out_bad; + ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fhandle); + out_set_verifier: + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + out_valid: +@@ -842,11 +853,21 @@ out_zap_parent: + shrink_dcache_parent(dentry); + } + d_drop(dentry); ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fhandle); + dput(parent); + dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", + __func__, dentry->d_parent->d_name.name, + dentry->d_name.name); + return 0; ++out_error: ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fhandle); ++ dput(parent); ++ dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n", ++ __func__, dentry->d_parent->d_name.name, ++ dentry->d_name.name, error); ++ return error; + } + + /* +@@ -911,9 +932,9 @@ static struct dentry *nfs_lookup(struct + struct dentry *res; + struct dentry *parent; + struct inode *inode = NULL; ++ struct nfs_fh *fhandle = NULL; ++ struct nfs_fattr *fattr = NULL; + int error; +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; + + dfprintk(VFS, "NFS: lookup(%s/%s)\n", + dentry->d_parent->d_name.name, dentry->d_name.name); +@@ -923,7 +944,6 @@ static struct dentry *nfs_lookup(struct + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) + goto out; + +- res = ERR_PTR(-ENOMEM); + dentry->d_op = NFS_PROTO(dir)->dentry_ops; + + /* +@@ -936,17 +956,23 @@ static struct dentry *nfs_lookup(struct + goto out; + } + ++ res = ERR_PTR(-ENOMEM); ++ fhandle = nfs_alloc_fhandle(); ++ fattr = nfs_alloc_fattr(); ++ if (fhandle == NULL || fattr == NULL) ++ goto out; ++ + parent = dentry->d_parent; + /* Protect against concurrent sillydeletes */ + nfs_block_sillyrename(parent); +- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); ++ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + if (error == -ENOENT) + goto no_entry; + if (error < 0) { + res = ERR_PTR(error); + goto out_unblock_sillyrename; + } +- inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr); ++ inode = nfs_fhget(dentry->d_sb, fhandle, fattr); + res = (struct dentry *)inode; + if (IS_ERR(res)) + goto out_unblock_sillyrename; +@@ -962,6 +988,8 @@ no_entry: + out_unblock_sillyrename: + nfs_unblock_sillyrename(parent); + out: ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fhandle); + return res; + } + +@@ -1669,28 +1697,33 @@ static void nfs_access_free_entry(struct + smp_mb__after_atomic_dec(); + } + ++static void nfs_access_free_list(struct list_head *head) ++{ ++ struct nfs_access_entry *cache; ++ ++ while (!list_empty(head)) { ++ cache = list_entry(head->next, struct nfs_access_entry, lru); ++ list_del(&cache->lru); ++ nfs_access_free_entry(cache); ++ } ++} ++ + int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) + { + LIST_HEAD(head); + struct nfs_inode *nfsi; + struct nfs_access_entry *cache; + +-restart: ++ if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) ++ return (nr_to_scan == 0) ? 0 : -1; ++ + spin_lock(&nfs_access_lru_lock); + list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { +- struct rw_semaphore *s_umount; + struct inode *inode; + + if (nr_to_scan-- == 0) + break; +- s_umount = &nfsi->vfs_inode.i_sb->s_umount; +- if (!down_read_trylock(s_umount)) +- continue; +- inode = igrab(&nfsi->vfs_inode); +- if (inode == NULL) { +- up_read(s_umount); +- continue; +- } ++ inode = &nfsi->vfs_inode; + spin_lock(&inode->i_lock); + if (list_empty(&nfsi->access_cache_entry_lru)) + goto remove_lru_entry; +@@ -1704,61 +1737,47 @@ restart: + else { + remove_lru_entry: + list_del_init(&nfsi->access_cache_inode_lru); ++ smp_mb__before_clear_bit(); + clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); ++ smp_mb__after_clear_bit(); + } +- spin_unlock(&inode->i_lock); +- spin_unlock(&nfs_access_lru_lock); +- iput(inode); +- up_read(s_umount); +- goto restart; + } + spin_unlock(&nfs_access_lru_lock); +- while (!list_empty(&head)) { +- cache = list_entry(head.next, struct nfs_access_entry, lru); +- list_del(&cache->lru); +- nfs_access_free_entry(cache); +- } ++ nfs_access_free_list(&head); + return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure; + } + +-static void __nfs_access_zap_cache(struct inode *inode) ++static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) + { +- struct nfs_inode *nfsi = NFS_I(inode); + struct rb_root *root_node = &nfsi->access_cache; +- struct rb_node *n, *dispose = NULL; ++ struct rb_node *n; + struct nfs_access_entry *entry; + + /* Unhook entries from the cache */ + while ((n = rb_first(root_node)) != NULL) { + entry = rb_entry(n, struct nfs_access_entry, rb_node); + rb_erase(n, root_node); +- list_del(&entry->lru); +- n->rb_left = dispose; +- dispose = n; ++ list_move(&entry->lru, head); + } + nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS; +- spin_unlock(&inode->i_lock); +- +- /* Now kill them all! */ +- while (dispose != NULL) { +- n = dispose; +- dispose = n->rb_left; +- nfs_access_free_entry(rb_entry(n, struct nfs_access_entry, rb_node)); +- } + } + + void nfs_access_zap_cache(struct inode *inode) + { ++ LIST_HEAD(head); ++ ++ if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0) ++ return; + /* Remove from global LRU init */ +- if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { +- spin_lock(&nfs_access_lru_lock); ++ spin_lock(&nfs_access_lru_lock); ++ if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) + list_del_init(&NFS_I(inode)->access_cache_inode_lru); +- spin_unlock(&nfs_access_lru_lock); +- } + + spin_lock(&inode->i_lock); +- /* This will release the spinlock */ +- __nfs_access_zap_cache(inode); ++ __nfs_access_zap_cache(NFS_I(inode), &head); ++ spin_unlock(&inode->i_lock); ++ spin_unlock(&nfs_access_lru_lock); ++ nfs_access_free_list(&head); + } + + static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred) +@@ -1809,8 +1828,8 @@ out_stale: + nfs_access_free_entry(cache); + return -ENOENT; + out_zap: +- /* This will release the spinlock */ +- __nfs_access_zap_cache(inode); ++ spin_unlock(&inode->i_lock); ++ nfs_access_zap_cache(inode); + return -ENOENT; + } + +@@ -1865,9 +1884,11 @@ static void nfs_access_add_cache(struct + smp_mb__after_atomic_inc(); + + /* Add inode to global LRU list */ +- if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { ++ if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { + spin_lock(&nfs_access_lru_lock); +- list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list); ++ if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) ++ list_add_tail(&NFS_I(inode)->access_cache_inode_lru, ++ &nfs_access_lru_list); + spin_unlock(&nfs_access_lru_lock); + } + } +diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file.c +--- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-23 11:00:23.790502081 -0400 ++++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-23 11:01:00.354376416 -0400 +@@ -162,14 +162,17 @@ static int nfs_revalidate_file_size(stru + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_inode *nfsi = NFS_I(inode); + +- if (server->flags & NFS_MOUNT_NOAC) +- goto force_reval; ++ if (nfs_have_delegated_attributes(inode)) ++ goto out_noreval; ++ + if (filp->f_flags & O_DIRECT) + goto force_reval; +- if (nfsi->npages != 0) +- return 0; +- if (!(nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode)) +- return 0; ++ if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) ++ goto force_reval; ++ if (nfs_attribute_timeout(inode)) ++ goto force_reval; ++out_noreval: ++ return 0; + force_reval: + return __nfs_revalidate_inode(server, inode); + } +diff -up linux-2.6.34.noarch/fs/nfs/fscache.c.orig linux-2.6.34.noarch/fs/nfs/fscache.c +--- linux-2.6.34.noarch/fs/nfs/fscache.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/fscache.c 2010-08-23 11:01:00.355376416 -0400 +@@ -467,7 +467,8 @@ int __nfs_readpages_from_fscache(struct + struct list_head *pages, + unsigned *nr_pages) + { +- int ret, npages = *nr_pages; ++ unsigned npages = *nr_pages; ++ int ret; + + dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", + NFS_I(inode)->fscache, npages, inode); +diff -up linux-2.6.34.noarch/fs/nfs/getroot.c.orig linux-2.6.34.noarch/fs/nfs/getroot.c +--- linux-2.6.34.noarch/fs/nfs/getroot.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/getroot.c 2010-08-23 11:01:00.356376417 -0400 +@@ -78,159 +78,94 @@ struct dentry *nfs_get_root(struct super + { + struct nfs_server *server = NFS_SB(sb); + struct nfs_fsinfo fsinfo; +- struct nfs_fattr fattr; +- struct dentry *mntroot; ++ struct dentry *ret; + struct inode *inode; + int error; + + /* get the actual root for this mount */ +- fsinfo.fattr = &fattr; ++ fsinfo.fattr = nfs_alloc_fattr(); ++ if (fsinfo.fattr == NULL) ++ return ERR_PTR(-ENOMEM); + + error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); + if (error < 0) { + dprintk("nfs_get_root: getattr error = %d\n", -error); +- return ERR_PTR(error); ++ ret = ERR_PTR(error); ++ goto out; + } + + inode = nfs_fhget(sb, mntfh, fsinfo.fattr); + if (IS_ERR(inode)) { + dprintk("nfs_get_root: get root inode failed\n"); +- return ERR_CAST(inode); ++ ret = ERR_CAST(inode); ++ goto out; + } + + error = nfs_superblock_set_dummy_root(sb, inode); +- if (error != 0) +- return ERR_PTR(error); ++ if (error != 0) { ++ ret = ERR_PTR(error); ++ goto out; ++ } + + /* root dentries normally start off anonymous and get spliced in later + * if the dentry tree reaches them; however if the dentry already + * exists, we'll pick it up at this point and use it as the root + */ +- mntroot = d_obtain_alias(inode); +- if (IS_ERR(mntroot)) { ++ ret = d_obtain_alias(inode); ++ if (IS_ERR(ret)) { + dprintk("nfs_get_root: get root dentry failed\n"); +- return mntroot; ++ goto out; + } + +- security_d_instantiate(mntroot, inode); +- +- if (!mntroot->d_op) +- mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops; ++ security_d_instantiate(ret, inode); + +- return mntroot; ++ if (ret->d_op == NULL) ++ ret->d_op = server->nfs_client->rpc_ops->dentry_ops; ++out: ++ nfs_free_fattr(fsinfo.fattr); ++ return ret; + } + + #ifdef CONFIG_NFS_V4 + +-/* +- * Do a simple pathwalk from the root FH of the server to the nominated target +- * of the mountpoint +- * - give error on symlinks +- * - give error on ".." occurring in the path +- * - follow traversals +- */ +-int nfs4_path_walk(struct nfs_server *server, +- struct nfs_fh *mntfh, +- const char *path) ++int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh) + { + struct nfs_fsinfo fsinfo; +- struct nfs_fattr fattr; +- struct nfs_fh lastfh; +- struct qstr name; +- int ret; +- +- dprintk("--> nfs4_path_walk(,,%s)\n", path); +- +- fsinfo.fattr = &fattr; +- nfs_fattr_init(&fattr); +- +- /* Eat leading slashes */ +- while (*path == '/') +- path++; ++ int ret = -ENOMEM; ++ ++ dprintk("--> nfs4_get_rootfh()\n"); ++ ++ fsinfo.fattr = nfs_alloc_fattr(); ++ if (fsinfo.fattr == NULL) ++ goto out; + + /* Start by getting the root filehandle from the server */ + ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); + if (ret < 0) { +- dprintk("nfs4_get_root: getroot error = %d\n", -ret); +- return ret; ++ dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); ++ goto out; + } + +- if (!S_ISDIR(fattr.mode)) { +- printk(KERN_ERR "nfs4_get_root:" ++ if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_MODE) ++ || !S_ISDIR(fsinfo.fattr->mode)) { ++ printk(KERN_ERR "nfs4_get_rootfh:" + " getroot encountered non-directory\n"); +- return -ENOTDIR; ++ ret = -ENOTDIR; ++ goto out; + } + +- /* FIXME: It is quite valid for the server to return a referral here */ +- if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) { +- printk(KERN_ERR "nfs4_get_root:" ++ if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { ++ printk(KERN_ERR "nfs4_get_rootfh:" + " getroot obtained referral\n"); +- return -EREMOTE; ++ ret = -EREMOTE; ++ goto out; + } + +-next_component: +- dprintk("Next: %s\n", path); +- +- /* extract the next bit of the path */ +- if (!*path) +- goto path_walk_complete; +- +- name.name = path; +- while (*path && *path != '/') +- path++; +- name.len = path - (const char *) name.name; +- +- if (name.len > NFS4_MAXNAMLEN) +- return -ENAMETOOLONG; +- +-eat_dot_dir: +- while (*path == '/') +- path++; +- +- if (path[0] == '.' && (path[1] == '/' || !path[1])) { +- path += 2; +- goto eat_dot_dir; +- } +- +- /* FIXME: Why shouldn't the user be able to use ".." in the path? */ +- if (path[0] == '.' && path[1] == '.' && (path[2] == '/' || !path[2]) +- ) { +- printk(KERN_ERR "nfs4_get_root:" +- " Mount path contains reference to \"..\"\n"); +- return -EINVAL; +- } +- +- /* lookup the next FH in the sequence */ +- memcpy(&lastfh, mntfh, sizeof(lastfh)); +- +- dprintk("LookupFH: %*.*s [%s]\n", name.len, name.len, name.name, path); +- +- ret = server->nfs_client->rpc_ops->lookupfh(server, &lastfh, &name, +- mntfh, &fattr); +- if (ret < 0) { +- dprintk("nfs4_get_root: getroot error = %d\n", -ret); +- return ret; +- } +- +- if (!S_ISDIR(fattr.mode)) { +- printk(KERN_ERR "nfs4_get_root:" +- " lookupfh encountered non-directory\n"); +- return -ENOTDIR; +- } +- +- /* FIXME: Referrals are quite valid here too */ +- if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) { +- printk(KERN_ERR "nfs4_get_root:" +- " lookupfh obtained referral\n"); +- return -EREMOTE; +- } +- +- goto next_component; +- +-path_walk_complete: +- memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid)); +- dprintk("<-- nfs4_path_walk() = 0\n"); +- return 0; ++ memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); ++out: ++ nfs_free_fattr(fsinfo.fattr); ++ dprintk("<-- nfs4_get_rootfh() = %d\n", ret); ++ return ret; + } + + /* +@@ -239,8 +174,8 @@ path_walk_complete: + struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) + { + struct nfs_server *server = NFS_SB(sb); +- struct nfs_fattr fattr; +- struct dentry *mntroot; ++ struct nfs_fattr *fattr = NULL; ++ struct dentry *ret; + struct inode *inode; + int error; + +@@ -254,40 +189,50 @@ struct dentry *nfs4_get_root(struct supe + return ERR_PTR(error); + } + ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ return ERR_PTR(-ENOMEM);; ++ + /* get the actual root for this mount */ +- error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); ++ error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); + if (error < 0) { + dprintk("nfs_get_root: getattr error = %d\n", -error); +- return ERR_PTR(error); ++ ret = ERR_PTR(error); ++ goto out; + } + +- inode = nfs_fhget(sb, mntfh, &fattr); ++ inode = nfs_fhget(sb, mntfh, fattr); + if (IS_ERR(inode)) { + dprintk("nfs_get_root: get root inode failed\n"); +- return ERR_CAST(inode); ++ ret = ERR_CAST(inode); ++ goto out; + } + + error = nfs_superblock_set_dummy_root(sb, inode); +- if (error != 0) +- return ERR_PTR(error); ++ if (error != 0) { ++ ret = ERR_PTR(error); ++ goto out; ++ } + + /* root dentries normally start off anonymous and get spliced in later + * if the dentry tree reaches them; however if the dentry already + * exists, we'll pick it up at this point and use it as the root + */ +- mntroot = d_obtain_alias(inode); +- if (IS_ERR(mntroot)) { ++ ret = d_obtain_alias(inode); ++ if (IS_ERR(ret)) { + dprintk("nfs_get_root: get root dentry failed\n"); +- return mntroot; ++ goto out; + } + +- security_d_instantiate(mntroot, inode); ++ security_d_instantiate(ret, inode); + +- if (!mntroot->d_op) +- mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops; ++ if (ret->d_op == NULL) ++ ret->d_op = server->nfs_client->rpc_ops->dentry_ops; + ++out: ++ nfs_free_fattr(fattr); + dprintk("<-- nfs4_get_root()\n"); +- return mntroot; ++ return ret; + } + + #endif /* CONFIG_NFS_V4 */ +diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inode.c +--- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-23 11:01:00.357376378 -0400 +@@ -393,8 +393,8 @@ int + nfs_setattr(struct dentry *dentry, struct iattr *attr) + { + struct inode *inode = dentry->d_inode; +- struct nfs_fattr fattr; +- int error; ++ struct nfs_fattr *fattr; ++ int error = -ENOMEM; + + nfs_inc_stats(inode, NFSIOS_VFSSETATTR); + +@@ -417,14 +417,20 @@ nfs_setattr(struct dentry *dentry, struc + filemap_write_and_wait(inode->i_mapping); + nfs_wb_all(inode); + } ++ ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto out; + /* + * Return any delegations if we're going to change ACLs + */ + if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) + nfs_inode_return_delegation(inode); +- error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); ++ error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); + if (error == 0) +- nfs_refresh_inode(inode, &fattr); ++ nfs_refresh_inode(inode, fattr); ++ nfs_free_fattr(fattr); ++out: + return error; + } + +@@ -682,7 +688,7 @@ int + __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) + { + int status = -ESTALE; +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr = NULL; + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", +@@ -693,8 +699,13 @@ __nfs_revalidate_inode(struct nfs_server + if (NFS_STALE(inode)) + goto out; + ++ status = -ENOMEM; ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto out; ++ + nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); +- status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); ++ status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr); + if (status != 0) { + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", + inode->i_sb->s_id, +@@ -707,7 +718,7 @@ __nfs_revalidate_inode(struct nfs_server + goto out; + } + +- status = nfs_refresh_inode(inode, &fattr); ++ status = nfs_refresh_inode(inode, fattr); + if (status) { + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", + inode->i_sb->s_id, +@@ -723,6 +734,7 @@ __nfs_revalidate_inode(struct nfs_server + (long long)NFS_FILEID(inode)); + + out: ++ nfs_free_fattr(fattr); + return status; + } + +@@ -730,9 +742,14 @@ int nfs_attribute_timeout(struct inode * + { + struct nfs_inode *nfsi = NFS_I(inode); + ++ return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); ++} ++ ++static int nfs_attribute_cache_expired(struct inode *inode) ++{ + if (nfs_have_delegated_attributes(inode)) + return 0; +- return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); ++ return nfs_attribute_timeout(inode); + } + + /** +@@ -745,7 +762,7 @@ int nfs_attribute_timeout(struct inode * + int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) + { + if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) +- && !nfs_attribute_timeout(inode)) ++ && !nfs_attribute_cache_expired(inode)) + return NFS_STALE(inode) ? -ESTALE : 0; + return __nfs_revalidate_inode(server, inode); + } +@@ -782,7 +799,8 @@ int nfs_revalidate_mapping(struct inode + int ret = 0; + + if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) +- || nfs_attribute_timeout(inode) || NFS_STALE(inode)) { ++ || nfs_attribute_cache_expired(inode) ++ || NFS_STALE(inode)) { + ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (ret < 0) + goto out; +@@ -916,6 +934,26 @@ void nfs_fattr_init(struct nfs_fattr *fa + fattr->gencount = nfs_inc_attr_generation_counter(); + } + ++struct nfs_fattr *nfs_alloc_fattr(void) ++{ ++ struct nfs_fattr *fattr; ++ ++ fattr = kmalloc(sizeof(*fattr), GFP_NOFS); ++ if (fattr != NULL) ++ nfs_fattr_init(fattr); ++ return fattr; ++} ++ ++struct nfs_fh *nfs_alloc_fhandle(void) ++{ ++ struct nfs_fh *fh; ++ ++ fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS); ++ if (fh != NULL) ++ fh->size = 0; ++ return fh; ++} ++ + /** + * nfs_inode_attrs_need_update - check if the inode attributes need updating + * @inode - pointer to inode +diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/internal.h +--- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-23 11:01:00.358564151 -0400 +@@ -244,9 +244,7 @@ extern struct dentry *nfs_get_root(struc + #ifdef CONFIG_NFS_V4 + extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *); + +-extern int nfs4_path_walk(struct nfs_server *server, +- struct nfs_fh *mntfh, +- const char *path); ++extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); + #endif + + /* read.c */ +diff -up linux-2.6.34.noarch/fs/nfs/iostat.h.orig linux-2.6.34.noarch/fs/nfs/iostat.h +--- linux-2.6.34.noarch/fs/nfs/iostat.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/iostat.h 2010-08-23 11:01:00.358564151 -0400 +@@ -36,14 +36,14 @@ static inline void nfs_inc_stats(const s + + static inline void nfs_add_server_stats(const struct nfs_server *server, + enum nfs_stat_bytecounters stat, +- unsigned long addend) ++ long addend) + { + this_cpu_add(server->io_stats->bytes[stat], addend); + } + + static inline void nfs_add_stats(const struct inode *inode, + enum nfs_stat_bytecounters stat, +- unsigned long addend) ++ long addend) + { + nfs_add_server_stats(NFS_SERVER(inode), stat, addend); + } +@@ -51,7 +51,7 @@ static inline void nfs_add_stats(const s + #ifdef CONFIG_NFS_FSCACHE + static inline void nfs_add_fscache_stats(struct inode *inode, + enum nfs_stat_fscachecounters stat, +- unsigned long addend) ++ long addend) + { + this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend); + } +diff -up linux-2.6.34.noarch/fs/nfs/namespace.c.orig linux-2.6.34.noarch/fs/nfs/namespace.c +--- linux-2.6.34.noarch/fs/nfs/namespace.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/namespace.c 2010-08-23 11:01:00.359420147 -0400 +@@ -105,8 +105,8 @@ static void * nfs_follow_mountpoint(stru + struct vfsmount *mnt; + struct nfs_server *server = NFS_SERVER(dentry->d_inode); + struct dentry *parent; +- struct nfs_fh fh; +- struct nfs_fattr fattr; ++ struct nfs_fh *fh = NULL; ++ struct nfs_fattr *fattr = NULL; + int err; + + dprintk("--> nfs_follow_mountpoint()\n"); +@@ -115,6 +115,12 @@ static void * nfs_follow_mountpoint(stru + if (IS_ROOT(dentry)) + goto out_err; + ++ err = -ENOMEM; ++ fh = nfs_alloc_fhandle(); ++ fattr = nfs_alloc_fattr(); ++ if (fh == NULL || fattr == NULL) ++ goto out_err; ++ + dprintk("%s: enter\n", __func__); + dput(nd->path.dentry); + nd->path.dentry = dget(dentry); +@@ -123,16 +129,16 @@ static void * nfs_follow_mountpoint(stru + parent = dget_parent(nd->path.dentry); + err = server->nfs_client->rpc_ops->lookup(parent->d_inode, + &nd->path.dentry->d_name, +- &fh, &fattr); ++ fh, fattr); + dput(parent); + if (err != 0) + goto out_err; + +- if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) ++ if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) + mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry); + else +- mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, &fh, +- &fattr); ++ mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh, ++ fattr); + err = PTR_ERR(mnt); + if (IS_ERR(mnt)) + goto out_err; +@@ -151,6 +157,8 @@ static void * nfs_follow_mountpoint(stru + nd->path.dentry = dget(mnt->mnt_root); + schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); + out: ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fh); + dprintk("%s: done, returned %d\n", __func__, err); + + dprintk("<-- nfs_follow_mountpoint() = %d\n", err); +diff -up linux-2.6.34.noarch/fs/nfs/nfs3acl.c.orig linux-2.6.34.noarch/fs/nfs/nfs3acl.c +--- linux-2.6.34.noarch/fs/nfs/nfs3acl.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3acl.c 2010-08-23 11:01:00.359420147 -0400 +@@ -185,7 +185,6 @@ static void nfs3_cache_acls(struct inode + struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) + { + struct nfs_server *server = NFS_SERVER(inode); +- struct nfs_fattr fattr; + struct page *pages[NFSACL_MAXPAGES] = { }; + struct nfs3_getaclargs args = { + .fh = NFS_FH(inode), +@@ -193,7 +192,7 @@ struct posix_acl *nfs3_proc_getacl(struc + .pages = pages, + }; + struct nfs3_getaclres res = { +- .fattr = &fattr, ++ 0 + }; + struct rpc_message msg = { + .rpc_argp = &args, +@@ -228,7 +227,10 @@ struct posix_acl *nfs3_proc_getacl(struc + + dprintk("NFS call getacl\n"); + msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; +- nfs_fattr_init(&fattr); ++ res.fattr = nfs_alloc_fattr(); ++ if (res.fattr == NULL) ++ return ERR_PTR(-ENOMEM); ++ + status = rpc_call_sync(server->client_acl, &msg, 0); + dprintk("NFS reply getacl: %d\n", status); + +@@ -238,7 +240,7 @@ struct posix_acl *nfs3_proc_getacl(struc + + switch (status) { + case 0: +- status = nfs_refresh_inode(inode, &fattr); ++ status = nfs_refresh_inode(inode, res.fattr); + break; + case -EPFNOSUPPORT: + case -EPROTONOSUPPORT: +@@ -278,6 +280,7 @@ struct posix_acl *nfs3_proc_getacl(struc + getout: + posix_acl_release(res.acl_access); + posix_acl_release(res.acl_default); ++ nfs_free_fattr(res.fattr); + + if (status != 0) { + posix_acl_release(acl); +@@ -290,7 +293,7 @@ static int nfs3_proc_setacls(struct inod + struct posix_acl *dfacl) + { + struct nfs_server *server = NFS_SERVER(inode); +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr; + struct page *pages[NFSACL_MAXPAGES]; + struct nfs3_setaclargs args = { + .inode = inode, +@@ -335,8 +338,13 @@ static int nfs3_proc_setacls(struct inod + } + + dprintk("NFS call setacl\n"); ++ status = -ENOMEM; ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto out_freepages; ++ + msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; +- nfs_fattr_init(&fattr); ++ msg.rpc_resp = fattr; + status = rpc_call_sync(server->client_acl, &msg, 0); + nfs_access_zap_cache(inode); + nfs_zap_acl_cache(inode); +@@ -344,7 +352,7 @@ static int nfs3_proc_setacls(struct inod + + switch (status) { + case 0: +- status = nfs_refresh_inode(inode, &fattr); ++ status = nfs_refresh_inode(inode, fattr); + nfs3_cache_acls(inode, acl, dfacl); + break; + case -EPFNOSUPPORT: +@@ -355,6 +363,7 @@ static int nfs3_proc_setacls(struct inod + case -ENOTSUPP: + status = -EOPNOTSUPP; + } ++ nfs_free_fattr(fattr); + out_freepages: + while (args.npages != 0) { + args.npages--; +diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs3proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-23 11:01:00.360574301 -0400 +@@ -144,14 +144,12 @@ static int + nfs3_proc_lookup(struct inode *dir, struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) + { +- struct nfs_fattr dir_attr; + struct nfs3_diropargs arg = { + .fh = NFS_FH(dir), + .name = name->name, + .len = name->len + }; + struct nfs3_diropres res = { +- .dir_attr = &dir_attr, + .fh = fhandle, + .fattr = fattr + }; +@@ -163,29 +161,30 @@ nfs3_proc_lookup(struct inode *dir, stru + int status; + + dprintk("NFS call lookup %s\n", name->name); +- nfs_fattr_init(&dir_attr); ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.dir_attr == NULL) ++ return -ENOMEM; ++ + nfs_fattr_init(fattr); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); +- nfs_refresh_inode(dir, &dir_attr); ++ nfs_refresh_inode(dir, res.dir_attr); + if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) { + msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; + msg.rpc_argp = fhandle; + msg.rpc_resp = fattr; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + } ++ nfs_free_fattr(res.dir_attr); + dprintk("NFS reply lookup: %d\n", status); + return status; + } + + static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) + { +- struct nfs_fattr fattr; + struct nfs3_accessargs arg = { + .fh = NFS_FH(inode), + }; +- struct nfs3_accessres res = { +- .fattr = &fattr, +- }; ++ struct nfs3_accessres res; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS], + .rpc_argp = &arg, +@@ -193,7 +192,7 @@ static int nfs3_proc_access(struct inode + .rpc_cred = entry->cred, + }; + int mode = entry->mask; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call access\n"); + +@@ -210,9 +209,13 @@ static int nfs3_proc_access(struct inode + if (mode & MAY_EXEC) + arg.access |= NFS3_ACCESS_EXECUTE; + } +- nfs_fattr_init(&fattr); ++ ++ res.fattr = nfs_alloc_fattr(); ++ if (res.fattr == NULL) ++ goto out; ++ + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); +- nfs_refresh_inode(inode, &fattr); ++ nfs_refresh_inode(inode, res.fattr); + if (status == 0) { + entry->mask = 0; + if (res.access & NFS3_ACCESS_READ) +@@ -222,6 +225,8 @@ static int nfs3_proc_access(struct inode + if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE)) + entry->mask |= MAY_EXEC; + } ++ nfs_free_fattr(res.fattr); ++out: + dprintk("NFS reply access: %d\n", status); + return status; + } +@@ -229,7 +234,7 @@ static int nfs3_proc_access(struct inode + static int nfs3_proc_readlink(struct inode *inode, struct page *page, + unsigned int pgbase, unsigned int pglen) + { +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr; + struct nfs3_readlinkargs args = { + .fh = NFS_FH(inode), + .pgbase = pgbase, +@@ -239,14 +244,19 @@ static int nfs3_proc_readlink(struct ino + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_READLINK], + .rpc_argp = &args, +- .rpc_resp = &fattr, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call readlink\n"); +- nfs_fattr_init(&fattr); ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto out; ++ msg.rpc_resp = fattr; ++ + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); +- nfs_refresh_inode(inode, &fattr); ++ nfs_refresh_inode(inode, fattr); ++ nfs_free_fattr(fattr); ++out: + dprintk("NFS reply readlink: %d\n", status); + return status; + } +@@ -396,12 +406,17 @@ nfs3_proc_remove(struct inode *dir, stru + .rpc_argp = &arg, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call remove %s\n", name->name); +- nfs_fattr_init(&res.dir_attr); ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.dir_attr == NULL) ++ goto out; ++ + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); +- nfs_post_op_update_inode(dir, &res.dir_attr); ++ nfs_post_op_update_inode(dir, res.dir_attr); ++ nfs_free_fattr(res.dir_attr); ++out: + dprintk("NFS reply remove: %d\n", status); + return status; + } +@@ -419,7 +434,7 @@ nfs3_proc_unlink_done(struct rpc_task *t + if (nfs3_async_handle_jukebox(task, dir)) + return 0; + res = task->tk_msg.rpc_resp; +- nfs_post_op_update_inode(dir, &res->dir_attr); ++ nfs_post_op_update_inode(dir, res->dir_attr); + return 1; + } + +@@ -427,7 +442,6 @@ static int + nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, + struct inode *new_dir, struct qstr *new_name) + { +- struct nfs_fattr old_dir_attr, new_dir_attr; + struct nfs3_renameargs arg = { + .fromfh = NFS_FH(old_dir), + .fromname = old_name->name, +@@ -436,23 +450,27 @@ nfs3_proc_rename(struct inode *old_dir, + .toname = new_name->name, + .tolen = new_name->len + }; +- struct nfs3_renameres res = { +- .fromattr = &old_dir_attr, +- .toattr = &new_dir_attr +- }; ++ struct nfs3_renameres res; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], + .rpc_argp = &arg, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); +- nfs_fattr_init(&old_dir_attr); +- nfs_fattr_init(&new_dir_attr); ++ ++ res.fromattr = nfs_alloc_fattr(); ++ res.toattr = nfs_alloc_fattr(); ++ if (res.fromattr == NULL || res.toattr == NULL) ++ goto out; ++ + status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); +- nfs_post_op_update_inode(old_dir, &old_dir_attr); +- nfs_post_op_update_inode(new_dir, &new_dir_attr); ++ nfs_post_op_update_inode(old_dir, res.fromattr); ++ nfs_post_op_update_inode(new_dir, res.toattr); ++out: ++ nfs_free_fattr(res.toattr); ++ nfs_free_fattr(res.fromattr); + dprintk("NFS reply rename: %d\n", status); + return status; + } +@@ -460,30 +478,32 @@ nfs3_proc_rename(struct inode *old_dir, + static int + nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) + { +- struct nfs_fattr dir_attr, fattr; + struct nfs3_linkargs arg = { + .fromfh = NFS_FH(inode), + .tofh = NFS_FH(dir), + .toname = name->name, + .tolen = name->len + }; +- struct nfs3_linkres res = { +- .dir_attr = &dir_attr, +- .fattr = &fattr +- }; ++ struct nfs3_linkres res; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_LINK], + .rpc_argp = &arg, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call link %s\n", name->name); +- nfs_fattr_init(&dir_attr); +- nfs_fattr_init(&fattr); ++ res.fattr = nfs_alloc_fattr(); ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.fattr == NULL || res.dir_attr == NULL) ++ goto out; ++ + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); +- nfs_post_op_update_inode(dir, &dir_attr); +- nfs_post_op_update_inode(inode, &fattr); ++ nfs_post_op_update_inode(dir, res.dir_attr); ++ nfs_post_op_update_inode(inode, res.fattr); ++out: ++ nfs_free_fattr(res.dir_attr); ++ nfs_free_fattr(res.fattr); + dprintk("NFS reply link: %d\n", status); + return status; + } +@@ -554,7 +574,7 @@ out: + static int + nfs3_proc_rmdir(struct inode *dir, struct qstr *name) + { +- struct nfs_fattr dir_attr; ++ struct nfs_fattr *dir_attr; + struct nfs3_diropargs arg = { + .fh = NFS_FH(dir), + .name = name->name, +@@ -563,14 +583,19 @@ nfs3_proc_rmdir(struct inode *dir, struc + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR], + .rpc_argp = &arg, +- .rpc_resp = &dir_attr, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call rmdir %s\n", name->name); +- nfs_fattr_init(&dir_attr); ++ dir_attr = nfs_alloc_fattr(); ++ if (dir_attr == NULL) ++ goto out; ++ ++ msg.rpc_resp = dir_attr; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); +- nfs_post_op_update_inode(dir, &dir_attr); ++ nfs_post_op_update_inode(dir, dir_attr); ++ nfs_free_fattr(dir_attr); ++out: + dprintk("NFS reply rmdir: %d\n", status); + return status; + } +@@ -589,7 +614,6 @@ nfs3_proc_readdir(struct dentry *dentry, + u64 cookie, struct page *page, unsigned int count, int plus) + { + struct inode *dir = dentry->d_inode; +- struct nfs_fattr dir_attr; + __be32 *verf = NFS_COOKIEVERF(dir); + struct nfs3_readdirargs arg = { + .fh = NFS_FH(dir), +@@ -600,7 +624,6 @@ nfs3_proc_readdir(struct dentry *dentry, + .pages = &page + }; + struct nfs3_readdirres res = { +- .dir_attr = &dir_attr, + .verf = verf, + .plus = plus + }; +@@ -610,7 +633,7 @@ nfs3_proc_readdir(struct dentry *dentry, + .rpc_resp = &res, + .rpc_cred = cred + }; +- int status; ++ int status = -ENOMEM; + + if (plus) + msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS]; +@@ -618,12 +641,17 @@ nfs3_proc_readdir(struct dentry *dentry, + dprintk("NFS call readdir%s %d\n", + plus? "plus" : "", (unsigned int) cookie); + +- nfs_fattr_init(&dir_attr); ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.dir_attr == NULL) ++ goto out; ++ + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + + nfs_invalidate_atime(dir); ++ nfs_refresh_inode(dir, res.dir_attr); + +- nfs_refresh_inode(dir, &dir_attr); ++ nfs_free_fattr(res.dir_attr); ++out: + dprintk("NFS reply readdir: %d\n", status); + return status; + } +diff -up linux-2.6.34.noarch/fs/nfs/nfs3xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs3xdr.c +--- linux-2.6.34.noarch/fs/nfs/nfs3xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3xdr.c 2010-08-23 11:01:00.361593802 -0400 +@@ -762,7 +762,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, _ + static int + nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res) + { +- return nfs3_xdr_wccstat(req, p, &res->dir_attr); ++ return nfs3_xdr_wccstat(req, p, res->dir_attr); + } + + /* +diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nfs4_fs.h +--- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-23 11:01:00.362574935 -0400 +@@ -206,14 +206,14 @@ extern ssize_t nfs4_listxattr(struct den + + + /* nfs4proc.c */ +-extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *); +-extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *); ++extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); ++extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); + extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); + extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); + extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); + extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); + extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); +-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait); ++extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); + extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); + extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); + extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); +@@ -286,7 +286,7 @@ extern void nfs4_put_lock_state(struct n + extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); + extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); + +-extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter); ++extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); + extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); + extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); + extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); +diff -up linux-2.6.34.noarch/fs/nfs/nfs4namespace.c.orig linux-2.6.34.noarch/fs/nfs/nfs4namespace.c +--- linux-2.6.34.noarch/fs/nfs/nfs4namespace.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4namespace.c 2010-08-23 11:01:00.363574219 -0400 +@@ -115,6 +115,7 @@ static struct vfsmount *try_location(str + char *page, char *page2, + const struct nfs4_fs_location *location) + { ++ const size_t addr_bufsize = sizeof(struct sockaddr_storage); + struct vfsmount *mnt = ERR_PTR(-ENOENT); + char *mnt_path; + unsigned int maxbuflen; +@@ -126,9 +127,12 @@ static struct vfsmount *try_location(str + mountdata->mnt_path = mnt_path; + maxbuflen = mnt_path - 1 - page2; + ++ mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL); ++ if (mountdata->addr == NULL) ++ return ERR_PTR(-ENOMEM); ++ + for (s = 0; s < location->nservers; s++) { + const struct nfs4_string *buf = &location->servers[s]; +- struct sockaddr_storage addr; + + if (buf->len <= 0 || buf->len >= maxbuflen) + continue; +@@ -137,11 +141,10 @@ static struct vfsmount *try_location(str + continue; + + mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, +- (struct sockaddr *)&addr, sizeof(addr)); ++ mountdata->addr, addr_bufsize); + if (mountdata->addrlen == 0) + continue; + +- mountdata->addr = (struct sockaddr *)&addr; + rpc_set_port(mountdata->addr, NFS_PORT); + + memcpy(page2, buf->data, buf->len); +@@ -156,6 +159,7 @@ static struct vfsmount *try_location(str + if (!IS_ERR(mnt)) + break; + } ++ kfree(mountdata->addr); + return mnt; + } + +@@ -221,8 +225,8 @@ out: + + /* + * nfs_do_refmount - handle crossing a referral on server ++ * @mnt_parent - mountpoint of referral + * @dentry - dentry of referral +- * @nd - nameidata info + * + */ + struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) +diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-23 11:01:00.365544029 -0400 +@@ -70,6 +70,9 @@ static int nfs4_do_fsinfo(struct nfs_ser + static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); + static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); ++static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, ++ struct nfs_fattr *fattr, struct iattr *sattr, ++ struct nfs4_state *state); + + /* Prevent leaks of NFSv4 errors into userland */ + static int nfs4_map_errors(int err) +@@ -714,17 +717,18 @@ static void nfs4_init_opendata_res(struc + + static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, + struct nfs4_state_owner *sp, fmode_t fmode, int flags, +- const struct iattr *attrs) ++ const struct iattr *attrs, ++ gfp_t gfp_mask) + { + struct dentry *parent = dget_parent(path->dentry); + struct inode *dir = parent->d_inode; + struct nfs_server *server = NFS_SERVER(dir); + struct nfs4_opendata *p; + +- p = kzalloc(sizeof(*p), GFP_KERNEL); ++ p = kzalloc(sizeof(*p), gfp_mask); + if (p == NULL) + goto err; +- p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); ++ p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask); + if (p->o_arg.seqid == NULL) + goto err_free; + path_get(path); +@@ -1060,7 +1064,7 @@ static struct nfs4_opendata *nfs4_open_r + { + struct nfs4_opendata *opendata; + +- opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL); ++ opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL, GFP_NOFS); + if (opendata == NULL) + return ERR_PTR(-ENOMEM); + opendata->state = state; +@@ -1648,7 +1652,7 @@ static int _nfs4_do_open(struct inode *d + if (path->dentry->d_inode != NULL) + nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode); + status = -ENOMEM; +- opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr); ++ opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr, GFP_KERNEL); + if (opendata == NULL) + goto err_put_state_owner; + +@@ -1659,15 +1663,24 @@ static int _nfs4_do_open(struct inode *d + if (status != 0) + goto err_opendata_put; + +- if (opendata->o_arg.open_flags & O_EXCL) +- nfs4_exclusive_attrset(opendata, sattr); +- + state = nfs4_opendata_to_nfs4_state(opendata); + status = PTR_ERR(state); + if (IS_ERR(state)) + goto err_opendata_put; + if (server->caps & NFS_CAP_POSIX_LOCK) + set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); ++ ++ if (opendata->o_arg.open_flags & O_EXCL) { ++ nfs4_exclusive_attrset(opendata, sattr); ++ ++ nfs_fattr_init(opendata->o_res.f_attr); ++ status = nfs4_do_setattr(state->inode, cred, ++ opendata->o_res.f_attr, sattr, ++ state); ++ if (status == 0) ++ nfs_setattr_update_inode(state->inode, sattr); ++ nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); ++ } + nfs4_opendata_put(opendata); + nfs4_put_state_owner(sp); + *res = state; +@@ -1914,7 +1927,7 @@ static const struct rpc_call_ops nfs4_cl + * + * NOTE: Caller must be holding the sp->so_owner semaphore! + */ +-int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) ++int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait) + { + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_closedata *calldata; +@@ -1933,7 +1946,7 @@ int nfs4_do_close(struct path *path, str + }; + int status = -ENOMEM; + +- calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); ++ calldata = kzalloc(sizeof(*calldata), gfp_mask); + if (calldata == NULL) + goto out; + calldata->inode = state->inode; +@@ -1941,7 +1954,7 @@ int nfs4_do_close(struct path *path, str + calldata->arg.fh = NFS_FH(state->inode); + calldata->arg.stateid = &state->open_stateid; + /* Serialization for the sequence id */ +- calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); ++ calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask); + if (calldata->arg.seqid == NULL) + goto out_free_calldata; + calldata->arg.fmode = 0; +@@ -2404,14 +2417,12 @@ static int nfs4_proc_lookup(struct inode + static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) + { + struct nfs_server *server = NFS_SERVER(inode); +- struct nfs_fattr fattr; + struct nfs4_accessargs args = { + .fh = NFS_FH(inode), + .bitmask = server->attr_bitmask, + }; + struct nfs4_accessres res = { + .server = server, +- .fattr = &fattr, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], +@@ -2438,7 +2449,11 @@ static int _nfs4_proc_access(struct inod + if (mode & MAY_EXEC) + args.access |= NFS4_ACCESS_EXECUTE; + } +- nfs_fattr_init(&fattr); ++ ++ res.fattr = nfs_alloc_fattr(); ++ if (res.fattr == NULL) ++ return -ENOMEM; ++ + status = nfs4_call_sync(server, &msg, &args, &res, 0); + if (!status) { + entry->mask = 0; +@@ -2448,8 +2463,9 @@ static int _nfs4_proc_access(struct inod + entry->mask |= MAY_WRITE; + if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) + entry->mask |= MAY_EXEC; +- nfs_refresh_inode(inode, &fattr); ++ nfs_refresh_inode(inode, res.fattr); + } ++ nfs_free_fattr(res.fattr); + return status; + } + +@@ -2562,13 +2578,6 @@ nfs4_proc_create(struct inode *dir, stru + } + d_add(dentry, igrab(state->inode)); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); +- if (flags & O_EXCL) { +- struct nfs_fattr fattr; +- status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state); +- if (status == 0) +- nfs_setattr_update_inode(state->inode, sattr); +- nfs_post_op_update_inode(state->inode, &fattr); +- } + if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) + status = nfs4_intent_set_file(nd, &path, state, fmode); + else +@@ -2596,14 +2605,19 @@ static int _nfs4_proc_remove(struct inod + .rpc_argp = &args, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; ++ ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.dir_attr == NULL) ++ goto out; + +- nfs_fattr_init(&res.dir_attr); + status = nfs4_call_sync(server, &msg, &args, &res, 1); + if (status == 0) { + update_changeattr(dir, &res.cinfo); +- nfs_post_op_update_inode(dir, &res.dir_attr); ++ nfs_post_op_update_inode(dir, res.dir_attr); + } ++ nfs_free_fattr(res.dir_attr); ++out: + return status; + } + +@@ -2638,7 +2652,7 @@ static int nfs4_proc_unlink_done(struct + if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) + return 0; + update_changeattr(dir, &res->cinfo); +- nfs_post_op_update_inode(dir, &res->dir_attr); ++ nfs_post_op_update_inode(dir, res->dir_attr); + return 1; + } + +@@ -2653,29 +2667,31 @@ static int _nfs4_proc_rename(struct inod + .new_name = new_name, + .bitmask = server->attr_bitmask, + }; +- struct nfs_fattr old_fattr, new_fattr; + struct nfs4_rename_res res = { + .server = server, +- .old_fattr = &old_fattr, +- .new_fattr = &new_fattr, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME], + .rpc_argp = &arg, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + +- nfs_fattr_init(res.old_fattr); +- nfs_fattr_init(res.new_fattr); +- status = nfs4_call_sync(server, &msg, &arg, &res, 1); ++ res.old_fattr = nfs_alloc_fattr(); ++ res.new_fattr = nfs_alloc_fattr(); ++ if (res.old_fattr == NULL || res.new_fattr == NULL) ++ goto out; + ++ status = nfs4_call_sync(server, &msg, &arg, &res, 1); + if (!status) { + update_changeattr(old_dir, &res.old_cinfo); + nfs_post_op_update_inode(old_dir, res.old_fattr); + update_changeattr(new_dir, &res.new_cinfo); + nfs_post_op_update_inode(new_dir, res.new_fattr); + } ++out: ++ nfs_free_fattr(res.new_fattr); ++ nfs_free_fattr(res.old_fattr); + return status; + } + +@@ -2702,28 +2718,30 @@ static int _nfs4_proc_link(struct inode + .name = name, + .bitmask = server->attr_bitmask, + }; +- struct nfs_fattr fattr, dir_attr; + struct nfs4_link_res res = { + .server = server, +- .fattr = &fattr, +- .dir_attr = &dir_attr, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], + .rpc_argp = &arg, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; ++ ++ res.fattr = nfs_alloc_fattr(); ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.fattr == NULL || res.dir_attr == NULL) ++ goto out; + +- nfs_fattr_init(res.fattr); +- nfs_fattr_init(res.dir_attr); + status = nfs4_call_sync(server, &msg, &arg, &res, 1); + if (!status) { + update_changeattr(dir, &res.cinfo); + nfs_post_op_update_inode(dir, res.dir_attr); + nfs_post_op_update_inode(inode, res.fattr); + } +- ++out: ++ nfs_free_fattr(res.dir_attr); ++ nfs_free_fattr(res.fattr); + return status; + } + +@@ -3146,23 +3164,31 @@ static void nfs4_proc_commit_setup(struc + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; + } + ++struct nfs4_renewdata { ++ struct nfs_client *client; ++ unsigned long timestamp; ++}; ++ + /* + * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special + * standalone procedure for queueing an asynchronous RENEW. + */ +-static void nfs4_renew_release(void *data) ++static void nfs4_renew_release(void *calldata) + { +- struct nfs_client *clp = data; ++ struct nfs4_renewdata *data = calldata; ++ struct nfs_client *clp = data->client; + + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); ++ kfree(data); + } + +-static void nfs4_renew_done(struct rpc_task *task, void *data) ++static void nfs4_renew_done(struct rpc_task *task, void *calldata) + { +- struct nfs_client *clp = data; +- unsigned long timestamp = task->tk_start; ++ struct nfs4_renewdata *data = calldata; ++ struct nfs_client *clp = data->client; ++ unsigned long timestamp = data->timestamp; + + if (task->tk_status < 0) { + /* Unless we're shutting down, schedule state recovery! */ +@@ -3188,11 +3214,17 @@ int nfs4_proc_async_renew(struct nfs_cli + .rpc_argp = clp, + .rpc_cred = cred, + }; ++ struct nfs4_renewdata *data; + + if (!atomic_inc_not_zero(&clp->cl_count)) + return -EIO; ++ data = kmalloc(sizeof(*data), GFP_KERNEL); ++ if (data == NULL) ++ return -ENOMEM; ++ data->client = clp; ++ data->timestamp = jiffies; + return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, +- &nfs4_renew_ops, clp); ++ &nfs4_renew_ops, data); + } + + int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) +@@ -3494,7 +3526,9 @@ nfs4_async_handle_error(struct rpc_task + return _nfs4_async_handle_error(task, server, server->nfs_client, state); + } + +-int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred) ++int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, ++ unsigned short port, struct rpc_cred *cred, ++ struct nfs4_setclientid_res *res) + { + nfs4_verifier sc_verifier; + struct nfs4_setclientid setclientid = { +@@ -3504,7 +3538,7 @@ int nfs4_proc_setclientid(struct nfs_cli + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], + .rpc_argp = &setclientid, +- .rpc_resp = clp, ++ .rpc_resp = res, + .rpc_cred = cred, + }; + __be32 *p; +@@ -3547,12 +3581,14 @@ int nfs4_proc_setclientid(struct nfs_cli + return status; + } + +-static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) ++static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, ++ struct nfs4_setclientid_res *arg, ++ struct rpc_cred *cred) + { + struct nfs_fsinfo fsinfo; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM], +- .rpc_argp = clp, ++ .rpc_argp = arg, + .rpc_resp = &fsinfo, + .rpc_cred = cred, + }; +@@ -3570,12 +3606,14 @@ static int _nfs4_proc_setclientid_confir + return status; + } + +-int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) ++int nfs4_proc_setclientid_confirm(struct nfs_client *clp, ++ struct nfs4_setclientid_res *arg, ++ struct rpc_cred *cred) + { + long timeout = 0; + int err; + do { +- err = _nfs4_proc_setclientid_confirm(clp, cred); ++ err = _nfs4_proc_setclientid_confirm(clp, arg, cred); + switch (err) { + case 0: + return err; +@@ -3667,7 +3705,7 @@ static int _nfs4_proc_delegreturn(struct + }; + int status = 0; + +- data = kzalloc(sizeof(*data), GFP_KERNEL); ++ data = kzalloc(sizeof(*data), GFP_NOFS); + if (data == NULL) + return -ENOMEM; + data->args.fhandle = &data->fh; +@@ -3823,7 +3861,7 @@ static struct nfs4_unlockdata *nfs4_allo + struct nfs4_unlockdata *p; + struct inode *inode = lsp->ls_state->inode; + +- p = kzalloc(sizeof(*p), GFP_KERNEL); ++ p = kzalloc(sizeof(*p), GFP_NOFS); + if (p == NULL) + return NULL; + p->arg.fh = NFS_FH(inode); +@@ -3961,7 +3999,7 @@ static int nfs4_proc_unlck(struct nfs4_s + if (test_bit(NFS_DELEGATED_STATE, &state->flags)) + goto out; + lsp = request->fl_u.nfs4_fl.owner; +- seqid = nfs_alloc_seqid(&lsp->ls_seqid); ++ seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL); + status = -ENOMEM; + if (seqid == NULL) + goto out; +@@ -3989,22 +4027,23 @@ struct nfs4_lockdata { + }; + + static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, +- struct nfs_open_context *ctx, struct nfs4_lock_state *lsp) ++ struct nfs_open_context *ctx, struct nfs4_lock_state *lsp, ++ gfp_t gfp_mask) + { + struct nfs4_lockdata *p; + struct inode *inode = lsp->ls_state->inode; + struct nfs_server *server = NFS_SERVER(inode); + +- p = kzalloc(sizeof(*p), GFP_KERNEL); ++ p = kzalloc(sizeof(*p), gfp_mask); + if (p == NULL) + return NULL; + + p->arg.fh = NFS_FH(inode); + p->arg.fl = &p->fl; +- p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid); ++ p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask); + if (p->arg.open_seqid == NULL) + goto out_free; +- p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid); ++ p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask); + if (p->arg.lock_seqid == NULL) + goto out_free_seqid; + p->arg.lock_stateid = &lsp->ls_stateid; +@@ -4158,7 +4197,8 @@ static int _nfs4_do_setlk(struct nfs4_st + + dprintk("%s: begin!\n", __func__); + data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), +- fl->fl_u.nfs4_fl.owner); ++ fl->fl_u.nfs4_fl.owner, ++ recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS); + if (data == NULL) + return -ENOMEM; + if (IS_SETLKW(cmd)) +@@ -4647,7 +4687,7 @@ static int nfs4_reset_slot_table(struct + if (max_reqs != tbl->max_slots) { + ret = -ENOMEM; + new = kmalloc(max_reqs * sizeof(struct nfs4_slot), +- GFP_KERNEL); ++ GFP_NOFS); + if (!new) + goto out; + ret = 0; +@@ -4712,7 +4752,7 @@ static int nfs4_init_slot_table(struct n + + dprintk("--> %s: max_reqs=%u\n", __func__, max_slots); + +- slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL); ++ slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS); + if (!slot) + goto out; + ret = 0; +@@ -4761,7 +4801,7 @@ struct nfs4_session *nfs4_alloc_session( + struct nfs4_session *session; + struct nfs4_slot_table *tbl; + +- session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL); ++ session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); + if (!session) + return NULL; + +@@ -5105,8 +5145,8 @@ static int nfs41_proc_async_sequence(str + + if (!atomic_inc_not_zero(&clp->cl_count)) + return -EIO; +- args = kzalloc(sizeof(*args), GFP_KERNEL); +- res = kzalloc(sizeof(*res), GFP_KERNEL); ++ args = kzalloc(sizeof(*args), GFP_NOFS); ++ res = kzalloc(sizeof(*res), GFP_NOFS); + if (!args || !res) { + kfree(args); + kfree(res); +@@ -5207,7 +5247,7 @@ static int nfs41_proc_reclaim_complete(s + int status = -ENOMEM; + + dprintk("--> %s\n", __func__); +- calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); ++ calldata = kzalloc(sizeof(*calldata), GFP_NOFS); + if (calldata == NULL) + goto out; + calldata->clp = clp; +diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/nfs4state.c +--- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-23 11:01:00.367574218 -0400 +@@ -62,6 +62,7 @@ static LIST_HEAD(nfs4_clientid_list); + + int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) + { ++ struct nfs4_setclientid_res clid; + unsigned short port; + int status; + +@@ -69,11 +70,15 @@ int nfs4_init_clientid(struct nfs_client + if (clp->cl_addr.ss_family == AF_INET6) + port = nfs_callback_tcpport6; + +- status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred); +- if (status == 0) +- status = nfs4_proc_setclientid_confirm(clp, cred); +- if (status == 0) +- nfs4_schedule_state_renewal(clp); ++ status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); ++ if (status != 0) ++ goto out; ++ status = nfs4_proc_setclientid_confirm(clp, &clid, cred); ++ if (status != 0) ++ goto out; ++ clp->cl_clientid = clid.clientid; ++ nfs4_schedule_state_renewal(clp); ++out: + return status; + } + +@@ -361,7 +366,7 @@ nfs4_alloc_state_owner(void) + { + struct nfs4_state_owner *sp; + +- sp = kzalloc(sizeof(*sp),GFP_KERNEL); ++ sp = kzalloc(sizeof(*sp),GFP_NOFS); + if (!sp) + return NULL; + spin_lock_init(&sp->so_lock); +@@ -435,7 +440,7 @@ nfs4_alloc_open_state(void) + { + struct nfs4_state *state; + +- state = kzalloc(sizeof(*state), GFP_KERNEL); ++ state = kzalloc(sizeof(*state), GFP_NOFS); + if (!state) + return NULL; + atomic_set(&state->count, 1); +@@ -537,7 +542,8 @@ void nfs4_put_open_state(struct nfs4_sta + /* + * Close the current file. + */ +-static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait) ++static void __nfs4_close(struct path *path, struct nfs4_state *state, ++ fmode_t fmode, gfp_t gfp_mask, int wait) + { + struct nfs4_state_owner *owner = state->owner; + int call_close = 0; +@@ -578,17 +584,17 @@ static void __nfs4_close(struct path *pa + nfs4_put_open_state(state); + nfs4_put_state_owner(owner); + } else +- nfs4_do_close(path, state, wait); ++ nfs4_do_close(path, state, gfp_mask, wait); + } + + void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) + { +- __nfs4_close(path, state, fmode, 0); ++ __nfs4_close(path, state, fmode, GFP_NOFS, 0); + } + + void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode) + { +- __nfs4_close(path, state, fmode, 1); ++ __nfs4_close(path, state, fmode, GFP_KERNEL, 1); + } + + /* +@@ -618,7 +624,7 @@ static struct nfs4_lock_state *nfs4_allo + struct nfs4_lock_state *lsp; + struct nfs_client *clp = state->owner->so_client; + +- lsp = kzalloc(sizeof(*lsp), GFP_KERNEL); ++ lsp = kzalloc(sizeof(*lsp), GFP_NOFS); + if (lsp == NULL) + return NULL; + rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue"); +@@ -754,11 +760,11 @@ void nfs4_copy_stateid(nfs4_stateid *dst + nfs4_put_lock_state(lsp); + } + +-struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter) ++struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask) + { + struct nfs_seqid *new; + +- new = kmalloc(sizeof(*new), GFP_KERNEL); ++ new = kmalloc(sizeof(*new), gfp_mask); + if (new != NULL) { + new->sequence = counter; + INIT_LIST_HEAD(&new->list); +@@ -1347,7 +1353,7 @@ static int nfs4_recall_slot(struct nfs_c + + nfs4_begin_drain_session(clp); + new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot), +- GFP_KERNEL); ++ GFP_NOFS); + if (!new) + return -ENOMEM; + +diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-23 11:00:23.792491380 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-23 11:01:00.369544055 -0400 +@@ -1504,14 +1504,14 @@ static void encode_setclientid(struct xd + hdr->replen += decode_setclientid_maxsz; + } + +-static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr) ++static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr) + { + __be32 *p; + + p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); +- p = xdr_encode_hyper(p, client_state->cl_clientid); +- xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); ++ p = xdr_encode_hyper(p, arg->clientid); ++ xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE); + hdr->nops++; + hdr->replen += decode_setclientid_confirm_maxsz; + } +@@ -2324,7 +2324,7 @@ static int nfs4_xdr_enc_setclientid(stru + /* + * a SETCLIENTID_CONFIRM request + */ +-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp) ++static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg) + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +@@ -2334,7 +2334,7 @@ static int nfs4_xdr_enc_setclientid_conf + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); +- encode_setclientid_confirm(&xdr, clp, &hdr); ++ encode_setclientid_confirm(&xdr, arg, &hdr); + encode_putrootfh(&xdr, &hdr); + encode_fsinfo(&xdr, lease_bitmap, &hdr); + encode_nops(&hdr); +@@ -4397,7 +4397,7 @@ out_overflow: + return -EIO; + } + +-static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) ++static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res) + { + __be32 *p; + uint32_t opnum; +@@ -4417,8 +4417,8 @@ static int decode_setclientid(struct xdr + p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); + if (unlikely(!p)) + goto out_overflow; +- p = xdr_decode_hyper(p, &clp->cl_clientid); +- memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE); ++ p = xdr_decode_hyper(p, &res->clientid); ++ memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE); + } else if (nfserr == NFSERR_CLID_INUSE) { + uint32_t len; + +@@ -4815,7 +4815,7 @@ static int nfs4_xdr_dec_remove(struct rp + goto out; + if ((status = decode_remove(&xdr, &res->cinfo)) != 0) + goto out; +- decode_getfattr(&xdr, &res->dir_attr, res->server, ++ decode_getfattr(&xdr, res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); + out: + return status; +@@ -5498,7 +5498,7 @@ static int nfs4_xdr_dec_renew(struct rpc + * Decode SETCLIENTID response + */ + static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, +- struct nfs_client *clp) ++ struct nfs4_setclientid_res *res) + { + struct xdr_stream xdr; + struct compound_hdr hdr; +@@ -5507,7 +5507,7 @@ static int nfs4_xdr_dec_setclientid(stru + xdr_init_decode(&xdr, &req->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (!status) +- status = decode_setclientid(&xdr, clp); ++ status = decode_setclientid(&xdr, res); + return status; + } + +diff -up linux-2.6.34.noarch/fs/nfs/nfsroot.c.orig linux-2.6.34.noarch/fs/nfs/nfsroot.c +--- linux-2.6.34.noarch/fs/nfs/nfsroot.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfsroot.c 2010-08-23 11:01:00.371574358 -0400 +@@ -488,7 +488,6 @@ static int __init root_nfs_ports(void) + */ + static int __init root_nfs_get_handle(void) + { +- struct nfs_fh fh; + struct sockaddr_in sin; + unsigned int auth_flav_len = 0; + struct nfs_mount_request request = { +@@ -499,21 +498,24 @@ static int __init root_nfs_get_handle(vo + NFS_MNT3_VERSION : NFS_MNT_VERSION, + .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ? + XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP, +- .fh = &fh, + .auth_flav_len = &auth_flav_len, + }; +- int status; ++ int status = -ENOMEM; + ++ request.fh = nfs_alloc_fhandle(); ++ if (!request.fh) ++ goto out; + set_sockaddr(&sin, servaddr, htons(mount_port)); + status = nfs_mount(&request); + if (status < 0) + printk(KERN_ERR "Root-NFS: Server returned error %d " + "while mounting %s\n", status, nfs_export_path); + else { +- nfs_data.root.size = fh.size; +- memcpy(nfs_data.root.data, fh.data, fh.size); ++ nfs_data.root.size = request.fh->size; ++ memcpy(&nfs_data.root.data, request.fh->data, request.fh->size); + } +- ++ nfs_free_fhandle(request.fh); ++out: + return status; + } + +diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/pagelist.c +--- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-23 11:01:00.371574358 -0400 +@@ -60,16 +60,10 @@ nfs_create_request(struct nfs_open_conte + { + struct nfs_page *req; + +- for (;;) { +- /* try to allocate the request struct */ +- req = nfs_page_alloc(); +- if (req != NULL) +- break; +- +- if (fatal_signal_pending(current)) +- return ERR_PTR(-ERESTARTSYS); +- yield(); +- } ++ /* try to allocate the request struct */ ++ req = nfs_page_alloc(); ++ if (req == NULL) ++ return ERR_PTR(-ENOMEM); + + /* Initialize the request struct. Initially, we assume a + * long write-back delay. This will be adjusted in +diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc.c +--- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-23 11:01:00.372574292 -0400 +@@ -224,35 +224,60 @@ static int nfs_proc_readlink(struct inod + return status; + } + ++struct nfs_createdata { ++ struct nfs_createargs arg; ++ struct nfs_diropok res; ++ struct nfs_fh fhandle; ++ struct nfs_fattr fattr; ++}; ++ ++static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir, ++ struct dentry *dentry, struct iattr *sattr) ++{ ++ struct nfs_createdata *data; ++ ++ data = kmalloc(sizeof(*data), GFP_KERNEL); ++ ++ if (data != NULL) { ++ data->arg.fh = NFS_FH(dir); ++ data->arg.name = dentry->d_name.name; ++ data->arg.len = dentry->d_name.len; ++ data->arg.sattr = sattr; ++ nfs_fattr_init(&data->fattr); ++ data->fhandle.size = 0; ++ data->res.fh = &data->fhandle; ++ data->res.fattr = &data->fattr; ++ } ++ return data; ++}; ++ ++static void nfs_free_createdata(const struct nfs_createdata *data) ++{ ++ kfree(data); ++} ++ + static int + nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + int flags, struct nameidata *nd) + { +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; +- struct nfs_createargs arg = { +- .fh = NFS_FH(dir), +- .name = dentry->d_name.name, +- .len = dentry->d_name.len, +- .sattr = sattr +- }; +- struct nfs_diropok res = { +- .fh = &fhandle, +- .fattr = &fattr +- }; ++ struct nfs_createdata *data; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_CREATE], +- .rpc_argp = &arg, +- .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + +- nfs_fattr_init(&fattr); + dprintk("NFS call create %s\n", dentry->d_name.name); ++ data = nfs_alloc_createdata(dir, dentry, sattr); ++ if (data == NULL) ++ goto out; ++ msg.rpc_argp = &data->arg; ++ msg.rpc_resp = &data->res; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + if (status == 0) +- status = nfs_instantiate(dentry, &fhandle, &fattr); ++ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); ++ nfs_free_createdata(data); ++out: + dprintk("NFS reply create: %d\n", status); + return status; + } +@@ -264,24 +289,12 @@ static int + nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + dev_t rdev) + { +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; +- struct nfs_createargs arg = { +- .fh = NFS_FH(dir), +- .name = dentry->d_name.name, +- .len = dentry->d_name.len, +- .sattr = sattr +- }; +- struct nfs_diropok res = { +- .fh = &fhandle, +- .fattr = &fattr +- }; ++ struct nfs_createdata *data; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_CREATE], +- .rpc_argp = &arg, +- .rpc_resp = &res, + }; +- int status, mode; ++ umode_t mode; ++ int status = -ENOMEM; + + dprintk("NFS call mknod %s\n", dentry->d_name.name); + +@@ -294,17 +307,24 @@ nfs_proc_mknod(struct inode *dir, struct + sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */ + } + +- nfs_fattr_init(&fattr); ++ data = nfs_alloc_createdata(dir, dentry, sattr); ++ if (data == NULL) ++ goto out; ++ msg.rpc_argp = &data->arg; ++ msg.rpc_resp = &data->res; ++ + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + + if (status == -EINVAL && S_ISFIFO(mode)) { + sattr->ia_mode = mode; +- nfs_fattr_init(&fattr); ++ nfs_fattr_init(data->res.fattr); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + } + if (status == 0) +- status = nfs_instantiate(dentry, &fhandle, &fattr); ++ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); ++ nfs_free_createdata(data); ++out: + dprintk("NFS reply mknod: %d\n", status); + return status; + } +@@ -398,8 +418,8 @@ static int + nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, + unsigned int len, struct iattr *sattr) + { +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; ++ struct nfs_fh *fh; ++ struct nfs_fattr *fattr; + struct nfs_symlinkargs arg = { + .fromfh = NFS_FH(dir), + .fromname = dentry->d_name.name, +@@ -412,12 +432,18 @@ nfs_proc_symlink(struct inode *dir, stru + .rpc_proc = &nfs_procedures[NFSPROC_SYMLINK], + .rpc_argp = &arg, + }; +- int status; ++ int status = -ENAMETOOLONG; ++ ++ dprintk("NFS call symlink %s\n", dentry->d_name.name); + + if (len > NFS2_MAXPATHLEN) +- return -ENAMETOOLONG; ++ goto out; + +- dprintk("NFS call symlink %s\n", dentry->d_name.name); ++ fh = nfs_alloc_fhandle(); ++ fattr = nfs_alloc_fattr(); ++ status = -ENOMEM; ++ if (fh == NULL || fattr == NULL) ++ goto out; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); +@@ -427,12 +453,12 @@ nfs_proc_symlink(struct inode *dir, stru + * filehandle size to zero indicates to nfs_instantiate that it + * should fill in the data with a LOOKUP call on the wire. + */ +- if (status == 0) { +- nfs_fattr_init(&fattr); +- fhandle.size = 0; +- status = nfs_instantiate(dentry, &fhandle, &fattr); +- } ++ if (status == 0) ++ status = nfs_instantiate(dentry, fh, fattr); + ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fh); ++out: + dprintk("NFS reply symlink: %d\n", status); + return status; + } +@@ -440,31 +466,25 @@ nfs_proc_symlink(struct inode *dir, stru + static int + nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) + { +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; +- struct nfs_createargs arg = { +- .fh = NFS_FH(dir), +- .name = dentry->d_name.name, +- .len = dentry->d_name.len, +- .sattr = sattr +- }; +- struct nfs_diropok res = { +- .fh = &fhandle, +- .fattr = &fattr +- }; ++ struct nfs_createdata *data; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_MKDIR], +- .rpc_argp = &arg, +- .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call mkdir %s\n", dentry->d_name.name); +- nfs_fattr_init(&fattr); ++ data = nfs_alloc_createdata(dir, dentry, sattr); ++ if (data == NULL) ++ goto out; ++ msg.rpc_argp = &data->arg; ++ msg.rpc_resp = &data->res; ++ + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + if (status == 0) +- status = nfs_instantiate(dentry, &fhandle, &fattr); ++ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); ++ nfs_free_createdata(data); ++out: + dprintk("NFS reply mkdir: %d\n", status); + return status; + } +diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read.c +--- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-23 11:01:00.373574317 -0400 +@@ -40,7 +40,7 @@ static mempool_t *nfs_rdata_mempool; + + struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) + { +- struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS); ++ struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL); + + if (p) { + memset(p, 0, sizeof(*p)); +@@ -50,7 +50,7 @@ struct nfs_read_data *nfs_readdata_alloc + if (pagecount <= ARRAY_SIZE(p->page_array)) + p->pagevec = p->page_array; + else { +- p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); ++ p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); + if (!p->pagevec) { + mempool_free(p, nfs_rdata_mempool); + p = NULL; +diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/super.c +--- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-23 11:00:23.794511661 -0400 ++++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-23 11:01:00.374564179 -0400 +@@ -141,7 +141,6 @@ static const match_table_t nfs_mount_opt + { Opt_resvport, "resvport" }, + { Opt_noresvport, "noresvport" }, + { Opt_fscache, "fsc" }, +- { Opt_fscache_uniq, "fsc=%s" }, + { Opt_nofscache, "nofsc" }, + + { Opt_port, "port=%s" }, +@@ -171,6 +170,7 @@ static const match_table_t nfs_mount_opt + { Opt_mountaddr, "mountaddr=%s" }, + + { Opt_lookupcache, "lookupcache=%s" }, ++ { Opt_fscache_uniq, "fsc=%s" }, + + { Opt_err, NULL } + }; +@@ -423,15 +423,19 @@ static int nfs_statfs(struct dentry *den + unsigned char blockbits; + unsigned long blockres; + struct nfs_fh *fh = NFS_FH(dentry->d_inode); +- struct nfs_fattr fattr; +- struct nfs_fsstat res = { +- .fattr = &fattr, +- }; +- int error; ++ struct nfs_fsstat res; ++ int error = -ENOMEM; ++ ++ res.fattr = nfs_alloc_fattr(); ++ if (res.fattr == NULL) ++ goto out_err; + + error = server->nfs_client->rpc_ops->statfs(server, fh, &res); ++ ++ nfs_free_fattr(res.fattr); + if (error < 0) + goto out_err; ++ + buf->f_type = NFS_SUPER_MAGIC; + + /* +@@ -1060,14 +1064,6 @@ static int nfs_parse_mount_options(char + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = NULL; + break; +- case Opt_fscache_uniq: +- string = match_strdup(args); +- if (!string) +- goto out_nomem; +- kfree(mnt->fscache_uniq); +- mnt->fscache_uniq = string; +- mnt->options |= NFS_OPTION_FSCACHE; +- break; + + /* + * options that take numeric values +@@ -1398,6 +1394,14 @@ static int nfs_parse_mount_options(char + return 0; + }; + break; ++ case Opt_fscache_uniq: ++ string = match_strdup(args); ++ if (string == NULL) ++ goto out_nomem; ++ kfree(mnt->fscache_uniq); ++ mnt->fscache_uniq = string; ++ mnt->options |= NFS_OPTION_FSCACHE; ++ break; + + /* + * Special options +@@ -2186,7 +2190,7 @@ static int nfs_get_sb(struct file_system + int error = -ENOMEM; + + data = nfs_alloc_parsed_mount_data(3); +- mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); ++ mntfh = nfs_alloc_fhandle(); + if (data == NULL || mntfh == NULL) + goto out_free_fh; + +@@ -2261,7 +2265,7 @@ out: + kfree(data->fscache_uniq); + security_free_mnt_opts(&data->lsm_opts); + out_free_fh: +- kfree(mntfh); ++ nfs_free_fhandle(mntfh); + kfree(data); + return error; + +@@ -2570,7 +2574,7 @@ static int nfs4_remote_get_sb(struct fil + }; + int error = -ENOMEM; + +- mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); ++ mntfh = nfs_alloc_fhandle(); + if (data == NULL || mntfh == NULL) + goto out_free_fh; + +@@ -2628,7 +2632,7 @@ static int nfs4_remote_get_sb(struct fil + out: + security_free_mnt_opts(&data->lsm_opts); + out_free_fh: +- kfree(mntfh); ++ nfs_free_fhandle(mntfh); + return error; + + out_free: +@@ -2683,41 +2687,120 @@ out_freepage: + free_page((unsigned long)page); + } + ++struct nfs_referral_count { ++ struct list_head list; ++ const struct task_struct *task; ++ unsigned int referral_count; ++}; ++ ++static LIST_HEAD(nfs_referral_count_list); ++static DEFINE_SPINLOCK(nfs_referral_count_list_lock); ++ ++static struct nfs_referral_count *nfs_find_referral_count(void) ++{ ++ struct nfs_referral_count *p; ++ ++ list_for_each_entry(p, &nfs_referral_count_list, list) { ++ if (p->task == current) ++ return p; ++ } ++ return NULL; ++} ++ ++#define NFS_MAX_NESTED_REFERRALS 2 ++ ++static int nfs_referral_loop_protect(void) ++{ ++ struct nfs_referral_count *p, *new; ++ int ret = -ENOMEM; ++ ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ goto out; ++ new->task = current; ++ new->referral_count = 1; ++ ++ ret = 0; ++ spin_lock(&nfs_referral_count_list_lock); ++ p = nfs_find_referral_count(); ++ if (p != NULL) { ++ if (p->referral_count >= NFS_MAX_NESTED_REFERRALS) ++ ret = -ELOOP; ++ else ++ p->referral_count++; ++ } else { ++ list_add(&new->list, &nfs_referral_count_list); ++ new = NULL; ++ } ++ spin_unlock(&nfs_referral_count_list_lock); ++ kfree(new); ++out: ++ return ret; ++} ++ ++static void nfs_referral_loop_unprotect(void) ++{ ++ struct nfs_referral_count *p; ++ ++ spin_lock(&nfs_referral_count_list_lock); ++ p = nfs_find_referral_count(); ++ p->referral_count--; ++ if (p->referral_count == 0) ++ list_del(&p->list); ++ else ++ p = NULL; ++ spin_unlock(&nfs_referral_count_list_lock); ++ kfree(p); ++} ++ + static int nfs_follow_remote_path(struct vfsmount *root_mnt, + const char *export_path, struct vfsmount *mnt_target) + { ++ struct nameidata *nd = NULL; + struct mnt_namespace *ns_private; +- struct nameidata nd; + struct super_block *s; + int ret; + ++ nd = kmalloc(sizeof(*nd), GFP_KERNEL); ++ if (nd == NULL) ++ return -ENOMEM; ++ + ns_private = create_mnt_ns(root_mnt); + ret = PTR_ERR(ns_private); + if (IS_ERR(ns_private)) + goto out_mntput; + ++ ret = nfs_referral_loop_protect(); ++ if (ret != 0) ++ goto out_put_mnt_ns; ++ + ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, +- export_path, LOOKUP_FOLLOW, &nd); ++ export_path, LOOKUP_FOLLOW, nd); + ++ nfs_referral_loop_unprotect(); + put_mnt_ns(ns_private); + + if (ret != 0) + goto out_err; + +- s = nd.path.mnt->mnt_sb; ++ s = nd->path.mnt->mnt_sb; + atomic_inc(&s->s_active); + mnt_target->mnt_sb = s; +- mnt_target->mnt_root = dget(nd.path.dentry); ++ mnt_target->mnt_root = dget(nd->path.dentry); + + /* Correct the device pathname */ +- nfs_fix_devname(&nd.path, mnt_target); ++ nfs_fix_devname(&nd->path, mnt_target); + +- path_put(&nd.path); ++ path_put(&nd->path); ++ kfree(nd); + down_write(&s->s_umount); + return 0; ++out_put_mnt_ns: ++ put_mnt_ns(ns_private); + out_mntput: + mntput(root_mnt); + out_err: ++ kfree(nd); + return ret; + } + +@@ -2888,17 +2971,21 @@ static int nfs4_remote_referral_get_sb(s + struct super_block *s; + struct nfs_server *server; + struct dentry *mntroot; +- struct nfs_fh mntfh; ++ struct nfs_fh *mntfh; + int (*compare_super)(struct super_block *, void *) = nfs_compare_super; + struct nfs_sb_mountdata sb_mntdata = { + .mntflags = flags, + }; +- int error; ++ int error = -ENOMEM; + + dprintk("--> nfs4_referral_get_sb()\n"); + ++ mntfh = nfs_alloc_fhandle(); ++ if (mntfh == NULL) ++ goto out_err_nofh; ++ + /* create a new volume representation */ +- server = nfs4_create_referral_server(data, &mntfh); ++ server = nfs4_create_referral_server(data, mntfh); + if (IS_ERR(server)) { + error = PTR_ERR(server); + goto out_err_noserver; +@@ -2930,7 +3017,7 @@ static int nfs4_remote_referral_get_sb(s + nfs_fscache_get_super_cookie(s, NULL, data); + } + +- mntroot = nfs4_get_root(s, &mntfh); ++ mntroot = nfs4_get_root(s, mntfh); + if (IS_ERR(mntroot)) { + error = PTR_ERR(mntroot); + goto error_splat_super; +@@ -2947,12 +3034,15 @@ static int nfs4_remote_referral_get_sb(s + + security_sb_clone_mnt_opts(data->sb, s); + ++ nfs_free_fhandle(mntfh); + dprintk("<-- nfs4_referral_get_sb() = 0\n"); + return 0; + + out_err_nosb: + nfs_free_server(server); + out_err_noserver: ++ nfs_free_fhandle(mntfh); ++out_err_nofh: + dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error); + return error; + +@@ -2961,6 +3051,7 @@ error_splat_super: + bdi_unregister(&server->backing_dev_info); + error_splat_bdi: + deactivate_locked_super(s); ++ nfs_free_fhandle(mntfh); + dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); + return error; + } +diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unlink.c +--- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-23 11:01:00.375554592 -0400 +@@ -23,6 +23,7 @@ struct nfs_unlinkdata { + struct nfs_removeres res; + struct inode *dir; + struct rpc_cred *cred; ++ struct nfs_fattr dir_attr; + }; + + /** +@@ -169,7 +170,7 @@ static int nfs_do_call_unlink(struct den + } + nfs_sb_active(dir->i_sb); + data->args.fh = NFS_FH(dir); +- nfs_fattr_init(&data->res.dir_attr); ++ nfs_fattr_init(data->res.dir_attr); + + NFS_PROTO(dir)->unlink_setup(&msg, dir); + +@@ -259,6 +260,7 @@ nfs_async_unlink(struct inode *dir, stru + goto out_free; + } + data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ data->res.dir_attr = &data->dir_attr; + + status = -EBUSY; + spin_lock(&dentry->d_lock); +diff -up linux-2.6.34.noarch/include/linux/ktime.h.orig linux-2.6.34.noarch/include/linux/ktime.h +--- linux-2.6.34.noarch/include/linux/ktime.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/ktime.h 2010-08-23 11:01:00.377554285 -0400 +@@ -130,7 +130,7 @@ static inline ktime_t timeval_to_ktime(s + /* Convert ktime_t to nanoseconds - NOP in the scalar storage format: */ + #define ktime_to_ns(kt) ((kt).tv64) + +-#else ++#else /* !((BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)) */ + + /* + * Helper macros/inlines to get the ktime_t math right in the timespec +@@ -275,7 +275,7 @@ static inline s64 ktime_to_ns(const ktim + return (s64) kt.tv.sec * NSEC_PER_SEC + kt.tv.nsec; + } + +-#endif ++#endif /* !((BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)) */ + + /** + * ktime_equal - Compares two ktime_t variables to see if they are equal +@@ -295,6 +295,12 @@ static inline s64 ktime_to_us(const ktim + return (s64) tv.tv_sec * USEC_PER_SEC + tv.tv_usec; + } + ++static inline s64 ktime_to_ms(const ktime_t kt) ++{ ++ struct timeval tv = ktime_to_timeval(kt); ++ return (s64) tv.tv_sec * MSEC_PER_SEC + tv.tv_usec / USEC_PER_MSEC; ++} ++ + static inline s64 ktime_us_delta(const ktime_t later, const ktime_t earlier) + { + return ktime_to_us(ktime_sub(later, earlier)); +diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/include/linux/nfs_fs.h +--- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-23 11:00:23.822502111 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-23 11:01:00.378563926 -0400 +@@ -356,6 +356,20 @@ extern struct nfs_open_context *nfs_find + extern u64 nfs_compat_user_ino64(u64 fileid); + extern void nfs_fattr_init(struct nfs_fattr *fattr); + ++extern struct nfs_fattr *nfs_alloc_fattr(void); ++ ++static inline void nfs_free_fattr(const struct nfs_fattr *fattr) ++{ ++ kfree(fattr); ++} ++ ++extern struct nfs_fh *nfs_alloc_fhandle(void); ++ ++static inline void nfs_free_fhandle(const struct nfs_fh *fh) ++{ ++ kfree(fh); ++} ++ + /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */ + extern __be32 root_nfs_parse_addr(char *name); /*__init*/ + extern unsigned long nfs_inc_attr_generation_counter(void); +diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/include/linux/nfs_fs_sb.h +--- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-23 11:01:00.380553887 -0400 +@@ -44,7 +44,6 @@ struct nfs_client { + + #ifdef CONFIG_NFS_V4 + u64 cl_clientid; /* constant */ +- nfs4_verifier cl_confirm; + unsigned long cl_state; + + struct rb_root cl_openowner_id; +diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/include/linux/nfs_xdr.h +--- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-23 11:01:00.381564072 -0400 +@@ -386,8 +386,8 @@ struct nfs_removeargs { + + struct nfs_removeres { + const struct nfs_server *server; ++ struct nfs_fattr *dir_attr; + struct nfs4_change_info cinfo; +- struct nfs_fattr dir_attr; + struct nfs4_sequence_res seq_res; + }; + +@@ -824,6 +824,11 @@ struct nfs4_setclientid { + u32 sc_cb_ident; + }; + ++struct nfs4_setclientid_res { ++ u64 clientid; ++ nfs4_verifier confirm; ++}; ++ + struct nfs4_statfs_arg { + const struct nfs_fh * fh; + const u32 * bitmask; +diff -up linux-2.6.34.noarch/include/linux/sunrpc/auth_gss.h.orig linux-2.6.34.noarch/include/linux/sunrpc/auth_gss.h +--- linux-2.6.34.noarch/include/linux/sunrpc/auth_gss.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/auth_gss.h 2010-08-23 11:01:00.382564026 -0400 +@@ -82,6 +82,7 @@ struct gss_cred { + enum rpc_gss_svc gc_service; + struct gss_cl_ctx *gc_ctx; + struct gss_upcall_msg *gc_upcall; ++ unsigned long gc_upcall_timestamp; + unsigned char gc_machine_cred : 1; + }; + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/auth.h.orig linux-2.6.34.noarch/include/linux/sunrpc/auth.h +--- linux-2.6.34.noarch/include/linux/sunrpc/auth.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/auth.h 2010-08-23 11:01:00.382564026 -0400 +@@ -54,6 +54,7 @@ struct rpc_cred { + #define RPCAUTH_CRED_NEW 0 + #define RPCAUTH_CRED_UPTODATE 1 + #define RPCAUTH_CRED_HASHED 2 ++#define RPCAUTH_CRED_NEGATIVE 3 + + #define RPCAUTH_CRED_MAGIC 0x0f4aa4f0 + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/gss_api.h.orig linux-2.6.34.noarch/include/linux/sunrpc/gss_api.h +--- linux-2.6.34.noarch/include/linux/sunrpc/gss_api.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/gss_api.h 2010-08-23 11:01:00.383574314 -0400 +@@ -35,7 +35,8 @@ int gss_import_sec_context( + const void* input_token, + size_t bufsize, + struct gss_api_mech *mech, +- struct gss_ctx **ctx_id); ++ struct gss_ctx **ctx_id, ++ gfp_t gfp_mask); + u32 gss_get_mic( + struct gss_ctx *ctx_id, + struct xdr_buf *message, +@@ -80,6 +81,8 @@ struct gss_api_mech { + /* pseudoflavors supported by this mechanism: */ + int gm_pf_num; + struct pf_desc * gm_pfs; ++ /* Should the following be a callback operation instead? */ ++ const char *gm_upcall_enctypes; + }; + + /* and must provide the following operations: */ +@@ -87,7 +90,8 @@ struct gss_api_ops { + int (*gss_import_sec_context)( + const void *input_token, + size_t bufsize, +- struct gss_ctx *ctx_id); ++ struct gss_ctx *ctx_id, ++ gfp_t gfp_mask); + u32 (*gss_get_mic)( + struct gss_ctx *ctx_id, + struct xdr_buf *message, +diff -up linux-2.6.34.noarch/include/linux/sunrpc/gss_krb5.h.orig linux-2.6.34.noarch/include/linux/sunrpc/gss_krb5.h +--- linux-2.6.34.noarch/include/linux/sunrpc/gss_krb5.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/gss_krb5.h 2010-08-23 11:01:00.383574314 -0400 +@@ -4,7 +4,7 @@ + * Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h, + * lib/gssapi/krb5/gssapiP_krb5.h, and others + * +- * Copyright (c) 2000 The Regents of the University of Michigan. ++ * Copyright (c) 2000-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson +@@ -36,17 +36,86 @@ + * + */ + ++#include + #include + #include + #include + ++/* Length of constant used in key derivation */ ++#define GSS_KRB5_K5CLENGTH (5) ++ ++/* Maximum key length (in bytes) for the supported crypto algorithms*/ ++#define GSS_KRB5_MAX_KEYLEN (32) ++ ++/* Maximum checksum function output for the supported crypto algorithms */ ++#define GSS_KRB5_MAX_CKSUM_LEN (20) ++ ++/* Maximum blocksize for the supported crypto algorithms */ ++#define GSS_KRB5_MAX_BLOCKSIZE (16) ++ ++struct krb5_ctx; ++ ++struct gss_krb5_enctype { ++ const u32 etype; /* encryption (key) type */ ++ const u32 ctype; /* checksum type */ ++ const char *name; /* "friendly" name */ ++ const char *encrypt_name; /* crypto encrypt name */ ++ const char *cksum_name; /* crypto checksum name */ ++ const u16 signalg; /* signing algorithm */ ++ const u16 sealalg; /* sealing algorithm */ ++ const u32 blocksize; /* encryption blocksize */ ++ const u32 conflen; /* confounder length ++ (normally the same as ++ the blocksize) */ ++ const u32 cksumlength; /* checksum length */ ++ const u32 keyed_cksum; /* is it a keyed cksum? */ ++ const u32 keybytes; /* raw key len, in bytes */ ++ const u32 keylength; /* final key len, in bytes */ ++ u32 (*encrypt) (struct crypto_blkcipher *tfm, ++ void *iv, void *in, void *out, ++ int length); /* encryption function */ ++ u32 (*decrypt) (struct crypto_blkcipher *tfm, ++ void *iv, void *in, void *out, ++ int length); /* decryption function */ ++ u32 (*mk_key) (const struct gss_krb5_enctype *gk5e, ++ struct xdr_netobj *in, ++ struct xdr_netobj *out); /* complete key generation */ ++ u32 (*encrypt_v2) (struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, int ec, ++ struct page **pages); /* v2 encryption function */ ++ u32 (*decrypt_v2) (struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, u32 *headskip, ++ u32 *tailskip); /* v2 decryption function */ ++}; ++ ++/* krb5_ctx flags definitions */ ++#define KRB5_CTX_FLAG_INITIATOR 0x00000001 ++#define KRB5_CTX_FLAG_CFX 0x00000002 ++#define KRB5_CTX_FLAG_ACCEPTOR_SUBKEY 0x00000004 ++ + struct krb5_ctx { + int initiate; /* 1 = initiating, 0 = accepting */ ++ u32 enctype; ++ u32 flags; ++ const struct gss_krb5_enctype *gk5e; /* enctype-specific info */ + struct crypto_blkcipher *enc; + struct crypto_blkcipher *seq; ++ struct crypto_blkcipher *acceptor_enc; ++ struct crypto_blkcipher *initiator_enc; ++ struct crypto_blkcipher *acceptor_enc_aux; ++ struct crypto_blkcipher *initiator_enc_aux; ++ u8 Ksess[GSS_KRB5_MAX_KEYLEN]; /* session key */ ++ u8 cksum[GSS_KRB5_MAX_KEYLEN]; + s32 endtime; + u32 seq_send; ++ u64 seq_send64; + struct xdr_netobj mech_used; ++ u8 initiator_sign[GSS_KRB5_MAX_KEYLEN]; ++ u8 acceptor_sign[GSS_KRB5_MAX_KEYLEN]; ++ u8 initiator_seal[GSS_KRB5_MAX_KEYLEN]; ++ u8 acceptor_seal[GSS_KRB5_MAX_KEYLEN]; ++ u8 initiator_integ[GSS_KRB5_MAX_KEYLEN]; ++ u8 acceptor_integ[GSS_KRB5_MAX_KEYLEN]; + }; + + extern spinlock_t krb5_seq_lock; +@@ -57,6 +126,18 @@ extern spinlock_t krb5_seq_lock; + #define KG_TOK_MIC_MSG 0x0101 + #define KG_TOK_WRAP_MSG 0x0201 + ++#define KG2_TOK_INITIAL 0x0101 ++#define KG2_TOK_RESPONSE 0x0202 ++#define KG2_TOK_MIC 0x0404 ++#define KG2_TOK_WRAP 0x0504 ++ ++#define KG2_TOKEN_FLAG_SENTBYACCEPTOR 0x01 ++#define KG2_TOKEN_FLAG_SEALED 0x02 ++#define KG2_TOKEN_FLAG_ACCEPTORSUBKEY 0x04 ++ ++#define KG2_RESP_FLAG_ERROR 0x0001 ++#define KG2_RESP_FLAG_DELEG_OK 0x0002 ++ + enum sgn_alg { + SGN_ALG_DES_MAC_MD5 = 0x0000, + SGN_ALG_MD2_5 = 0x0001, +@@ -81,6 +162,9 @@ enum seal_alg { + #define CKSUMTYPE_RSA_MD5_DES 0x0008 + #define CKSUMTYPE_NIST_SHA 0x0009 + #define CKSUMTYPE_HMAC_SHA1_DES3 0x000c ++#define CKSUMTYPE_HMAC_SHA1_96_AES128 0x000f ++#define CKSUMTYPE_HMAC_SHA1_96_AES256 0x0010 ++#define CKSUMTYPE_HMAC_MD5_ARCFOUR -138 /* Microsoft md5 hmac cksumtype */ + + /* from gssapi_err_krb5.h */ + #define KG_CCACHE_NOMATCH (39756032L) +@@ -111,11 +195,56 @@ enum seal_alg { + #define ENCTYPE_DES3_CBC_RAW 0x0006 /* DES-3 cbc mode raw */ + #define ENCTYPE_DES_HMAC_SHA1 0x0008 + #define ENCTYPE_DES3_CBC_SHA1 0x0010 ++#define ENCTYPE_AES128_CTS_HMAC_SHA1_96 0x0011 ++#define ENCTYPE_AES256_CTS_HMAC_SHA1_96 0x0012 ++#define ENCTYPE_ARCFOUR_HMAC 0x0017 ++#define ENCTYPE_ARCFOUR_HMAC_EXP 0x0018 + #define ENCTYPE_UNKNOWN 0x01ff + +-s32 +-make_checksum(char *, char *header, int hdrlen, struct xdr_buf *body, +- int body_offset, struct xdr_netobj *cksum); ++/* ++ * Constants used for key derivation ++ */ ++/* for 3DES */ ++#define KG_USAGE_SEAL (22) ++#define KG_USAGE_SIGN (23) ++#define KG_USAGE_SEQ (24) ++ ++/* from rfc3961 */ ++#define KEY_USAGE_SEED_CHECKSUM (0x99) ++#define KEY_USAGE_SEED_ENCRYPTION (0xAA) ++#define KEY_USAGE_SEED_INTEGRITY (0x55) ++ ++/* from rfc4121 */ ++#define KG_USAGE_ACCEPTOR_SEAL (22) ++#define KG_USAGE_ACCEPTOR_SIGN (23) ++#define KG_USAGE_INITIATOR_SEAL (24) ++#define KG_USAGE_INITIATOR_SIGN (25) ++ ++/* ++ * This compile-time check verifies that we will not exceed the ++ * slack space allotted by the client and server auth_gss code ++ * before they call gss_wrap(). ++ */ ++#define GSS_KRB5_MAX_SLACK_NEEDED \ ++ (GSS_KRB5_TOK_HDR_LEN /* gss token header */ \ ++ + GSS_KRB5_MAX_CKSUM_LEN /* gss token checksum */ \ ++ + GSS_KRB5_MAX_BLOCKSIZE /* confounder */ \ ++ + GSS_KRB5_MAX_BLOCKSIZE /* possible padding */ \ ++ + GSS_KRB5_TOK_HDR_LEN /* encrypted hdr in v2 token */\ ++ + GSS_KRB5_MAX_CKSUM_LEN /* encryption hmac */ \ ++ + 4 + 4 /* RPC verifier */ \ ++ + GSS_KRB5_TOK_HDR_LEN \ ++ + GSS_KRB5_MAX_CKSUM_LEN) ++ ++u32 ++make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, ++ struct xdr_buf *body, int body_offset, u8 *cksumkey, ++ unsigned int usage, struct xdr_netobj *cksumout); ++ ++u32 ++make_checksum_v2(struct krb5_ctx *, char *header, int hdrlen, ++ struct xdr_buf *body, int body_offset, u8 *key, ++ unsigned int usage, struct xdr_netobj *cksum); + + u32 gss_get_mic_kerberos(struct gss_ctx *, struct xdr_buf *, + struct xdr_netobj *); +@@ -149,11 +278,54 @@ gss_decrypt_xdr_buf(struct crypto_blkcip + int offset); + + s32 +-krb5_make_seq_num(struct crypto_blkcipher *key, ++krb5_make_seq_num(struct krb5_ctx *kctx, ++ struct crypto_blkcipher *key, + int direction, + u32 seqnum, unsigned char *cksum, unsigned char *buf); + + s32 +-krb5_get_seq_num(struct crypto_blkcipher *key, ++krb5_get_seq_num(struct krb5_ctx *kctx, + unsigned char *cksum, + unsigned char *buf, int *direction, u32 *seqnum); ++ ++int ++xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen); ++ ++u32 ++krb5_derive_key(const struct gss_krb5_enctype *gk5e, ++ const struct xdr_netobj *inkey, ++ struct xdr_netobj *outkey, ++ const struct xdr_netobj *in_constant, ++ gfp_t gfp_mask); ++ ++u32 ++gss_krb5_des3_make_key(const struct gss_krb5_enctype *gk5e, ++ struct xdr_netobj *randombits, ++ struct xdr_netobj *key); ++ ++u32 ++gss_krb5_aes_make_key(const struct gss_krb5_enctype *gk5e, ++ struct xdr_netobj *randombits, ++ struct xdr_netobj *key); ++ ++u32 ++gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, int ec, ++ struct page **pages); ++ ++u32 ++gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, u32 *plainoffset, ++ u32 *plainlen); ++ ++int ++krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, ++ struct crypto_blkcipher *cipher, ++ unsigned char *cksum); ++ ++int ++krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, ++ struct crypto_blkcipher *cipher, ++ s32 seqnum); ++void ++gss_krb5_make_confounder(char *p, u32 conflen); +diff -up linux-2.6.34.noarch/include/linux/sunrpc/metrics.h.orig linux-2.6.34.noarch/include/linux/sunrpc/metrics.h +--- linux-2.6.34.noarch/include/linux/sunrpc/metrics.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/metrics.h 2010-08-23 11:01:00.384611889 -0400 +@@ -26,6 +26,7 @@ + #define _LINUX_SUNRPC_METRICS_H + + #include ++#include + + #define RPC_IOSTATS_VERS "1.0" + +@@ -58,9 +59,9 @@ struct rpc_iostats { + * and the total time the request spent from init to release + * are measured. + */ +- unsigned long long om_queue, /* jiffies queued for xmit */ +- om_rtt, /* jiffies for RPC RTT */ +- om_execute; /* jiffies for RPC execution */ ++ ktime_t om_queue, /* queued for xmit */ ++ om_rtt, /* RPC RTT */ ++ om_execute; /* RPC execution */ + } ____cacheline_aligned; + + struct rpc_task; +diff -up linux-2.6.34.noarch/include/linux/sunrpc/sched.h.orig linux-2.6.34.noarch/include/linux/sunrpc/sched.h +--- linux-2.6.34.noarch/include/linux/sunrpc/sched.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/sched.h 2010-08-23 11:01:00.385361873 -0400 +@@ -10,6 +10,7 @@ + #define _LINUX_SUNRPC_SCHED_H_ + + #include ++#include + #include + #include + #include +@@ -40,21 +41,15 @@ struct rpc_wait { + * This is the RPC task struct + */ + struct rpc_task { +-#ifdef RPC_DEBUG +- unsigned long tk_magic; /* 0xf00baa */ +-#endif + atomic_t tk_count; /* Reference count */ + struct list_head tk_task; /* global list of tasks */ + struct rpc_clnt * tk_client; /* RPC client */ + struct rpc_rqst * tk_rqstp; /* RPC request */ +- int tk_status; /* result of last operation */ + + /* + * RPC call state + */ + struct rpc_message tk_msg; /* RPC call info */ +- __u8 tk_garb_retry; +- __u8 tk_cred_retry; + + /* + * callback to be executed after waking up +@@ -67,7 +62,6 @@ struct rpc_task { + void * tk_calldata; + + unsigned long tk_timeout; /* timeout for rpc_sleep() */ +- unsigned short tk_flags; /* misc flags */ + unsigned long tk_runstate; /* Task run status */ + struct workqueue_struct *tk_workqueue; /* Normally rpciod, but could + * be any workqueue +@@ -78,17 +72,19 @@ struct rpc_task { + struct rpc_wait tk_wait; /* RPC wait */ + } u; + +- unsigned short tk_timeouts; /* maj timeouts */ +- size_t tk_bytes_sent; /* total bytes sent */ +- unsigned long tk_start; /* RPC task init timestamp */ +- long tk_rtt; /* round-trip time (jiffies) */ ++ ktime_t tk_start; /* RPC task init timestamp */ + + pid_t tk_owner; /* Process id for batching tasks */ +- unsigned char tk_priority : 2;/* Task priority */ ++ int tk_status; /* result of last operation */ ++ unsigned short tk_flags; /* misc flags */ ++ unsigned short tk_timeouts; /* maj timeouts */ + + #ifdef RPC_DEBUG + unsigned short tk_pid; /* debugging aid */ + #endif ++ unsigned char tk_priority : 2,/* Task priority */ ++ tk_garb_retry : 2, ++ tk_cred_retry : 2; + }; + #define tk_xprt tk_client->cl_xprt + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xdr.h +--- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-23 11:01:00.385361873 -0400 +@@ -1,7 +1,10 @@ + /* +- * include/linux/sunrpc/xdr.h ++ * XDR standard data types and function declarations + * + * Copyright (C) 1995-1997 Olaf Kirch ++ * ++ * Based on: ++ * RFC 4506 "XDR: External Data Representation Standard", May 2006 + */ + + #ifndef _SUNRPC_XDR_H_ +@@ -62,7 +65,6 @@ struct xdr_buf { + + unsigned int buflen, /* Total length of storage buffer */ + len; /* Length of XDR encoded message */ +- + }; + + /* +@@ -178,7 +180,7 @@ struct xdr_array2_desc { + }; + + extern int xdr_decode_array2(struct xdr_buf *buf, unsigned int base, +- struct xdr_array2_desc *desc); ++ struct xdr_array2_desc *desc); + extern int xdr_encode_array2(struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc); + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/xprt.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xprt.h +--- linux-2.6.34.noarch/include/linux/sunrpc/xprt.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xprt.h 2010-08-23 11:01:00.386574704 -0400 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -65,8 +66,6 @@ struct rpc_rqst { + struct rpc_task * rq_task; /* RPC task data */ + __be32 rq_xid; /* request XID */ + int rq_cong; /* has incremented xprt->cong */ +- int rq_reply_bytes_recvd; /* number of reply */ +- /* bytes received */ + u32 rq_seqno; /* gss seq no. used on req. */ + int rq_enc_pages_num; + struct page **rq_enc_pages; /* scratch pages for use by +@@ -77,12 +76,16 @@ struct rpc_rqst { + __u32 * rq_buffer; /* XDR encode buffer */ + size_t rq_callsize, + rq_rcvsize; ++ size_t rq_xmit_bytes_sent; /* total bytes sent */ ++ size_t rq_reply_bytes_recvd; /* total reply bytes */ ++ /* received */ + + struct xdr_buf rq_private_buf; /* The receive buffer + * used in the softirq. + */ + unsigned long rq_majortimeo; /* major timeout alarm */ + unsigned long rq_timeout; /* Current timeout value */ ++ ktime_t rq_rtt; /* round-trip time */ + unsigned int rq_retries; /* # of retries */ + unsigned int rq_connect_cookie; + /* A cookie used to track the +@@ -94,7 +97,7 @@ struct rpc_rqst { + */ + u32 rq_bytes_sent; /* Bytes we have sent */ + +- unsigned long rq_xtime; /* when transmitted */ ++ ktime_t rq_xtime; /* transmit time stamp */ + int rq_ntrans; + + #if defined(CONFIG_NFS_V4_1) +@@ -174,8 +177,7 @@ struct rpc_xprt { + /* + * Connection of transports + */ +- unsigned long connect_timeout, +- bind_timeout, ++ unsigned long bind_timeout, + reestablish_timeout; + unsigned int connect_cookie; /* A cookie that gets bumped + every time the transport +@@ -294,7 +296,6 @@ void xprt_set_retrans_timeout_rtt(stru + void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); + void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action); + void xprt_write_space(struct rpc_xprt *xprt); +-void xprt_update_rtt(struct rpc_task *task); + void xprt_adjust_cwnd(struct rpc_task *task, int result); + struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid); + void xprt_complete_rqst(struct rpc_task *task, int copied); +diff -up linux-2.6.34.noarch/net/sunrpc/auth.c.orig linux-2.6.34.noarch/net/sunrpc/auth.c +--- linux-2.6.34.noarch/net/sunrpc/auth.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth.c 2010-08-23 11:01:00.387574079 -0400 +@@ -236,10 +236,15 @@ rpcauth_prune_expired(struct list_head * + + list_for_each_entry_safe(cred, next, &cred_unused, cr_lru) { + +- /* Enforce a 60 second garbage collection moratorium */ ++ if (nr_to_scan-- == 0) ++ break; ++ /* ++ * Enforce a 60 second garbage collection moratorium ++ * Note that the cred_unused list must be time-ordered. ++ */ + if (time_in_range(cred->cr_expire, expired, jiffies) && + test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) +- continue; ++ return 0; + + list_del_init(&cred->cr_lru); + number_cred_unused--; +@@ -252,13 +257,10 @@ rpcauth_prune_expired(struct list_head * + get_rpccred(cred); + list_add_tail(&cred->cr_lru, free); + rpcauth_unhash_cred_locked(cred); +- nr_to_scan--; + } + spin_unlock(cache_lock); +- if (nr_to_scan == 0) +- break; + } +- return nr_to_scan; ++ return (number_cred_unused / 100) * sysctl_vfs_cache_pressure; + } + + /* +@@ -270,11 +272,12 @@ rpcauth_cache_shrinker(int nr_to_scan, g + LIST_HEAD(free); + int res; + ++ if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) ++ return (nr_to_scan == 0) ? 0 : -1; + if (list_empty(&cred_unused)) + return 0; + spin_lock(&rpc_credcache_lock); +- nr_to_scan = rpcauth_prune_expired(&free, nr_to_scan); +- res = (number_cred_unused / 100) * sysctl_vfs_cache_pressure; ++ res = rpcauth_prune_expired(&free, nr_to_scan); + spin_unlock(&rpc_credcache_lock); + rpcauth_destroy_credlist(&free); + return res; +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/auth_gss.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/auth_gss.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/auth_gss.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/auth_gss.c 2010-08-23 11:01:00.388574680 -0400 +@@ -57,11 +57,14 @@ static const struct rpc_authops authgss_ + static const struct rpc_credops gss_credops; + static const struct rpc_credops gss_nullops; + ++#define GSS_RETRY_EXPIRED 5 ++static unsigned int gss_expired_cred_retry_delay = GSS_RETRY_EXPIRED; ++ + #ifdef RPC_DEBUG + # define RPCDBG_FACILITY RPCDBG_AUTH + #endif + +-#define GSS_CRED_SLACK 1024 ++#define GSS_CRED_SLACK (RPC_MAX_AUTH_SIZE * 2) + /* length of a krb5 verifier (48), plus data added before arguments when + * using integrity (two 4-byte integers): */ + #define GSS_VERF_SLACK 100 +@@ -229,7 +232,7 @@ gss_fill_context(const void *p, const vo + p = ERR_PTR(-EFAULT); + goto err; + } +- ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx); ++ ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, GFP_NOFS); + if (ret < 0) { + p = ERR_PTR(ret); + goto err; +@@ -350,6 +353,24 @@ gss_unhash_msg(struct gss_upcall_msg *gs + } + + static void ++gss_handle_downcall_result(struct gss_cred *gss_cred, struct gss_upcall_msg *gss_msg) ++{ ++ switch (gss_msg->msg.errno) { ++ case 0: ++ if (gss_msg->ctx == NULL) ++ break; ++ clear_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags); ++ gss_cred_set_ctx(&gss_cred->gc_base, gss_msg->ctx); ++ break; ++ case -EKEYEXPIRED: ++ set_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags); ++ } ++ gss_cred->gc_upcall_timestamp = jiffies; ++ gss_cred->gc_upcall = NULL; ++ rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); ++} ++ ++static void + gss_upcall_callback(struct rpc_task *task) + { + struct gss_cred *gss_cred = container_of(task->tk_msg.rpc_cred, +@@ -358,13 +379,9 @@ gss_upcall_callback(struct rpc_task *tas + struct inode *inode = &gss_msg->inode->vfs_inode; + + spin_lock(&inode->i_lock); +- if (gss_msg->ctx) +- gss_cred_set_ctx(task->tk_msg.rpc_cred, gss_msg->ctx); +- else +- task->tk_status = gss_msg->msg.errno; +- gss_cred->gc_upcall = NULL; +- rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); ++ gss_handle_downcall_result(gss_cred, gss_msg); + spin_unlock(&inode->i_lock); ++ task->tk_status = gss_msg->msg.errno; + gss_release_msg(gss_msg); + } + +@@ -377,11 +394,12 @@ static void gss_encode_v0_msg(struct gss + static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, + struct rpc_clnt *clnt, int machine_cred) + { ++ struct gss_api_mech *mech = gss_msg->auth->mech; + char *p = gss_msg->databuf; + int len = 0; + + gss_msg->msg.len = sprintf(gss_msg->databuf, "mech=%s uid=%d ", +- gss_msg->auth->mech->gm_name, ++ mech->gm_name, + gss_msg->uid); + p += gss_msg->msg.len; + if (clnt->cl_principal) { +@@ -398,6 +416,11 @@ static void gss_encode_v1_msg(struct gss + p += len; + gss_msg->msg.len += len; + } ++ if (mech->gm_upcall_enctypes) { ++ len = sprintf(p, mech->gm_upcall_enctypes); ++ p += len; ++ gss_msg->msg.len += len; ++ } + len = sprintf(p, "\n"); + gss_msg->msg.len += len; + +@@ -507,18 +530,16 @@ gss_refresh_upcall(struct rpc_task *task + spin_lock(&inode->i_lock); + if (gss_cred->gc_upcall != NULL) + rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL); +- else if (gss_msg->ctx != NULL) { +- gss_cred_set_ctx(task->tk_msg.rpc_cred, gss_msg->ctx); +- gss_cred->gc_upcall = NULL; +- rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); +- } else if (gss_msg->msg.errno >= 0) { ++ else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) { + task->tk_timeout = 0; + gss_cred->gc_upcall = gss_msg; + /* gss_upcall_callback will release the reference to gss_upcall_msg */ + atomic_inc(&gss_msg->count); + rpc_sleep_on(&gss_msg->rpc_waitqueue, task, gss_upcall_callback); +- } else ++ } else { ++ gss_handle_downcall_result(gss_cred, gss_msg); + err = gss_msg->msg.errno; ++ } + spin_unlock(&inode->i_lock); + gss_release_msg(gss_msg); + out: +@@ -1117,6 +1138,23 @@ static int gss_renew_cred(struct rpc_tas + return 0; + } + ++static int gss_cred_is_negative_entry(struct rpc_cred *cred) ++{ ++ if (test_bit(RPCAUTH_CRED_NEGATIVE, &cred->cr_flags)) { ++ unsigned long now = jiffies; ++ unsigned long begin, expire; ++ struct gss_cred *gss_cred; ++ ++ gss_cred = container_of(cred, struct gss_cred, gc_base); ++ begin = gss_cred->gc_upcall_timestamp; ++ expire = begin + gss_expired_cred_retry_delay * HZ; ++ ++ if (time_in_range_open(now, begin, expire)) ++ return 1; ++ } ++ return 0; ++} ++ + /* + * Refresh credentials. XXX - finish + */ +@@ -1126,6 +1164,9 @@ gss_refresh(struct rpc_task *task) + struct rpc_cred *cred = task->tk_msg.rpc_cred; + int ret = 0; + ++ if (gss_cred_is_negative_entry(cred)) ++ return -EKEYEXPIRED; ++ + if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && + !test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags)) { + ret = gss_renew_cred(task); +@@ -1316,15 +1357,21 @@ gss_wrap_req_priv(struct rpc_cred *cred, + inpages = snd_buf->pages + first; + snd_buf->pages = rqstp->rq_enc_pages; + snd_buf->page_base -= first << PAGE_CACHE_SHIFT; +- /* Give the tail its own page, in case we need extra space in the +- * head when wrapping: */ ++ /* ++ * Give the tail its own page, in case we need extra space in the ++ * head when wrapping: ++ * ++ * call_allocate() allocates twice the slack space required ++ * by the authentication flavor to rq_callsize. ++ * For GSS, slack is GSS_CRED_SLACK. ++ */ + if (snd_buf->page_len || snd_buf->tail[0].iov_len) { + tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]); + memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len); + snd_buf->tail[0].iov_base = tmp; + } + maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages); +- /* RPC_SLACK_SPACE should prevent this ever happening: */ ++ /* slack space should prevent this ever happening: */ + BUG_ON(snd_buf->len > snd_buf->buflen); + status = -EIO; + /* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was +@@ -1573,5 +1620,11 @@ static void __exit exit_rpcsec_gss(void) + } + + MODULE_LICENSE("GPL"); ++module_param_named(expired_cred_retry_delay, ++ gss_expired_cred_retry_delay, ++ uint, 0644); ++MODULE_PARM_DESC(expired_cred_retry_delay, "Timeout (in seconds) until " ++ "the RPC engine retries an expired credential"); ++ + module_init(init_rpcsec_gss) + module_exit(exit_rpcsec_gss) +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_crypto.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_crypto.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_crypto.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_crypto.c 2010-08-23 11:01:00.390553891 -0400 +@@ -1,7 +1,7 @@ + /* + * linux/net/sunrpc/gss_krb5_crypto.c + * +- * Copyright (c) 2000 The Regents of the University of Michigan. ++ * Copyright (c) 2000-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson +@@ -41,6 +41,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -58,13 +59,13 @@ krb5_encrypt( + { + u32 ret = -EINVAL; + struct scatterlist sg[1]; +- u8 local_iv[16] = {0}; ++ u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; + struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv }; + + if (length % crypto_blkcipher_blocksize(tfm) != 0) + goto out; + +- if (crypto_blkcipher_ivsize(tfm) > 16) { ++ if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { + dprintk("RPC: gss_k5encrypt: tfm iv size too large %d\n", + crypto_blkcipher_ivsize(tfm)); + goto out; +@@ -92,13 +93,13 @@ krb5_decrypt( + { + u32 ret = -EINVAL; + struct scatterlist sg[1]; +- u8 local_iv[16] = {0}; ++ u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; + struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv }; + + if (length % crypto_blkcipher_blocksize(tfm) != 0) + goto out; + +- if (crypto_blkcipher_ivsize(tfm) > 16) { ++ if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { + dprintk("RPC: gss_k5decrypt: tfm iv size too large %d\n", + crypto_blkcipher_ivsize(tfm)); + goto out; +@@ -123,21 +124,155 @@ checksummer(struct scatterlist *sg, void + return crypto_hash_update(desc, sg, sg->length); + } + +-/* checksum the plaintext data and hdrlen bytes of the token header */ +-s32 +-make_checksum(char *cksumname, char *header, int hdrlen, struct xdr_buf *body, +- int body_offset, struct xdr_netobj *cksum) ++static int ++arcfour_hmac_md5_usage_to_salt(unsigned int usage, u8 salt[4]) ++{ ++ unsigned int ms_usage; ++ ++ switch (usage) { ++ case KG_USAGE_SIGN: ++ ms_usage = 15; ++ break; ++ case KG_USAGE_SEAL: ++ ms_usage = 13; ++ break; ++ default: ++ return EINVAL;; ++ } ++ salt[0] = (ms_usage >> 0) & 0xff; ++ salt[1] = (ms_usage >> 8) & 0xff; ++ salt[2] = (ms_usage >> 16) & 0xff; ++ salt[3] = (ms_usage >> 24) & 0xff; ++ ++ return 0; ++} ++ ++static u32 ++make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, ++ struct xdr_buf *body, int body_offset, u8 *cksumkey, ++ unsigned int usage, struct xdr_netobj *cksumout) + { +- struct hash_desc desc; /* XXX add to ctx? */ ++ struct hash_desc desc; + struct scatterlist sg[1]; + int err; ++ u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ u8 rc4salt[4]; ++ struct crypto_hash *md5; ++ struct crypto_hash *hmac_md5; ++ ++ if (cksumkey == NULL) ++ return GSS_S_FAILURE; ++ ++ if (cksumout->len < kctx->gk5e->cksumlength) { ++ dprintk("%s: checksum buffer length, %u, too small for %s\n", ++ __func__, cksumout->len, kctx->gk5e->name); ++ return GSS_S_FAILURE; ++ } ++ ++ if (arcfour_hmac_md5_usage_to_salt(usage, rc4salt)) { ++ dprintk("%s: invalid usage value %u\n", __func__, usage); ++ return GSS_S_FAILURE; ++ } ++ ++ md5 = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); ++ if (IS_ERR(md5)) ++ return GSS_S_FAILURE; ++ ++ hmac_md5 = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(hmac_md5)) { ++ crypto_free_hash(md5); ++ return GSS_S_FAILURE; ++ } ++ ++ desc.tfm = md5; ++ desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out; ++ sg_init_one(sg, rc4salt, 4); ++ err = crypto_hash_update(&desc, sg, 4); ++ if (err) ++ goto out; ++ ++ sg_init_one(sg, header, hdrlen); ++ err = crypto_hash_update(&desc, sg, hdrlen); ++ if (err) ++ goto out; ++ err = xdr_process_buf(body, body_offset, body->len - body_offset, ++ checksummer, &desc); ++ if (err) ++ goto out; ++ err = crypto_hash_final(&desc, checksumdata); ++ if (err) ++ goto out; ++ ++ desc.tfm = hmac_md5; ++ desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out; ++ err = crypto_hash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength); ++ if (err) ++ goto out; ++ ++ sg_init_one(sg, checksumdata, crypto_hash_digestsize(md5)); ++ err = crypto_hash_digest(&desc, sg, crypto_hash_digestsize(md5), ++ checksumdata); ++ if (err) ++ goto out; ++ ++ memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); ++ cksumout->len = kctx->gk5e->cksumlength; ++out: ++ crypto_free_hash(md5); ++ crypto_free_hash(hmac_md5); ++ return err ? GSS_S_FAILURE : 0; ++} ++ ++/* ++ * checksum the plaintext data and hdrlen bytes of the token header ++ * The checksum is performed over the first 8 bytes of the ++ * gss token header and then over the data body ++ */ ++u32 ++make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, ++ struct xdr_buf *body, int body_offset, u8 *cksumkey, ++ unsigned int usage, struct xdr_netobj *cksumout) ++{ ++ struct hash_desc desc; ++ struct scatterlist sg[1]; ++ int err; ++ u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ unsigned int checksumlen; ++ ++ if (kctx->gk5e->ctype == CKSUMTYPE_HMAC_MD5_ARCFOUR) ++ return make_checksum_hmac_md5(kctx, header, hdrlen, ++ body, body_offset, ++ cksumkey, usage, cksumout); ++ ++ if (cksumout->len < kctx->gk5e->cksumlength) { ++ dprintk("%s: checksum buffer length, %u, too small for %s\n", ++ __func__, cksumout->len, kctx->gk5e->name); ++ return GSS_S_FAILURE; ++ } + +- desc.tfm = crypto_alloc_hash(cksumname, 0, CRYPTO_ALG_ASYNC); ++ desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(desc.tfm)) + return GSS_S_FAILURE; +- cksum->len = crypto_hash_digestsize(desc.tfm); + desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + ++ checksumlen = crypto_hash_digestsize(desc.tfm); ++ ++ if (cksumkey != NULL) { ++ err = crypto_hash_setkey(desc.tfm, cksumkey, ++ kctx->gk5e->keylength); ++ if (err) ++ goto out; ++ } ++ + err = crypto_hash_init(&desc); + if (err) + goto out; +@@ -149,15 +284,109 @@ make_checksum(char *cksumname, char *hea + checksummer, &desc); + if (err) + goto out; +- err = crypto_hash_final(&desc, cksum->data); ++ err = crypto_hash_final(&desc, checksumdata); ++ if (err) ++ goto out; + ++ switch (kctx->gk5e->ctype) { ++ case CKSUMTYPE_RSA_MD5: ++ err = kctx->gk5e->encrypt(kctx->seq, NULL, checksumdata, ++ checksumdata, checksumlen); ++ if (err) ++ goto out; ++ memcpy(cksumout->data, ++ checksumdata + checksumlen - kctx->gk5e->cksumlength, ++ kctx->gk5e->cksumlength); ++ break; ++ case CKSUMTYPE_HMAC_SHA1_DES3: ++ memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); ++ break; ++ default: ++ BUG(); ++ break; ++ } ++ cksumout->len = kctx->gk5e->cksumlength; ++out: ++ crypto_free_hash(desc.tfm); ++ return err ? GSS_S_FAILURE : 0; ++} ++ ++/* ++ * checksum the plaintext data and hdrlen bytes of the token header ++ * Per rfc4121, sec. 4.2.4, the checksum is performed over the data ++ * body then over the first 16 octets of the MIC token ++ * Inclusion of the header data in the calculation of the ++ * checksum is optional. ++ */ ++u32 ++make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, ++ struct xdr_buf *body, int body_offset, u8 *cksumkey, ++ unsigned int usage, struct xdr_netobj *cksumout) ++{ ++ struct hash_desc desc; ++ struct scatterlist sg[1]; ++ int err; ++ u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ unsigned int checksumlen; ++ ++ if (kctx->gk5e->keyed_cksum == 0) { ++ dprintk("%s: expected keyed hash for %s\n", ++ __func__, kctx->gk5e->name); ++ return GSS_S_FAILURE; ++ } ++ if (cksumkey == NULL) { ++ dprintk("%s: no key supplied for %s\n", ++ __func__, kctx->gk5e->name); ++ return GSS_S_FAILURE; ++ } ++ ++ desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(desc.tfm)) ++ return GSS_S_FAILURE; ++ checksumlen = crypto_hash_digestsize(desc.tfm); ++ desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; ++ ++ err = crypto_hash_setkey(desc.tfm, cksumkey, kctx->gk5e->keylength); ++ if (err) ++ goto out; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out; ++ err = xdr_process_buf(body, body_offset, body->len - body_offset, ++ checksummer, &desc); ++ if (err) ++ goto out; ++ if (header != NULL) { ++ sg_init_one(sg, header, hdrlen); ++ err = crypto_hash_update(&desc, sg, hdrlen); ++ if (err) ++ goto out; ++ } ++ err = crypto_hash_final(&desc, checksumdata); ++ if (err) ++ goto out; ++ ++ cksumout->len = kctx->gk5e->cksumlength; ++ ++ switch (kctx->gk5e->ctype) { ++ case CKSUMTYPE_HMAC_SHA1_96_AES128: ++ case CKSUMTYPE_HMAC_SHA1_96_AES256: ++ /* note that this truncates the hash */ ++ memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); ++ break; ++ default: ++ BUG(); ++ break; ++ } + out: + crypto_free_hash(desc.tfm); + return err ? GSS_S_FAILURE : 0; + } + + struct encryptor_desc { +- u8 iv[8]; /* XXX hard-coded blocksize */ ++ u8 iv[GSS_KRB5_MAX_BLOCKSIZE]; + struct blkcipher_desc desc; + int pos; + struct xdr_buf *outbuf; +@@ -198,7 +427,7 @@ encryptor(struct scatterlist *sg, void * + desc->fraglen += sg->length; + desc->pos += sg->length; + +- fraglen = thislen & 7; /* XXX hardcoded blocksize */ ++ fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1); + thislen -= fraglen; + + if (thislen == 0) +@@ -256,7 +485,7 @@ gss_encrypt_xdr_buf(struct crypto_blkcip + } + + struct decryptor_desc { +- u8 iv[8]; /* XXX hard-coded blocksize */ ++ u8 iv[GSS_KRB5_MAX_BLOCKSIZE]; + struct blkcipher_desc desc; + struct scatterlist frags[4]; + int fragno; +@@ -278,7 +507,7 @@ decryptor(struct scatterlist *sg, void * + desc->fragno++; + desc->fraglen += sg->length; + +- fraglen = thislen & 7; /* XXX hardcoded blocksize */ ++ fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1); + thislen -= fraglen; + + if (thislen == 0) +@@ -325,3 +554,437 @@ gss_decrypt_xdr_buf(struct crypto_blkcip + + return xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc); + } ++ ++/* ++ * This function makes the assumption that it was ultimately called ++ * from gss_wrap(). ++ * ++ * The client auth_gss code moves any existing tail data into a ++ * separate page before calling gss_wrap. ++ * The server svcauth_gss code ensures that both the head and the ++ * tail have slack space of RPC_MAX_AUTH_SIZE before calling gss_wrap. ++ * ++ * Even with that guarantee, this function may be called more than ++ * once in the processing of gss_wrap(). The best we can do is ++ * verify at compile-time (see GSS_KRB5_SLACK_CHECK) that the ++ * largest expected shift will fit within RPC_MAX_AUTH_SIZE. ++ * At run-time we can verify that a single invocation of this ++ * function doesn't attempt to use more the RPC_MAX_AUTH_SIZE. ++ */ ++ ++int ++xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen) ++{ ++ u8 *p; ++ ++ if (shiftlen == 0) ++ return 0; ++ ++ BUILD_BUG_ON(GSS_KRB5_MAX_SLACK_NEEDED > RPC_MAX_AUTH_SIZE); ++ BUG_ON(shiftlen > RPC_MAX_AUTH_SIZE); ++ ++ p = buf->head[0].iov_base + base; ++ ++ memmove(p + shiftlen, p, buf->head[0].iov_len - base); ++ ++ buf->head[0].iov_len += shiftlen; ++ buf->len += shiftlen; ++ ++ return 0; ++} ++ ++static u32 ++gss_krb5_cts_crypt(struct crypto_blkcipher *cipher, struct xdr_buf *buf, ++ u32 offset, u8 *iv, struct page **pages, int encrypt) ++{ ++ u32 ret; ++ struct scatterlist sg[1]; ++ struct blkcipher_desc desc = { .tfm = cipher, .info = iv }; ++ u8 data[crypto_blkcipher_blocksize(cipher) * 2]; ++ struct page **save_pages; ++ u32 len = buf->len - offset; ++ ++ BUG_ON(len > crypto_blkcipher_blocksize(cipher) * 2); ++ ++ /* ++ * For encryption, we want to read from the cleartext ++ * page cache pages, and write the encrypted data to ++ * the supplied xdr_buf pages. ++ */ ++ save_pages = buf->pages; ++ if (encrypt) ++ buf->pages = pages; ++ ++ ret = read_bytes_from_xdr_buf(buf, offset, data, len); ++ buf->pages = save_pages; ++ if (ret) ++ goto out; ++ ++ sg_init_one(sg, data, len); ++ ++ if (encrypt) ++ ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, len); ++ else ++ ret = crypto_blkcipher_decrypt_iv(&desc, sg, sg, len); ++ ++ if (ret) ++ goto out; ++ ++ ret = write_bytes_to_xdr_buf(buf, offset, data, len); ++ ++out: ++ return ret; ++} ++ ++u32 ++gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, int ec, struct page **pages) ++{ ++ u32 err; ++ struct xdr_netobj hmac; ++ u8 *cksumkey; ++ u8 *ecptr; ++ struct crypto_blkcipher *cipher, *aux_cipher; ++ int blocksize; ++ struct page **save_pages; ++ int nblocks, nbytes; ++ struct encryptor_desc desc; ++ u32 cbcbytes; ++ unsigned int usage; ++ ++ if (kctx->initiate) { ++ cipher = kctx->initiator_enc; ++ aux_cipher = kctx->initiator_enc_aux; ++ cksumkey = kctx->initiator_integ; ++ usage = KG_USAGE_INITIATOR_SEAL; ++ } else { ++ cipher = kctx->acceptor_enc; ++ aux_cipher = kctx->acceptor_enc_aux; ++ cksumkey = kctx->acceptor_integ; ++ usage = KG_USAGE_ACCEPTOR_SEAL; ++ } ++ blocksize = crypto_blkcipher_blocksize(cipher); ++ ++ /* hide the gss token header and insert the confounder */ ++ offset += GSS_KRB5_TOK_HDR_LEN; ++ if (xdr_extend_head(buf, offset, kctx->gk5e->conflen)) ++ return GSS_S_FAILURE; ++ gss_krb5_make_confounder(buf->head[0].iov_base + offset, kctx->gk5e->conflen); ++ offset -= GSS_KRB5_TOK_HDR_LEN; ++ ++ if (buf->tail[0].iov_base != NULL) { ++ ecptr = buf->tail[0].iov_base + buf->tail[0].iov_len; ++ } else { ++ buf->tail[0].iov_base = buf->head[0].iov_base ++ + buf->head[0].iov_len; ++ buf->tail[0].iov_len = 0; ++ ecptr = buf->tail[0].iov_base; ++ } ++ ++ memset(ecptr, 'X', ec); ++ buf->tail[0].iov_len += ec; ++ buf->len += ec; ++ ++ /* copy plaintext gss token header after filler (if any) */ ++ memcpy(ecptr + ec, buf->head[0].iov_base + offset, ++ GSS_KRB5_TOK_HDR_LEN); ++ buf->tail[0].iov_len += GSS_KRB5_TOK_HDR_LEN; ++ buf->len += GSS_KRB5_TOK_HDR_LEN; ++ ++ /* Do the HMAC */ ++ hmac.len = GSS_KRB5_MAX_CKSUM_LEN; ++ hmac.data = buf->tail[0].iov_base + buf->tail[0].iov_len; ++ ++ /* ++ * When we are called, pages points to the real page cache ++ * data -- which we can't go and encrypt! buf->pages points ++ * to scratch pages which we are going to send off to the ++ * client/server. Swap in the plaintext pages to calculate ++ * the hmac. ++ */ ++ save_pages = buf->pages; ++ buf->pages = pages; ++ ++ err = make_checksum_v2(kctx, NULL, 0, buf, ++ offset + GSS_KRB5_TOK_HDR_LEN, ++ cksumkey, usage, &hmac); ++ buf->pages = save_pages; ++ if (err) ++ return GSS_S_FAILURE; ++ ++ nbytes = buf->len - offset - GSS_KRB5_TOK_HDR_LEN; ++ nblocks = (nbytes + blocksize - 1) / blocksize; ++ cbcbytes = 0; ++ if (nblocks > 2) ++ cbcbytes = (nblocks - 2) * blocksize; ++ ++ memset(desc.iv, 0, sizeof(desc.iv)); ++ ++ if (cbcbytes) { ++ desc.pos = offset + GSS_KRB5_TOK_HDR_LEN; ++ desc.fragno = 0; ++ desc.fraglen = 0; ++ desc.pages = pages; ++ desc.outbuf = buf; ++ desc.desc.info = desc.iv; ++ desc.desc.flags = 0; ++ desc.desc.tfm = aux_cipher; ++ ++ sg_init_table(desc.infrags, 4); ++ sg_init_table(desc.outfrags, 4); ++ ++ err = xdr_process_buf(buf, offset + GSS_KRB5_TOK_HDR_LEN, ++ cbcbytes, encryptor, &desc); ++ if (err) ++ goto out_err; ++ } ++ ++ /* Make sure IV carries forward from any CBC results. */ ++ err = gss_krb5_cts_crypt(cipher, buf, ++ offset + GSS_KRB5_TOK_HDR_LEN + cbcbytes, ++ desc.iv, pages, 1); ++ if (err) { ++ err = GSS_S_FAILURE; ++ goto out_err; ++ } ++ ++ /* Now update buf to account for HMAC */ ++ buf->tail[0].iov_len += kctx->gk5e->cksumlength; ++ buf->len += kctx->gk5e->cksumlength; ++ ++out_err: ++ if (err) ++ err = GSS_S_FAILURE; ++ return err; ++} ++ ++u32 ++gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, ++ u32 *headskip, u32 *tailskip) ++{ ++ struct xdr_buf subbuf; ++ u32 ret = 0; ++ u8 *cksum_key; ++ struct crypto_blkcipher *cipher, *aux_cipher; ++ struct xdr_netobj our_hmac_obj; ++ u8 our_hmac[GSS_KRB5_MAX_CKSUM_LEN]; ++ u8 pkt_hmac[GSS_KRB5_MAX_CKSUM_LEN]; ++ int nblocks, blocksize, cbcbytes; ++ struct decryptor_desc desc; ++ unsigned int usage; ++ ++ if (kctx->initiate) { ++ cipher = kctx->acceptor_enc; ++ aux_cipher = kctx->acceptor_enc_aux; ++ cksum_key = kctx->acceptor_integ; ++ usage = KG_USAGE_ACCEPTOR_SEAL; ++ } else { ++ cipher = kctx->initiator_enc; ++ aux_cipher = kctx->initiator_enc_aux; ++ cksum_key = kctx->initiator_integ; ++ usage = KG_USAGE_INITIATOR_SEAL; ++ } ++ blocksize = crypto_blkcipher_blocksize(cipher); ++ ++ ++ /* create a segment skipping the header and leaving out the checksum */ ++ xdr_buf_subsegment(buf, &subbuf, offset + GSS_KRB5_TOK_HDR_LEN, ++ (buf->len - offset - GSS_KRB5_TOK_HDR_LEN - ++ kctx->gk5e->cksumlength)); ++ ++ nblocks = (subbuf.len + blocksize - 1) / blocksize; ++ ++ cbcbytes = 0; ++ if (nblocks > 2) ++ cbcbytes = (nblocks - 2) * blocksize; ++ ++ memset(desc.iv, 0, sizeof(desc.iv)); ++ ++ if (cbcbytes) { ++ desc.fragno = 0; ++ desc.fraglen = 0; ++ desc.desc.info = desc.iv; ++ desc.desc.flags = 0; ++ desc.desc.tfm = aux_cipher; ++ ++ sg_init_table(desc.frags, 4); ++ ++ ret = xdr_process_buf(&subbuf, 0, cbcbytes, decryptor, &desc); ++ if (ret) ++ goto out_err; ++ } ++ ++ /* Make sure IV carries forward from any CBC results. */ ++ ret = gss_krb5_cts_crypt(cipher, &subbuf, cbcbytes, desc.iv, NULL, 0); ++ if (ret) ++ goto out_err; ++ ++ ++ /* Calculate our hmac over the plaintext data */ ++ our_hmac_obj.len = sizeof(our_hmac); ++ our_hmac_obj.data = our_hmac; ++ ++ ret = make_checksum_v2(kctx, NULL, 0, &subbuf, 0, ++ cksum_key, usage, &our_hmac_obj); ++ if (ret) ++ goto out_err; ++ ++ /* Get the packet's hmac value */ ++ ret = read_bytes_from_xdr_buf(buf, buf->len - kctx->gk5e->cksumlength, ++ pkt_hmac, kctx->gk5e->cksumlength); ++ if (ret) ++ goto out_err; ++ ++ if (memcmp(pkt_hmac, our_hmac, kctx->gk5e->cksumlength) != 0) { ++ ret = GSS_S_BAD_SIG; ++ goto out_err; ++ } ++ *headskip = kctx->gk5e->conflen; ++ *tailskip = kctx->gk5e->cksumlength; ++out_err: ++ if (ret && ret != GSS_S_BAD_SIG) ++ ret = GSS_S_FAILURE; ++ return ret; ++} ++ ++/* ++ * Compute Kseq given the initial session key and the checksum. ++ * Set the key of the given cipher. ++ */ ++int ++krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher, ++ unsigned char *cksum) ++{ ++ struct crypto_hash *hmac; ++ struct hash_desc desc; ++ struct scatterlist sg[1]; ++ u8 Kseq[GSS_KRB5_MAX_KEYLEN]; ++ u32 zeroconstant = 0; ++ int err; ++ ++ dprintk("%s: entered\n", __func__); ++ ++ hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); ++ if (IS_ERR(hmac)) { ++ dprintk("%s: error %ld, allocating hash '%s'\n", ++ __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name); ++ return PTR_ERR(hmac); ++ } ++ ++ desc.tfm = hmac; ++ desc.flags = 0; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out_err; ++ ++ /* Compute intermediate Kseq from session key */ ++ err = crypto_hash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ sg_init_table(sg, 1); ++ sg_set_buf(sg, &zeroconstant, 4); ++ ++ err = crypto_hash_digest(&desc, sg, 4, Kseq); ++ if (err) ++ goto out_err; ++ ++ /* Compute final Kseq from the checksum and intermediate Kseq */ ++ err = crypto_hash_setkey(hmac, Kseq, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ sg_set_buf(sg, cksum, 8); ++ ++ err = crypto_hash_digest(&desc, sg, 8, Kseq); ++ if (err) ++ goto out_err; ++ ++ err = crypto_blkcipher_setkey(cipher, Kseq, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ err = 0; ++ ++out_err: ++ crypto_free_hash(hmac); ++ dprintk("%s: returning %d\n", __func__, err); ++ return err; ++} ++ ++/* ++ * Compute Kcrypt given the initial session key and the plaintext seqnum. ++ * Set the key of cipher kctx->enc. ++ */ ++int ++krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher, ++ s32 seqnum) ++{ ++ struct crypto_hash *hmac; ++ struct hash_desc desc; ++ struct scatterlist sg[1]; ++ u8 Kcrypt[GSS_KRB5_MAX_KEYLEN]; ++ u8 zeroconstant[4] = {0}; ++ u8 seqnumarray[4]; ++ int err, i; ++ ++ dprintk("%s: entered, seqnum %u\n", __func__, seqnum); ++ ++ hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); ++ if (IS_ERR(hmac)) { ++ dprintk("%s: error %ld, allocating hash '%s'\n", ++ __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name); ++ return PTR_ERR(hmac); ++ } ++ ++ desc.tfm = hmac; ++ desc.flags = 0; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out_err; ++ ++ /* Compute intermediate Kcrypt from session key */ ++ for (i = 0; i < kctx->gk5e->keylength; i++) ++ Kcrypt[i] = kctx->Ksess[i] ^ 0xf0; ++ ++ err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ sg_init_table(sg, 1); ++ sg_set_buf(sg, zeroconstant, 4); ++ ++ err = crypto_hash_digest(&desc, sg, 4, Kcrypt); ++ if (err) ++ goto out_err; ++ ++ /* Compute final Kcrypt from the seqnum and intermediate Kcrypt */ ++ err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ seqnumarray[0] = (unsigned char) ((seqnum >> 24) & 0xff); ++ seqnumarray[1] = (unsigned char) ((seqnum >> 16) & 0xff); ++ seqnumarray[2] = (unsigned char) ((seqnum >> 8) & 0xff); ++ seqnumarray[3] = (unsigned char) ((seqnum >> 0) & 0xff); ++ ++ sg_set_buf(sg, seqnumarray, 4); ++ ++ err = crypto_hash_digest(&desc, sg, 4, Kcrypt); ++ if (err) ++ goto out_err; ++ ++ err = crypto_blkcipher_setkey(cipher, Kcrypt, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ err = 0; ++ ++out_err: ++ crypto_free_hash(hmac); ++ dprintk("%s: returning %d\n", __func__, err); ++ return err; ++} ++ +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_keys.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_keys.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_keys.c.orig 2010-08-23 11:01:00.390553891 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_keys.c 2010-08-23 11:01:00.391564137 -0400 +@@ -0,0 +1,336 @@ ++/* ++ * COPYRIGHT (c) 2008 ++ * The Regents of the University of Michigan ++ * ALL RIGHTS RESERVED ++ * ++ * Permission is granted to use, copy, create derivative works ++ * and redistribute this software and such derivative works ++ * for any purpose, so long as the name of The University of ++ * Michigan is not used in any advertising or publicity ++ * pertaining to the use of distribution of this software ++ * without specific, written prior authorization. If the ++ * above copyright notice or any other identification of the ++ * University of Michigan is included in any copy of any ++ * portion of this software, then the disclaimer below must ++ * also be included. ++ * ++ * THIS SOFTWARE IS PROVIDED AS IS, WITHOUT REPRESENTATION ++ * FROM THE UNIVERSITY OF MICHIGAN AS TO ITS FITNESS FOR ANY ++ * PURPOSE, AND WITHOUT WARRANTY BY THE UNIVERSITY OF ++ * MICHIGAN OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ++ * WITHOUT LIMITATION THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ++ * REGENTS OF THE UNIVERSITY OF MICHIGAN SHALL NOT BE LIABLE ++ * FOR ANY DAMAGES, INCLUDING SPECIAL, INDIRECT, INCIDENTAL, OR ++ * CONSEQUENTIAL DAMAGES, WITH RESPECT TO ANY CLAIM ARISING ++ * OUT OF OR IN CONNECTION WITH THE USE OF THE SOFTWARE, EVEN ++ * IF IT HAS BEEN OR IS HEREAFTER ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGES. ++ */ ++ ++/* ++ * Copyright (C) 1998 by the FundsXpress, INC. ++ * ++ * All rights reserved. ++ * ++ * Export of this software from the United States of America may require ++ * a specific license from the United States Government. It is the ++ * responsibility of any person or organization contemplating export to ++ * obtain such a license before exporting. ++ * ++ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and ++ * distribute this software and its documentation for any purpose and ++ * without fee is hereby granted, provided that the above copyright ++ * notice appear in all copies and that both that copyright notice and ++ * this permission notice appear in supporting documentation, and that ++ * the name of FundsXpress. not be used in advertising or publicity pertaining ++ * to distribution of the software without specific, written prior ++ * permission. FundsXpress makes no representations about the suitability of ++ * this software for any purpose. It is provided "as is" without express ++ * or implied warranty. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR ++ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED ++ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef RPC_DEBUG ++# define RPCDBG_FACILITY RPCDBG_AUTH ++#endif ++ ++/* ++ * This is the n-fold function as described in rfc3961, sec 5.1 ++ * Taken from MIT Kerberos and modified. ++ */ ++ ++static void krb5_nfold(u32 inbits, const u8 *in, ++ u32 outbits, u8 *out) ++{ ++ int a, b, c, lcm; ++ int byte, i, msbit; ++ ++ /* the code below is more readable if I make these bytes ++ instead of bits */ ++ ++ inbits >>= 3; ++ outbits >>= 3; ++ ++ /* first compute lcm(n,k) */ ++ ++ a = outbits; ++ b = inbits; ++ ++ while (b != 0) { ++ c = b; ++ b = a%b; ++ a = c; ++ } ++ ++ lcm = outbits*inbits/a; ++ ++ /* now do the real work */ ++ ++ memset(out, 0, outbits); ++ byte = 0; ++ ++ /* this will end up cycling through k lcm(k,n)/k times, which ++ is correct */ ++ for (i = lcm-1; i >= 0; i--) { ++ /* compute the msbit in k which gets added into this byte */ ++ msbit = ( ++ /* first, start with the msbit in the first, ++ * unrotated byte */ ++ ((inbits << 3) - 1) ++ /* then, for each byte, shift to the right ++ * for each repetition */ ++ + (((inbits << 3) + 13) * (i/inbits)) ++ /* last, pick out the correct byte within ++ * that shifted repetition */ ++ + ((inbits - (i % inbits)) << 3) ++ ) % (inbits << 3); ++ ++ /* pull out the byte value itself */ ++ byte += (((in[((inbits - 1) - (msbit >> 3)) % inbits] << 8)| ++ (in[((inbits) - (msbit >> 3)) % inbits])) ++ >> ((msbit & 7) + 1)) & 0xff; ++ ++ /* do the addition */ ++ byte += out[i % outbits]; ++ out[i % outbits] = byte & 0xff; ++ ++ /* keep around the carry bit, if any */ ++ byte >>= 8; ++ ++ } ++ ++ /* if there's a carry bit left over, add it back in */ ++ if (byte) { ++ for (i = outbits - 1; i >= 0; i--) { ++ /* do the addition */ ++ byte += out[i]; ++ out[i] = byte & 0xff; ++ ++ /* keep around the carry bit, if any */ ++ byte >>= 8; ++ } ++ } ++} ++ ++/* ++ * This is the DK (derive_key) function as described in rfc3961, sec 5.1 ++ * Taken from MIT Kerberos and modified. ++ */ ++ ++u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e, ++ const struct xdr_netobj *inkey, ++ struct xdr_netobj *outkey, ++ const struct xdr_netobj *in_constant, ++ gfp_t gfp_mask) ++{ ++ size_t blocksize, keybytes, keylength, n; ++ unsigned char *inblockdata, *outblockdata, *rawkey; ++ struct xdr_netobj inblock, outblock; ++ struct crypto_blkcipher *cipher; ++ u32 ret = EINVAL; ++ ++ blocksize = gk5e->blocksize; ++ keybytes = gk5e->keybytes; ++ keylength = gk5e->keylength; ++ ++ if ((inkey->len != keylength) || (outkey->len != keylength)) ++ goto err_return; ++ ++ cipher = crypto_alloc_blkcipher(gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cipher)) ++ goto err_return; ++ if (crypto_blkcipher_setkey(cipher, inkey->data, inkey->len)) ++ goto err_return; ++ ++ /* allocate and set up buffers */ ++ ++ ret = ENOMEM; ++ inblockdata = kmalloc(blocksize, gfp_mask); ++ if (inblockdata == NULL) ++ goto err_free_cipher; ++ ++ outblockdata = kmalloc(blocksize, gfp_mask); ++ if (outblockdata == NULL) ++ goto err_free_in; ++ ++ rawkey = kmalloc(keybytes, gfp_mask); ++ if (rawkey == NULL) ++ goto err_free_out; ++ ++ inblock.data = (char *) inblockdata; ++ inblock.len = blocksize; ++ ++ outblock.data = (char *) outblockdata; ++ outblock.len = blocksize; ++ ++ /* initialize the input block */ ++ ++ if (in_constant->len == inblock.len) { ++ memcpy(inblock.data, in_constant->data, inblock.len); ++ } else { ++ krb5_nfold(in_constant->len * 8, in_constant->data, ++ inblock.len * 8, inblock.data); ++ } ++ ++ /* loop encrypting the blocks until enough key bytes are generated */ ++ ++ n = 0; ++ while (n < keybytes) { ++ (*(gk5e->encrypt))(cipher, NULL, inblock.data, ++ outblock.data, inblock.len); ++ ++ if ((keybytes - n) <= outblock.len) { ++ memcpy(rawkey + n, outblock.data, (keybytes - n)); ++ break; ++ } ++ ++ memcpy(rawkey + n, outblock.data, outblock.len); ++ memcpy(inblock.data, outblock.data, outblock.len); ++ n += outblock.len; ++ } ++ ++ /* postprocess the key */ ++ ++ inblock.data = (char *) rawkey; ++ inblock.len = keybytes; ++ ++ BUG_ON(gk5e->mk_key == NULL); ++ ret = (*(gk5e->mk_key))(gk5e, &inblock, outkey); ++ if (ret) { ++ dprintk("%s: got %d from mk_key function for '%s'\n", ++ __func__, ret, gk5e->encrypt_name); ++ goto err_free_raw; ++ } ++ ++ /* clean memory, free resources and exit */ ++ ++ ret = 0; ++ ++err_free_raw: ++ memset(rawkey, 0, keybytes); ++ kfree(rawkey); ++err_free_out: ++ memset(outblockdata, 0, blocksize); ++ kfree(outblockdata); ++err_free_in: ++ memset(inblockdata, 0, blocksize); ++ kfree(inblockdata); ++err_free_cipher: ++ crypto_free_blkcipher(cipher); ++err_return: ++ return ret; ++} ++ ++#define smask(step) ((1<>step)&smask(step))) ++#define parity_char(x) pstep(pstep(pstep((x), 4), 2), 1) ++ ++static void mit_des_fixup_key_parity(u8 key[8]) ++{ ++ int i; ++ for (i = 0; i < 8; i++) { ++ key[i] &= 0xfe; ++ key[i] |= 1^parity_char(key[i]); ++ } ++} ++ ++/* ++ * This is the des3 key derivation postprocess function ++ */ ++u32 gss_krb5_des3_make_key(const struct gss_krb5_enctype *gk5e, ++ struct xdr_netobj *randombits, ++ struct xdr_netobj *key) ++{ ++ int i; ++ u32 ret = EINVAL; ++ ++ if (key->len != 24) { ++ dprintk("%s: key->len is %d\n", __func__, key->len); ++ goto err_out; ++ } ++ if (randombits->len != 21) { ++ dprintk("%s: randombits->len is %d\n", ++ __func__, randombits->len); ++ goto err_out; ++ } ++ ++ /* take the seven bytes, move them around into the top 7 bits of the ++ 8 key bytes, then compute the parity bits. Do this three times. */ ++ ++ for (i = 0; i < 3; i++) { ++ memcpy(key->data + i*8, randombits->data + i*7, 7); ++ key->data[i*8+7] = (((key->data[i*8]&1)<<1) | ++ ((key->data[i*8+1]&1)<<2) | ++ ((key->data[i*8+2]&1)<<3) | ++ ((key->data[i*8+3]&1)<<4) | ++ ((key->data[i*8+4]&1)<<5) | ++ ((key->data[i*8+5]&1)<<6) | ++ ((key->data[i*8+6]&1)<<7)); ++ ++ mit_des_fixup_key_parity(key->data + i*8); ++ } ++ ret = 0; ++err_out: ++ return ret; ++} ++ ++/* ++ * This is the aes key derivation postprocess function ++ */ ++u32 gss_krb5_aes_make_key(const struct gss_krb5_enctype *gk5e, ++ struct xdr_netobj *randombits, ++ struct xdr_netobj *key) ++{ ++ u32 ret = EINVAL; ++ ++ if (key->len != 16 && key->len != 32) { ++ dprintk("%s: key->len is %d\n", __func__, key->len); ++ goto err_out; ++ } ++ if (randombits->len != 16 && randombits->len != 32) { ++ dprintk("%s: randombits->len is %d\n", ++ __func__, randombits->len); ++ goto err_out; ++ } ++ if (randombits->len != key->len) { ++ dprintk("%s: randombits->len is %d, key->len is %d\n", ++ __func__, randombits->len, key->len); ++ goto err_out; ++ } ++ memcpy(key->data, randombits->data, key->len); ++ ret = 0; ++err_out: ++ return ret; ++} ++ +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_mech.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_mech.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_mech.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_mech.c 2010-08-23 11:01:00.392564136 -0400 +@@ -1,7 +1,7 @@ + /* + * linux/net/sunrpc/gss_krb5_mech.c + * +- * Copyright (c) 2001 The Regents of the University of Michigan. ++ * Copyright (c) 2001-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson +@@ -48,6 +48,143 @@ + # define RPCDBG_FACILITY RPCDBG_AUTH + #endif + ++static struct gss_api_mech gss_kerberos_mech; /* forward declaration */ ++ ++static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { ++ /* ++ * DES (All DES enctypes are mapped to the same gss functionality) ++ */ ++ { ++ .etype = ENCTYPE_DES_CBC_RAW, ++ .ctype = CKSUMTYPE_RSA_MD5, ++ .name = "des-cbc-crc", ++ .encrypt_name = "cbc(des)", ++ .cksum_name = "md5", ++ .encrypt = krb5_encrypt, ++ .decrypt = krb5_decrypt, ++ .mk_key = NULL, ++ .signalg = SGN_ALG_DES_MAC_MD5, ++ .sealalg = SEAL_ALG_DES, ++ .keybytes = 7, ++ .keylength = 8, ++ .blocksize = 8, ++ .conflen = 8, ++ .cksumlength = 8, ++ .keyed_cksum = 0, ++ }, ++ /* ++ * RC4-HMAC ++ */ ++ { ++ .etype = ENCTYPE_ARCFOUR_HMAC, ++ .ctype = CKSUMTYPE_HMAC_MD5_ARCFOUR, ++ .name = "rc4-hmac", ++ .encrypt_name = "ecb(arc4)", ++ .cksum_name = "hmac(md5)", ++ .encrypt = krb5_encrypt, ++ .decrypt = krb5_decrypt, ++ .mk_key = NULL, ++ .signalg = SGN_ALG_HMAC_MD5, ++ .sealalg = SEAL_ALG_MICROSOFT_RC4, ++ .keybytes = 16, ++ .keylength = 16, ++ .blocksize = 1, ++ .conflen = 8, ++ .cksumlength = 8, ++ .keyed_cksum = 1, ++ }, ++ /* ++ * 3DES ++ */ ++ { ++ .etype = ENCTYPE_DES3_CBC_RAW, ++ .ctype = CKSUMTYPE_HMAC_SHA1_DES3, ++ .name = "des3-hmac-sha1", ++ .encrypt_name = "cbc(des3_ede)", ++ .cksum_name = "hmac(sha1)", ++ .encrypt = krb5_encrypt, ++ .decrypt = krb5_decrypt, ++ .mk_key = gss_krb5_des3_make_key, ++ .signalg = SGN_ALG_HMAC_SHA1_DES3_KD, ++ .sealalg = SEAL_ALG_DES3KD, ++ .keybytes = 21, ++ .keylength = 24, ++ .blocksize = 8, ++ .conflen = 8, ++ .cksumlength = 20, ++ .keyed_cksum = 1, ++ }, ++ /* ++ * AES128 ++ */ ++ { ++ .etype = ENCTYPE_AES128_CTS_HMAC_SHA1_96, ++ .ctype = CKSUMTYPE_HMAC_SHA1_96_AES128, ++ .name = "aes128-cts", ++ .encrypt_name = "cts(cbc(aes))", ++ .cksum_name = "hmac(sha1)", ++ .encrypt = krb5_encrypt, ++ .decrypt = krb5_decrypt, ++ .mk_key = gss_krb5_aes_make_key, ++ .encrypt_v2 = gss_krb5_aes_encrypt, ++ .decrypt_v2 = gss_krb5_aes_decrypt, ++ .signalg = -1, ++ .sealalg = -1, ++ .keybytes = 16, ++ .keylength = 16, ++ .blocksize = 16, ++ .conflen = 16, ++ .cksumlength = 12, ++ .keyed_cksum = 1, ++ }, ++ /* ++ * AES256 ++ */ ++ { ++ .etype = ENCTYPE_AES256_CTS_HMAC_SHA1_96, ++ .ctype = CKSUMTYPE_HMAC_SHA1_96_AES256, ++ .name = "aes256-cts", ++ .encrypt_name = "cts(cbc(aes))", ++ .cksum_name = "hmac(sha1)", ++ .encrypt = krb5_encrypt, ++ .decrypt = krb5_decrypt, ++ .mk_key = gss_krb5_aes_make_key, ++ .encrypt_v2 = gss_krb5_aes_encrypt, ++ .decrypt_v2 = gss_krb5_aes_decrypt, ++ .signalg = -1, ++ .sealalg = -1, ++ .keybytes = 32, ++ .keylength = 32, ++ .blocksize = 16, ++ .conflen = 16, ++ .cksumlength = 12, ++ .keyed_cksum = 1, ++ }, ++}; ++ ++static const int num_supported_enctypes = ++ ARRAY_SIZE(supported_gss_krb5_enctypes); ++ ++static int ++supported_gss_krb5_enctype(int etype) ++{ ++ int i; ++ for (i = 0; i < num_supported_enctypes; i++) ++ if (supported_gss_krb5_enctypes[i].etype == etype) ++ return 1; ++ return 0; ++} ++ ++static const struct gss_krb5_enctype * ++get_gss_krb5_enctype(int etype) ++{ ++ int i; ++ for (i = 0; i < num_supported_enctypes; i++) ++ if (supported_gss_krb5_enctypes[i].etype == etype) ++ return &supported_gss_krb5_enctypes[i]; ++ return NULL; ++} ++ + static const void * + simple_get_bytes(const void *p, const void *end, void *res, int len) + { +@@ -78,35 +215,45 @@ simple_get_netobj(const void *p, const v + } + + static inline const void * +-get_key(const void *p, const void *end, struct crypto_blkcipher **res) ++get_key(const void *p, const void *end, ++ struct krb5_ctx *ctx, struct crypto_blkcipher **res) + { + struct xdr_netobj key; + int alg; +- char *alg_name; + + p = simple_get_bytes(p, end, &alg, sizeof(alg)); + if (IS_ERR(p)) + goto out_err; ++ ++ switch (alg) { ++ case ENCTYPE_DES_CBC_CRC: ++ case ENCTYPE_DES_CBC_MD4: ++ case ENCTYPE_DES_CBC_MD5: ++ /* Map all these key types to ENCTYPE_DES_CBC_RAW */ ++ alg = ENCTYPE_DES_CBC_RAW; ++ break; ++ } ++ ++ if (!supported_gss_krb5_enctype(alg)) { ++ printk(KERN_WARNING "gss_kerberos_mech: unsupported " ++ "encryption key algorithm %d\n", alg); ++ goto out_err; ++ } + p = simple_get_netobj(p, end, &key); + if (IS_ERR(p)) + goto out_err; + +- switch (alg) { +- case ENCTYPE_DES_CBC_RAW: +- alg_name = "cbc(des)"; +- break; +- default: +- printk("gss_kerberos_mech: unsupported algorithm %d\n", alg); +- goto out_err_free_key; +- } +- *res = crypto_alloc_blkcipher(alg_name, 0, CRYPTO_ALG_ASYNC); ++ *res = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); + if (IS_ERR(*res)) { +- printk("gss_kerberos_mech: unable to initialize crypto algorithm %s\n", alg_name); ++ printk(KERN_WARNING "gss_kerberos_mech: unable to initialize " ++ "crypto algorithm %s\n", ctx->gk5e->encrypt_name); + *res = NULL; + goto out_err_free_key; + } + if (crypto_blkcipher_setkey(*res, key.data, key.len)) { +- printk("gss_kerberos_mech: error setting key for crypto algorithm %s\n", alg_name); ++ printk(KERN_WARNING "gss_kerberos_mech: error setting key for " ++ "crypto algorithm %s\n", ctx->gk5e->encrypt_name); + goto out_err_free_tfm; + } + +@@ -123,56 +270,55 @@ out_err: + } + + static int +-gss_import_sec_context_kerberos(const void *p, +- size_t len, +- struct gss_ctx *ctx_id) ++gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) + { +- const void *end = (const void *)((const char *)p + len); +- struct krb5_ctx *ctx; + int tmp; + +- if (!(ctx = kzalloc(sizeof(*ctx), GFP_NOFS))) { +- p = ERR_PTR(-ENOMEM); +- goto out_err; +- } +- + p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate)); + if (IS_ERR(p)) +- goto out_err_free_ctx; ++ goto out_err; ++ ++ /* Old format supports only DES! Any other enctype uses new format */ ++ ctx->enctype = ENCTYPE_DES_CBC_RAW; ++ ++ ctx->gk5e = get_gss_krb5_enctype(ctx->enctype); ++ if (ctx->gk5e == NULL) ++ goto out_err; ++ + /* The downcall format was designed before we completely understood + * the uses of the context fields; so it includes some stuff we + * just give some minimal sanity-checking, and some we ignore + * completely (like the next twenty bytes): */ + if (unlikely(p + 20 > end || p + 20 < p)) +- goto out_err_free_ctx; ++ goto out_err; + p += 20; + p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); + if (IS_ERR(p)) +- goto out_err_free_ctx; ++ goto out_err; + if (tmp != SGN_ALG_DES_MAC_MD5) { + p = ERR_PTR(-ENOSYS); +- goto out_err_free_ctx; ++ goto out_err; + } + p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); + if (IS_ERR(p)) +- goto out_err_free_ctx; ++ goto out_err; + if (tmp != SEAL_ALG_DES) { + p = ERR_PTR(-ENOSYS); +- goto out_err_free_ctx; ++ goto out_err; + } + p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); + if (IS_ERR(p)) +- goto out_err_free_ctx; ++ goto out_err; + p = simple_get_bytes(p, end, &ctx->seq_send, sizeof(ctx->seq_send)); + if (IS_ERR(p)) +- goto out_err_free_ctx; ++ goto out_err; + p = simple_get_netobj(p, end, &ctx->mech_used); + if (IS_ERR(p)) +- goto out_err_free_ctx; +- p = get_key(p, end, &ctx->enc); ++ goto out_err; ++ p = get_key(p, end, ctx, &ctx->enc); + if (IS_ERR(p)) + goto out_err_free_mech; +- p = get_key(p, end, &ctx->seq); ++ p = get_key(p, end, ctx, &ctx->seq); + if (IS_ERR(p)) + goto out_err_free_key1; + if (p != end) { +@@ -180,9 +326,6 @@ gss_import_sec_context_kerberos(const vo + goto out_err_free_key2; + } + +- ctx_id->internal_ctx_id = ctx; +- +- dprintk("RPC: Successfully imported new context.\n"); + return 0; + + out_err_free_key2: +@@ -191,18 +334,378 @@ out_err_free_key1: + crypto_free_blkcipher(ctx->enc); + out_err_free_mech: + kfree(ctx->mech_used.data); +-out_err_free_ctx: +- kfree(ctx); + out_err: + return PTR_ERR(p); + } + ++struct crypto_blkcipher * ++context_v2_alloc_cipher(struct krb5_ctx *ctx, const char *cname, u8 *key) ++{ ++ struct crypto_blkcipher *cp; ++ ++ cp = crypto_alloc_blkcipher(cname, 0, CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cp)) { ++ dprintk("gss_kerberos_mech: unable to initialize " ++ "crypto algorithm %s\n", cname); ++ return NULL; ++ } ++ if (crypto_blkcipher_setkey(cp, key, ctx->gk5e->keylength)) { ++ dprintk("gss_kerberos_mech: error setting key for " ++ "crypto algorithm %s\n", cname); ++ crypto_free_blkcipher(cp); ++ return NULL; ++ } ++ return cp; ++} ++ ++static inline void ++set_cdata(u8 cdata[GSS_KRB5_K5CLENGTH], u32 usage, u8 seed) ++{ ++ cdata[0] = (usage>>24)&0xff; ++ cdata[1] = (usage>>16)&0xff; ++ cdata[2] = (usage>>8)&0xff; ++ cdata[3] = usage&0xff; ++ cdata[4] = seed; ++} ++ ++static int ++context_derive_keys_des3(struct krb5_ctx *ctx, gfp_t gfp_mask) ++{ ++ struct xdr_netobj c, keyin, keyout; ++ u8 cdata[GSS_KRB5_K5CLENGTH]; ++ u32 err; ++ ++ c.len = GSS_KRB5_K5CLENGTH; ++ c.data = cdata; ++ ++ keyin.data = ctx->Ksess; ++ keyin.len = ctx->gk5e->keylength; ++ keyout.len = ctx->gk5e->keylength; ++ ++ /* seq uses the raw key */ ++ ctx->seq = context_v2_alloc_cipher(ctx, ctx->gk5e->encrypt_name, ++ ctx->Ksess); ++ if (ctx->seq == NULL) ++ goto out_err; ++ ++ ctx->enc = context_v2_alloc_cipher(ctx, ctx->gk5e->encrypt_name, ++ ctx->Ksess); ++ if (ctx->enc == NULL) ++ goto out_free_seq; ++ ++ /* derive cksum */ ++ set_cdata(cdata, KG_USAGE_SIGN, KEY_USAGE_SEED_CHECKSUM); ++ keyout.data = ctx->cksum; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving cksum key\n", ++ __func__, err); ++ goto out_free_enc; ++ } ++ ++ return 0; ++ ++out_free_enc: ++ crypto_free_blkcipher(ctx->enc); ++out_free_seq: ++ crypto_free_blkcipher(ctx->seq); ++out_err: ++ return -EINVAL; ++} ++ ++/* ++ * Note that RC4 depends on deriving keys using the sequence ++ * number or the checksum of a token. Therefore, the final keys ++ * cannot be calculated until the token is being constructed! ++ */ ++static int ++context_derive_keys_rc4(struct krb5_ctx *ctx) ++{ ++ struct crypto_hash *hmac; ++ char sigkeyconstant[] = "signaturekey"; ++ int slen = strlen(sigkeyconstant) + 1; /* include null terminator */ ++ struct hash_desc desc; ++ struct scatterlist sg[1]; ++ int err; ++ ++ dprintk("RPC: %s: entered\n", __func__); ++ /* ++ * derive cksum (aka Ksign) key ++ */ ++ hmac = crypto_alloc_hash(ctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); ++ if (IS_ERR(hmac)) { ++ dprintk("%s: error %ld allocating hash '%s'\n", ++ __func__, PTR_ERR(hmac), ctx->gk5e->cksum_name); ++ err = PTR_ERR(hmac); ++ goto out_err; ++ } ++ ++ err = crypto_hash_setkey(hmac, ctx->Ksess, ctx->gk5e->keylength); ++ if (err) ++ goto out_err_free_hmac; ++ ++ sg_init_table(sg, 1); ++ sg_set_buf(sg, sigkeyconstant, slen); ++ ++ desc.tfm = hmac; ++ desc.flags = 0; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out_err_free_hmac; ++ ++ err = crypto_hash_digest(&desc, sg, slen, ctx->cksum); ++ if (err) ++ goto out_err_free_hmac; ++ /* ++ * allocate hash, and blkciphers for data and seqnum encryption ++ */ ++ ctx->enc = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(ctx->enc)) { ++ err = PTR_ERR(ctx->enc); ++ goto out_err_free_hmac; ++ } ++ ++ ctx->seq = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(ctx->seq)) { ++ crypto_free_blkcipher(ctx->enc); ++ err = PTR_ERR(ctx->seq); ++ goto out_err_free_hmac; ++ } ++ ++ dprintk("RPC: %s: returning success\n", __func__); ++ ++ err = 0; ++ ++out_err_free_hmac: ++ crypto_free_hash(hmac); ++out_err: ++ dprintk("RPC: %s: returning %d\n", __func__, err); ++ return err; ++} ++ ++static int ++context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask) ++{ ++ struct xdr_netobj c, keyin, keyout; ++ u8 cdata[GSS_KRB5_K5CLENGTH]; ++ u32 err; ++ ++ c.len = GSS_KRB5_K5CLENGTH; ++ c.data = cdata; ++ ++ keyin.data = ctx->Ksess; ++ keyin.len = ctx->gk5e->keylength; ++ keyout.len = ctx->gk5e->keylength; ++ ++ /* initiator seal encryption */ ++ set_cdata(cdata, KG_USAGE_INITIATOR_SEAL, KEY_USAGE_SEED_ENCRYPTION); ++ keyout.data = ctx->initiator_seal; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving initiator_seal key\n", ++ __func__, err); ++ goto out_err; ++ } ++ ctx->initiator_enc = context_v2_alloc_cipher(ctx, ++ ctx->gk5e->encrypt_name, ++ ctx->initiator_seal); ++ if (ctx->initiator_enc == NULL) ++ goto out_err; ++ ++ /* acceptor seal encryption */ ++ set_cdata(cdata, KG_USAGE_ACCEPTOR_SEAL, KEY_USAGE_SEED_ENCRYPTION); ++ keyout.data = ctx->acceptor_seal; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving acceptor_seal key\n", ++ __func__, err); ++ goto out_free_initiator_enc; ++ } ++ ctx->acceptor_enc = context_v2_alloc_cipher(ctx, ++ ctx->gk5e->encrypt_name, ++ ctx->acceptor_seal); ++ if (ctx->acceptor_enc == NULL) ++ goto out_free_initiator_enc; ++ ++ /* initiator sign checksum */ ++ set_cdata(cdata, KG_USAGE_INITIATOR_SIGN, KEY_USAGE_SEED_CHECKSUM); ++ keyout.data = ctx->initiator_sign; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving initiator_sign key\n", ++ __func__, err); ++ goto out_free_acceptor_enc; ++ } ++ ++ /* acceptor sign checksum */ ++ set_cdata(cdata, KG_USAGE_ACCEPTOR_SIGN, KEY_USAGE_SEED_CHECKSUM); ++ keyout.data = ctx->acceptor_sign; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving acceptor_sign key\n", ++ __func__, err); ++ goto out_free_acceptor_enc; ++ } ++ ++ /* initiator seal integrity */ ++ set_cdata(cdata, KG_USAGE_INITIATOR_SEAL, KEY_USAGE_SEED_INTEGRITY); ++ keyout.data = ctx->initiator_integ; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving initiator_integ key\n", ++ __func__, err); ++ goto out_free_acceptor_enc; ++ } ++ ++ /* acceptor seal integrity */ ++ set_cdata(cdata, KG_USAGE_ACCEPTOR_SEAL, KEY_USAGE_SEED_INTEGRITY); ++ keyout.data = ctx->acceptor_integ; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving acceptor_integ key\n", ++ __func__, err); ++ goto out_free_acceptor_enc; ++ } ++ ++ switch (ctx->enctype) { ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ ctx->initiator_enc_aux = ++ context_v2_alloc_cipher(ctx, "cbc(aes)", ++ ctx->initiator_seal); ++ if (ctx->initiator_enc_aux == NULL) ++ goto out_free_acceptor_enc; ++ ctx->acceptor_enc_aux = ++ context_v2_alloc_cipher(ctx, "cbc(aes)", ++ ctx->acceptor_seal); ++ if (ctx->acceptor_enc_aux == NULL) { ++ crypto_free_blkcipher(ctx->initiator_enc_aux); ++ goto out_free_acceptor_enc; ++ } ++ } ++ ++ return 0; ++ ++out_free_acceptor_enc: ++ crypto_free_blkcipher(ctx->acceptor_enc); ++out_free_initiator_enc: ++ crypto_free_blkcipher(ctx->initiator_enc); ++out_err: ++ return -EINVAL; ++} ++ ++static int ++gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, ++ gfp_t gfp_mask) ++{ ++ int keylen; ++ ++ p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags)); ++ if (IS_ERR(p)) ++ goto out_err; ++ ctx->initiate = ctx->flags & KRB5_CTX_FLAG_INITIATOR; ++ ++ p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); ++ if (IS_ERR(p)) ++ goto out_err; ++ p = simple_get_bytes(p, end, &ctx->seq_send64, sizeof(ctx->seq_send64)); ++ if (IS_ERR(p)) ++ goto out_err; ++ /* set seq_send for use by "older" enctypes */ ++ ctx->seq_send = ctx->seq_send64; ++ if (ctx->seq_send64 != ctx->seq_send) { ++ dprintk("%s: seq_send64 %lx, seq_send %x overflow?\n", __func__, ++ (long unsigned)ctx->seq_send64, ctx->seq_send); ++ goto out_err; ++ } ++ p = simple_get_bytes(p, end, &ctx->enctype, sizeof(ctx->enctype)); ++ if (IS_ERR(p)) ++ goto out_err; ++ /* Map ENCTYPE_DES3_CBC_SHA1 to ENCTYPE_DES3_CBC_RAW */ ++ if (ctx->enctype == ENCTYPE_DES3_CBC_SHA1) ++ ctx->enctype = ENCTYPE_DES3_CBC_RAW; ++ ctx->gk5e = get_gss_krb5_enctype(ctx->enctype); ++ if (ctx->gk5e == NULL) { ++ dprintk("gss_kerberos_mech: unsupported krb5 enctype %u\n", ++ ctx->enctype); ++ p = ERR_PTR(-EINVAL); ++ goto out_err; ++ } ++ keylen = ctx->gk5e->keylength; ++ ++ p = simple_get_bytes(p, end, ctx->Ksess, keylen); ++ if (IS_ERR(p)) ++ goto out_err; ++ ++ if (p != end) { ++ p = ERR_PTR(-EINVAL); ++ goto out_err; ++ } ++ ++ ctx->mech_used.data = kmemdup(gss_kerberos_mech.gm_oid.data, ++ gss_kerberos_mech.gm_oid.len, gfp_mask); ++ if (unlikely(ctx->mech_used.data == NULL)) { ++ p = ERR_PTR(-ENOMEM); ++ goto out_err; ++ } ++ ctx->mech_used.len = gss_kerberos_mech.gm_oid.len; ++ ++ switch (ctx->enctype) { ++ case ENCTYPE_DES3_CBC_RAW: ++ return context_derive_keys_des3(ctx, gfp_mask); ++ case ENCTYPE_ARCFOUR_HMAC: ++ return context_derive_keys_rc4(ctx); ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ return context_derive_keys_new(ctx, gfp_mask); ++ default: ++ return -EINVAL; ++ } ++ ++out_err: ++ return PTR_ERR(p); ++} ++ ++static int ++gss_import_sec_context_kerberos(const void *p, size_t len, ++ struct gss_ctx *ctx_id, ++ gfp_t gfp_mask) ++{ ++ const void *end = (const void *)((const char *)p + len); ++ struct krb5_ctx *ctx; ++ int ret; ++ ++ ctx = kzalloc(sizeof(*ctx), gfp_mask); ++ if (ctx == NULL) ++ return -ENOMEM; ++ ++ if (len == 85) ++ ret = gss_import_v1_context(p, end, ctx); ++ else ++ ret = gss_import_v2_context(p, end, ctx, gfp_mask); ++ ++ if (ret == 0) ++ ctx_id->internal_ctx_id = ctx; ++ else ++ kfree(ctx); ++ ++ dprintk("RPC: %s: returning %d\n", __func__, ret); ++ return ret; ++} ++ + static void + gss_delete_sec_context_kerberos(void *internal_ctx) { + struct krb5_ctx *kctx = internal_ctx; + + crypto_free_blkcipher(kctx->seq); + crypto_free_blkcipher(kctx->enc); ++ crypto_free_blkcipher(kctx->acceptor_enc); ++ crypto_free_blkcipher(kctx->initiator_enc); ++ crypto_free_blkcipher(kctx->acceptor_enc_aux); ++ crypto_free_blkcipher(kctx->initiator_enc_aux); + kfree(kctx->mech_used.data); + kfree(kctx); + } +@@ -241,6 +744,7 @@ static struct gss_api_mech gss_kerberos_ + .gm_ops = &gss_kerberos_ops, + .gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs), + .gm_pfs = gss_kerberos_pfs, ++ .gm_upcall_enctypes = "enctypes=18,17,16,23,3,1,2 ", + }; + + static int __init init_kerberos_module(void) +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seal.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seal.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seal.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seal.c 2010-08-23 11:01:00.392564136 -0400 +@@ -3,7 +3,7 @@ + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5seal.c + * +- * Copyright (c) 2000 The Regents of the University of Michigan. ++ * Copyright (c) 2000-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson +@@ -70,53 +70,154 @@ + + DEFINE_SPINLOCK(krb5_seq_lock); + +-u32 +-gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, ++static char * ++setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token) ++{ ++ __be16 *ptr, *krb5_hdr; ++ int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength; ++ ++ token->len = g_token_size(&ctx->mech_used, body_size); ++ ++ ptr = (__be16 *)token->data; ++ g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr); ++ ++ /* ptr now at start of header described in rfc 1964, section 1.2.1: */ ++ krb5_hdr = ptr; ++ *ptr++ = KG_TOK_MIC_MSG; ++ *ptr++ = cpu_to_le16(ctx->gk5e->signalg); ++ *ptr++ = SEAL_ALG_NONE; ++ *ptr++ = 0xffff; ++ ++ return (char *)krb5_hdr; ++} ++ ++static void * ++setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) ++{ ++ __be16 *ptr, *krb5_hdr; ++ u8 *p, flags = 0x00; ++ ++ if ((ctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0) ++ flags |= 0x01; ++ if (ctx->flags & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY) ++ flags |= 0x04; ++ ++ /* Per rfc 4121, sec 4.2.6.1, there is no header, ++ * just start the token */ ++ krb5_hdr = ptr = (__be16 *)token->data; ++ ++ *ptr++ = KG2_TOK_MIC; ++ p = (u8 *)ptr; ++ *p++ = flags; ++ *p++ = 0xff; ++ ptr = (__be16 *)p; ++ *ptr++ = 0xffff; ++ *ptr++ = 0xffff; ++ ++ token->len = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength; ++ return krb5_hdr; ++} ++ ++static u32 ++gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, + struct xdr_netobj *token) + { +- struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; +- char cksumdata[16]; +- struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; +- unsigned char *ptr, *msg_start; ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), ++ .data = cksumdata}; ++ void *ptr; + s32 now; + u32 seq_send; ++ u8 *cksumkey; + +- dprintk("RPC: gss_krb5_seal\n"); ++ dprintk("RPC: %s\n", __func__); + BUG_ON(ctx == NULL); + + now = get_seconds(); + +- token->len = g_token_size(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8); ++ ptr = setup_token(ctx, token); + +- ptr = token->data; +- g_make_token_header(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8, &ptr); ++ if (ctx->gk5e->keyed_cksum) ++ cksumkey = ctx->cksum; ++ else ++ cksumkey = NULL; + +- /* ptr now at header described in rfc 1964, section 1.2.1: */ +- ptr[0] = (unsigned char) ((KG_TOK_MIC_MSG >> 8) & 0xff); +- ptr[1] = (unsigned char) (KG_TOK_MIC_MSG & 0xff); ++ if (make_checksum(ctx, ptr, 8, text, 0, cksumkey, ++ KG_USAGE_SIGN, &md5cksum)) ++ return GSS_S_FAILURE; + +- msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8; ++ memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); + +- *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5); +- memset(ptr + 4, 0xff, 4); ++ spin_lock(&krb5_seq_lock); ++ seq_send = ctx->seq_send++; ++ spin_unlock(&krb5_seq_lock); + +- if (make_checksum("md5", ptr, 8, text, 0, &md5cksum)) ++ if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff, ++ seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)) + return GSS_S_FAILURE; + +- if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, +- md5cksum.data, md5cksum.len)) +- return GSS_S_FAILURE; ++ return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; ++} ++ ++u32 ++gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, ++ struct xdr_netobj *token) ++{ ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj cksumobj = { .len = sizeof(cksumdata), ++ .data = cksumdata}; ++ void *krb5_hdr; ++ s32 now; ++ u64 seq_send; ++ u8 *cksumkey; ++ unsigned int cksum_usage; ++ ++ dprintk("RPC: %s\n", __func__); + +- memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8); ++ krb5_hdr = setup_token_v2(ctx, token); + ++ /* Set up the sequence number. Now 64-bits in clear ++ * text and w/o direction indicator */ + spin_lock(&krb5_seq_lock); +- seq_send = ctx->seq_send++; ++ seq_send = ctx->seq_send64++; + spin_unlock(&krb5_seq_lock); ++ *((u64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send); + +- if (krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff, +- seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, +- ptr + 8)) ++ if (ctx->initiate) { ++ cksumkey = ctx->initiator_sign; ++ cksum_usage = KG_USAGE_INITIATOR_SIGN; ++ } else { ++ cksumkey = ctx->acceptor_sign; ++ cksum_usage = KG_USAGE_ACCEPTOR_SIGN; ++ } ++ ++ if (make_checksum_v2(ctx, krb5_hdr, GSS_KRB5_TOK_HDR_LEN, ++ text, 0, cksumkey, cksum_usage, &cksumobj)) + return GSS_S_FAILURE; + ++ memcpy(krb5_hdr + GSS_KRB5_TOK_HDR_LEN, cksumobj.data, cksumobj.len); ++ ++ now = get_seconds(); ++ + return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; + } ++ ++u32 ++gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, ++ struct xdr_netobj *token) ++{ ++ struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; ++ ++ switch (ctx->enctype) { ++ default: ++ BUG(); ++ case ENCTYPE_DES_CBC_RAW: ++ case ENCTYPE_DES3_CBC_RAW: ++ case ENCTYPE_ARCFOUR_HMAC: ++ return gss_get_mic_v1(ctx, text, token); ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ return gss_get_mic_v2(ctx, text, token); ++ } ++} ++ +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seqnum.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seqnum.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seqnum.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seqnum.c 2010-08-23 11:01:00.393496180 -0400 +@@ -39,14 +39,51 @@ + # define RPCDBG_FACILITY RPCDBG_AUTH + #endif + ++static s32 ++krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum, ++ unsigned char *cksum, unsigned char *buf) ++{ ++ struct crypto_blkcipher *cipher; ++ unsigned char plain[8]; ++ s32 code; ++ ++ dprintk("RPC: %s:\n", __func__); ++ cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cipher)) ++ return PTR_ERR(cipher); ++ ++ plain[0] = (unsigned char) ((seqnum >> 24) & 0xff); ++ plain[1] = (unsigned char) ((seqnum >> 16) & 0xff); ++ plain[2] = (unsigned char) ((seqnum >> 8) & 0xff); ++ plain[3] = (unsigned char) ((seqnum >> 0) & 0xff); ++ plain[4] = direction; ++ plain[5] = direction; ++ plain[6] = direction; ++ plain[7] = direction; ++ ++ code = krb5_rc4_setup_seq_key(kctx, cipher, cksum); ++ if (code) ++ goto out; ++ ++ code = krb5_encrypt(cipher, cksum, plain, buf, 8); ++out: ++ crypto_free_blkcipher(cipher); ++ return code; ++} + s32 +-krb5_make_seq_num(struct crypto_blkcipher *key, ++krb5_make_seq_num(struct krb5_ctx *kctx, ++ struct crypto_blkcipher *key, + int direction, + u32 seqnum, + unsigned char *cksum, unsigned char *buf) + { + unsigned char plain[8]; + ++ if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) ++ return krb5_make_rc4_seq_num(kctx, direction, seqnum, ++ cksum, buf); ++ + plain[0] = (unsigned char) (seqnum & 0xff); + plain[1] = (unsigned char) ((seqnum >> 8) & 0xff); + plain[2] = (unsigned char) ((seqnum >> 16) & 0xff); +@@ -60,17 +97,59 @@ krb5_make_seq_num(struct crypto_blkciphe + return krb5_encrypt(key, cksum, plain, buf, 8); + } + ++static s32 ++krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum, ++ unsigned char *buf, int *direction, s32 *seqnum) ++{ ++ struct crypto_blkcipher *cipher; ++ unsigned char plain[8]; ++ s32 code; ++ ++ dprintk("RPC: %s:\n", __func__); ++ cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cipher)) ++ return PTR_ERR(cipher); ++ ++ code = krb5_rc4_setup_seq_key(kctx, cipher, cksum); ++ if (code) ++ goto out; ++ ++ code = krb5_decrypt(cipher, cksum, buf, plain, 8); ++ if (code) ++ goto out; ++ ++ if ((plain[4] != plain[5]) || (plain[4] != plain[6]) ++ || (plain[4] != plain[7])) { ++ code = (s32)KG_BAD_SEQ; ++ goto out; ++ } ++ ++ *direction = plain[4]; ++ ++ *seqnum = ((plain[0] << 24) | (plain[1] << 16) | ++ (plain[2] << 8) | (plain[3])); ++out: ++ crypto_free_blkcipher(cipher); ++ return code; ++} ++ + s32 +-krb5_get_seq_num(struct crypto_blkcipher *key, ++krb5_get_seq_num(struct krb5_ctx *kctx, + unsigned char *cksum, + unsigned char *buf, + int *direction, u32 *seqnum) + { + s32 code; + unsigned char plain[8]; ++ struct crypto_blkcipher *key = kctx->seq; + + dprintk("RPC: krb5_get_seq_num:\n"); + ++ if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) ++ return krb5_get_rc4_seq_num(kctx, cksum, buf, ++ direction, seqnum); ++ + if ((code = krb5_decrypt(key, cksum, buf, plain, 8))) + return code; + +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_unseal.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_unseal.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_unseal.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_unseal.c 2010-08-23 11:01:00.393496180 -0400 +@@ -3,7 +3,7 @@ + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5unseal.c + * +- * Copyright (c) 2000 The Regents of the University of Michigan. ++ * Copyright (c) 2000-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson +@@ -70,20 +70,21 @@ + /* read_token is a mic token, and message_buffer is the data that the mic was + * supposedly taken over. */ + +-u32 +-gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, ++static u32 ++gss_verify_mic_v1(struct krb5_ctx *ctx, + struct xdr_buf *message_buffer, struct xdr_netobj *read_token) + { +- struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; + int signalg; + int sealalg; +- char cksumdata[16]; +- struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), ++ .data = cksumdata}; + s32 now; + int direction; + u32 seqnum; + unsigned char *ptr = (unsigned char *)read_token->data; + int bodysize; ++ u8 *cksumkey; + + dprintk("RPC: krb5_read_token\n"); + +@@ -98,7 +99,7 @@ gss_verify_mic_kerberos(struct gss_ctx * + /* XXX sanity-check bodysize?? */ + + signalg = ptr[2] + (ptr[3] << 8); +- if (signalg != SGN_ALG_DES_MAC_MD5) ++ if (signalg != ctx->gk5e->signalg) + return GSS_S_DEFECTIVE_TOKEN; + + sealalg = ptr[4] + (ptr[5] << 8); +@@ -108,13 +109,17 @@ gss_verify_mic_kerberos(struct gss_ctx * + if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) + return GSS_S_DEFECTIVE_TOKEN; + +- if (make_checksum("md5", ptr, 8, message_buffer, 0, &md5cksum)) +- return GSS_S_FAILURE; ++ if (ctx->gk5e->keyed_cksum) ++ cksumkey = ctx->cksum; ++ else ++ cksumkey = NULL; + +- if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, md5cksum.data, 16)) ++ if (make_checksum(ctx, ptr, 8, message_buffer, 0, ++ cksumkey, KG_USAGE_SIGN, &md5cksum)) + return GSS_S_FAILURE; + +- if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8)) ++ if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN, ++ ctx->gk5e->cksumlength)) + return GSS_S_BAD_SIG; + + /* it got through unscathed. Make sure the context is unexpired */ +@@ -126,7 +131,8 @@ gss_verify_mic_kerberos(struct gss_ctx * + + /* do sequencing checks */ + +- if (krb5_get_seq_num(ctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, &direction, &seqnum)) ++ if (krb5_get_seq_num(ctx, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, ++ &direction, &seqnum)) + return GSS_S_FAILURE; + + if ((ctx->initiate && direction != 0xff) || +@@ -135,3 +141,86 @@ gss_verify_mic_kerberos(struct gss_ctx * + + return GSS_S_COMPLETE; + } ++ ++static u32 ++gss_verify_mic_v2(struct krb5_ctx *ctx, ++ struct xdr_buf *message_buffer, struct xdr_netobj *read_token) ++{ ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj cksumobj = {.len = sizeof(cksumdata), ++ .data = cksumdata}; ++ s32 now; ++ u64 seqnum; ++ u8 *ptr = read_token->data; ++ u8 *cksumkey; ++ u8 flags; ++ int i; ++ unsigned int cksum_usage; ++ ++ dprintk("RPC: %s\n", __func__); ++ ++ if (be16_to_cpu(*((__be16 *)ptr)) != KG2_TOK_MIC) ++ return GSS_S_DEFECTIVE_TOKEN; ++ ++ flags = ptr[2]; ++ if ((!ctx->initiate && (flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR)) || ++ (ctx->initiate && !(flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR))) ++ return GSS_S_BAD_SIG; ++ ++ if (flags & KG2_TOKEN_FLAG_SEALED) { ++ dprintk("%s: token has unexpected sealed flag\n", __func__); ++ return GSS_S_FAILURE; ++ } ++ ++ for (i = 3; i < 8; i++) ++ if (ptr[i] != 0xff) ++ return GSS_S_DEFECTIVE_TOKEN; ++ ++ if (ctx->initiate) { ++ cksumkey = ctx->acceptor_sign; ++ cksum_usage = KG_USAGE_ACCEPTOR_SIGN; ++ } else { ++ cksumkey = ctx->initiator_sign; ++ cksum_usage = KG_USAGE_INITIATOR_SIGN; ++ } ++ ++ if (make_checksum_v2(ctx, ptr, GSS_KRB5_TOK_HDR_LEN, message_buffer, 0, ++ cksumkey, cksum_usage, &cksumobj)) ++ return GSS_S_FAILURE; ++ ++ if (memcmp(cksumobj.data, ptr + GSS_KRB5_TOK_HDR_LEN, ++ ctx->gk5e->cksumlength)) ++ return GSS_S_BAD_SIG; ++ ++ /* it got through unscathed. Make sure the context is unexpired */ ++ now = get_seconds(); ++ if (now > ctx->endtime) ++ return GSS_S_CONTEXT_EXPIRED; ++ ++ /* do sequencing checks */ ++ ++ seqnum = be64_to_cpup((__be64 *)ptr + 8); ++ ++ return GSS_S_COMPLETE; ++} ++ ++u32 ++gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, ++ struct xdr_buf *message_buffer, ++ struct xdr_netobj *read_token) ++{ ++ struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; ++ ++ switch (ctx->enctype) { ++ default: ++ BUG(); ++ case ENCTYPE_DES_CBC_RAW: ++ case ENCTYPE_DES3_CBC_RAW: ++ case ENCTYPE_ARCFOUR_HMAC: ++ return gss_verify_mic_v1(ctx, message_buffer, read_token); ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ return gss_verify_mic_v2(ctx, message_buffer, read_token); ++ } ++} ++ +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_wrap.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_wrap.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_wrap.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_wrap.c 2010-08-23 11:01:00.394576083 -0400 +@@ -1,3 +1,33 @@ ++/* ++ * COPYRIGHT (c) 2008 ++ * The Regents of the University of Michigan ++ * ALL RIGHTS RESERVED ++ * ++ * Permission is granted to use, copy, create derivative works ++ * and redistribute this software and such derivative works ++ * for any purpose, so long as the name of The University of ++ * Michigan is not used in any advertising or publicity ++ * pertaining to the use of distribution of this software ++ * without specific, written prior authorization. If the ++ * above copyright notice or any other identification of the ++ * University of Michigan is included in any copy of any ++ * portion of this software, then the disclaimer below must ++ * also be included. ++ * ++ * THIS SOFTWARE IS PROVIDED AS IS, WITHOUT REPRESENTATION ++ * FROM THE UNIVERSITY OF MICHIGAN AS TO ITS FITNESS FOR ANY ++ * PURPOSE, AND WITHOUT WARRANTY BY THE UNIVERSITY OF ++ * MICHIGAN OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ++ * WITHOUT LIMITATION THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ++ * REGENTS OF THE UNIVERSITY OF MICHIGAN SHALL NOT BE LIABLE ++ * FOR ANY DAMAGES, INCLUDING SPECIAL, INDIRECT, INCIDENTAL, OR ++ * CONSEQUENTIAL DAMAGES, WITH RESPECT TO ANY CLAIM ARISING ++ * OUT OF OR IN CONNECTION WITH THE USE OF THE SOFTWARE, EVEN ++ * IF IT HAS BEEN OR IS HEREAFTER ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGES. ++ */ ++ + #include + #include + #include +@@ -12,10 +42,7 @@ + static inline int + gss_krb5_padding(int blocksize, int length) + { +- /* Most of the code is block-size independent but currently we +- * use only 8: */ +- BUG_ON(blocksize != 8); +- return 8 - (length & 7); ++ return blocksize - (length % blocksize); + } + + static inline void +@@ -86,8 +113,8 @@ out: + return 0; + } + +-static void +-make_confounder(char *p, u32 conflen) ++void ++gss_krb5_make_confounder(char *p, u32 conflen) + { + static u64 i = 0; + u64 *q = (u64 *)p; +@@ -127,69 +154,73 @@ make_confounder(char *p, u32 conflen) + + /* XXX factor out common code with seal/unseal. */ + +-u32 +-gss_wrap_kerberos(struct gss_ctx *ctx, int offset, ++static u32 ++gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, + struct xdr_buf *buf, struct page **pages) + { +- struct krb5_ctx *kctx = ctx->internal_ctx_id; +- char cksumdata[16]; +- struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), ++ .data = cksumdata}; + int blocksize = 0, plainlen; + unsigned char *ptr, *msg_start; + s32 now; + int headlen; + struct page **tmp_pages; + u32 seq_send; ++ u8 *cksumkey; ++ u32 conflen = kctx->gk5e->conflen; + +- dprintk("RPC: gss_wrap_kerberos\n"); ++ dprintk("RPC: %s\n", __func__); + + now = get_seconds(); + + blocksize = crypto_blkcipher_blocksize(kctx->enc); + gss_krb5_add_padding(buf, offset, blocksize); + BUG_ON((buf->len - offset) % blocksize); +- plainlen = blocksize + buf->len - offset; ++ plainlen = conflen + buf->len - offset; + +- headlen = g_token_size(&kctx->mech_used, 24 + plainlen) - +- (buf->len - offset); ++ headlen = g_token_size(&kctx->mech_used, ++ GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength + plainlen) - ++ (buf->len - offset); + + ptr = buf->head[0].iov_base + offset; + /* shift data to make room for header. */ ++ xdr_extend_head(buf, offset, headlen); ++ + /* XXX Would be cleverer to encrypt while copying. */ +- /* XXX bounds checking, slack, etc. */ +- memmove(ptr + headlen, ptr, buf->head[0].iov_len - offset); +- buf->head[0].iov_len += headlen; +- buf->len += headlen; + BUG_ON((buf->len - offset - headlen) % blocksize); + + g_make_token_header(&kctx->mech_used, +- GSS_KRB5_TOK_HDR_LEN + 8 + plainlen, &ptr); ++ GSS_KRB5_TOK_HDR_LEN + ++ kctx->gk5e->cksumlength + plainlen, &ptr); + + + /* ptr now at header described in rfc 1964, section 1.2.1: */ + ptr[0] = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff); + ptr[1] = (unsigned char) (KG_TOK_WRAP_MSG & 0xff); + +- msg_start = ptr + 24; ++ msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength; + +- *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5); ++ *(__be16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg); + memset(ptr + 4, 0xff, 4); +- *(__be16 *)(ptr + 4) = htons(SEAL_ALG_DES); ++ *(__be16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg); + +- make_confounder(msg_start, blocksize); ++ gss_krb5_make_confounder(msg_start, conflen); ++ ++ if (kctx->gk5e->keyed_cksum) ++ cksumkey = kctx->cksum; ++ else ++ cksumkey = NULL; + + /* XXXJBF: UGH!: */ + tmp_pages = buf->pages; + buf->pages = pages; +- if (make_checksum("md5", ptr, 8, buf, +- offset + headlen - blocksize, &md5cksum)) ++ if (make_checksum(kctx, ptr, 8, buf, offset + headlen - conflen, ++ cksumkey, KG_USAGE_SEAL, &md5cksum)) + return GSS_S_FAILURE; + buf->pages = tmp_pages; + +- if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, +- md5cksum.data, md5cksum.len)) +- return GSS_S_FAILURE; +- memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8); ++ memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); + + spin_lock(&krb5_seq_lock); + seq_send = kctx->seq_send++; +@@ -197,25 +228,42 @@ gss_wrap_kerberos(struct gss_ctx *ctx, i + + /* XXX would probably be more efficient to compute checksum + * and encrypt at the same time: */ +- if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff, ++ if ((krb5_make_seq_num(kctx, kctx->seq, kctx->initiate ? 0 : 0xff, + seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8))) + return GSS_S_FAILURE; + +- if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize, +- pages)) +- return GSS_S_FAILURE; ++ if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { ++ struct crypto_blkcipher *cipher; ++ int err; ++ cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cipher)) ++ return GSS_S_FAILURE; ++ ++ krb5_rc4_setup_enc_key(kctx, cipher, seq_send); ++ ++ err = gss_encrypt_xdr_buf(cipher, buf, ++ offset + headlen - conflen, pages); ++ crypto_free_blkcipher(cipher); ++ if (err) ++ return GSS_S_FAILURE; ++ } else { ++ if (gss_encrypt_xdr_buf(kctx->enc, buf, ++ offset + headlen - conflen, pages)) ++ return GSS_S_FAILURE; ++ } + + return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; + } + +-u32 +-gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf) ++static u32 ++gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) + { +- struct krb5_ctx *kctx = ctx->internal_ctx_id; + int signalg; + int sealalg; +- char cksumdata[16]; +- struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), ++ .data = cksumdata}; + s32 now; + int direction; + s32 seqnum; +@@ -224,6 +272,9 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, + void *data_start, *orig_start; + int data_len; + int blocksize; ++ u32 conflen = kctx->gk5e->conflen; ++ int crypt_offset; ++ u8 *cksumkey; + + dprintk("RPC: gss_unwrap_kerberos\n"); + +@@ -241,29 +292,65 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, + /* get the sign and seal algorithms */ + + signalg = ptr[2] + (ptr[3] << 8); +- if (signalg != SGN_ALG_DES_MAC_MD5) ++ if (signalg != kctx->gk5e->signalg) + return GSS_S_DEFECTIVE_TOKEN; + + sealalg = ptr[4] + (ptr[5] << 8); +- if (sealalg != SEAL_ALG_DES) ++ if (sealalg != kctx->gk5e->sealalg) + return GSS_S_DEFECTIVE_TOKEN; + + if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) + return GSS_S_DEFECTIVE_TOKEN; + +- if (gss_decrypt_xdr_buf(kctx->enc, buf, +- ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base)) +- return GSS_S_DEFECTIVE_TOKEN; ++ /* ++ * Data starts after token header and checksum. ptr points ++ * to the beginning of the token header ++ */ ++ crypt_offset = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) - ++ (unsigned char *)buf->head[0].iov_base; ++ ++ /* ++ * Need plaintext seqnum to derive encryption key for arcfour-hmac ++ */ ++ if (krb5_get_seq_num(kctx, ptr + GSS_KRB5_TOK_HDR_LEN, ++ ptr + 8, &direction, &seqnum)) ++ return GSS_S_BAD_SIG; + +- if (make_checksum("md5", ptr, 8, buf, +- ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base, &md5cksum)) +- return GSS_S_FAILURE; ++ if ((kctx->initiate && direction != 0xff) || ++ (!kctx->initiate && direction != 0)) ++ return GSS_S_BAD_SIG; ++ ++ if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { ++ struct crypto_blkcipher *cipher; ++ int err; ++ ++ cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cipher)) ++ return GSS_S_FAILURE; ++ ++ krb5_rc4_setup_enc_key(kctx, cipher, seqnum); ++ ++ err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset); ++ crypto_free_blkcipher(cipher); ++ if (err) ++ return GSS_S_DEFECTIVE_TOKEN; ++ } else { ++ if (gss_decrypt_xdr_buf(kctx->enc, buf, crypt_offset)) ++ return GSS_S_DEFECTIVE_TOKEN; ++ } + +- if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, +- md5cksum.data, md5cksum.len)) ++ if (kctx->gk5e->keyed_cksum) ++ cksumkey = kctx->cksum; ++ else ++ cksumkey = NULL; ++ ++ if (make_checksum(kctx, ptr, 8, buf, crypt_offset, ++ cksumkey, KG_USAGE_SEAL, &md5cksum)) + return GSS_S_FAILURE; + +- if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8)) ++ if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN, ++ kctx->gk5e->cksumlength)) + return GSS_S_BAD_SIG; + + /* it got through unscathed. Make sure the context is unexpired */ +@@ -275,19 +362,12 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, + + /* do sequencing checks */ + +- if (krb5_get_seq_num(kctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, +- &direction, &seqnum)) +- return GSS_S_BAD_SIG; +- +- if ((kctx->initiate && direction != 0xff) || +- (!kctx->initiate && direction != 0)) +- return GSS_S_BAD_SIG; +- + /* Copy the data back to the right position. XXX: Would probably be + * better to copy and encrypt at the same time. */ + + blocksize = crypto_blkcipher_blocksize(kctx->enc); +- data_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8 + blocksize; ++ data_start = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) + ++ conflen; + orig_start = buf->head[0].iov_base + offset; + data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start; + memmove(orig_start, data_start, data_len); +@@ -299,3 +379,209 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, + + return GSS_S_COMPLETE; + } ++ ++/* ++ * We cannot currently handle tokens with rotated data. We need a ++ * generalized routine to rotate the data in place. It is anticipated ++ * that we won't encounter rotated data in the general case. ++ */ ++static u32 ++rotate_left(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, u16 rrc) ++{ ++ unsigned int realrrc = rrc % (buf->len - offset - GSS_KRB5_TOK_HDR_LEN); ++ ++ if (realrrc == 0) ++ return 0; ++ ++ dprintk("%s: cannot process token with rotated data: " ++ "rrc %u, realrrc %u\n", __func__, rrc, realrrc); ++ return 1; ++} ++ ++static u32 ++gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, struct page **pages) ++{ ++ int blocksize; ++ u8 *ptr, *plainhdr; ++ s32 now; ++ u8 flags = 0x00; ++ __be16 *be16ptr, ec = 0; ++ __be64 *be64ptr; ++ u32 err; ++ ++ dprintk("RPC: %s\n", __func__); ++ ++ if (kctx->gk5e->encrypt_v2 == NULL) ++ return GSS_S_FAILURE; ++ ++ /* make room for gss token header */ ++ if (xdr_extend_head(buf, offset, GSS_KRB5_TOK_HDR_LEN)) ++ return GSS_S_FAILURE; ++ ++ /* construct gss token header */ ++ ptr = plainhdr = buf->head[0].iov_base + offset; ++ *ptr++ = (unsigned char) ((KG2_TOK_WRAP>>8) & 0xff); ++ *ptr++ = (unsigned char) (KG2_TOK_WRAP & 0xff); ++ ++ if ((kctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0) ++ flags |= KG2_TOKEN_FLAG_SENTBYACCEPTOR; ++ if ((kctx->flags & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY) != 0) ++ flags |= KG2_TOKEN_FLAG_ACCEPTORSUBKEY; ++ /* We always do confidentiality in wrap tokens */ ++ flags |= KG2_TOKEN_FLAG_SEALED; ++ ++ *ptr++ = flags; ++ *ptr++ = 0xff; ++ be16ptr = (__be16 *)ptr; ++ ++ blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc); ++ *be16ptr++ = cpu_to_be16(ec); ++ /* "inner" token header always uses 0 for RRC */ ++ *be16ptr++ = cpu_to_be16(0); ++ ++ be64ptr = (__be64 *)be16ptr; ++ spin_lock(&krb5_seq_lock); ++ *be64ptr = cpu_to_be64(kctx->seq_send64++); ++ spin_unlock(&krb5_seq_lock); ++ ++ err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, ec, pages); ++ if (err) ++ return err; ++ ++ now = get_seconds(); ++ return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; ++} ++ ++static u32 ++gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) ++{ ++ s32 now; ++ u64 seqnum; ++ u8 *ptr; ++ u8 flags = 0x00; ++ u16 ec, rrc; ++ int err; ++ u32 headskip, tailskip; ++ u8 decrypted_hdr[GSS_KRB5_TOK_HDR_LEN]; ++ unsigned int movelen; ++ ++ ++ dprintk("RPC: %s\n", __func__); ++ ++ if (kctx->gk5e->decrypt_v2 == NULL) ++ return GSS_S_FAILURE; ++ ++ ptr = buf->head[0].iov_base + offset; ++ ++ if (be16_to_cpu(*((__be16 *)ptr)) != KG2_TOK_WRAP) ++ return GSS_S_DEFECTIVE_TOKEN; ++ ++ flags = ptr[2]; ++ if ((!kctx->initiate && (flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR)) || ++ (kctx->initiate && !(flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR))) ++ return GSS_S_BAD_SIG; ++ ++ if ((flags & KG2_TOKEN_FLAG_SEALED) == 0) { ++ dprintk("%s: token missing expected sealed flag\n", __func__); ++ return GSS_S_DEFECTIVE_TOKEN; ++ } ++ ++ if (ptr[3] != 0xff) ++ return GSS_S_DEFECTIVE_TOKEN; ++ ++ ec = be16_to_cpup((__be16 *)(ptr + 4)); ++ rrc = be16_to_cpup((__be16 *)(ptr + 6)); ++ ++ seqnum = be64_to_cpup((__be64 *)(ptr + 8)); ++ ++ if (rrc != 0) { ++ err = rotate_left(kctx, offset, buf, rrc); ++ if (err) ++ return GSS_S_FAILURE; ++ } ++ ++ err = (*kctx->gk5e->decrypt_v2)(kctx, offset, buf, ++ &headskip, &tailskip); ++ if (err) ++ return GSS_S_FAILURE; ++ ++ /* ++ * Retrieve the decrypted gss token header and verify ++ * it against the original ++ */ ++ err = read_bytes_from_xdr_buf(buf, ++ buf->len - GSS_KRB5_TOK_HDR_LEN - tailskip, ++ decrypted_hdr, GSS_KRB5_TOK_HDR_LEN); ++ if (err) { ++ dprintk("%s: error %u getting decrypted_hdr\n", __func__, err); ++ return GSS_S_FAILURE; ++ } ++ if (memcmp(ptr, decrypted_hdr, 6) ++ || memcmp(ptr + 8, decrypted_hdr + 8, 8)) { ++ dprintk("%s: token hdr, plaintext hdr mismatch!\n", __func__); ++ return GSS_S_FAILURE; ++ } ++ ++ /* do sequencing checks */ ++ ++ /* it got through unscathed. Make sure the context is unexpired */ ++ now = get_seconds(); ++ if (now > kctx->endtime) ++ return GSS_S_CONTEXT_EXPIRED; ++ ++ /* ++ * Move the head data back to the right position in xdr_buf. ++ * We ignore any "ec" data since it might be in the head or ++ * the tail, and we really don't need to deal with it. ++ * Note that buf->head[0].iov_len may indicate the available ++ * head buffer space rather than that actually occupied. ++ */ ++ movelen = min_t(unsigned int, buf->head[0].iov_len, buf->len); ++ movelen -= offset + GSS_KRB5_TOK_HDR_LEN + headskip; ++ BUG_ON(offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen > ++ buf->head[0].iov_len); ++ memmove(ptr, ptr + GSS_KRB5_TOK_HDR_LEN + headskip, movelen); ++ buf->head[0].iov_len -= GSS_KRB5_TOK_HDR_LEN + headskip; ++ buf->len -= GSS_KRB5_TOK_HDR_LEN + headskip; ++ ++ return GSS_S_COMPLETE; ++} ++ ++u32 ++gss_wrap_kerberos(struct gss_ctx *gctx, int offset, ++ struct xdr_buf *buf, struct page **pages) ++{ ++ struct krb5_ctx *kctx = gctx->internal_ctx_id; ++ ++ switch (kctx->enctype) { ++ default: ++ BUG(); ++ case ENCTYPE_DES_CBC_RAW: ++ case ENCTYPE_DES3_CBC_RAW: ++ case ENCTYPE_ARCFOUR_HMAC: ++ return gss_wrap_kerberos_v1(kctx, offset, buf, pages); ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ return gss_wrap_kerberos_v2(kctx, offset, buf, pages); ++ } ++} ++ ++u32 ++gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, struct xdr_buf *buf) ++{ ++ struct krb5_ctx *kctx = gctx->internal_ctx_id; ++ ++ switch (kctx->enctype) { ++ default: ++ BUG(); ++ case ENCTYPE_DES_CBC_RAW: ++ case ENCTYPE_DES3_CBC_RAW: ++ case ENCTYPE_ARCFOUR_HMAC: ++ return gss_unwrap_kerberos_v1(kctx, offset, buf); ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ return gss_unwrap_kerberos_v2(kctx, offset, buf); ++ } ++} ++ +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_mech_switch.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_mech_switch.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_mech_switch.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_mech_switch.c 2010-08-23 11:01:00.395574706 -0400 +@@ -249,14 +249,15 @@ EXPORT_SYMBOL_GPL(gss_mech_put); + int + gss_import_sec_context(const void *input_token, size_t bufsize, + struct gss_api_mech *mech, +- struct gss_ctx **ctx_id) ++ struct gss_ctx **ctx_id, ++ gfp_t gfp_mask) + { +- if (!(*ctx_id = kzalloc(sizeof(**ctx_id), GFP_KERNEL))) ++ if (!(*ctx_id = kzalloc(sizeof(**ctx_id), gfp_mask))) + return -ENOMEM; + (*ctx_id)->mech_type = gss_mech_get(mech); + + return mech->gm_ops +- ->gss_import_sec_context(input_token, bufsize, *ctx_id); ++ ->gss_import_sec_context(input_token, bufsize, *ctx_id, gfp_mask); + } + + /* gss_get_mic: compute a mic over message and return mic_token. */ +@@ -285,6 +286,20 @@ gss_verify_mic(struct gss_ctx *context_ + mic_token); + } + ++/* ++ * This function is called from both the client and server code. ++ * Each makes guarantees about how much "slack" space is available ++ * for the underlying function in "buf"'s head and tail while ++ * performing the wrap. ++ * ++ * The client and server code allocate RPC_MAX_AUTH_SIZE extra ++ * space in both the head and tail which is available for use by ++ * the wrap function. ++ * ++ * Underlying functions should verify they do not use more than ++ * RPC_MAX_AUTH_SIZE of extra space in either the head or tail ++ * when performing the wrap. ++ */ + u32 + gss_wrap(struct gss_ctx *ctx_id, + int offset, +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_spkm3_mech.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_spkm3_mech.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_spkm3_mech.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_spkm3_mech.c 2010-08-23 11:01:00.396574085 -0400 +@@ -84,13 +84,14 @@ simple_get_netobj(const void *p, const v + + static int + gss_import_sec_context_spkm3(const void *p, size_t len, +- struct gss_ctx *ctx_id) ++ struct gss_ctx *ctx_id, ++ gfp_t gfp_mask) + { + const void *end = (const void *)((const char *)p + len); + struct spkm3_ctx *ctx; + int version; + +- if (!(ctx = kzalloc(sizeof(*ctx), GFP_NOFS))) ++ if (!(ctx = kzalloc(sizeof(*ctx), gfp_mask))) + goto out_err; + + p = simple_get_bytes(p, end, &version, sizeof(version)); +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/Makefile.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/Makefile +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/Makefile 2010-08-23 11:01:00.387574079 -0400 +@@ -10,7 +10,7 @@ auth_rpcgss-objs := auth_gss.o gss_gener + obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o + + rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ +- gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o ++ gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o gss_krb5_keys.o + + obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o + +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/svcauth_gss.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/svcauth_gss.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/svcauth_gss.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/svcauth_gss.c 2010-08-23 11:01:00.396574085 -0400 +@@ -494,7 +494,7 @@ static int rsc_parse(struct cache_detail + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; +- status = gss_import_sec_context(buf, len, gm, &rsci.mechctx); ++ status = gss_import_sec_context(buf, len, gm, &rsci.mechctx, GFP_KERNEL); + if (status) + goto out; + +@@ -1315,6 +1315,14 @@ svcauth_gss_wrap_resp_priv(struct svc_rq + inpages = resbuf->pages; + /* XXX: Would be better to write some xdr helper functions for + * nfs{2,3,4}xdr.c that place the data right, instead of copying: */ ++ ++ /* ++ * If there is currently tail data, make sure there is ++ * room for the head, tail, and 2 * RPC_MAX_AUTH_SIZE in ++ * the page, and move the current tail data such that ++ * there is RPC_MAX_AUTH_SIZE slack space available in ++ * both the head and tail. ++ */ + if (resbuf->tail[0].iov_base) { + BUG_ON(resbuf->tail[0].iov_base >= resbuf->head[0].iov_base + + PAGE_SIZE); +@@ -1327,6 +1335,13 @@ svcauth_gss_wrap_resp_priv(struct svc_rq + resbuf->tail[0].iov_len); + resbuf->tail[0].iov_base += RPC_MAX_AUTH_SIZE; + } ++ /* ++ * If there is no current tail data, make sure there is ++ * room for the head data, and 2 * RPC_MAX_AUTH_SIZE in the ++ * allotted page, and set up tail information such that there ++ * is RPC_MAX_AUTH_SIZE slack space available in both the ++ * head and tail. ++ */ + if (resbuf->tail[0].iov_base == NULL) { + if (resbuf->head[0].iov_len + 2*RPC_MAX_AUTH_SIZE > PAGE_SIZE) + return -ENOMEM; +diff -up linux-2.6.34.noarch/net/sunrpc/clnt.c.orig linux-2.6.34.noarch/net/sunrpc/clnt.c +--- linux-2.6.34.noarch/net/sunrpc/clnt.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/clnt.c 2010-08-23 11:01:00.397622347 -0400 +@@ -556,26 +556,16 @@ static const struct rpc_call_ops rpc_def + */ + struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) + { +- struct rpc_task *task, *ret; ++ struct rpc_task *task; + + task = rpc_new_task(task_setup_data); +- if (task == NULL) { +- rpc_release_calldata(task_setup_data->callback_ops, +- task_setup_data->callback_data); +- ret = ERR_PTR(-ENOMEM); ++ if (IS_ERR(task)) + goto out; +- } + +- if (task->tk_status != 0) { +- ret = ERR_PTR(task->tk_status); +- rpc_put_task(task); +- goto out; +- } + atomic_inc(&task->tk_count); + rpc_execute(task); +- ret = task; + out: +- return ret; ++ return task; + } + EXPORT_SYMBOL_GPL(rpc_run_task); + +@@ -657,9 +647,8 @@ struct rpc_task *rpc_run_bc_task(struct + * Create an rpc_task to send the data + */ + task = rpc_new_task(&task_setup_data); +- if (!task) { ++ if (IS_ERR(task)) { + xprt_free_bc_request(req); +- task = ERR_PTR(-ENOMEM); + goto out; + } + task->tk_rqstp = req; +diff -up linux-2.6.34.noarch/net/sunrpc/sched.c.orig linux-2.6.34.noarch/net/sunrpc/sched.c +--- linux-2.6.34.noarch/net/sunrpc/sched.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/sched.c 2010-08-23 11:01:00.398564598 -0400 +@@ -25,7 +25,6 @@ + + #ifdef RPC_DEBUG + #define RPCDBG_FACILITY RPCDBG_SCHED +-#define RPC_TASK_MAGIC_ID 0xf00baa + #endif + + /* +@@ -237,7 +236,6 @@ static void rpc_task_set_debuginfo(struc + { + static atomic_t rpc_pid; + +- task->tk_magic = RPC_TASK_MAGIC_ID; + task->tk_pid = atomic_inc_return(&rpc_pid); + } + #else +@@ -360,9 +358,6 @@ static void __rpc_do_wake_up_task(struct + dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n", + task->tk_pid, jiffies); + +-#ifdef RPC_DEBUG +- BUG_ON(task->tk_magic != RPC_TASK_MAGIC_ID); +-#endif + /* Has the task been executed yet? If not, we cannot wake it up! */ + if (!RPC_IS_ACTIVATED(task)) { + printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task); +@@ -834,7 +829,7 @@ static void rpc_init_task(struct rpc_tas + } + + /* starting timestamp */ +- task->tk_start = jiffies; ++ task->tk_start = ktime_get(); + + dprintk("RPC: new task initialized, procpid %u\n", + task_pid_nr(current)); +@@ -856,16 +851,23 @@ struct rpc_task *rpc_new_task(const stru + + if (task == NULL) { + task = rpc_alloc_task(); +- if (task == NULL) +- goto out; ++ if (task == NULL) { ++ rpc_release_calldata(setup_data->callback_ops, ++ setup_data->callback_data); ++ return ERR_PTR(-ENOMEM); ++ } + flags = RPC_TASK_DYNAMIC; + } + + rpc_init_task(task, setup_data); ++ if (task->tk_status < 0) { ++ int err = task->tk_status; ++ rpc_put_task(task); ++ return ERR_PTR(err); ++ } + + task->tk_flags |= flags; + dprintk("RPC: allocated task %p\n", task); +-out: + return task; + } + +@@ -909,9 +911,6 @@ EXPORT_SYMBOL_GPL(rpc_put_task); + + static void rpc_release_task(struct rpc_task *task) + { +-#ifdef RPC_DEBUG +- BUG_ON(task->tk_magic != RPC_TASK_MAGIC_ID); +-#endif + dprintk("RPC: %5u release task\n", task->tk_pid); + + if (!list_empty(&task->tk_task)) { +@@ -923,9 +922,6 @@ static void rpc_release_task(struct rpc_ + } + BUG_ON (RPC_IS_QUEUED(task)); + +-#ifdef RPC_DEBUG +- task->tk_magic = 0; +-#endif + /* Wake up anyone who is waiting for task completion */ + rpc_mark_complete_task(task); + +diff -up linux-2.6.34.noarch/net/sunrpc/stats.c.orig linux-2.6.34.noarch/net/sunrpc/stats.c +--- linux-2.6.34.noarch/net/sunrpc/stats.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/stats.c 2010-08-23 11:01:00.399574225 -0400 +@@ -144,7 +144,7 @@ void rpc_count_iostats(struct rpc_task * + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_iostats *stats; + struct rpc_iostats *op_metrics; +- long rtt, execute, queue; ++ ktime_t delta; + + if (!task->tk_client || !task->tk_client->cl_metrics || !req) + return; +@@ -156,23 +156,16 @@ void rpc_count_iostats(struct rpc_task * + op_metrics->om_ntrans += req->rq_ntrans; + op_metrics->om_timeouts += task->tk_timeouts; + +- op_metrics->om_bytes_sent += task->tk_bytes_sent; ++ op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent; + op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd; + +- queue = (long)req->rq_xtime - task->tk_start; +- if (queue < 0) +- queue = -queue; +- op_metrics->om_queue += queue; +- +- rtt = task->tk_rtt; +- if (rtt < 0) +- rtt = -rtt; +- op_metrics->om_rtt += rtt; +- +- execute = (long)jiffies - task->tk_start; +- if (execute < 0) +- execute = -execute; +- op_metrics->om_execute += execute; ++ delta = ktime_sub(req->rq_xtime, task->tk_start); ++ op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta); ++ ++ op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt); ++ ++ delta = ktime_sub(ktime_get(), task->tk_start); ++ op_metrics->om_execute = ktime_add(op_metrics->om_execute, delta); + } + + static void _print_name(struct seq_file *seq, unsigned int op, +@@ -186,8 +179,6 @@ static void _print_name(struct seq_file + seq_printf(seq, "\t%12u: ", op); + } + +-#define MILLISECS_PER_JIFFY (1000 / HZ) +- + void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt) + { + struct rpc_iostats *stats = clnt->cl_metrics; +@@ -214,9 +205,9 @@ void rpc_print_iostats(struct seq_file * + metrics->om_timeouts, + metrics->om_bytes_sent, + metrics->om_bytes_recv, +- metrics->om_queue * MILLISECS_PER_JIFFY, +- metrics->om_rtt * MILLISECS_PER_JIFFY, +- metrics->om_execute * MILLISECS_PER_JIFFY); ++ ktime_to_ms(metrics->om_queue), ++ ktime_to_ms(metrics->om_rtt), ++ ktime_to_ms(metrics->om_execute)); + } + } + EXPORT_SYMBOL_GPL(rpc_print_iostats); +diff -up linux-2.6.34.noarch/net/sunrpc/xdr.c.orig linux-2.6.34.noarch/net/sunrpc/xdr.c +--- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-23 11:01:00.400574086 -0400 +@@ -762,6 +762,7 @@ int write_bytes_to_xdr_buf(struct xdr_bu + __write_bytes_to_xdr_buf(&subbuf, obj, len); + return 0; + } ++EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf); + + int + xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj) +diff -up linux-2.6.34.noarch/net/sunrpc/xprt.c.orig linux-2.6.34.noarch/net/sunrpc/xprt.c +--- linux-2.6.34.noarch/net/sunrpc/xprt.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xprt.c 2010-08-23 11:01:00.401372963 -0400 +@@ -43,6 +43,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -62,7 +63,6 @@ + * Local functions + */ + static void xprt_request_init(struct rpc_task *, struct rpc_xprt *); +-static inline void do_xprt_reserve(struct rpc_task *); + static void xprt_connect_status(struct rpc_task *task); + static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); + +@@ -711,12 +711,16 @@ void xprt_connect(struct rpc_task *task) + if (task->tk_rqstp) + task->tk_rqstp->rq_bytes_sent = 0; + +- task->tk_timeout = xprt->connect_timeout; ++ task->tk_timeout = task->tk_rqstp->rq_timeout; + rpc_sleep_on(&xprt->pending, task, xprt_connect_status); ++ ++ if (test_bit(XPRT_CLOSING, &xprt->state)) ++ return; ++ if (xprt_test_and_set_connecting(xprt)) ++ return; + xprt->stat.connect_start = jiffies; + xprt->ops->connect(task); + } +- return; + } + + static void xprt_connect_status(struct rpc_task *task) +@@ -771,25 +775,19 @@ struct rpc_rqst *xprt_lookup_rqst(struct + } + EXPORT_SYMBOL_GPL(xprt_lookup_rqst); + +-/** +- * xprt_update_rtt - update an RPC client's RTT state after receiving a reply +- * @task: RPC request that recently completed +- * +- */ +-void xprt_update_rtt(struct rpc_task *task) ++static void xprt_update_rtt(struct rpc_task *task) + { + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_rtt *rtt = task->tk_client->cl_rtt; + unsigned timer = task->tk_msg.rpc_proc->p_timer; ++ long m = usecs_to_jiffies(ktime_to_us(req->rq_rtt)); + + if (timer) { + if (req->rq_ntrans == 1) +- rpc_update_rtt(rtt, timer, +- (long)jiffies - req->rq_xtime); ++ rpc_update_rtt(rtt, timer, m); + rpc_set_timeo(rtt, timer, req->rq_ntrans - 1); + } + } +-EXPORT_SYMBOL_GPL(xprt_update_rtt); + + /** + * xprt_complete_rqst - called when reply processing is complete +@@ -807,7 +805,9 @@ void xprt_complete_rqst(struct rpc_task + task->tk_pid, ntohl(req->rq_xid), copied); + + xprt->stat.recvs++; +- task->tk_rtt = (long)jiffies - req->rq_xtime; ++ req->rq_rtt = ktime_sub(ktime_get(), req->rq_xtime); ++ if (xprt->ops->timer != NULL) ++ xprt_update_rtt(task); + + list_del_init(&req->rq_list); + req->rq_private_buf.len = copied; +@@ -906,7 +906,7 @@ void xprt_transmit(struct rpc_task *task + return; + + req->rq_connect_cookie = xprt->connect_cookie; +- req->rq_xtime = jiffies; ++ req->rq_xtime = ktime_get(); + status = xprt->ops->send_request(task); + if (status != 0) { + task->tk_status = status; +@@ -935,7 +935,7 @@ void xprt_transmit(struct rpc_task *task + spin_unlock_bh(&xprt->transport_lock); + } + +-static inline void do_xprt_reserve(struct rpc_task *task) ++static void xprt_alloc_slot(struct rpc_task *task) + { + struct rpc_xprt *xprt = task->tk_xprt; + +@@ -955,6 +955,16 @@ static inline void do_xprt_reserve(struc + rpc_sleep_on(&xprt->backlog, task, NULL); + } + ++static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) ++{ ++ memset(req, 0, sizeof(*req)); /* mark unused */ ++ ++ spin_lock(&xprt->reserve_lock); ++ list_add(&req->rq_list, &xprt->free); ++ rpc_wake_up_next(&xprt->backlog); ++ spin_unlock(&xprt->reserve_lock); ++} ++ + /** + * xprt_reserve - allocate an RPC request slot + * @task: RPC task requesting a slot allocation +@@ -968,7 +978,7 @@ void xprt_reserve(struct rpc_task *task) + + task->tk_status = -EIO; + spin_lock(&xprt->reserve_lock); +- do_xprt_reserve(task); ++ xprt_alloc_slot(task); + spin_unlock(&xprt->reserve_lock); + } + +@@ -1006,14 +1016,10 @@ void xprt_release(struct rpc_task *task) + { + struct rpc_xprt *xprt; + struct rpc_rqst *req; +- int is_bc_request; + + if (!(req = task->tk_rqstp)) + return; + +- /* Preallocated backchannel request? */ +- is_bc_request = bc_prealloc(req); +- + xprt = req->rq_xprt; + rpc_count_iostats(task); + spin_lock_bh(&xprt->transport_lock); +@@ -1027,21 +1033,16 @@ void xprt_release(struct rpc_task *task) + mod_timer(&xprt->timer, + xprt->last_used + xprt->idle_timeout); + spin_unlock_bh(&xprt->transport_lock); +- if (!bc_prealloc(req)) ++ if (req->rq_buffer) + xprt->ops->buf_free(req->rq_buffer); + task->tk_rqstp = NULL; + if (req->rq_release_snd_buf) + req->rq_release_snd_buf(req); + + dprintk("RPC: %5u release request %p\n", task->tk_pid, req); +- if (likely(!is_bc_request)) { +- memset(req, 0, sizeof(*req)); /* mark unused */ +- +- spin_lock(&xprt->reserve_lock); +- list_add(&req->rq_list, &xprt->free); +- rpc_wake_up_next(&xprt->backlog); +- spin_unlock(&xprt->reserve_lock); +- } else ++ if (likely(!bc_prealloc(req))) ++ xprt_free_slot(xprt, req); ++ else + xprt_free_bc_request(req); + } + +diff -up linux-2.6.34.noarch/net/sunrpc/xprtrdma/transport.c.orig linux-2.6.34.noarch/net/sunrpc/xprtrdma/transport.c +--- linux-2.6.34.noarch/net/sunrpc/xprtrdma/transport.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xprtrdma/transport.c 2010-08-23 11:01:00.402563985 -0400 +@@ -305,7 +305,6 @@ xprt_setup_rdma(struct xprt_create *args + /* 60 second timeout, no retries */ + xprt->timeout = &xprt_rdma_default_timeout; + xprt->bind_timeout = (60U * HZ); +- xprt->connect_timeout = (60U * HZ); + xprt->reestablish_timeout = (5U * HZ); + xprt->idle_timeout = (5U * 60 * HZ); + +@@ -449,21 +448,19 @@ xprt_rdma_connect(struct rpc_task *task) + struct rpc_xprt *xprt = (struct rpc_xprt *)task->tk_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + +- if (!xprt_test_and_set_connecting(xprt)) { +- if (r_xprt->rx_ep.rep_connected != 0) { +- /* Reconnect */ +- schedule_delayed_work(&r_xprt->rdma_connect, +- xprt->reestablish_timeout); +- xprt->reestablish_timeout <<= 1; +- if (xprt->reestablish_timeout > (30 * HZ)) +- xprt->reestablish_timeout = (30 * HZ); +- else if (xprt->reestablish_timeout < (5 * HZ)) +- xprt->reestablish_timeout = (5 * HZ); +- } else { +- schedule_delayed_work(&r_xprt->rdma_connect, 0); +- if (!RPC_IS_ASYNC(task)) +- flush_scheduled_work(); +- } ++ if (r_xprt->rx_ep.rep_connected != 0) { ++ /* Reconnect */ ++ schedule_delayed_work(&r_xprt->rdma_connect, ++ xprt->reestablish_timeout); ++ xprt->reestablish_timeout <<= 1; ++ if (xprt->reestablish_timeout > (30 * HZ)) ++ xprt->reestablish_timeout = (30 * HZ); ++ else if (xprt->reestablish_timeout < (5 * HZ)) ++ xprt->reestablish_timeout = (5 * HZ); ++ } else { ++ schedule_delayed_work(&r_xprt->rdma_connect, 0); ++ if (!RPC_IS_ASYNC(task)) ++ flush_scheduled_work(); + } + } + +@@ -677,7 +674,7 @@ xprt_rdma_send_request(struct rpc_task * + if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + goto drop_connection; + +- task->tk_bytes_sent += rqst->rq_snd_buf.len; ++ rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len; + rqst->rq_bytes_sent = 0; + return 0; + +diff -up linux-2.6.34.noarch/net/sunrpc/xprtsock.c.orig linux-2.6.34.noarch/net/sunrpc/xprtsock.c +--- linux-2.6.34.noarch/net/sunrpc/xprtsock.c.orig 2010-08-23 11:00:23.890501549 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xprtsock.c 2010-08-23 11:01:00.403564023 -0400 +@@ -138,20 +138,6 @@ static ctl_table sunrpc_table[] = { + #endif + + /* +- * Time out for an RPC UDP socket connect. UDP socket connects are +- * synchronous, but we set a timeout anyway in case of resource +- * exhaustion on the local host. +- */ +-#define XS_UDP_CONN_TO (5U * HZ) +- +-/* +- * Wait duration for an RPC TCP connection to be established. Solaris +- * NFS over TCP uses 60 seconds, for example, which is in line with how +- * long a server takes to reboot. +- */ +-#define XS_TCP_CONN_TO (60U * HZ) +- +-/* + * Wait duration for a reply from the RPC portmapper. + */ + #define XS_BIND_TO (60U * HZ) +@@ -543,7 +529,7 @@ static int xs_udp_send_request(struct rp + xdr->len - req->rq_bytes_sent, status); + + if (status >= 0) { +- task->tk_bytes_sent += status; ++ req->rq_xmit_bytes_sent += status; + if (status >= req->rq_slen) + return 0; + /* Still some bytes left; set up for a retry later. */ +@@ -639,7 +625,7 @@ static int xs_tcp_send_request(struct rp + /* If we've sent the entire packet, immediately + * reset the count of bytes sent. */ + req->rq_bytes_sent += status; +- task->tk_bytes_sent += status; ++ req->rq_xmit_bytes_sent += status; + if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_bytes_sent = 0; + return 0; +@@ -859,7 +845,6 @@ static void xs_udp_data_ready(struct soc + dst_confirm(skb_dst(skb)); + + xprt_adjust_cwnd(task, copied); +- xprt_update_rtt(task); + xprt_complete_rqst(task, copied); + + out_unlock: +@@ -2022,9 +2007,6 @@ static void xs_connect(struct rpc_task * + struct rpc_xprt *xprt = task->tk_xprt; + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + +- if (xprt_test_and_set_connecting(xprt)) +- return; +- + if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) { + dprintk("RPC: xs_connect delayed xprt %p for %lu " + "seconds\n", +@@ -2044,16 +2026,6 @@ static void xs_connect(struct rpc_task * + } + } + +-static void xs_tcp_connect(struct rpc_task *task) +-{ +- struct rpc_xprt *xprt = task->tk_xprt; +- +- /* Exit if we need to wait for socket shutdown to complete */ +- if (test_bit(XPRT_CLOSING, &xprt->state)) +- return; +- xs_connect(task); +-} +- + /** + * xs_udp_print_stats - display UDP socket-specifc stats + * @xprt: rpc_xprt struct containing statistics +@@ -2252,7 +2224,7 @@ static struct rpc_xprt_ops xs_tcp_ops = + .release_xprt = xs_tcp_release_xprt, + .rpcbind = rpcb_getport_async, + .set_port = xs_set_port, +- .connect = xs_tcp_connect, ++ .connect = xs_connect, + .buf_alloc = rpc_malloc, + .buf_free = rpc_free, + .send_request = xs_tcp_send_request, +@@ -2343,7 +2315,6 @@ static struct rpc_xprt *xs_setup_udp(str + xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); + + xprt->bind_timeout = XS_BIND_TO; +- xprt->connect_timeout = XS_UDP_CONN_TO; + xprt->reestablish_timeout = XS_UDP_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; + +@@ -2418,7 +2389,6 @@ static struct rpc_xprt *xs_setup_tcp(str + xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; + + xprt->bind_timeout = XS_BIND_TO; +- xprt->connect_timeout = XS_TCP_CONN_TO; + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; + +@@ -2478,9 +2448,6 @@ static struct rpc_xprt *xs_setup_bc_tcp( + struct sock_xprt *transport; + struct svc_sock *bc_sock; + +- if (!args->bc_xprt) +- ERR_PTR(-EINVAL); +- + xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries); + if (IS_ERR(xprt)) + return xprt; +@@ -2494,7 +2461,6 @@ static struct rpc_xprt *xs_setup_bc_tcp( + /* backchannel */ + xprt_set_bound(xprt); + xprt->bind_timeout = 0; +- xprt->connect_timeout = 0; + xprt->reestablish_timeout = 0; + xprt->idle_timeout = 0; + diff --git a/nfsd-35-fc.patch b/nfsd-35-fc.patch new file mode 100644 index 000000000..ef99b4995 --- /dev/null +++ b/nfsd-35-fc.patch @@ -0,0 +1,1808 @@ +diff -up linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt +--- linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt 2010-08-23 09:57:18.233564439 -0400 +@@ -137,7 +137,7 @@ NS*| OPENATTR | OPT | + | READ | REQ | | Section 18.22 | + | READDIR | REQ | | Section 18.23 | + | READLINK | OPT | | Section 18.24 | +-NS | RECLAIM_COMPLETE | REQ | | Section 18.51 | ++ | RECLAIM_COMPLETE | REQ | | Section 18.51 | + | RELEASE_LOCKOWNER | MNI | | N/A | + | REMOVE | REQ | | Section 18.25 | + | RENAME | REQ | | Section 18.26 | +diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c +--- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 09:57:18.234564075 -0400 +@@ -259,10 +259,9 @@ static struct cache_detail svc_expkey_ca + .alloc = expkey_alloc, + }; + +-static struct svc_expkey * +-svc_expkey_lookup(struct svc_expkey *item) ++static int ++svc_expkey_hash(struct svc_expkey *item) + { +- struct cache_head *ch; + int hash = item->ek_fsidtype; + char * cp = (char*)item->ek_fsid; + int len = key_len(item->ek_fsidtype); +@@ -270,6 +269,14 @@ svc_expkey_lookup(struct svc_expkey *ite + hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); + hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS); + hash &= EXPKEY_HASHMASK; ++ return hash; ++} ++ ++static struct svc_expkey * ++svc_expkey_lookup(struct svc_expkey *item) ++{ ++ struct cache_head *ch; ++ int hash = svc_expkey_hash(item); + + ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h, + hash); +@@ -283,13 +290,7 @@ static struct svc_expkey * + svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old) + { + struct cache_head *ch; +- int hash = new->ek_fsidtype; +- char * cp = (char*)new->ek_fsid; +- int len = key_len(new->ek_fsidtype); +- +- hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); +- hash ^= hash_ptr(new->ek_client, EXPKEY_HASHBITS); +- hash &= EXPKEY_HASHMASK; ++ int hash = svc_expkey_hash(new); + + ch = sunrpc_cache_update(&svc_expkey_cache, &new->h, + &old->h, hash); +@@ -738,14 +739,22 @@ struct cache_detail svc_export_cache = { + .alloc = svc_export_alloc, + }; + +-static struct svc_export * +-svc_export_lookup(struct svc_export *exp) ++static int ++svc_export_hash(struct svc_export *exp) + { +- struct cache_head *ch; + int hash; ++ + hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS); + hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS); + hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS); ++ return hash; ++} ++ ++static struct svc_export * ++svc_export_lookup(struct svc_export *exp) ++{ ++ struct cache_head *ch; ++ int hash = svc_export_hash(exp); + + ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h, + hash); +@@ -759,10 +768,7 @@ static struct svc_export * + svc_export_update(struct svc_export *new, struct svc_export *old) + { + struct cache_head *ch; +- int hash; +- hash = hash_ptr(old->ex_client, EXPORT_HASHBITS); +- hash ^= hash_ptr(old->ex_path.dentry, EXPORT_HASHBITS); +- hash ^= hash_ptr(old->ex_path.mnt, EXPORT_HASHBITS); ++ int hash = svc_export_hash(old); + + ch = sunrpc_cache_update(&svc_export_cache, &new->h, + &old->h, +@@ -1071,9 +1077,9 @@ exp_export(struct nfsctl_export *nxp) + err = 0; + finish: + kfree(new.ex_pathname); +- if (exp) ++ if (!IS_ERR_OR_NULL(exp)) + exp_put(exp); +- if (fsid_key && !IS_ERR(fsid_key)) ++ if (!IS_ERR_OR_NULL(fsid_key)) + cache_put(&fsid_key->h, &svc_expkey_cache); + path_put(&path); + out_put_clp: +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 10:00:37.257414684 -0400 +@@ -79,11 +79,6 @@ enum nfs_cb_opnum4 { + cb_sequence_dec_sz + \ + op_dec_sz) + +-struct nfs4_rpc_args { +- void *args_op; +- struct nfsd4_cb_sequence args_seq; +-}; +- + /* + * Generic encode routines from fs/nfs/nfs4xdr.c + */ +@@ -428,13 +423,19 @@ static struct rpc_procinfo nfs4_cb_p + }; + + static struct rpc_version nfs_cb_version4 = { ++/* ++ * Note on the callback rpc program version number: despite language in rfc ++ * 5661 section 18.36.3 requiring servers to use 4 in this field, the ++ * official xdr descriptions for both 4.0 and 4.1 specify version 1, and ++ * in practice that appears to be what implementations use. The section ++ * 18.36.3 language is expected to be fixed in an erratum. ++ */ + .number = 1, + .nrprocs = ARRAY_SIZE(nfs4_cb_procedures), + .procs = nfs4_cb_procedures + }; + + static struct rpc_version * nfs_cb_version[] = { +- NULL, + &nfs_cb_version4, + }; + +@@ -456,15 +457,14 @@ static struct rpc_program cb_program = { + + static int max_cb_time(void) + { +- return max(NFSD_LEASE_TIME/10, (time_t)1) * HZ; ++ return max(nfsd4_lease/10, (time_t)1) * HZ; + } + + /* Reference counting, callback cleanup, etc., all look racy as heck. +- * And why is cb_set an atomic? */ ++ * And why is cl_cb_set an atomic? */ + +-int setup_callback_client(struct nfs4_client *clp) ++int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb) + { +- struct nfs4_cb_conn *cb = &clp->cl_cb_conn; + struct rpc_timeout timeparms = { + .to_initval = max_cb_time(), + .to_retries = 0, +@@ -476,7 +476,7 @@ int setup_callback_client(struct nfs4_cl + .timeout = &timeparms, + .program = &cb_program, + .prognumber = cb->cb_prog, +- .version = nfs_cb_version[1]->number, ++ .version = 0, + .authflavor = clp->cl_flavor, + .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), + .client_name = clp->cl_principal, +@@ -486,7 +486,7 @@ int setup_callback_client(struct nfs4_cl + if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) + return -EINVAL; + if (cb->cb_minorversion) { +- args.bc_xprt = clp->cl_cb_xprt; ++ args.bc_xprt = cb->cb_xprt; + args.protocol = XPRT_TRANSPORT_BC_TCP; + } + /* Create RPC client */ +@@ -496,7 +496,7 @@ int setup_callback_client(struct nfs4_cl + PTR_ERR(client)); + return PTR_ERR(client); + } +- cb->cb_client = client; ++ nfsd4_set_callback_client(clp, client); + return 0; + + } +@@ -514,8 +514,7 @@ static void nfsd4_cb_probe_done(struct r + if (task->tk_status) + warn_no_callback_path(clp, task->tk_status); + else +- atomic_set(&clp->cl_cb_conn.cb_set, 1); +- put_nfs4_client(clp); ++ atomic_set(&clp->cl_cb_set, 1); + } + + static const struct rpc_call_ops nfsd4_cb_probe_ops = { +@@ -537,7 +536,6 @@ int set_callback_cred(void) + + void do_probe_callback(struct nfs4_client *clp) + { +- struct nfs4_cb_conn *cb = &clp->cl_cb_conn; + struct rpc_message msg = { + .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], + .rpc_argp = clp, +@@ -545,34 +543,28 @@ void do_probe_callback(struct nfs4_clien + }; + int status; + +- status = rpc_call_async(cb->cb_client, &msg, ++ status = rpc_call_async(cb->cl_cb_client, &msg, + RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + &nfsd4_cb_probe_ops, (void *)clp); +- if (status) { ++ if (status) + warn_no_callback_path(clp, status); +- put_nfs4_client(clp); +- } + } + + /* + * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... + */ +-void +-nfsd4_probe_callback(struct nfs4_client *clp) ++void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb) + { + int status; + +- BUG_ON(atomic_read(&clp->cl_cb_conn.cb_set)); ++ BUG_ON(atomic_read(&clp->cl_cb_set)); + +- status = setup_callback_client(clp); ++ status = setup_callback_client(clp, cb); + if (status) { + warn_no_callback_path(clp, status); + return; + } + +- /* the task holds a reference to the nfs4_client struct */ +- atomic_inc(&clp->cl_count); +- + do_probe_callback(clp); + } + +@@ -658,18 +650,32 @@ static void nfsd4_cb_done(struct rpc_tas + } + } + ++ + static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) + { + struct nfs4_delegation *dp = calldata; + struct nfs4_client *clp = dp->dl_client; ++ struct rpc_clnt *current_rpc_client = clp->cl_cb_client; + + nfsd4_cb_done(task, calldata); + ++ if (current_rpc_client == NULL) { ++ /* We're shutting down; give up. */ ++ /* XXX: err, or is it ok just to fall through ++ * and rpc_restart_call? */ ++ return; ++ } ++ + switch (task->tk_status) { + case -EIO: + /* Network partition? */ +- atomic_set(&clp->cl_cb_conn.cb_set, 0); ++ atomic_set(&clp->cl_cb_set, 0); + warn_no_callback_path(clp, task->tk_status); ++ if (current_rpc_client != task->tk_client) { ++ /* queue a callback on the new connection: */ ++ nfsd4_cb_recall(dp); ++ return; ++ } + case -EBADHANDLE: + case -NFS4ERR_BAD_STATEID: + /* Race: client probably got cb_recall +@@ -677,7 +683,7 @@ static void nfsd4_cb_recall_done(struct + break; + default: + /* success, or error we can't handle */ +- goto done; ++ return; + } + if (dp->dl_retries--) { + rpc_delay(task, 2*HZ); +@@ -685,20 +691,16 @@ static void nfsd4_cb_recall_done(struct + rpc_restart_call(task); + return; + } else { +- atomic_set(&clp->cl_cb_conn.cb_set, 0); ++ atomic_set(&clp->cl_cb_set, 0); + warn_no_callback_path(clp, task->tk_status); + } +-done: +- kfree(task->tk_msg.rpc_argp); + } + + static void nfsd4_cb_recall_release(void *calldata) + { + struct nfs4_delegation *dp = calldata; +- struct nfs4_client *clp = dp->dl_client; + + nfs4_put_delegation(dp); +- put_nfs4_client(clp); + } + + static const struct rpc_call_ops nfsd4_cb_recall_ops = { +@@ -707,33 +709,75 @@ static const struct rpc_call_ops nfsd4_c + .rpc_release = nfsd4_cb_recall_release, + }; + ++static struct workqueue_struct *callback_wq; ++ ++int nfsd4_create_callback_queue(void) ++{ ++ callback_wq = create_singlethread_workqueue("nfsd4_callbacks"); ++ if (!callback_wq) ++ return -ENOMEM; ++ return 0; ++} ++ ++void nfsd4_destroy_callback_queue(void) ++{ ++ destroy_workqueue(callback_wq); ++} ++ ++/* must be called under the state lock */ ++void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new) ++{ ++ struct rpc_clnt *old = clp->cl_cb_client; ++ ++ clp->cl_cb_client = new; ++ /* ++ * After this, any work that saw the old value of cl_cb_client will ++ * be gone: ++ */ ++ flush_workqueue(callback_wq); ++ /* So we can safely shut it down: */ ++ if (old) ++ rpc_shutdown_client(old); ++} ++ + /* + * called with dp->dl_count inc'ed. + */ +-void +-nfsd4_cb_recall(struct nfs4_delegation *dp) ++static void _nfsd4_cb_recall(struct nfs4_delegation *dp) + { + struct nfs4_client *clp = dp->dl_client; +- struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client; +- struct nfs4_rpc_args *args; ++ struct rpc_clnt *clnt = clp->cl_cb_client; ++ struct nfs4_rpc_args *args = &dp->dl_recall.cb_args; + struct rpc_message msg = { + .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], + .rpc_cred = callback_cred + }; +- int status = -ENOMEM; ++ int status; ++ ++ if (clnt == NULL) ++ return; /* Client is shutting down; give up. */ + +- args = kzalloc(sizeof(*args), GFP_KERNEL); +- if (!args) +- goto out; + args->args_op = dp; + msg.rpc_argp = args; + dp->dl_retries = 1; + status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, + &nfsd4_cb_recall_ops, dp); +-out: +- if (status) { +- kfree(args); +- put_nfs4_client(clp); ++ if (status) + nfs4_put_delegation(dp); +- } ++} ++ ++void nfsd4_do_callback_rpc(struct work_struct *w) ++{ ++ /* XXX: for now, just send off delegation recall. */ ++ /* In future, generalize to handle any sort of callback. */ ++ struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work); ++ struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall); ++ ++ _nfsd4_cb_recall(dp); ++} ++ ++ ++void nfsd4_cb_recall(struct nfs4_delegation *dp) ++{ ++ queue_work(callback_wq, &dp->dl_recall.cb_work); + } +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 09:57:18.237376763 -0400 +@@ -969,20 +969,36 @@ static struct nfsd4_operation nfsd4_ops[ + static const char *nfsd4_op_name(unsigned opnum); + + /* +- * Enforce NFSv4.1 COMPOUND ordering rules. ++ * Enforce NFSv4.1 COMPOUND ordering rules: + * +- * TODO: +- * - enforce NFS4ERR_NOT_ONLY_OP, +- * - DESTROY_SESSION MUST be the final operation in the COMPOUND request. ++ * Also note, enforced elsewhere: ++ * - SEQUENCE other than as first op results in ++ * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().) ++ * - BIND_CONN_TO_SESSION must be the only op in its compound ++ * (Will be enforced in nfsd4_bind_conn_to_session().) ++ * - DESTROY_SESSION must be the final operation in a compound, if ++ * sessionid's in SEQUENCE and DESTROY_SESSION are the same. ++ * (Enforced in nfsd4_destroy_session().) + */ +-static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args) ++static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args) + { +- if (args->minorversion && args->opcnt > 0) { +- struct nfsd4_op *op = &args->ops[0]; +- return (op->status == nfserr_op_illegal) || +- (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP); +- } +- return true; ++ struct nfsd4_op *op = &args->ops[0]; ++ ++ /* These ordering requirements don't apply to NFSv4.0: */ ++ if (args->minorversion == 0) ++ return nfs_ok; ++ /* This is weird, but OK, not our problem: */ ++ if (args->opcnt == 0) ++ return nfs_ok; ++ if (op->status == nfserr_op_illegal) ++ return nfs_ok; ++ if (!(nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP)) ++ return nfserr_op_not_in_session; ++ if (op->opnum == OP_SEQUENCE) ++ return nfs_ok; ++ if (args->opcnt != 1) ++ return nfserr_not_only_op; ++ return nfs_ok; + } + + /* +@@ -1012,6 +1028,7 @@ nfsd4_proc_compound(struct svc_rqst *rqs + resp->rqstp = rqstp; + resp->cstate.minorversion = args->minorversion; + resp->cstate.replay_owner = NULL; ++ resp->cstate.session = NULL; + fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); + fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); + /* Use the deferral mechanism only for NFSv4.0 compounds */ +@@ -1024,13 +1041,13 @@ nfsd4_proc_compound(struct svc_rqst *rqs + if (args->minorversion > nfsd_supported_minorversion) + goto out; + +- if (!nfs41_op_ordering_ok(args)) { ++ status = nfs41_check_op_ordering(args); ++ if (status) { + op = &args->ops[0]; +- op->status = nfserr_sequence_pos; ++ op->status = status; + goto encode_op; + } + +- status = nfs_ok; + while (!status && resp->opcnt < args->opcnt) { + op = &args->ops[resp->opcnt++]; + +@@ -1295,6 +1312,11 @@ static struct nfsd4_operation nfsd4_ops[ + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_SEQUENCE", + }, ++ [OP_RECLAIM_COMPLETE] = { ++ .op_func = (nfsd4op_func)nfsd4_reclaim_complete, ++ .op_flags = ALLOWED_WITHOUT_FH, ++ .op_name = "OP_RECLAIM_COMPLETE", ++ }, + }; + + static const char *nfsd4_op_name(unsigned opnum) +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 09:57:18.240356512 -0400 +@@ -45,8 +45,8 @@ + #define NFSDDBG_FACILITY NFSDDBG_PROC + + /* Globals */ +-static time_t lease_time = 90; /* default lease time */ +-static time_t user_lease_time = 90; ++time_t nfsd4_lease = 90; /* default lease time */ ++time_t nfsd4_grace = 90; + static time_t boot_time; + static u32 current_ownerid = 1; + static u32 current_fileid = 1; +@@ -190,7 +190,7 @@ alloc_init_deleg(struct nfs4_client *clp + dp->dl_vfs_file = stp->st_vfs_file; + dp->dl_type = type; + dp->dl_ident = cb->cb_ident; +- dp->dl_stateid.si_boot = get_seconds(); ++ dp->dl_stateid.si_boot = boot_time; + dp->dl_stateid.si_stateownerid = current_delegid++; + dp->dl_stateid.si_fileid = 0; + dp->dl_stateid.si_generation = 0; +@@ -199,6 +199,7 @@ alloc_init_deleg(struct nfs4_client *clp + atomic_set(&dp->dl_count, 1); + list_add(&dp->dl_perfile, &fp->fi_delegations); + list_add(&dp->dl_perclnt, &clp->cl_delegations); ++ INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc); + return dp; + } + +@@ -249,6 +250,9 @@ unhash_delegation(struct nfs4_delegation + * SETCLIENTID state + */ + ++/* client_lock protects the client lru list and session hash table */ ++static DEFINE_SPINLOCK(client_lock); ++ + /* Hash tables for nfs4_clientid state */ + #define CLIENT_HASH_BITS 4 + #define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) +@@ -367,7 +371,6 @@ static void release_openowner(struct nfs + nfs4_put_stateowner(sop); + } + +-static DEFINE_SPINLOCK(sessionid_lock); + #define SESSION_HASH_SIZE 512 + static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE]; + +@@ -565,10 +568,10 @@ alloc_init_session(struct svc_rqst *rqst + + new->se_flags = cses->flags; + kref_init(&new->se_ref); +- spin_lock(&sessionid_lock); ++ spin_lock(&client_lock); + list_add(&new->se_hash, &sessionid_hashtbl[idx]); + list_add(&new->se_perclnt, &clp->cl_sessions); +- spin_unlock(&sessionid_lock); ++ spin_unlock(&client_lock); + + status = nfs_ok; + out: +@@ -579,7 +582,7 @@ out_free: + goto out; + } + +-/* caller must hold sessionid_lock */ ++/* caller must hold client_lock */ + static struct nfsd4_session * + find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid) + { +@@ -602,7 +605,7 @@ find_in_sessionid_hashtbl(struct nfs4_se + return NULL; + } + +-/* caller must hold sessionid_lock */ ++/* caller must hold client_lock */ + static void + unhash_session(struct nfsd4_session *ses) + { +@@ -610,15 +613,6 @@ unhash_session(struct nfsd4_session *ses + list_del(&ses->se_perclnt); + } + +-static void +-release_session(struct nfsd4_session *ses) +-{ +- spin_lock(&sessionid_lock); +- unhash_session(ses); +- spin_unlock(&sessionid_lock); +- nfsd4_put_session(ses); +-} +- + void + free_session(struct kref *kref) + { +@@ -634,9 +628,18 @@ free_session(struct kref *kref) + kfree(ses); + } + ++/* must be called under the client_lock */ + static inline void +-renew_client(struct nfs4_client *clp) ++renew_client_locked(struct nfs4_client *clp) + { ++ if (is_client_expired(clp)) { ++ dprintk("%s: client (clientid %08x/%08x) already expired\n", ++ __func__, ++ clp->cl_clientid.cl_boot, ++ clp->cl_clientid.cl_id); ++ return; ++ } ++ + /* + * Move client to the end to the LRU list. + */ +@@ -647,6 +650,14 @@ renew_client(struct nfs4_client *clp) + clp->cl_time = get_seconds(); + } + ++static inline void ++renew_client(struct nfs4_client *clp) ++{ ++ spin_lock(&client_lock); ++ renew_client_locked(clp); ++ spin_unlock(&client_lock); ++} ++ + /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ + static int + STALE_CLIENTID(clientid_t *clid) +@@ -680,27 +691,9 @@ static struct nfs4_client *alloc_client( + return clp; + } + +-static void +-shutdown_callback_client(struct nfs4_client *clp) +-{ +- struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client; +- +- if (clnt) { +- /* +- * Callback threads take a reference on the client, so there +- * should be no outstanding callbacks at this point. +- */ +- clp->cl_cb_conn.cb_client = NULL; +- rpc_shutdown_client(clnt); +- } +-} +- + static inline void + free_client(struct nfs4_client *clp) + { +- shutdown_callback_client(clp); +- if (clp->cl_cb_xprt) +- svc_xprt_put(clp->cl_cb_xprt); + if (clp->cl_cred.cr_group_info) + put_group_info(clp->cl_cred.cr_group_info); + kfree(clp->cl_principal); +@@ -709,10 +702,34 @@ free_client(struct nfs4_client *clp) + } + + void +-put_nfs4_client(struct nfs4_client *clp) ++release_session_client(struct nfsd4_session *session) + { +- if (atomic_dec_and_test(&clp->cl_count)) ++ struct nfs4_client *clp = session->se_client; ++ ++ if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock)) ++ return; ++ if (is_client_expired(clp)) { + free_client(clp); ++ session->se_client = NULL; ++ } else ++ renew_client_locked(clp); ++ spin_unlock(&client_lock); ++ nfsd4_put_session(session); ++} ++ ++/* must be called under the client_lock */ ++static inline void ++unhash_client_locked(struct nfs4_client *clp) ++{ ++ mark_client_expired(clp); ++ list_del(&clp->cl_lru); ++ while (!list_empty(&clp->cl_sessions)) { ++ struct nfsd4_session *ses; ++ ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, ++ se_perclnt); ++ unhash_session(ses); ++ nfsd4_put_session(ses); ++ } + } + + static void +@@ -722,9 +739,6 @@ expire_client(struct nfs4_client *clp) + struct nfs4_delegation *dp; + struct list_head reaplist; + +- dprintk("NFSD: expire_client cl_count %d\n", +- atomic_read(&clp->cl_count)); +- + INIT_LIST_HEAD(&reaplist); + spin_lock(&recall_lock); + while (!list_empty(&clp->cl_delegations)) { +@@ -740,20 +754,20 @@ expire_client(struct nfs4_client *clp) + list_del_init(&dp->dl_recall_lru); + unhash_delegation(dp); + } +- list_del(&clp->cl_idhash); +- list_del(&clp->cl_strhash); +- list_del(&clp->cl_lru); + while (!list_empty(&clp->cl_openowners)) { + sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); + release_openowner(sop); + } +- while (!list_empty(&clp->cl_sessions)) { +- struct nfsd4_session *ses; +- ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, +- se_perclnt); +- release_session(ses); +- } +- put_nfs4_client(clp); ++ nfsd4_set_callback_client(clp, NULL); ++ if (clp->cl_cb_conn.cb_xprt) ++ svc_xprt_put(clp->cl_cb_conn.cb_xprt); ++ list_del(&clp->cl_idhash); ++ list_del(&clp->cl_strhash); ++ spin_lock(&client_lock); ++ unhash_client_locked(clp); ++ if (atomic_read(&clp->cl_refcount) == 0) ++ free_client(clp); ++ spin_unlock(&client_lock); + } + + static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) +@@ -839,14 +853,15 @@ static struct nfs4_client *create_client + } + + memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); +- atomic_set(&clp->cl_count, 1); +- atomic_set(&clp->cl_cb_conn.cb_set, 0); ++ atomic_set(&clp->cl_refcount, 0); ++ atomic_set(&clp->cl_cb_set, 0); + INIT_LIST_HEAD(&clp->cl_idhash); + INIT_LIST_HEAD(&clp->cl_strhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); + INIT_LIST_HEAD(&clp->cl_sessions); + INIT_LIST_HEAD(&clp->cl_lru); ++ clp->cl_time = get_seconds(); + clear_bit(0, &clp->cl_cb_slot_busy); + rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); + copy_verf(clp, verf); +@@ -877,8 +892,7 @@ add_to_unconfirmed(struct nfs4_client *c + list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]); + idhashval = clientid_hashval(clp->cl_clientid.cl_id); + list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]); +- list_add_tail(&clp->cl_lru, &client_lru); +- clp->cl_time = get_seconds(); ++ renew_client(clp); + } + + static void +@@ -888,10 +902,9 @@ move_to_confirmed(struct nfs4_client *cl + unsigned int strhashval; + + dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); +- list_del_init(&clp->cl_strhash); + list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); + strhashval = clientstr_hashval(clp->cl_recdir); +- list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); ++ list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); + renew_client(clp); + } + +@@ -1327,15 +1340,9 @@ nfsd4_create_session(struct svc_rqst *rq + cs_slot->sl_seqid++; /* from 0 to 1 */ + move_to_confirmed(unconf); + +- /* +- * We do not support RDMA or persistent sessions +- */ +- cr_ses->flags &= ~SESSION4_PERSIST; +- cr_ses->flags &= ~SESSION4_RDMA; +- + if (cr_ses->flags & SESSION4_BACK_CHAN) { +- unconf->cl_cb_xprt = rqstp->rq_xprt; +- svc_xprt_get(unconf->cl_cb_xprt); ++ unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; ++ svc_xprt_get(rqstp->rq_xprt); + rpc_copy_addr( + (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, + sa); +@@ -1344,7 +1351,7 @@ nfsd4_create_session(struct svc_rqst *rq + cstate->minorversion; + unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; + unconf->cl_cb_seq_nr = 1; +- nfsd4_probe_callback(unconf); ++ nfsd4_probe_callback(unconf, &unconf->cl_cb_conn); + } + conf = unconf; + } else { +@@ -1352,6 +1359,12 @@ nfsd4_create_session(struct svc_rqst *rq + goto out; + } + ++ /* ++ * We do not support RDMA or persistent sessions ++ */ ++ cr_ses->flags &= ~SESSION4_PERSIST; ++ cr_ses->flags &= ~SESSION4_RDMA; ++ + status = alloc_init_session(rqstp, conf, cr_ses); + if (status) + goto out; +@@ -1369,6 +1382,21 @@ out: + return status; + } + ++static bool nfsd4_last_compound_op(struct svc_rqst *rqstp) ++{ ++ struct nfsd4_compoundres *resp = rqstp->rq_resp; ++ struct nfsd4_compoundargs *argp = rqstp->rq_argp; ++ ++ return argp->opcnt == resp->opcnt; ++} ++ ++static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) ++{ ++ if (!session) ++ return 0; ++ return !memcmp(sid, &session->se_sessionid, sizeof(*sid)); ++} ++ + __be32 + nfsd4_destroy_session(struct svc_rqst *r, + struct nfsd4_compound_state *cstate, +@@ -1384,19 +1412,25 @@ nfsd4_destroy_session(struct svc_rqst *r + * - Do we need to clear any callback info from previous session? + */ + ++ if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) { ++ if (!nfsd4_last_compound_op(r)) ++ return nfserr_not_only_op; ++ } + dump_sessionid(__func__, &sessionid->sessionid); +- spin_lock(&sessionid_lock); ++ spin_lock(&client_lock); + ses = find_in_sessionid_hashtbl(&sessionid->sessionid); + if (!ses) { +- spin_unlock(&sessionid_lock); ++ spin_unlock(&client_lock); + goto out; + } + + unhash_session(ses); +- spin_unlock(&sessionid_lock); ++ spin_unlock(&client_lock); + ++ nfs4_lock_state(); + /* wait for callbacks */ +- shutdown_callback_client(ses->se_client); ++ nfsd4_set_callback_client(ses->se_client, NULL); ++ nfs4_unlock_state(); + nfsd4_put_session(ses); + status = nfs_ok; + out: +@@ -1417,7 +1451,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, + if (resp->opcnt != 1) + return nfserr_sequence_pos; + +- spin_lock(&sessionid_lock); ++ spin_lock(&client_lock); + status = nfserr_badsession; + session = find_in_sessionid_hashtbl(&seq->sessionid); + if (!session) +@@ -1456,23 +1490,47 @@ nfsd4_sequence(struct svc_rqst *rqstp, + cstate->slot = slot; + cstate->session = session; + +- /* Hold a session reference until done processing the compound: +- * nfsd4_put_session called only if the cstate slot is set. +- */ +- nfsd4_get_session(session); + out: +- spin_unlock(&sessionid_lock); +- /* Renew the clientid on success and on replay */ ++ /* Hold a session reference until done processing the compound. */ + if (cstate->session) { +- nfs4_lock_state(); +- renew_client(session->se_client); +- nfs4_unlock_state(); ++ nfsd4_get_session(cstate->session); ++ atomic_inc(&session->se_client->cl_refcount); + } ++ spin_unlock(&client_lock); + dprintk("%s: return %d\n", __func__, ntohl(status)); + return status; + } + + __be32 ++nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) ++{ ++ if (rc->rca_one_fs) { ++ if (!cstate->current_fh.fh_dentry) ++ return nfserr_nofilehandle; ++ /* ++ * We don't take advantage of the rca_one_fs case. ++ * That's OK, it's optional, we can safely ignore it. ++ */ ++ return nfs_ok; ++ } ++ nfs4_lock_state(); ++ if (is_client_expired(cstate->session->se_client)) { ++ nfs4_unlock_state(); ++ /* ++ * The following error isn't really legal. ++ * But we only get here if the client just explicitly ++ * destroyed the client. Surely it no longer cares what ++ * error it gets back on an operation for the dead ++ * client. ++ */ ++ return nfserr_stale_clientid; ++ } ++ nfsd4_create_clid_dir(cstate->session->se_client); ++ nfs4_unlock_state(); ++ return nfs_ok; ++} ++ ++__be32 + nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_setclientid *setclid) + { +@@ -1631,9 +1689,8 @@ nfsd4_setclientid_confirm(struct svc_rqs + if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) + status = nfserr_clid_inuse; + else { +- /* XXX: We just turn off callbacks until we can handle +- * change request correctly. */ +- atomic_set(&conf->cl_cb_conn.cb_set, 0); ++ atomic_set(&conf->cl_cb_set, 0); ++ nfsd4_probe_callback(conf, &unconf->cl_cb_conn); + expire_client(unconf); + status = nfs_ok; + +@@ -1667,7 +1724,7 @@ nfsd4_setclientid_confirm(struct svc_rqs + } + move_to_confirmed(unconf); + conf = unconf; +- nfsd4_probe_callback(conf); ++ nfsd4_probe_callback(conf, &conf->cl_cb_conn); + status = nfs_ok; + } + } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) +@@ -1700,12 +1757,12 @@ alloc_init_file(struct inode *ino) + INIT_LIST_HEAD(&fp->fi_hash); + INIT_LIST_HEAD(&fp->fi_stateids); + INIT_LIST_HEAD(&fp->fi_delegations); +- spin_lock(&recall_lock); +- list_add(&fp->fi_hash, &file_hashtbl[hashval]); +- spin_unlock(&recall_lock); + fp->fi_inode = igrab(ino); + fp->fi_id = current_fileid++; + fp->fi_had_conflict = false; ++ spin_lock(&recall_lock); ++ list_add(&fp->fi_hash, &file_hashtbl[hashval]); ++ spin_unlock(&recall_lock); + return fp; + } + return NULL; +@@ -1827,7 +1884,7 @@ init_stateid(struct nfs4_stateid *stp, s + stp->st_stateowner = sop; + get_nfs4_file(fp); + stp->st_file = fp; +- stp->st_stateid.si_boot = get_seconds(); ++ stp->st_stateid.si_boot = boot_time; + stp->st_stateid.si_stateownerid = sop->so_id; + stp->st_stateid.si_fileid = fp->fi_id; + stp->st_stateid.si_generation = 0; +@@ -2028,7 +2085,6 @@ void nfsd_break_deleg_cb(struct file_loc + * lock) we know the server hasn't removed the lease yet, we know + * it's safe to take a reference: */ + atomic_inc(&dp->dl_count); +- atomic_inc(&dp->dl_client->cl_count); + + spin_lock(&recall_lock); + list_add_tail(&dp->dl_recall_lru, &del_recall_lru); +@@ -2347,7 +2403,7 @@ nfs4_open_delegation(struct svc_fh *fh, + { + struct nfs4_delegation *dp; + struct nfs4_stateowner *sop = stp->st_stateowner; +- struct nfs4_cb_conn *cb = &sop->so_client->cl_cb_conn; ++ int cb_up = atomic_read(&sop->so_client->cl_cb_set); + struct file_lock fl, *flp = &fl; + int status, flag = 0; + +@@ -2355,7 +2411,7 @@ nfs4_open_delegation(struct svc_fh *fh, + open->op_recall = 0; + switch (open->op_claim_type) { + case NFS4_OPEN_CLAIM_PREVIOUS: +- if (!atomic_read(&cb->cb_set)) ++ if (!cb_up) + open->op_recall = 1; + flag = open->op_delegate_type; + if (flag == NFS4_OPEN_DELEGATE_NONE) +@@ -2366,7 +2422,7 @@ nfs4_open_delegation(struct svc_fh *fh, + * had the chance to reclaim theirs.... */ + if (locks_in_grace()) + goto out; +- if (!atomic_read(&cb->cb_set) || !sop->so_confirmed) ++ if (!cb_up || !sop->so_confirmed) + goto out; + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) + flag = NFS4_OPEN_DELEGATE_WRITE; +@@ -2483,10 +2539,8 @@ nfsd4_process_open2(struct svc_rqst *rqs + } + memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); + +- if (nfsd4_has_session(&resp->cstate)) { ++ if (nfsd4_has_session(&resp->cstate)) + open->op_stateowner->so_confirmed = 1; +- nfsd4_create_clid_dir(open->op_stateowner->so_client); +- } + + /* + * Attempt to hand out a delegation. No error return, because the +@@ -2537,7 +2591,7 @@ nfsd4_renew(struct svc_rqst *rqstp, stru + renew_client(clp); + status = nfserr_cb_path_down; + if (!list_empty(&clp->cl_delegations) +- && !atomic_read(&clp->cl_cb_conn.cb_set)) ++ && !atomic_read(&clp->cl_cb_set)) + goto out; + status = nfs_ok; + out: +@@ -2554,6 +2608,12 @@ nfsd4_end_grace(void) + dprintk("NFSD: end of grace period\n"); + nfsd4_recdir_purge_old(); + locks_end_grace(&nfsd4_manager); ++ /* ++ * Now that every NFSv4 client has had the chance to recover and ++ * to see the (possibly new, possibly shorter) lease time, we ++ * can safely set the next grace time to the current lease time: ++ */ ++ nfsd4_grace = nfsd4_lease; + } + + static time_t +@@ -2563,15 +2623,17 @@ nfs4_laundromat(void) + struct nfs4_stateowner *sop; + struct nfs4_delegation *dp; + struct list_head *pos, *next, reaplist; +- time_t cutoff = get_seconds() - NFSD_LEASE_TIME; +- time_t t, clientid_val = NFSD_LEASE_TIME; +- time_t u, test_val = NFSD_LEASE_TIME; ++ time_t cutoff = get_seconds() - nfsd4_lease; ++ time_t t, clientid_val = nfsd4_lease; ++ time_t u, test_val = nfsd4_lease; + + nfs4_lock_state(); + + dprintk("NFSD: laundromat service - starting\n"); + if (locks_in_grace()) + nfsd4_end_grace(); ++ INIT_LIST_HEAD(&reaplist); ++ spin_lock(&client_lock); + list_for_each_safe(pos, next, &client_lru) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { +@@ -2580,12 +2642,22 @@ nfs4_laundromat(void) + clientid_val = t; + break; + } ++ if (atomic_read(&clp->cl_refcount)) { ++ dprintk("NFSD: client in use (clientid %08x)\n", ++ clp->cl_clientid.cl_id); ++ continue; ++ } ++ unhash_client_locked(clp); ++ list_add(&clp->cl_lru, &reaplist); ++ } ++ spin_unlock(&client_lock); ++ list_for_each_safe(pos, next, &reaplist) { ++ clp = list_entry(pos, struct nfs4_client, cl_lru); + dprintk("NFSD: purging unused client (clientid %08x)\n", + clp->cl_clientid.cl_id); + nfsd4_remove_clid_dir(clp); + expire_client(clp); + } +- INIT_LIST_HEAD(&reaplist); + spin_lock(&recall_lock); + list_for_each_safe(pos, next, &del_recall_lru) { + dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); +@@ -2605,7 +2677,7 @@ nfs4_laundromat(void) + list_del_init(&dp->dl_recall_lru); + unhash_delegation(dp); + } +- test_val = NFSD_LEASE_TIME; ++ test_val = nfsd4_lease; + list_for_each_safe(pos, next, &close_lru) { + sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); + if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) { +@@ -2661,39 +2733,11 @@ nfs4_check_fh(struct svc_fh *fhp, struct + static int + STALE_STATEID(stateid_t *stateid) + { +- if (time_after((unsigned long)boot_time, +- (unsigned long)stateid->si_boot)) { +- dprintk("NFSD: stale stateid " STATEID_FMT "!\n", +- STATEID_VAL(stateid)); +- return 1; +- } +- return 0; +-} +- +-static int +-EXPIRED_STATEID(stateid_t *stateid) +-{ +- if (time_before((unsigned long)boot_time, +- ((unsigned long)stateid->si_boot)) && +- time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) { +- dprintk("NFSD: expired stateid " STATEID_FMT "!\n", +- STATEID_VAL(stateid)); +- return 1; +- } +- return 0; +-} +- +-static __be32 +-stateid_error_map(stateid_t *stateid) +-{ +- if (STALE_STATEID(stateid)) +- return nfserr_stale_stateid; +- if (EXPIRED_STATEID(stateid)) +- return nfserr_expired; +- +- dprintk("NFSD: bad stateid " STATEID_FMT "!\n", ++ if (stateid->si_boot == boot_time) ++ return 0; ++ dprintk("NFSD: stale stateid " STATEID_FMT "!\n", + STATEID_VAL(stateid)); +- return nfserr_bad_stateid; ++ return 1; + } + + static inline int +@@ -2817,10 +2861,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_ + status = nfserr_bad_stateid; + if (is_delegation_stateid(stateid)) { + dp = find_delegation_stateid(ino, stateid); +- if (!dp) { +- status = stateid_error_map(stateid); ++ if (!dp) + goto out; +- } + status = check_stateid_generation(stateid, &dp->dl_stateid, + flags); + if (status) +@@ -2833,10 +2875,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_ + *filpp = dp->dl_vfs_file; + } else { /* open or lock stateid */ + stp = find_stateid(stateid, flags); +- if (!stp) { +- status = stateid_error_map(stateid); ++ if (!stp) + goto out; +- } + if (nfs4_check_fh(current_fh, stp)) + goto out; + if (!stp->st_stateowner->so_confirmed) +@@ -2908,7 +2948,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_co + */ + sop = search_close_lru(stateid->si_stateownerid, flags); + if (sop == NULL) +- return stateid_error_map(stateid); ++ return nfserr_bad_stateid; + *sopp = sop; + goto check_replay; + } +@@ -3175,10 +3215,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp + if (!is_delegation_stateid(stateid)) + goto out; + dp = find_delegation_stateid(inode, stateid); +- if (!dp) { +- status = stateid_error_map(stateid); ++ if (!dp) + goto out; +- } + status = check_stateid_generation(stateid, &dp->dl_stateid, flags); + if (status) + goto out; +@@ -3404,7 +3442,7 @@ alloc_init_lock_stateid(struct nfs4_stat + stp->st_stateowner = sop; + get_nfs4_file(fp); + stp->st_file = fp; +- stp->st_stateid.si_boot = get_seconds(); ++ stp->st_stateid.si_boot = boot_time; + stp->st_stateid.si_stateownerid = sop->so_id; + stp->st_stateid.si_fileid = fp->fi_id; + stp->st_stateid.si_generation = 0; +@@ -3976,12 +4014,6 @@ nfsd4_load_reboot_recovery_data(void) + printk("NFSD: Failure reading reboot recovery data\n"); + } + +-unsigned long +-get_nfs4_grace_period(void) +-{ +- return max(user_lease_time, lease_time) * HZ; +-} +- + /* + * Since the lifetime of a delegation isn't limited to that of an open, a + * client may quite reasonably hang on to a delegation as long as it has +@@ -4008,20 +4040,27 @@ set_max_delegations(void) + static int + __nfs4_state_start(void) + { +- unsigned long grace_time; ++ int ret; + + boot_time = get_seconds(); +- grace_time = get_nfs4_grace_period(); +- lease_time = user_lease_time; + locks_start_grace(&nfsd4_manager); + printk(KERN_INFO "NFSD: starting %ld-second grace period\n", +- grace_time/HZ); ++ nfsd4_grace); ++ ret = set_callback_cred(); ++ if (ret) ++ return -ENOMEM; + laundry_wq = create_singlethread_workqueue("nfsd4"); + if (laundry_wq == NULL) + return -ENOMEM; +- queue_delayed_work(laundry_wq, &laundromat_work, grace_time); ++ ret = nfsd4_create_callback_queue(); ++ if (ret) ++ goto out_free_laundry; ++ queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ); + set_max_delegations(); +- return set_callback_cred(); ++ return 0; ++out_free_laundry: ++ destroy_workqueue(laundry_wq); ++ return ret; + } + + int +@@ -4039,12 +4078,6 @@ nfs4_state_start(void) + return 0; + } + +-time_t +-nfs4_lease_time(void) +-{ +- return lease_time; +-} +- + static void + __nfs4_state_shutdown(void) + { +@@ -4089,6 +4122,7 @@ nfs4_state_shutdown(void) + nfs4_lock_state(); + nfs4_release_reclaim(); + __nfs4_state_shutdown(); ++ nfsd4_destroy_callback_queue(); + nfs4_unlock_state(); + } + +@@ -4128,21 +4162,3 @@ nfs4_recoverydir(void) + { + return user_recovery_dirname; + } +- +-/* +- * Called when leasetime is changed. +- * +- * The only way the protocol gives us to handle on-the-fly lease changes is to +- * simulate a reboot. Instead of doing that, we just wait till the next time +- * we start to register any changes in lease time. If the administrator +- * really wants to change the lease time *now*, they can go ahead and bring +- * nfsd down and then back up again after changing the lease time. +- * +- * user_lease_time is protected by nfsd_mutex since it's only really accessed +- * when nfsd is starting +- */ +-void +-nfs4_reset_lease(time_t leasetime) +-{ +- user_lease_time = leasetime; +-} +diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c +--- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 09:57:20.629370282 -0400 +@@ -46,6 +46,7 @@ enum { + */ + #ifdef CONFIG_NFSD_V4 + NFSD_Leasetime, ++ NFSD_Gracetime, + NFSD_RecoveryDir, + #endif + }; +@@ -70,6 +71,7 @@ static ssize_t write_ports(struct file * + static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); + #ifdef CONFIG_NFSD_V4 + static ssize_t write_leasetime(struct file *file, char *buf, size_t size); ++static ssize_t write_gracetime(struct file *file, char *buf, size_t size); + static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); + #endif + +@@ -91,6 +93,7 @@ static ssize_t (*write_op[])(struct file + [NFSD_MaxBlkSize] = write_maxblksize, + #ifdef CONFIG_NFSD_V4 + [NFSD_Leasetime] = write_leasetime, ++ [NFSD_Gracetime] = write_gracetime, + [NFSD_RecoveryDir] = write_recoverydir, + #endif + }; +@@ -1204,29 +1207,45 @@ static ssize_t write_maxblksize(struct f + } + + #ifdef CONFIG_NFSD_V4 +-extern time_t nfs4_leasetime(void); +- +-static ssize_t __write_leasetime(struct file *file, char *buf, size_t size) ++static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) + { +- /* if size > 10 seconds, call +- * nfs4_reset_lease() then write out the new lease (seconds) as reply +- */ + char *mesg = buf; +- int rv, lease; ++ int rv, i; + + if (size > 0) { + if (nfsd_serv) + return -EBUSY; +- rv = get_int(&mesg, &lease); ++ rv = get_int(&mesg, &i); + if (rv) + return rv; +- if (lease < 10 || lease > 3600) ++ /* ++ * Some sanity checking. We don't have a reason for ++ * these particular numbers, but problems with the ++ * extremes are: ++ * - Too short: the briefest network outage may ++ * cause clients to lose all their locks. Also, ++ * the frequent polling may be wasteful. ++ * - Too long: do you really want reboot recovery ++ * to take more than an hour? Or to make other ++ * clients wait an hour before being able to ++ * revoke a dead client's locks? ++ */ ++ if (i < 10 || i > 3600) + return -EINVAL; +- nfs4_reset_lease(lease); ++ *time = i; + } + +- return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", +- nfs4_lease_time()); ++ return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time); ++} ++ ++static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) ++{ ++ ssize_t rv; ++ ++ mutex_lock(&nfsd_mutex); ++ rv = __nfsd4_write_time(file, buf, size, time); ++ mutex_unlock(&nfsd_mutex); ++ return rv; + } + + /** +@@ -1252,12 +1271,22 @@ static ssize_t __write_leasetime(struct + */ + static ssize_t write_leasetime(struct file *file, char *buf, size_t size) + { +- ssize_t rv; ++ return nfsd4_write_time(file, buf, size, &nfsd4_lease); ++} + +- mutex_lock(&nfsd_mutex); +- rv = __write_leasetime(file, buf, size); +- mutex_unlock(&nfsd_mutex); +- return rv; ++/** ++ * write_gracetime - Set or report current NFSv4 grace period time ++ * ++ * As above, but sets the time of the NFSv4 grace period. ++ * ++ * Note this should never be set to less than the *previous* ++ * lease-period time, but we don't try to enforce this. (In the common ++ * case (a new boot), we don't know what the previous lease time was ++ * anyway.) ++ */ ++static ssize_t write_gracetime(struct file *file, char *buf, size_t size) ++{ ++ return nfsd4_write_time(file, buf, size, &nfsd4_grace); + } + + extern char *nfs4_recoverydir(void); +@@ -1351,6 +1380,7 @@ static int nfsd_fill_super(struct super_ + [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, + #ifdef CONFIG_NFSD_V4 + [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, ++ [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, + #endif + /* last one */ {""} +diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h +--- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 09:57:20.629370282 -0400 +@@ -82,7 +82,6 @@ int nfs4_state_init(void); + void nfsd4_free_slabs(void); + int nfs4_state_start(void); + void nfs4_state_shutdown(void); +-time_t nfs4_lease_time(void); + void nfs4_reset_lease(time_t leasetime); + int nfs4_reset_recoverydir(char *recdir); + #else +@@ -90,7 +89,6 @@ static inline int nfs4_state_init(void) + static inline void nfsd4_free_slabs(void) { } + static inline int nfs4_state_start(void) { return 0; } + static inline void nfs4_state_shutdown(void) { } +-static inline time_t nfs4_lease_time(void) { return 0; } + static inline void nfs4_reset_lease(time_t leasetime) { } + static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } + #endif +@@ -229,6 +227,9 @@ extern struct timeval nfssvc_boot; + + #ifdef CONFIG_NFSD_V4 + ++extern time_t nfsd4_lease; ++extern time_t nfsd4_grace; ++ + /* before processing a COMPOUND operation, we have to check that there + * is enough space in the buffer for XDR encode to succeed. otherwise, + * we might process an operation with side effects, and be unable to +@@ -247,7 +248,6 @@ extern struct timeval nfssvc_boot; + #define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ + #define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ + +-#define NFSD_LEASE_TIME (nfs4_lease_time()) + #define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */ + + /* +diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h +--- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 09:57:21.807501619 -0400 +@@ -70,6 +70,16 @@ struct nfsd4_cb_sequence { + struct nfs4_client *cbs_clp; + }; + ++struct nfs4_rpc_args { ++ void *args_op; ++ struct nfsd4_cb_sequence args_seq; ++}; ++ ++struct nfsd4_callback { ++ struct nfs4_rpc_args cb_args; ++ struct work_struct cb_work; ++}; ++ + struct nfs4_delegation { + struct list_head dl_perfile; + struct list_head dl_perclnt; +@@ -86,6 +96,7 @@ struct nfs4_delegation { + stateid_t dl_stateid; + struct knfsd_fh dl_fh; + int dl_retries; ++ struct nfsd4_callback dl_recall; + }; + + /* client delegation callback info */ +@@ -96,9 +107,7 @@ struct nfs4_cb_conn { + u32 cb_prog; + u32 cb_minorversion; + u32 cb_ident; /* minorversion 0 only */ +- /* RPC client info */ +- atomic_t cb_set; /* successful CB_NULL call */ +- struct rpc_clnt * cb_client; ++ struct svc_xprt *cb_xprt; /* minorversion 1 only */ + }; + + /* Maximum number of slots per session. 160 is useful for long haul TCP */ +@@ -157,7 +166,7 @@ struct nfsd4_session { + struct list_head se_hash; /* hash by sessionid */ + struct list_head se_perclnt; + u32 se_flags; +- struct nfs4_client *se_client; /* for expire_client */ ++ struct nfs4_client *se_client; + struct nfs4_sessionid se_sessionid; + struct nfsd4_channel_attrs se_fchannel; + struct nfsd4_channel_attrs se_bchannel; +@@ -212,25 +221,41 @@ struct nfs4_client { + struct svc_cred cl_cred; /* setclientid principal */ + clientid_t cl_clientid; /* generated by server */ + nfs4_verifier cl_confirm; /* generated by server */ +- struct nfs4_cb_conn cl_cb_conn; /* callback info */ +- atomic_t cl_count; /* ref count */ + u32 cl_firststate; /* recovery dir creation */ + ++ /* for v4.0 and v4.1 callbacks: */ ++ struct nfs4_cb_conn cl_cb_conn; ++ struct rpc_clnt *cl_cb_client; ++ atomic_t cl_cb_set; ++ + /* for nfs41 */ + struct list_head cl_sessions; + struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ + u32 cl_exchange_flags; + struct nfs4_sessionid cl_sessionid; ++ /* number of rpc's in progress over an associated session: */ ++ atomic_t cl_refcount; + + /* for nfs41 callbacks */ + /* We currently support a single back channel with a single slot */ + unsigned long cl_cb_slot_busy; + u32 cl_cb_seq_nr; +- struct svc_xprt *cl_cb_xprt; /* 4.1 callback transport */ + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ + }; + ++static inline void ++mark_client_expired(struct nfs4_client *clp) ++{ ++ clp->cl_time = 0; ++} ++ ++static inline bool ++is_client_expired(struct nfs4_client *clp) ++{ ++ return clp->cl_time == 0; ++} ++ + /* struct nfs4_client_reset + * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl + * upon lease reset, or from upcall to state_daemon (to read in state +@@ -377,11 +402,14 @@ extern void nfs4_lock_state(void); + extern void nfs4_unlock_state(void); + extern int nfs4_in_grace(void); + extern __be32 nfs4_check_open_reclaim(clientid_t *clid); +-extern void put_nfs4_client(struct nfs4_client *clp); + extern void nfs4_free_stateowner(struct kref *kref); + extern int set_callback_cred(void); +-extern void nfsd4_probe_callback(struct nfs4_client *clp); ++extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); ++extern void nfsd4_do_callback_rpc(struct work_struct *); + extern void nfsd4_cb_recall(struct nfs4_delegation *dp); ++extern int nfsd4_create_callback_queue(void); ++extern void nfsd4_destroy_callback_queue(void); ++extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *); + extern void nfs4_put_delegation(struct nfs4_delegation *dp); + extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); + extern void nfsd4_init_recdir(char *recdir_name); +@@ -392,6 +420,7 @@ extern int nfs4_has_reclaimed_state(cons + extern void nfsd4_recdir_purge_old(void); + extern int nfsd4_create_clid_dir(struct nfs4_client *clp); + extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); ++extern void release_session_client(struct nfsd4_session *); + + static inline void + nfs4_put_stateowner(struct nfs4_stateowner *so) +diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h +--- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 09:57:23.994379831 -0400 +@@ -381,6 +381,10 @@ struct nfsd4_destroy_session { + struct nfs4_sessionid sessionid; + }; + ++struct nfsd4_reclaim_complete { ++ u32 rca_one_fs; ++}; ++ + struct nfsd4_op { + int opnum; + __be32 status; +@@ -421,6 +425,7 @@ struct nfsd4_op { + struct nfsd4_create_session create_session; + struct nfsd4_destroy_session destroy_session; + struct nfsd4_sequence sequence; ++ struct nfsd4_reclaim_complete reclaim_complete; + } u; + struct nfs4_replay * replay; + }; +@@ -513,9 +518,8 @@ extern void nfsd4_store_cache_entry(stru + extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, + struct nfsd4_sequence *seq); + extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, +- struct nfsd4_compound_state *, +-struct nfsd4_exchange_id *); +- extern __be32 nfsd4_create_session(struct svc_rqst *, ++ struct nfsd4_compound_state *, struct nfsd4_exchange_id *); ++extern __be32 nfsd4_create_session(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_create_session *); + extern __be32 nfsd4_sequence(struct svc_rqst *, +@@ -524,6 +528,7 @@ extern __be32 nfsd4_sequence(struct svc_ + extern __be32 nfsd4_destroy_session(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_destroy_session *); ++__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *); + extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, + struct nfsd4_open *open); + extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h 2010-08-23 09:57:23.994379831 -0400 +@@ -40,12 +40,12 @@ struct nfs_fhbase_old { + * This is the new flexible, extensible style NFSv2/v3 file handle. + * by Neil Brown - March 2000 + * +- * The file handle is seens as a list of 4byte words. +- * The first word contains a version number (1) and four descriptor bytes ++ * The file handle starts with a sequence of four-byte words. ++ * The first word contains a version number (1) and three descriptor bytes + * that tell how the remaining 3 variable length fields should be handled. + * These three bytes are auth_type, fsid_type and fileid_type. + * +- * All 4byte values are in host-byte-order. ++ * All four-byte values are in host-byte-order. + * + * The auth_type field specifies how the filehandle can be authenticated + * This might allow a file to be confirmed to be in a writable part of a +diff -up linux-2.6.34.noarch/net/sunrpc/cache.c.orig linux-2.6.34.noarch/net/sunrpc/cache.c +--- linux-2.6.34.noarch/net/sunrpc/cache.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/cache.c 2010-08-23 09:57:23.995376793 -0400 +@@ -49,11 +49,17 @@ static void cache_init(struct cache_head + h->last_refresh = now; + } + ++static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h) ++{ ++ return (h->expiry_time < get_seconds()) || ++ (detail->flush_time > h->last_refresh); ++} ++ + struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, + struct cache_head *key, int hash) + { + struct cache_head **head, **hp; +- struct cache_head *new = NULL; ++ struct cache_head *new = NULL, *freeme = NULL; + + head = &detail->hash_table[hash]; + +@@ -62,6 +68,9 @@ struct cache_head *sunrpc_cache_lookup(s + for (hp=head; *hp != NULL ; hp = &(*hp)->next) { + struct cache_head *tmp = *hp; + if (detail->match(tmp, key)) { ++ if (cache_is_expired(detail, tmp)) ++ /* This entry is expired, we will discard it. */ ++ break; + cache_get(tmp); + read_unlock(&detail->hash_lock); + return tmp; +@@ -86,6 +95,13 @@ struct cache_head *sunrpc_cache_lookup(s + for (hp=head; *hp != NULL ; hp = &(*hp)->next) { + struct cache_head *tmp = *hp; + if (detail->match(tmp, key)) { ++ if (cache_is_expired(detail, tmp)) { ++ *hp = tmp->next; ++ tmp->next = NULL; ++ detail->entries --; ++ freeme = tmp; ++ break; ++ } + cache_get(tmp); + write_unlock(&detail->hash_lock); + cache_put(new, detail); +@@ -98,6 +114,8 @@ struct cache_head *sunrpc_cache_lookup(s + cache_get(new); + write_unlock(&detail->hash_lock); + ++ if (freeme) ++ cache_put(freeme, detail); + return new; + } + EXPORT_SYMBOL_GPL(sunrpc_cache_lookup); +@@ -183,10 +201,7 @@ static int cache_make_upcall(struct cach + + static inline int cache_is_valid(struct cache_detail *detail, struct cache_head *h) + { +- if (!test_bit(CACHE_VALID, &h->flags) || +- h->expiry_time < get_seconds()) +- return -EAGAIN; +- else if (detail->flush_time > h->last_refresh) ++ if (!test_bit(CACHE_VALID, &h->flags)) + return -EAGAIN; + else { + /* entry is valid */ +diff -up linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig linux-2.6.34.noarch/net/sunrpc/svcsock.c +--- linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/svcsock.c 2010-08-23 09:57:23.997368707 -0400 +@@ -547,7 +547,6 @@ static int svc_udp_recvfrom(struct svc_r + dprintk("svc: recvfrom returned error %d\n", -err); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + } +- svc_xprt_received(&svsk->sk_xprt); + return -EAGAIN; + } + len = svc_addr_len(svc_addr(rqstp)); +@@ -562,11 +561,6 @@ static int svc_udp_recvfrom(struct svc_r + svsk->sk_sk->sk_stamp = skb->tstamp; + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ + +- /* +- * Maybe more packets - kick another thread ASAP. +- */ +- svc_xprt_received(&svsk->sk_xprt); +- + len = skb->len - sizeof(struct udphdr); + rqstp->rq_arg.len = len; + +@@ -917,7 +911,6 @@ static int svc_tcp_recv_record(struct sv + if (len < want) { + dprintk("svc: short recvfrom while reading record " + "length (%d of %d)\n", len, want); +- svc_xprt_received(&svsk->sk_xprt); + goto err_again; /* record header not complete */ + } + +@@ -953,7 +946,6 @@ static int svc_tcp_recv_record(struct sv + if (len < svsk->sk_reclen) { + dprintk("svc: incomplete TCP record (%d of %d)\n", + len, svsk->sk_reclen); +- svc_xprt_received(&svsk->sk_xprt); + goto err_again; /* record not complete */ + } + len = svsk->sk_reclen; +@@ -961,10 +953,8 @@ static int svc_tcp_recv_record(struct sv + + return len; + error: +- if (len == -EAGAIN) { ++ if (len == -EAGAIN) + dprintk("RPC: TCP recv_record got EAGAIN\n"); +- svc_xprt_received(&svsk->sk_xprt); +- } + return len; + err_delete: + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); +@@ -1110,7 +1100,6 @@ out: + svsk->sk_tcplen = 0; + + svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt); +- svc_xprt_received(&svsk->sk_xprt); + if (serv->sv_stats) + serv->sv_stats->nettcpcnt++; + +@@ -1119,7 +1108,6 @@ out: + err_again: + if (len == -EAGAIN) { + dprintk("RPC: TCP recvfrom got EAGAIN\n"); +- svc_xprt_received(&svsk->sk_xprt); + return len; + } + error: +diff -up linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig linux-2.6.34.noarch/net/sunrpc/svc_xprt.c +--- linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/svc_xprt.c 2010-08-23 09:57:23.996377209 -0400 +@@ -744,8 +744,10 @@ int svc_recv(struct svc_rqst *rqstp, lon + if (rqstp->rq_deferred) { + svc_xprt_received(xprt); + len = svc_deferred_recv(rqstp); +- } else ++ } else { + len = xprt->xpt_ops->xpo_recvfrom(rqstp); ++ svc_xprt_received(xprt); ++ } + dprintk("svc: got len=%d\n", len); + } + +@@ -893,12 +895,12 @@ void svc_delete_xprt(struct svc_xprt *xp + */ + if (test_bit(XPT_TEMP, &xprt->xpt_flags)) + serv->sv_tmpcnt--; ++ spin_unlock_bh(&serv->sv_lock); + + while ((dr = svc_deferred_dequeue(xprt)) != NULL) + kfree(dr); + + svc_xprt_put(xprt); +- spin_unlock_bh(&serv->sv_lock); + } + + void svc_close_xprt(struct svc_xprt *xprt) +diff -up linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c.orig linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +--- linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 2010-08-23 09:57:23.998377481 -0400 +@@ -566,7 +566,6 @@ static int rdma_read_complete(struct svc + ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, + rqstp->rq_arg.head[0].iov_len); + +- svc_xprt_received(rqstp->rq_xprt); + return ret; + } + +@@ -665,7 +664,6 @@ int svc_rdma_recvfrom(struct svc_rqst *r + rqstp->rq_arg.head[0].iov_len); + rqstp->rq_prot = IPPROTO_MAX; + svc_xprt_copy_addrs(rqstp, xprt); +- svc_xprt_received(xprt); + return ret; + + close_out: +@@ -678,6 +676,5 @@ int svc_rdma_recvfrom(struct svc_rqst *r + */ + set_bit(XPT_CLOSE, &xprt->xpt_flags); + defer: +- svc_xprt_received(xprt); + return 0; + } diff --git a/pnfs-all-2.6.35-2010-08-19-f13.patch b/pnfs-all-2.6.35-2010-08-19-f13.patch new file mode 100644 index 000000000..a9d78ba0e --- /dev/null +++ b/pnfs-all-2.6.35-2010-08-19-f13.patch @@ -0,0 +1,31788 @@ +diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arch/um/os-Linux/mem.c +--- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-08-23 12:08:27.310584826 -0400 ++++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-08-23 12:09:03.273553977 -0400 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include "init.h" + #include "kern_constants.h" + #include "os.h" +diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd.c +--- linux-2.6.34.noarch/block/genhd.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/block/genhd.c 2010-08-23 12:09:03.273553977 -0400 +@@ -1009,6 +1009,7 @@ static void disk_release(struct device * + struct class block_class = { + .name = "block", + }; ++EXPORT_SYMBOL(block_class); + + static char *block_devnode(struct device *dev, mode_t *mode) + { +diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt +--- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-23 12:09:03.274563927 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-08-23 12:09:03.274563927 -0400 +@@ -0,0 +1,211 @@ ++(c) 2007 Network Appliance Inc. ++ ++spNFS ++----- ++ ++An spNFS system consists of a Meta Data Server (MDS), a number of Client machines (C) and a number of Data Servers (DS). ++ ++A file system is mounted by the clients from the MDS, and all file data ++is striped across the DSs. ++ ++Identify the machines that will be filling each of these roles. ++ ++The spnfs kernel will be installed on all machines: clients, the MDS and DSs. ++ ++ ++Building and installing the spNFS kernel ++---------------------------------------- ++ ++Get the spNFS kernel from: ++ ++ git://linux-nfs.org/~bhalevy/linux-pnfs.git ++ ++Use the pnfs-all-latest branch and add these options to your .config file ++ ++ CONFIG_NETWORK_FILESYSTEMS=y ++ CONFIG_NFS_FS=m ++ CONFIG_NFS_V4=y ++ CONFIG_NFS_V4_1=y ++ CONFIG_PNFS=y ++ CONFIG_NFSD=m ++ CONFIG_PNFSD=y ++ # CONFIG_PNFSD_LOCAL_EXPORT is not set ++ CONFIG_SPNFS=y ++ ++By default, spNFS uses whole-file layouts. Layout segments can be enabled ++by adding: ++ ++ CONFIG_SPNFS_LAYOUTSEGMENTS=y ++ ++to your .config file. ++ ++Building and installation of kernel+modules is as usual. ++This kernel should be installed and booted on the client, MDS and DSs. ++ ++Note that CONFIG_PNFSD_LOCAL_EXPORT must be disabled for spnfs as it ++takes over the pnfs export interface. ++ ++Building nfs-utils ++------------------ ++ ++Get the nfs-utils package containing spnfsd from: ++ ++ git://linux-nfs.org/~bhalevy/pnfs-nfs-utils.git ++ ++Follow the standard instructions for building nfs-utils. ++ ++After building, the spnfsd daemon will be located in utils/spnfsd. The spnfsd ++daemon will only be needed on the MDS. ++ ++ ++Installation ++------------ ++ ++The nfs-utils package contains a default spnfsd.conf file in ++utils/spnfsd/spnfsd.conf. Copy this file to /etc/spnfsd.conf. ++ ++By default, the DS-Mount-Directory is set to /spnfs (see spnfsd.conf). Under ++this directory, mount points must be created for each DS to ++be used for pNFS data stripes. These mount points are named by the ip address ++of the corresponding DS. In the sample spnfsd.conf, there are two ++DSs defined (172.16.28.134 and 172.16.28.141). ++ ++Following the sample spnfsd.conf, ++ ++ mkdir /spnfs ++ ++on the MDS (corresponding to DS-Mount-Directory). Then ++ ++ mkdir /spnfs/172.16.28.134 ++ mkdir /spnfs/172.16.28.141 ++ ++to create the mount points for the DSs. ++ ++On the DSs, chose a directory where data stripes will be created by the MDS. ++For the sample file, this directory is /pnfs, so on each DS execute: ++ ++ mkdir /pnfs ++ ++This directory is specified in the spnfsd.conf file by the DS*_ROOT option ++(where * is replaced by the DS number). DS_ROOT is specified relative to ++the directory being exported by the DSs. In our example, our DSs are exporting ++the root directory (/) and therefore our DS_ROOT is /pnfs. On the DSs, we have ++the following entry in /etc/exports: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check) ++ ++N.B. If we had created a /exports directory and a /pnfs directory under ++/exports, and if we were exporting /exports, then DS_ROOT would still be /pnfs ++(not /exports/pnfs). ++ ++It may be useful to add entries to /etc/fstab on the MDS to automatically ++mount the DS_ROOT file systems. For this example, our MDS fstab would ++contain: ++ ++ 172.17.84.128:/pnfs /spnfs/172.17.84.128 nfs defaults 1 2 ++ 172.17.84.122:/pnfs /spnfs/172.17.84.122 nfs defaults 1 2 ++ ++The DS mounts must be performed manually or via fstab at this time (automatic ++mounting, directory creation, etc. are on the todo list). To perform I/O ++through the MDS, the DS mounts MUST use NFSv3 at this time (this restriction ++will eventually be removed). ++ ++ ++On the MDS, choose a file system to use with spNFS and export it, e.g.: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check,pnfs) ++ ++Make sure nfsd and all supporting processes are running on the MDS and DSs. ++ ++ ++Running ++------- ++ ++If rpc_pipefs is not already mounted (if you're running idmapd it probably is), ++you may want to add the following line to /etc/fstab: ++ ++ rpc_pipefs /var/lib/nfs/rpc_pipefs rpc_pipefs defaults 0 0 ++ ++to automatically mount rpc_pipefs. ++ ++With spnfsd.conf configured for your environment and the mounts mounted as ++described above, spnfsd can now be started. ++ ++On the MDS, execute spnfsd: ++ ++ spnfsd ++ ++The executable is located in the directory where it was built, and ++may also have been installed elsewhere depending on how you built nfs-utils. ++It will run in the foreground by default, and in fact will do so despite ++any options suggesting the contrary (it's still a debugging build). ++ ++On the client, make sure the nfslayoutdriver module is loaded: ++ ++ modprobe nfslayoutdriver ++ ++Then mount the file system from the MDS: ++ ++ mount -t nfs4 -o minorversion=1 mds:/ /mnt ++ ++I/O through the MDS is now supported. To use it, do not load the ++nfslayoutdriver on the client, and mount the MDS using NFSv4 or 4.1 ++(NFSv2 and v3 are not yet supported). ++ ++You may now use spNFS by performing file system activities in /mnt. ++If you create files in /mnt, you should see stripe files corresponding to ++new files being created on the DSs. The current implementation names the ++stripe files based on the inode number of the file on the MDS. For example, ++if you create a file foo in /mnt and do an 'ls -li /mnt/foo': ++ ++ # ls -li foo ++ 1233 -rw-r--r-- 1 root root 0 Nov 29 15:54 foo ++ ++You should see stripe files on each under /pnfs (per the sample) named ++1233. The file /pnfs/1233 on DS1 will contain the first bytes ++of data written to foo, DS2 will contain the next bytes, etc. ++Removing /mnt/foo will remove the corresponding stripe files on the DSs. ++Other file system operations should behave (mostly :-) as expected. ++ ++ ++Layout Segments ++--------------- ++ ++If the kernel is compiled to support layout segments, there will ++be two files created under /proc/fs/spnfs for controlling layout ++segment functionality. ++ ++To enable layout segments, write a '1' to /proc/fs/spnfs/layoutseg, e.g.: ++ ++ echo 1 > /proc/fs/spnfs/layoutseg ++ ++Layout segments can be disabled (returning to whole-file layouts) by ++writing a '0' to /proc/fs/spnfs/layoutseg: ++ ++ echo 0 > /proc/fs/spnfs/layoutseg ++ ++When layout segments are enabled, the size of the layouts returned can ++be specified by writing a decimal number (ascii representation) to ++/proc/fs/spnfs/layoutsegsize: ++ ++ echo 1024 > /proc/fs/spnfs/layoutsegsize ++ ++The value'0' has a special meaning--it causes the server to return a ++layout that is exactly the size requested by the client: ++ ++ echo 0 > /proc/fs/spnfs/layoutsegsize ++ ++ ++Troubleshooting ++--------------- ++ ++If you see data being written to the files on the MDS rather than ++the stripe files, make sure the nfslayoutdriver is loaded on the client ++(see above). ++ ++If you get a "permission denied" error, make sure mountd is running on the mds ++(it occasionally fails to start). ++ ++Bugs, enhancements, compliments, complaints to: dmuntz@netapp.com ++ ++ +diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/drivers/md/dm-ioctl.c +--- linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-08-23 12:09:03.275584050 -0400 +@@ -651,6 +651,12 @@ static int dev_create(struct dm_ioctl *p + return r; + } + ++int dm_dev_create(struct dm_ioctl *param) ++{ ++ return dev_create(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_create); ++ + /* + * Always use UUID for lookups if it's present, otherwise use name or dev. + */ +@@ -745,6 +751,12 @@ static int dev_remove(struct dm_ioctl *p + return 0; + } + ++int dm_dev_remove(struct dm_ioctl *param) ++{ ++ return dev_remove(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_remove); ++ + /* + * Check a string doesn't overrun the chunk of + * memory we copied from userland. +@@ -917,6 +929,12 @@ static int do_resume(struct dm_ioctl *pa + return r; + } + ++int dm_do_resume(struct dm_ioctl *param) ++{ ++ return do_resume(param); ++} ++EXPORT_SYMBOL(dm_do_resume); ++ + /* + * Set or unset the suspension state of a device. + * If the device already is in the requested state we just return its status. +@@ -1194,6 +1212,12 @@ out: + return r; + } + ++int dm_table_load(struct dm_ioctl *param, size_t param_size) ++{ ++ return table_load(param, param_size); ++} ++EXPORT_SYMBOL(dm_table_load); ++ + static int table_clear(struct dm_ioctl *param, size_t param_size) + { + int r; +diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drivers/scsi/hosts.c +--- linux-2.6.34.noarch/drivers/scsi/hosts.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-08-23 12:09:03.276563906 -0400 +@@ -49,7 +49,7 @@ static void scsi_host_cls_release(struct + put_device(&class_to_shost(dev)->shost_gendev); + } + +-static struct class shost_class = { ++struct class shost_class = { + .name = "scsi_host", + .dev_release = scsi_host_cls_release, + }; +diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/exofs.h +--- linux-2.6.34.noarch/fs/exofs/exofs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-08-23 12:09:03.277563890 -0400 +@@ -36,13 +36,9 @@ + #include + #include + #include ++#include + #include "common.h" + +-/* FIXME: Remove once pnfs hits mainline +- * #include +- */ +-#include "pnfs.h" +- + #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) + + #ifdef CONFIG_EXOFS_DEBUG +@@ -103,6 +99,7 @@ struct exofs_sb_info { + struct exofs_i_info { + struct inode vfs_inode; /* normal in-memory inode */ + wait_queue_head_t i_wq; /* wait queue for inode */ ++ spinlock_t i_layout_lock; /* lock for layout/return/recall */ + unsigned long i_flags; /* various atomic flags */ + uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ + uint32_t i_dir_start_lookup; /* which page to start lookup */ +@@ -166,6 +163,9 @@ static inline unsigned exofs_io_state_si + */ + #define OBJ_2BCREATED 0 /* object will be created soon*/ + #define OBJ_CREATED 1 /* object has been created on the osd*/ ++/* Below are not used atomic but reuse the same i_flags */ ++#define OBJ_LAYOUT_IS_GIVEN 2 /* inode has given layouts to clients*/ ++#define OBJ_IN_LAYOUT_RECALL 3 /* inode is in the middle of a layout recall*/ + + static inline int obj_2bcreated(struct exofs_i_info *oi) + { +@@ -304,4 +304,20 @@ extern const struct inode_operations exo + extern const struct inode_operations exofs_symlink_inode_operations; + extern const struct inode_operations exofs_fast_symlink_inode_operations; + ++/* export.c */ ++typedef int (exofs_recall_fn)(struct inode *inode); ++#ifdef CONFIG_PNFSD ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo); ++void exofs_init_export(struct super_block *sb); ++#else ++static inline int exofs_inode_recall_layout(struct inode *inode, ++ enum pnfs_iomode iomode, exofs_recall_fn todo) ++{ ++ return todo(inode); ++} ++ ++static inline void exofs_init_export(struct super_block *sb) {} ++#endif ++ + #endif +diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs/export.c +--- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-08-23 12:09:03.278386746 -0400 ++++ linux-2.6.34.noarch/fs/exofs/export.c 2010-08-23 12:09:03.278386746 -0400 +@@ -0,0 +1,396 @@ ++/* ++ * export.c - Implementation of the pnfs_export_operations ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Boaz Harrosh ++ * ++ * This file is part of exofs. ++ * ++ * exofs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation. Since it is based on ext2, and the only ++ * valid version of GPL for the Linux kernel is version 2, the only valid ++ * version of GPL for exofs is version 2. ++ * ++ * exofs is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with exofs; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include ++#include "exofs.h" ++ ++static int exofs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_OSD2_OBJECTS; ++} ++ ++static void set_dev_id(struct pnfs_deviceid *pnfs_devid, u64 sbid, u64 devid) ++{ ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)pnfs_devid; ++ ++ dev_id->sbid = sbid; ++ dev_id->devid = devid; ++} ++ ++static int cb_layout_recall(struct inode *inode, enum pnfs_iomode iomode, ++ u64 offset, u64 length, void *cookie) ++{ ++ struct nfsd4_pnfs_cb_layout cbl; ++ struct pnfsd_cb_ctl cb_ctl; ++ int status; ++ ++ memset(&cb_ctl, 0, sizeof(cb_ctl)); ++ status = pnfsd_get_cb_op(&cb_ctl); ++ if (unlikely(status)) { ++ EXOFS_ERR("%s: nfsd unloaded!! inode (0x%lx) status=%d\n", ++ __func__, inode->i_ino, status); ++ goto err; ++ } ++ ++ memset(&cbl, 0, sizeof(cbl)); ++ cbl.cbl_recall_type = RETURN_FILE; ++ cbl.cbl_seg.layout_type = LAYOUT_OSD2_OBJECTS; ++ cbl.cbl_seg.iomode = iomode; ++ cbl.cbl_seg.offset = offset; ++ cbl.cbl_seg.length = length; ++ cbl.cbl_cookie = cookie; ++ ++ status = cb_ctl.cb_op->cb_layout_recall(inode->i_sb, inode, &cbl); ++ pnfsd_put_cb_op(&cb_ctl); ++ ++err: ++ return status; ++} ++ ++static enum nfsstat4 exofs_layout_get( ++ struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; ++ struct exofs_layout *el = &sbi->layout; ++ struct pnfs_osd_object_cred *creds = NULL; ++ struct pnfs_osd_layout layout; ++ __be32 *start; ++ bool in_recall; ++ int i, err; ++ enum nfsstat4 nfserr; ++ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ res->lg_seg.iomode = IOMODE_RW; ++ res->lg_return_on_close = true; /* TODO: unused but will be soon */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ creds = kcalloc(el->s_numdevs, sizeof(*creds), GFP_KERNEL); ++ if (!creds) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto out; ++ } ++ ++ /* Fill in a pnfs_osd_layout struct */ ++ layout.olo_map = sbi->data_map; ++ ++ for (i = 0; i < el->s_numdevs; i++) { ++ struct pnfs_osd_object_cred *cred = &creds[i]; ++ osd_id id = exofs_oi_objno(oi); ++ unsigned dev = exofs_layout_od_id(el, id, i); ++ ++ set_dev_id(&cred->oc_object_id.oid_device_id, args->lg_sbid, ++ dev); ++ cred->oc_object_id.oid_partition_id = el->s_pid; ++ cred->oc_object_id.oid_object_id = id; ++ cred->oc_osd_version = osd_dev_is_ver1(el->s_ods[dev]) ? ++ PNFS_OSD_VERSION_1 : ++ PNFS_OSD_VERSION_2; ++ cred->oc_cap_key_sec = PNFS_OSD_CAP_KEY_SEC_NONE; ++ ++ cred->oc_cap_key.cred_len = 0; ++ cred->oc_cap_key.cred = NULL; ++ ++ cred->oc_cap.cred_len = OSD_CAP_LEN; ++ cred->oc_cap.cred = oi->i_cred; ++ } ++ ++ layout.olo_comps_index = 0; ++ layout.olo_num_comps = el->s_numdevs; ++ layout.olo_comps = creds; ++ ++ err = pnfs_osd_xdr_encode_layout(xdr, &layout); ++ if (err) { ++ nfserr = NFS4ERR_TOOSMALL; /* FIXME: Change osd_xdr error codes */ ++ goto out; ++ } ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ if (!in_recall) { ++ __set_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ nfserr = NFS4_OK; ++ } else { ++ nfserr = NFS4ERR_RECALLCONFLICT; ++ } ++ spin_unlock(&oi->i_layout_lock); ++ ++out: ++ kfree(creds); ++ EXOFS_DBGMSG("(0x%lx) nfserr=%u xdr_bytes=%zu\n", ++ inode->i_ino, nfserr, exp_xdr_qbytes(xdr->p - start)); ++ return nfserr; ++} ++ ++/* NOTE: inode mutex must NOT be held */ ++static int exofs_layout_commit( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct timespec mtime; ++ loff_t i_size; ++ int in_recall; ++ ++ /* In case of a recall we ignore the new size and mtime since they ++ * are going to be changed again by truncate, and since we cannot take ++ * the inode lock in that case. ++ */ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ if (in_recall) { ++ EXOFS_DBGMSG("(0x%lx) commit was called during recall\n", ++ inode->i_ino); ++ return 0; ++ } ++ ++ /* NOTE: I would love to call inode_setattr here ++ * but i cannot since this will cause an eventual vmtruncate, ++ * which will cause a layout_recall. So open code the i_size ++ * and mtime/atime changes under i_mutex. ++ */ ++ mutex_lock_nested(&inode->i_mutex, I_MUTEX_NORMAL); ++ ++ if (args->lc_mtime.seconds) { ++ mtime.tv_sec = args->lc_mtime.seconds; ++ mtime.tv_nsec = args->lc_mtime.nseconds; ++ ++ /* layout commit may only make time bigger, since there might ++ * be reordering of the notifications and it might arrive after ++ * A local change. ++ * TODO: if mtime > ctime then we know set_attr did an mtime ++ * in the future. and we can let this update through ++ */ ++ if (0 <= timespec_compare(&mtime, &inode->i_mtime)) ++ mtime = inode->i_mtime; ++ } else { ++ mtime = current_fs_time(inode->i_sb); ++ } ++ ++ /* TODO: Will below work? since mark_inode_dirty has it's own ++ * Time handling ++ */ ++ inode->i_atime = inode->i_mtime = mtime; ++ ++ i_size = i_size_read(inode); ++ if (args->lc_newoffset) { ++ loff_t new_size = args->lc_last_wr + 1; ++ ++ if (i_size < new_size) { ++ i_size_write(inode, i_size = new_size); ++ res->lc_size_chg = 1; ++ res->lc_newsize = new_size; ++ } ++ } ++ /* TODO: else { i_size = osd_get_object_length() } */ ++ ++/* TODO: exofs does not currently use the osd_xdr part of the layout_commit */ ++ ++ mark_inode_dirty_sync(inode); ++ ++ mutex_unlock(&inode->i_mutex); ++ EXOFS_DBGMSG("(0x%lx) i_size=0x%llx lcp->off=0x%llx\n", ++ inode->i_ino, i_size, args->lc_last_wr); ++ return 0; ++} ++ ++static void exofs_handle_error(struct pnfs_osd_ioerr *ioerr) ++{ ++ EXOFS_ERR("exofs_handle_error: errno=%d is_write=%d obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ ioerr->oer_errno, ioerr->oer_iswrite, ++ _LLU(ioerr->oer_component.oid_object_id), ++ _LLU(ioerr->oer_comp_offset), ++ _LLU(ioerr->oer_comp_length)); ++} ++ ++static int exofs_layout_return( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ __be32 *p = args->lrf_body; ++ unsigned len = exp_xdr_qwords(args->lrf_body_len); ++ ++ EXOFS_DBGMSG("(0x%lx) cookie %p xdr_len %d\n", ++ inode->i_ino, args->lr_cookie, len); ++ ++ while (len >= pnfs_osd_ioerr_xdr_sz()) { ++ struct pnfs_osd_ioerr ioerr; ++ ++ p = pnfs_osd_xdr_decode_ioerr(&ioerr, p); ++ len -= pnfs_osd_ioerr_xdr_sz(); ++ exofs_handle_error(&ioerr); ++ } ++ ++ if (args->lr_cookie) { ++ struct exofs_i_info *oi = exofs_i(inode); ++ bool in_recall; ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ __clear_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ /* TODO: how to communicate cookie with the waiter */ ++ if (in_recall) ++ wake_up(&oi->i_wq); /* wakeup any recalls */ ++ } ++ ++ return 0; ++} ++ ++int exofs_get_device_info(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct exofs_sb_info *sbi = sb->s_fs_info; ++ struct pnfs_osd_deviceaddr devaddr; ++ const struct osd_dev_info *odi; ++ u64 devno = devid->devid; ++ __be32 *start; ++ int err; ++ ++ memset(&devaddr, 0, sizeof(devaddr)); ++ ++ if (unlikely(devno >= sbi->layout.s_numdevs)) ++ return -ENODEV; ++ ++ odi = osduld_device_info(sbi->layout.s_ods[devno]); ++ ++ devaddr.oda_systemid.len = odi->systemid_len; ++ devaddr.oda_systemid.data = (void *)odi->systemid; /* !const cast */ ++ ++ devaddr.oda_osdname.len = odi->osdname_len ; ++ devaddr.oda_osdname.data = (void *)odi->osdname;/* !const cast */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ err = -E2BIG; ++ goto err; ++ } ++ ++ err = pnfs_osd_xdr_encode_deviceaddr(xdr, &devaddr); ++ if (err) ++ goto err; ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ EXOFS_DBGMSG("xdr_bytes=%Zu devno=%lld osdname-%s\n", ++ exp_xdr_qbytes(xdr->p - start), devno, odi->osdname); ++ return 0; ++ ++err: ++ EXOFS_DBGMSG("Error: err=%d at_byte=%zu\n", ++ err, exp_xdr_qbytes(xdr->p - start)); ++ return err; ++} ++ ++struct pnfs_export_operations exofs_pnfs_ops = { ++ .layout_type = exofs_layout_type, ++ .layout_get = exofs_layout_get, ++ .layout_commit = exofs_layout_commit, ++ .layout_return = exofs_layout_return, ++ .get_device_info = exofs_get_device_info, ++}; ++ ++static bool is_layout_returned(struct exofs_i_info *oi) ++{ ++ bool layout_given; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ return !layout_given; ++} ++ ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ int layout_given; ++ int error = 0; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ __set_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ if (!layout_given) ++ goto exec; ++ ++ for (;;) { ++ EXOFS_DBGMSG("(0x%lx) has_layout issue a recall\n", ++ inode->i_ino); ++ error = cb_layout_recall(inode, iomode, 0, NFS4_MAX_UINT64, ++ &oi->i_wq); ++ switch (error) { ++ case 0: ++ case -EAGAIN: ++ break; ++ case -ENOENT: ++ goto exec; ++ default: ++ goto err; ++ } ++ ++ error = wait_event_interruptible(oi->i_wq, ++ is_layout_returned(oi)); ++ if (error) ++ goto err; ++ } ++ ++exec: ++ error = todo(inode); ++ ++err: ++ spin_lock(&oi->i_layout_lock); ++ __clear_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ EXOFS_DBGMSG("(0x%lx) return=>%d\n", inode->i_ino, error); ++ return error; ++} ++ ++void exofs_init_export(struct super_block *sb) ++{ ++ sb->s_pnfs_op = &exofs_pnfs_ops; ++} +diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/inode.c +--- linux-2.6.34.noarch/fs/exofs/inode.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-08-23 12:09:03.279502002 -0400 +@@ -833,7 +833,7 @@ void exofs_truncate(struct inode *inode) + if (unlikely(wait_obj_created(oi))) + goto fail; + +- ret = _do_truncate(inode); ++ ret = exofs_inode_recall_layout(inode, IOMODE_ANY, _do_truncate); + if (ret) + goto fail; + +@@ -964,6 +964,7 @@ static void __oi_init(struct exofs_i_inf + { + init_waitqueue_head(&oi->i_wq); + oi->i_flags = 0; ++ spin_lock_init(&oi->i_layout_lock); + } + /* + * Fill in an inode read from the OSD and set it up for use +diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/Kbuild +--- linux-2.6.34.noarch/fs/exofs/Kbuild.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-08-23 12:09:03.279502002 -0400 +@@ -13,4 +13,5 @@ + # + + exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o ++exofs-$(CONFIG_PNFSD) += export.o + obj-$(CONFIG_EXOFS_FS) += exofs.o +diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/Kconfig +--- linux-2.6.34.noarch/fs/exofs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-08-23 12:09:03.280553663 -0400 +@@ -1,6 +1,7 @@ + config EXOFS_FS + tristate "exofs: OSD based file system support" + depends on SCSI_OSD_ULD ++ select EXPORTFS_OSD_LAYOUT if PNFSD + help + EXOFS is a file system that uses an OSD storage device, + as its backing storage. +diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/super.c +--- linux-2.6.34.noarch/fs/exofs/super.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/super.c 2010-08-23 12:09:03.281511951 -0400 +@@ -621,6 +621,7 @@ static int exofs_fill_super(struct super + sb->s_fs_info = sbi; + sb->s_op = &exofs_sops; + sb->s_export_op = &exofs_export_ops; ++ exofs_init_export(sb); + root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); + if (IS_ERR(root)) { + EXOFS_ERR("ERROR: exofs_iget failed\n"); +diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exportfs/expfs.c +--- linux-2.6.34.noarch/fs/exportfs/expfs.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-08-23 12:09:03.282511528 -0400 +@@ -16,6 +16,13 @@ + #include + #include + ++#if defined(CONFIG_PNFSD) ++struct pnfsd_cb_ctl pnfsd_cb_ctl = { ++ .lock = __SPIN_LOCK_UNLOCKED(pnfsd_cb_ctl.lock) ++}; ++EXPORT_SYMBOL(pnfsd_cb_ctl); ++#endif /* CONFIG_PNFSD */ ++ + #define dprintk(fmt, args...) do{}while(0) + + +diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/exportfs/Makefile +--- linux-2.6.34.noarch/fs/exportfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-08-23 12:09:03.282511528 -0400 +@@ -3,4 +3,7 @@ + + obj-$(CONFIG_EXPORTFS) += exportfs.o + +-exportfs-objs := expfs.o ++exportfs-y := expfs.o ++exportfs-$(CONFIG_EXPORTFS_FILE_LAYOUT) += nfs4filelayoutxdr.o ++exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o ++exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o +diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c +--- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-23 12:09:03.283511561 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-23 12:09:03.283511561 -0400 +@@ -0,0 +1,158 @@ ++/* ++ * linux/fs/nfsd/nfs4blocklayoutxdr.c ++ * ++ * ++ * Created by Rick McNeal on 3/31/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++#include ++#include ++#include ++#include ++ ++static int ++bl_encode_simple(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 12 + 4 + bld->u.simple.bld_sig_len); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u32(p, 1); ++ p = exp_xdr_encode_u64(p, bld->u.simple.bld_offset); ++ exp_xdr_encode_opaque(p, bld->u.simple.bld_sig, ++ bld->u.simple.bld_sig_len); ++ ++ return 0; ++} ++ ++static int ++bl_encode_slice(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2 + 2 + 1); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_start); ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_len); ++ exp_xdr_encode_u32(p, bld->u.slice.bld_index); ++ ++ return 0; ++} ++ ++static int ++bl_encode_concat(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ return -ENOTSUPP; ++} ++ ++static int ++bl_encode_stripe(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ int i; ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 2 + 1 + bld->u.stripe.bld_stripes); ++ ++ p = exp_xdr_encode_u64(p, bld->u.stripe.bld_chunk_size); ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripes); ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripe_indexs[i]); ++ ++ return 0; ++} ++ ++int ++blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes) ++{ ++ u32 num_vols = 0, ++ *layoutlen_p = xdr->p; ++ pnfs_blocklayout_devinfo_t *bld; ++ int status = 0; ++ __be32 *p; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -ETOOSMALL; ++ p += 2; ++ ++ /* ++ * All simple volumes with their signature are required to be listed ++ * first. ++ */ ++ list_for_each_entry(bld, volumes, bld_list) { ++ num_vols++; ++ p = exp_xdr_reserve_qwords(xdr, 1); ++ if (!p) ++ return -ETOOSMALL; ++ p = exp_xdr_encode_u32(p, bld->bld_type); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ status = bl_encode_simple(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_SLICE: ++ status = bl_encode_slice(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ status = bl_encode_concat(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ status = bl_encode_stripe(xdr, bld); ++ break; ++ default: ++ BUG(); ++ } ++ if (status) ++ goto error; ++ } ++ ++ /* ---- Fill in the overall length and number of volumes ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (xdr->p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, num_vols); ++ ++error: ++ return status; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_devinfo); ++ ++enum nfsstat4 ++blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *bl_head) ++{ ++ struct pnfs_blocklayout_layout *b; ++ u32 *layoutlen_p = xdr->p, ++ extents = 0; ++ __be32 *p; ++ ++ /* ++ * Save spot for opaque block layout length and number of extents, ++ * fill-in later. ++ */ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p += 2; ++ ++ list_for_each_entry(b, bl_head, bll_list) { ++ extents++; ++ p = exp_xdr_reserve_qwords(xdr, 5 * 2 + 1); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.sbid); ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.devid); ++ p = exp_xdr_encode_u64(p, b->bll_foff); ++ p = exp_xdr_encode_u64(p, b->bll_len); ++ p = exp_xdr_encode_u64(p, b->bll_soff); ++ p = exp_xdr_encode_u32(p, b->bll_es); ++ } ++ ++ /* ---- Fill in the overall length and number of extents ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, extents); ++ ++ return NFS4_OK; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_layout); +diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c +--- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-23 12:09:03.283511561 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-23 12:09:03.283511561 -0400 +@@ -0,0 +1,218 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++/* We do our-own dprintk so filesystems are not dependent on sunrpc */ ++#ifdef dprintk ++#undef dprintk ++#endif ++#define dprintk(fmt, args, ...) do { } while (0) ++ ++/* Calculate the XDR length of the GETDEVICEINFO4resok structure ++ * excluding the gdir_notification and the gdir_device_addr da_layout_type. ++ */ ++static int fl_devinfo_xdr_words(const struct pnfs_filelayout_device *fdev) ++{ ++ struct pnfs_filelayout_devaddr *fl_addr; ++ struct pnfs_filelayout_multipath *mp; ++ int i, j, nwords; ++ ++ /* da_addr_body length, indice length, indices, ++ * multipath_list4 length */ ++ nwords = 1 + 1 + fdev->fl_stripeindices_length + 1; ++ for (i = 0; i < fdev->fl_device_length; i++) { ++ mp = &fdev->fl_device_list[i]; ++ nwords++; /* multipath list length */ ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ fl_addr = mp->fl_multipath_list; ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_netid.len); ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_addr.len); ++ } ++ } ++ dprintk("<-- %s nwords %d\n", __func__, nwords); ++ return nwords; ++} ++ ++/* Encodes the nfsv4_1_file_layout_ds_addr4 structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++int ++filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ unsigned int i, j, len = 0, opaque_words; ++ u32 *p_in; ++ u32 index_count = fdev->fl_stripeindices_length; ++ u32 dev_count = fdev->fl_device_length; ++ int error = 0; ++ __be32 *p; ++ ++ opaque_words = fl_devinfo_xdr_words(fdev); ++ dprintk("%s: Begin indx_cnt: %u dev_cnt: %u total size %u\n", ++ __func__, ++ index_count, ++ dev_count, ++ opaque_words*4); ++ ++ /* check space for opaque length */ ++ p = p_in = exp_xdr_reserve_qwords(xdr, opaque_words); ++ if (!p) { ++ error = -ETOOSMALL; ++ goto out; ++ } ++ ++ /* Fill in length later */ ++ p++; ++ ++ /* encode device list indices */ ++ p = exp_xdr_encode_u32(p, index_count); ++ for (i = 0; i < index_count; i++) ++ p = exp_xdr_encode_u32(p, fdev->fl_stripeindices_list[i]); ++ ++ /* encode device list */ ++ p = exp_xdr_encode_u32(p, dev_count); ++ for (i = 0; i < dev_count; i++) { ++ struct pnfs_filelayout_multipath *mp = &fdev->fl_device_list[i]; ++ ++ p = exp_xdr_encode_u32(p, mp->fl_multipath_length); ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ struct pnfs_filelayout_devaddr *da = ++ &mp->fl_multipath_list[j]; ++ ++ /* Encode device info */ ++ p = exp_xdr_encode_opaque(p, da->r_netid.data, ++ da->r_netid.len); ++ p = exp_xdr_encode_opaque(p, da->r_addr.data, ++ da->r_addr.len); ++ } ++ } ++ ++ /* backfill in length. Subtract 4 for da_addr_body size */ ++ len = (char *)p - (char *)p_in; ++ exp_xdr_encode_u32(p_in, len - 4); ++ ++ error = 0; ++out: ++ dprintk("%s: End err %d xdrlen %d\n", ++ __func__, error, len); ++ return error; ++} ++EXPORT_SYMBOL(filelayout_encode_devinfo); ++ ++/* Encodes the loc_body structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++enum nfsstat4 ++filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp) ++{ ++ u32 len = 0, nfl_util, fhlen, i; ++ u32 *layoutlen_p; ++ enum nfsstat4 nfserr; ++ __be32 *p; ++ ++ dprintk("%s: device_id %llx:%llx fsi %u, numfh %u\n", ++ __func__, ++ flp->device_id.pnfs_fsid, ++ flp->device_id.pnfs_devid, ++ flp->lg_first_stripe_index, ++ flp->lg_fh_length); ++ ++ /* Ensure file system added at least one file handle */ ++ if (flp->lg_fh_length <= 0) { ++ dprintk("%s: File Layout has no file handles!!\n", __func__); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto out; ++ } ++ ++ /* Ensure room for len, devid, util, first_stripe_index, ++ * pattern_offset, number of filehandles */ ++ p = layoutlen_p = exp_xdr_reserve_qwords(xdr, 1+2+2+1+1+2+1); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ /* save spot for opaque file layout length, fill-in later*/ ++ p++; ++ ++ /* encode device id */ ++ p = exp_xdr_encode_u64(p, flp->device_id.sbid); ++ p = exp_xdr_encode_u64(p, flp->device_id.devid); ++ ++ /* set and encode flags */ ++ nfl_util = flp->lg_stripe_unit; ++ if (flp->lg_commit_through_mds) ++ nfl_util |= NFL4_UFLG_COMMIT_THRU_MDS; ++ if (flp->lg_stripe_type == STRIPE_DENSE) ++ nfl_util |= NFL4_UFLG_DENSE; ++ p = exp_xdr_encode_u32(p, nfl_util); ++ ++ /* encode first stripe index */ ++ p = exp_xdr_encode_u32(p, flp->lg_first_stripe_index); ++ ++ /* encode striping pattern start */ ++ p = exp_xdr_encode_u64(p, flp->lg_pattern_offset); ++ ++ /* encode number of file handles */ ++ p = exp_xdr_encode_u32(p, flp->lg_fh_length); ++ ++ /* encode file handles */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ fhlen = flp->lg_fh_list[i].fh_size; ++ p = exp_xdr_reserve_space(xdr, 4 + fhlen); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ p = exp_xdr_encode_opaque(p, &flp->lg_fh_list[i].fh_base, fhlen); ++ } ++ ++ /* Set number of bytes encoded = total_bytes_encoded - length var */ ++ len = (char *)p - (char *)layoutlen_p; ++ exp_xdr_encode_u32(layoutlen_p, len - 4); ++ ++ nfserr = NFS4_OK; ++out: ++ dprintk("%s: End err %u xdrlen %d\n", ++ __func__, nfserr, len); ++ return nfserr; ++} ++EXPORT_SYMBOL(filelayout_encode_layout); +diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c +--- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-23 12:09:03.284511493 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-23 12:09:03.284511493 -0400 +@@ -0,0 +1,289 @@ ++/* ++ * pnfs_osd_xdr_enc.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_data_map( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_data_map *data_map) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 1+2+1+1+1+1); ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, data_map->odm_num_comps); ++ p = exp_xdr_encode_u64(p, data_map->odm_stripe_unit); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_width); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_depth); ++ p = exp_xdr_encode_u32(p, data_map->odm_mirror_cnt); ++ p = exp_xdr_encode_u32(p, data_map->odm_raid_algorithm); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline int pnfs_osd_xdr_encode_objid( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2+2+2+2); ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)&object_id->oid_device_id; ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u64(p, dev_id->sbid); ++ p = exp_xdr_encode_u64(p, dev_id->devid); ++ p = exp_xdr_encode_u64(p, object_id->oid_partition_id); ++ p = exp_xdr_encode_u64(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * enum pnfs_osd_cap_key_sec4 { ++ * PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ * PNFS_OSD_CAP_KEY_SEC_SSV = 1 ++ * }; ++ * ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_object_cred( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_object_cred *olo_comp) ++{ ++ __be32 *p; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_objid(xdr, &olo_comp->oc_object_id); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_space(xdr, 3*4 + 4+olo_comp->oc_cap.cred_len); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, olo_comp->oc_osd_version); ++ ++ /* No sec for now */ ++ p = exp_xdr_encode_u32(p, PNFS_OSD_CAP_KEY_SEC_NONE); ++ p = exp_xdr_encode_u32(p, 0); /* opaque oc_capability_key<> */ ++ ++ exp_xdr_encode_opaque(p, olo_comp->oc_cap.cred, ++ olo_comp->oc_cap.cred_len); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_layout { ++ * struct pnfs_osd_data_map olo_map; ++ * u32 olo_comps_index; ++ * u32 olo_num_comps; ++ * struct pnfs_osd_object_cred *olo_comps; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *pol) ++{ ++ __be32 *p; ++ u32 i; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_data_map(xdr, &pol->olo_map); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, pol->olo_comps_index); ++ p = exp_xdr_encode_u32(p, pol->olo_num_comps); ++ ++ for (i = 0; i < pol->olo_num_comps; i++) { ++ err = pnfs_osd_xdr_encode_object_cred(xdr, &pol->olo_comps[i]); ++ if (err) ++ return err; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_layout); ++ ++static int _encode_string(struct exp_xdr_stream *xdr, ++ const struct nfs4_string *str) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, 4 + str->len); ++ ++ if (!p) ++ return -E2BIG; ++ exp_xdr_encode_opaque(p, str->data, str->len); ++ return 0; ++} ++ ++/* struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr) ++{ ++ __be32 *p; ++ int err; ++ ++ p = exp_xdr_reserve_space(xdr, 4 + 4 + sizeof(devaddr->oda_lun)); ++ if (!p) ++ return -E2BIG; ++ ++ /* Empty oda_targetid */ ++ p = exp_xdr_encode_u32(p, OBJ_TARGET_ANON); ++ ++ /* Empty oda_targetaddr for now */ ++ p = exp_xdr_encode_u32(p, 0); ++ ++ /* oda_lun */ ++ exp_xdr_encode_bytes(p, devaddr->oda_lun, sizeof(devaddr->oda_lun)); ++ ++ err = _encode_string(xdr, &devaddr->oda_systemid); ++ if (err) ++ return err; ++ ++ err = pnfs_osd_xdr_encode_object_cred(xdr, ++ &devaddr->oda_root_obj_cred); ++ if (err) ++ return err; ++ ++ err = _encode_string(xdr, &devaddr->oda_osdname); ++ if (err) ++ return err; ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_deviceaddr); ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p) ++{ ++ lou->dsu_valid = be32_to_cpu(*p++); ++ if (lou->dsu_valid) ++ p = xdr_decode_hyper(p, &lou->dsu_delta); ++ lou->olu_ioerr_flag = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_layoutupdate); ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline __be32 * ++pnfs_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) ++{ ++ /* FIXME: p = xdr_decode_fixed(...) */ ++ memcpy(objid->oid_device_id.data, p, sizeof(objid->oid_device_id.data)); ++ p += XDR_QUADLEN(sizeof(objid->oid_device_id.data)); ++ ++ p = xdr_decode_hyper(p, &objid->oid_partition_id); ++ p = xdr_decode_hyper(p, &objid->oid_object_id); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p) ++{ ++ p = pnfs_osd_xdr_decode_objid(p, &ioerr->oer_component); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_offset); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_length); ++ ioerr->oer_iswrite = be32_to_cpu(*p++); ++ ioerr->oer_errno = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); +diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gfs2/ops_fstype.c +--- linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-08-23 12:09:03.285539075 -0400 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + #include "gfs2.h" + #include "incore.h" +@@ -1146,6 +1147,9 @@ static int fill_super(struct super_block + sb->s_magic = GFS2_MAGIC; + sb->s_op = &gfs2_super_ops; + sb->s_export_op = &gfs2_export_ops; ++#if defined(CONFIG_PNFSD) ++ sb->s_pnfs_op = &pnfs_dlm_export_ops; ++#endif /* CONFIG_PNFSD */ + sb->s_xattr = gfs2_xattr_handlers; + sb->s_qcop = &gfs2_quotactl_ops; + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; +diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig +--- linux-2.6.34.noarch/fs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/Kconfig 2010-08-23 12:09:03.286512316 -0400 +@@ -224,6 +224,31 @@ config LOCKD_V4 + config EXPORTFS + tristate + ++config EXPORTFS_FILE_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 files layout type. ++ Must be automatically selected by supporting filesystems. ++ ++config EXPORTFS_OSD_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 objects layout type. ++ Must be automatically selected by supporting osd ++ filesystems. ++ ++ If unsure, say N. ++ ++config EXPORTFS_BLOCK_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 blocks layout type. ++ Must be automatically selected by supporting filesystems. ++ ++ + config NFS_ACL_SUPPORT + tristate + select FS_POSIX_ACL +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-23 12:09:03.287381619 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-23 12:09:03.287381619 -0400 +@@ -0,0 +1,66 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pipefs_list bl_device_list; ++struct dentry *bl_device_pipe; ++ ++ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, size_t len) ++{ ++ int err; ++ struct pipefs_hdr *msg; ++ ++ dprintk("Entering %s...\n", __func__); ++ ++ msg = pipefs_readmsg(filp, src, len); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: unable to read pipefs message.\n"); ++ return PTR_ERR(msg); ++ } ++ ++ /* now assign the result, which wakes the blocked thread */ ++ err = pipefs_assign_upcall_reply(msg, &bl_device_list); ++ if (err) { ++ dprintk("ERROR: failed to assign upcall with id %u\n", ++ msg->msgid); ++ kfree(msg); ++ } ++ return len; ++} ++ ++static const struct rpc_pipe_ops bl_pipe_ops = { ++ .upcall = pipefs_generic_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = pipefs_generic_destroy_msg, ++}; ++ ++int bl_pipe_init(void) ++{ ++ dprintk("%s: block_device pipefs registering...\n", __func__); ++ bl_device_pipe = pipefs_mkpipe("bl_device_pipe", &bl_pipe_ops, 1); ++ if (IS_ERR(bl_device_pipe)) ++ dprintk("ERROR, unable to make block_device pipe\n"); ++ ++ if (!bl_device_pipe) ++ dprintk("bl_device_pipe is NULL!\n"); ++ else ++ dprintk("bl_device_pipe created!\n"); ++ pipefs_init_list(&bl_device_list); ++ return 0; ++} ++ ++void bl_pipe_exit(void) ++{ ++ dprintk("%s: block_device pipefs unregistering...\n", __func__); ++ if (IS_ERR(bl_device_pipe)) ++ return ; ++ pipefs_closepipe(bl_device_pipe); ++ return; ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-23 12:09:03.288501648 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-23 12:09:03.288501648 -0400 +@@ -0,0 +1,1160 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include ++ ++#include /* various write calls */ ++#include /* struct bio */ ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Andy Adamson "); ++MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); ++ ++/* Callback operations to the pNFS client */ ++static struct pnfs_client_operations *pnfs_block_callback_ops; ++ ++static void print_page(struct page *page) ++{ ++ dprintk("PRINTPAGE page %p\n", page); ++ dprintk(" PagePrivate %d\n", PagePrivate(page)); ++ dprintk(" PageUptodate %d\n", PageUptodate(page)); ++ dprintk(" PageError %d\n", PageError(page)); ++ dprintk(" PageDirty %d\n", PageDirty(page)); ++ dprintk(" PageReferenced %d\n", PageReferenced(page)); ++ dprintk(" PageLocked %d\n", PageLocked(page)); ++ dprintk(" PageWriteback %d\n", PageWriteback(page)); ++ dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); ++ dprintk("\n"); ++} ++ ++/* Given the be associated with isect, determine if page data needs to be ++ * initialized. ++ */ ++static int is_hole(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return !is_sector_initialized(be->be_inval, isect); ++} ++ ++/* Given the be associated with isect, determine if page data can be ++ * written to disk. ++ */ ++static int is_writable(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return is_sector_initialized(be->be_inval, isect); ++} ++ ++static int ++dont_like_caller(struct nfs_page *req) ++{ ++ if (atomic_read(&req->wb_complete)) { ++ /* Called by _multi */ ++ return 1; ++ } else { ++ /* Called by _one */ ++ return 0; ++ } ++} ++ ++static enum pnfs_try_status ++bl_commit(struct nfs_write_data *nfs_data, ++ int sync) ++{ ++ dprintk("%s enter\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++/* The data we are handed might be spread across several bios. We need ++ * to track when the last one is finished. ++ */ ++struct parallel_io { ++ struct kref refcnt; ++ struct rpc_call_ops call_ops; ++ void (*pnfs_callback) (void *data); ++ void *data; ++}; ++ ++static inline struct parallel_io *alloc_parallel(void *data) ++{ ++ struct parallel_io *rv; ++ ++ rv = kmalloc(sizeof(*rv), GFP_KERNEL); ++ if (rv) { ++ rv->data = data; ++ kref_init(&rv->refcnt); ++ } ++ return rv; ++} ++ ++static inline void get_parallel(struct parallel_io *p) ++{ ++ kref_get(&p->refcnt); ++} ++ ++static void destroy_parallel(struct kref *kref) ++{ ++ struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); ++ ++ dprintk("%s enter\n", __func__); ++ p->pnfs_callback(p->data); ++ kfree(p); ++} ++ ++static inline void put_parallel(struct parallel_io *p) ++{ ++ kref_put(&p->refcnt, destroy_parallel); ++} ++ ++static struct bio * ++bl_submit_bio(int rw, struct bio *bio) ++{ ++ if (bio) { ++ get_parallel(bio->bi_private); ++ dprintk("%s submitting %s bio %u@%llu\n", __func__, ++ rw == READ ? "read" : "write", ++ bio->bi_size, (u64)bio->bi_sector); ++ submit_bio(rw, bio); ++ } ++ return NULL; ++} ++ ++static inline void ++bl_done_with_rpage(struct page *page, const int ok) ++{ ++ if (ok) { ++ ClearPagePnfsErr(page); ++ SetPageUptodate(page); ++ } else { ++ ClearPageUptodate(page); ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ } ++ /* Page is unlocked via rpc_release. Should really be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_read(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_rpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++static void bl_read_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ pnfs_block_callback_ops->nfs_readlist_complete(rdata); ++} ++ ++static void ++bl_end_par_io_read(void *data) ++{ ++ struct nfs_read_data *rdata = data; ++ ++ INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); ++ schedule_work(&rdata->task.u.tk_work); ++} ++ ++/* We don't want normal .rpc_call_done callback used, so we replace it ++ * with this stub. ++ */ ++static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) ++{ ++ return; ++} ++ ++static enum pnfs_try_status ++bl_read_pagelist(struct nfs_read_data *rdata, ++ unsigned nr_pages) ++{ ++ int i, hole; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t f_offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct page **pages = rdata->args.pages; ++ int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, ++ nr_pages, f_offset, count); ++ ++ if (dont_like_caller(rdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ goto use_mds; ++ } ++ if ((nr_pages == 1) && PagePnfsErr(rdata->req->wb_page)) { ++ /* We want to fall back to mds in case of read_page ++ * after error on read_pages. ++ */ ++ dprintk("%s PG_pnfserr set\n", __func__); ++ goto use_mds; ++ } ++ par = alloc_parallel(rdata); ++ if (!par) ++ goto use_mds; ++ par->call_ops = *rdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_read; ++ /* At this point, we can no longer jump to use_mds */ ++ ++ isect = (sector_t) (f_offset >> 9); ++ /* Code assumes extents are page-aligned */ ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ put_extent(cow_read); ++ bio = bl_submit_bio(READ, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(rdata->pdata.lseg), ++ isect, &cow_read); ++ if (!be) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ if (cow_read) { ++ sector_t cow_length = cow_read->be_length - ++ (isect - cow_read->be_f_offset); ++ extent_length = min(extent_length, cow_length); ++ } ++ } ++ hole = is_hole(be, isect); ++ if (hole && !cow_read) { ++ bio = bl_submit_bio(READ, bio); ++ /* Fill hole w/ zeroes w/o accessing device */ ++ dprintk("%s Zeroing page for hole\n", __func__); ++ zero_user(pages[i], 0, ++ min_t(int, PAGE_CACHE_SIZE, count)); ++ print_page(pages[i]); ++ bl_done_with_rpage(pages[i], 1); ++ } else { ++ struct pnfs_block_extent *be_read; ++ ++ be_read = (hole && cow_read) ? cow_read : be; ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - ++ be_read->be_f_offset + ++ be_read->be_v_offset; ++ bio->bi_bdev = be_read->be_mdev; ++ bio->bi_end_io = bl_end_io_read; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(READ, bio); ++ } ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ if ((isect << 9) >= rdata->inode->i_size) { ++ rdata->res.eof = 1; ++ rdata->res.count = rdata->inode->i_size - f_offset; ++ } else { ++ rdata->res.count = (isect << 9) - f_offset; ++ } ++ put_extent(be); ++ put_extent(cow_read); ++ bl_submit_bio(READ, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++ ++ use_mds: ++ dprintk("Giving up and using normal NFS\n"); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static void mark_extents_written(struct pnfs_block_layout *bl, ++ __u64 offset, __u32 count) ++{ ++ sector_t isect, end; ++ struct pnfs_block_extent *be; ++ ++ dprintk("%s(%llu, %u)\n", __func__, offset, count); ++ if (count == 0) ++ return; ++ isect = (offset & (long)(PAGE_CACHE_MASK)) >> 9; ++ end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); ++ end >>= 9; ++ while (isect < end) { ++ sector_t len; ++ be = find_get_extent(bl, isect, NULL); ++ BUG_ON(!be); /* FIXME */ ++ len = min(end, be->be_f_offset + be->be_length) - isect; ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ mark_for_commit(be, isect, len); /* What if fails? */ ++ isect += len; ++ put_extent(be); ++ } ++} ++ ++/* STUB - this needs thought */ ++static inline void ++bl_done_with_wpage(struct page *page, const int ok) ++{ ++ if (!ok) { ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ /* This is an inline copy of nfs_zap_mapping */ ++ /* This is oh so fishy, and needs deep thought */ ++ if (page->mapping->nrpages != 0) { ++ struct inode *inode = page->mapping->host; ++ spin_lock(&inode->i_lock); ++ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; ++ spin_unlock(&inode->i_lock); ++ } ++ } ++ /* end_page_writeback called in rpc_release. Should be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_write(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_wpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++/* Function scheduled for call during bl_end_par_io_write, ++ * it marks sectors as written and extends the commitlist. ++ */ ++static void bl_write_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ if (!wdata->task.tk_status) { ++ /* Marks for LAYOUTCOMMIT */ ++ /* BUG - this should be called after each bio, not after ++ * all finish, unless have some way of storing success/failure ++ */ ++ mark_extents_written(BLK_LSEG2EXT(wdata->pdata.lseg), ++ wdata->args.offset, wdata->args.count); ++ } ++ pnfs_block_callback_ops->nfs_writelist_complete(wdata); ++} ++ ++/* Called when last of bios associated with a bl_write_pagelist call finishes */ ++static void ++bl_end_par_io_write(void *data) ++{ ++ struct nfs_write_data *wdata = data; ++ ++ /* STUB - ignoring error handling */ ++ wdata->task.tk_status = 0; ++ wdata->verf.committed = NFS_FILE_SYNC; ++ INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); ++ schedule_work(&wdata->task.u.tk_work); ++} ++ ++static enum pnfs_try_status ++bl_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int sync) ++{ ++ int i; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t offset = wdata->args.offset; ++ size_t count = wdata->args.count; ++ struct page **pages = wdata->args.pages; ++ int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); ++ if (!wdata->req->wb_lseg) { ++ dprintk("%s no lseg, falling back to MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ if (dont_like_caller(wdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ /* At this point, wdata->pages is a (sequential) list of nfs_pages. ++ * We want to write each, and if there is an error remove it from ++ * list and call ++ * nfs_retry_request(req) to have it redone using nfs. ++ * QUEST? Do as block or per req? Think have to do per block ++ * as part of end_bio ++ */ ++ par = alloc_parallel(wdata); ++ if (!par) ++ return PNFS_NOT_ATTEMPTED; ++ par->call_ops = *wdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_write; ++ /* At this point, have to be more careful with error handling */ ++ ++ isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> 9); ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ bio = bl_submit_bio(WRITE, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(wdata->pdata.lseg), ++ isect, NULL); ++ if (!be || !is_writable(be, isect)) { ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ } ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - be->be_f_offset + ++ be->be_v_offset; ++ bio->bi_bdev = be->be_mdev; ++ bio->bi_end_io = bl_end_io_write; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(WRITE, bio); ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ wdata->res.count = (isect << 9) - (offset & (long)PAGE_CACHE_MASK); ++ put_extent(be); ++ bl_submit_bio(WRITE, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++} ++ ++/* FIXME - range ignored */ ++static void ++release_extents(struct pnfs_block_layout *bl, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ int i; ++ struct pnfs_block_extent *be; ++ ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ while (!list_empty(&bl->bl_extents[i])) { ++ be = list_first_entry(&bl->bl_extents[i], ++ struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++} ++ ++static void ++release_inval_marks(struct pnfs_inval_markings *marks) ++{ ++ struct pnfs_inval_tracking *pos, *temp; ++ ++ list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { ++ list_del(&pos->it_link); ++ kfree(pos); ++ } ++ return; ++} ++ ++/* Note we are relying on caller locking to prevent nasty races. */ ++static void ++bl_free_layout(struct pnfs_layout_type *lo) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ ++ dprintk("%s enter\n", __func__); ++ release_extents(bl, NULL); ++ release_inval_marks(&bl->bl_inval); ++ kfree(bl); ++} ++ ++static struct pnfs_layout_type * ++bl_alloc_layout(struct inode *inode) ++{ ++ struct pnfs_block_layout *bl; ++ ++ dprintk("%s enter\n", __func__); ++ bl = kzalloc(sizeof(*bl), GFP_KERNEL); ++ if (!bl) ++ return NULL; ++ spin_lock_init(&bl->bl_ext_lock); ++ INIT_LIST_HEAD(&bl->bl_extents[0]); ++ INIT_LIST_HEAD(&bl->bl_extents[1]); ++ INIT_LIST_HEAD(&bl->bl_commit); ++ bl->bl_count = 0; ++ bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> 9; ++ INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); ++ return &bl->bl_layout; ++} ++ ++static void ++bl_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter\n", __func__); ++ kfree(lseg); ++} ++ ++/* Because the generic infrastructure does not correctly merge layouts, ++ * we pretty much ignore lseg, and store all data layout wide, so we ++ * can correctly merge. Eventually we should push some correct merge ++ * behavior up to the generic code, as the current behavior tends to ++ * cause lots of unnecessary overlapping LAYOUTGET requests. ++ */ ++static struct pnfs_layout_segment * ++bl_alloc_lseg(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layoutget_res *lgr) ++{ ++ struct pnfs_layout_segment *lseg; ++ int status; ++ ++ dprintk("%s enter\n", __func__); ++ lseg = kzalloc(sizeof(*lseg) + 0, GFP_KERNEL); ++ if (!lseg) ++ return NULL; ++ status = nfs4_blk_process_layoutget(lo, lgr); ++ if (status) { ++ /* We don't want to call the full-blown bl_free_lseg, ++ * since on error extents were not touched. ++ */ ++ /* STUB - we really want to distinguish between 2 error ++ * conditions here. This lseg failed, but lo data structures ++ * are OK, or we hosed the lo data structures. The calling ++ * code probably needs to distinguish this too. ++ */ ++ kfree(lseg); ++ return ERR_PTR(status); ++ } ++ return lseg; ++} ++ ++static int ++bl_setup_layoutcommit(struct pnfs_layout_type *lo, ++ struct pnfs_layoutcommit_arg *arg) ++{ ++ struct nfs_server *nfss = PNFS_NFS_SERVER(lo); ++ struct bl_layoutupdate_data *layoutupdate_data; ++ ++ dprintk("%s enter\n", __func__); ++ /* Need to ensure commit is block-size aligned */ ++ if (nfss->pnfs_blksize) { ++ u64 mask = nfss->pnfs_blksize - 1; ++ u64 offset = arg->lseg.offset & mask; ++ ++ arg->lseg.offset -= offset; ++ arg->lseg.length += offset + mask; ++ arg->lseg.length &= ~mask; ++ } ++ ++ layoutupdate_data = kmalloc(sizeof(struct bl_layoutupdate_data), ++ GFP_KERNEL); ++ if (unlikely(!layoutupdate_data)) ++ return -ENOMEM; ++ INIT_LIST_HEAD(&layoutupdate_data->ranges); ++ arg->layoutdriver_data = layoutupdate_data; ++ ++ return 0; ++} ++ ++static void ++bl_encode_layoutcommit(struct pnfs_layout_type *lo, struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *arg) ++{ ++ dprintk("%s enter\n", __func__); ++ encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); ++} ++ ++static void ++bl_cleanup_layoutcommit(struct pnfs_layout_type *lo, ++ struct pnfs_layoutcommit_arg *arg, int status) ++{ ++ dprintk("%s enter\n", __func__); ++ clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), arg, status); ++ kfree(arg->layoutdriver_data); ++} ++ ++static void free_blk_mountid(struct block_mount_id *mid) ++{ ++ if (mid) { ++ struct pnfs_block_dev *dev; ++ spin_lock(&mid->bm_lock); ++ while (!list_empty(&mid->bm_devlist)) { ++ dev = list_first_entry(&mid->bm_devlist, ++ struct pnfs_block_dev, ++ bm_node); ++ list_del(&dev->bm_node); ++ free_block_dev(dev); ++ } ++ spin_unlock(&mid->bm_lock); ++ kfree(mid); ++ } ++} ++ ++/* This is mostly copied form the filelayout's get_device_info function. ++ * It seems much of this should be at the generic pnfs level. ++ */ ++static struct pnfs_block_dev * ++nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, ++ struct pnfs_deviceid *d_id, ++ struct list_head *sdlist) ++{ ++ struct pnfs_device *dev; ++ struct pnfs_block_dev *rv = NULL; ++ u32 max_resp_sz; ++ int max_pages; ++ struct page **pages = NULL; ++ int i, rc; ++ ++ /* ++ * Use the session max response size as the basis for setting ++ * GETDEVICEINFO's maxcount ++ */ ++ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; ++ max_pages = max_resp_sz >> PAGE_SHIFT; ++ dprintk("%s max_resp_sz %u max_pages %d\n", ++ __func__, max_resp_sz, max_pages); ++ ++ dev = kmalloc(sizeof(*dev), GFP_KERNEL); ++ if (!dev) { ++ dprintk("%s kmalloc failed\n", __func__); ++ return NULL; ++ } ++ ++ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); ++ if (pages == NULL) { ++ kfree(dev); ++ return NULL; ++ } ++ for (i = 0; i < max_pages; i++) { ++ pages[i] = alloc_page(GFP_KERNEL); ++ if (!pages[i]) ++ goto out_free; ++ } ++ ++ /* set dev->area */ ++ dev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); ++ if (!dev->area) ++ goto out_free; ++ ++ memcpy(&dev->dev_id, d_id, sizeof(*d_id)); ++ dev->layout_type = LAYOUT_BLOCK_VOLUME; ++ dev->dev_notify_types = 0; ++ dev->pages = pages; ++ dev->pgbase = 0; ++ dev->pglen = PAGE_SIZE * max_pages; ++ dev->mincount = 0; ++ ++ dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); ++ rc = pnfs_block_callback_ops->nfs_getdeviceinfo(server, dev); ++ dprintk("%s getdevice info returns %d\n", __func__, rc); ++ if (rc) ++ goto out_free; ++ ++ rv = nfs4_blk_decode_device(server, dev, sdlist); ++ out_free: ++ if (dev->area != NULL) ++ vunmap(dev->area); ++ for (i = 0; i < max_pages; i++) ++ __free_page(pages[i]); ++ kfree(pages); ++ kfree(dev); ++ return rv; ++} ++ ++ ++/* ++ * Retrieve the list of available devices for the mountpoint. ++ */ ++static int ++bl_initialize_mountpoint(struct nfs_server *server, const struct nfs_fh *fh) ++{ ++ struct block_mount_id *b_mt_id = NULL; ++ struct pnfs_mount_type *mtype = NULL; ++ struct pnfs_devicelist *dlist = NULL; ++ struct pnfs_block_dev *bdev; ++ LIST_HEAD(block_disklist); ++ int status = 0, i; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (server->pnfs_blksize == 0) { ++ dprintk("%s Server did not return blksize\n", __func__); ++ return -EINVAL; ++ } ++ b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_KERNEL); ++ if (!b_mt_id) { ++ status = -ENOMEM; ++ goto out_error; ++ } ++ /* Initialize nfs4 block layout mount id */ ++ spin_lock_init(&b_mt_id->bm_lock); ++ INIT_LIST_HEAD(&b_mt_id->bm_devlist); ++ ++ dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_KERNEL); ++ if (!dlist) ++ goto out_error; ++ dlist->eof = 0; ++ while (!dlist->eof) { ++ status = pnfs_block_callback_ops->nfs_getdevicelist( ++ server, fh, dlist); ++ if (status) ++ goto out_error; ++ dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", ++ __func__, dlist->num_devs, dlist->eof); ++ /* For each device returned in dlist, call GETDEVICEINFO, and ++ * decode the opaque topology encoding to create a flat ++ * volume topology, matching VOLUME_SIMPLE disk signatures ++ * to disks in the visible block disk list. ++ * Construct an LVM meta device from the flat volume topology. ++ */ ++ for (i = 0; i < dlist->num_devs; i++) { ++ bdev = nfs4_blk_get_deviceinfo(server, fh, ++ &dlist->dev_id[i], ++ &block_disklist); ++ if (!bdev) ++ goto out_error; ++ spin_lock(&b_mt_id->bm_lock); ++ list_add(&bdev->bm_node, &b_mt_id->bm_devlist); ++ spin_unlock(&b_mt_id->bm_lock); ++ } ++ } ++ dprintk("%s SUCCESS\n", __func__); ++ server->pnfs_ld_data = b_mt_id; ++ ++ out_return: ++ kfree(dlist); ++ return status; ++ ++ out_error: ++ free_blk_mountid(b_mt_id); ++ kfree(mtype); ++ goto out_return; ++} ++ ++static int ++bl_uninitialize_mountpoint(struct nfs_server *server) ++{ ++ struct block_mount_id *b_mt_id = server->pnfs_ld_data; ++ ++ dprintk("%s enter\n", __func__); ++ free_blk_mountid(b_mt_id); ++ dprintk("%s RETURNS\n", __func__); ++ return 0; ++} ++ ++/* STUB - mark intersection of layout and page as bad, so is not ++ * used again. ++ */ ++static void mark_bad_read(void) ++{ ++ return; ++} ++ ++/* Copied from buffer.c */ ++static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) ++{ ++ if (uptodate) { ++ set_buffer_uptodate(bh); ++ } else { ++ /* This happens, due to failed READA attempts. */ ++ clear_buffer_uptodate(bh); ++ } ++ unlock_buffer(bh); ++} ++ ++/* Copied from buffer.c */ ++static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) ++{ ++ __end_buffer_read_notouch(bh, uptodate); ++} ++ ++/* ++ * map_block: map a requested I/0 block (isect) into an offset in the LVM ++ * meta block_device ++ */ ++static void ++map_block(sector_t isect, struct pnfs_block_extent *be, struct buffer_head *bh) ++{ ++ dprintk("%s enter be=%p\n", __func__, be); ++ ++ set_buffer_mapped(bh); ++ bh->b_bdev = be->be_mdev; ++ bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> ++ (be->be_mdev->bd_inode->i_blkbits - 9); ++ ++ dprintk("%s isect %ld, bh->b_blocknr %ld, using bsize %Zd\n", ++ __func__, (long)isect, ++ (long)bh->b_blocknr, ++ bh->b_size); ++ return; ++} ++ ++/* Given an unmapped page, zero it (or read in page for COW), ++ * and set appropriate flags/markings, but it is safe to not initialize ++ * the range given in [from, to). ++ */ ++/* This is loosely based on nobh_write_begin */ ++static int ++init_page_for_write(struct pnfs_block_layout *bl, struct page *page, ++ unsigned from, unsigned to, sector_t **pages_to_mark) ++{ ++ struct buffer_head *bh; ++ int inval, ret = -EIO; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect; ++ ++ dprintk("%s enter, %p\n", __func__, page); ++ bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); ++ if (!bh) { ++ ret = -ENOMEM; ++ goto cleanup; ++ } ++ ++ isect = (sector_t)page->index << (PAGE_CACHE_SHIFT - 9); ++ be = find_get_extent(bl, isect, &cow_read); ++ if (!be) ++ goto cleanup; ++ inval = is_hole(be, isect); ++ dprintk("%s inval=%i, from=%u, to=%u\n", __func__, inval, from, to); ++ if (inval) { ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) { ++ dprintk("%s PANIC - got NONE_DATA extent %p\n", ++ __func__, be); ++ goto cleanup; ++ } ++ map_block(isect, be, bh); ++ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); ++ } ++ if (PageUptodate(page)) { ++ /* Do nothing */ ++ } else if (inval & !cow_read) { ++ zero_user_segments(page, 0, from, to, PAGE_CACHE_SIZE); ++ } else if (0 < from || PAGE_CACHE_SIZE > to) { ++ struct pnfs_block_extent *read_extent; ++ ++ read_extent = (inval && cow_read) ? cow_read : be; ++ map_block(isect, read_extent, bh); ++ lock_buffer(bh); ++ bh->b_end_io = end_buffer_read_nobh; ++ submit_bh(READ, bh); ++ dprintk("%s: Waiting for buffer read\n", __func__); ++ /* XXX Don't really want to hold layout lock here */ ++ wait_on_buffer(bh); ++ if (!buffer_uptodate(bh)) ++ goto cleanup; ++ } ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ /* There is a BUG here if is a short copy after write_begin, ++ * but I think this is a generic fs bug. The problem is that ++ * we have marked the page as initialized, but it is possible ++ * that the section not copied may never get copied. ++ */ ++ ret = mark_initialized_sectors(be->be_inval, isect, ++ PAGE_CACHE_SECTORS, ++ pages_to_mark); ++ /* Want to preallocate mem so above can't fail */ ++ if (ret) ++ goto cleanup; ++ } ++ SetPageMappedToDisk(page); ++ ret = 0; ++ ++cleanup: ++ free_buffer_head(bh); ++ put_extent(be); ++ put_extent(cow_read); ++ if (ret) { ++ /* Need to mark layout with bad read...should now ++ * just use nfs4 for reads and writes. ++ */ ++ mark_bad_read(); ++ } ++ return ret; ++} ++ ++static int ++bl_write_begin(struct pnfs_layout_segment *lseg, struct page *page, loff_t pos, ++ unsigned count, struct pnfs_fsdata *fsdata) ++{ ++ unsigned from, to; ++ int ret; ++ sector_t *pages_to_mark = NULL; ++ struct pnfs_block_layout *bl = BLK_LSEG2EXT(lseg); ++ ++ dprintk("%s enter, %u@%lld\n", __func__, count, pos); ++ print_page(page); ++ /* The following code assumes blocksize >= PAGE_CACHE_SIZE */ ++ if (bl->bl_blocksize < (PAGE_CACHE_SIZE >> 9)) { ++ dprintk("%s Can't handle blocksize %llu\n", __func__, ++ (u64)bl->bl_blocksize); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ return 0; ++ } ++ if (PageMappedToDisk(page)) { ++ /* Basically, this is a flag that says we have ++ * successfully called write_begin already on this page. ++ */ ++ /* NOTE - there are cache consistency issues here. ++ * For example, what if the layout is recalled, then regained? ++ * If the file is closed and reopened, will the page flags ++ * be reset? If not, we'll have to use layout info instead of ++ * the page flag. ++ */ ++ return 0; ++ } ++ from = pos & (PAGE_CACHE_SIZE - 1); ++ to = from + count; ++ ret = init_page_for_write(bl, page, from, to, &pages_to_mark); ++ if (ret) { ++ dprintk("%s init page failed with %i", __func__, ret); ++ /* Revert back to plain NFS and just continue on with ++ * write. This assumes there is no request attached, which ++ * should be true if we get here. ++ */ ++ BUG_ON(PagePrivate(page)); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ kfree(pages_to_mark); ++ ret = 0; ++ } else { ++ fsdata->private = pages_to_mark; ++ } ++ return ret; ++} ++ ++/* CAREFUL - what happens if copied < count??? */ ++static int ++bl_write_end(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter, %u@%lld, lseg=%p\n", __func__, count, pos, lseg); ++ print_page(page); ++ if (lseg) ++ SetPageUptodate(page); ++ return 0; ++} ++ ++/* Return any memory allocated to fsdata->private, and take advantage ++ * of no page locks to mark pages noted in write_begin as needing ++ * initialization. ++ */ ++static void ++bl_write_end_cleanup(struct file *filp, struct pnfs_fsdata *fsdata) ++{ ++ struct page *page; ++ pgoff_t index; ++ sector_t *pos; ++ struct address_space *mapping = filp->f_mapping; ++ struct pnfs_fsdata *fake_data; ++ struct pnfs_layout_segment *lseg; ++ ++ if (!fsdata) ++ return; ++ lseg = fsdata->lseg; ++ if (!lseg) ++ return; ++ pos = fsdata->private; ++ if (!pos) ++ return; ++ dprintk("%s enter with pos=%llu\n", __func__, (u64)(*pos)); ++ for (; *pos != ~0; pos++) { ++ index = *pos >> (PAGE_CACHE_SHIFT - 9); ++ /* XXX How do we properly deal with failures here??? */ ++ page = grab_cache_page_write_begin(mapping, index, 0); ++ if (!page) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", __func__); ++ continue; ++ } ++ dprintk("%s: Examining block page\n", __func__); ++ print_page(page); ++ if (!PageMappedToDisk(page)) { ++ /* XXX How do we properly deal with failures here??? */ ++ dprintk("%s Marking block page\n", __func__); ++ init_page_for_write(BLK_LSEG2EXT(fsdata->lseg), page, ++ PAGE_CACHE_SIZE, PAGE_CACHE_SIZE, ++ NULL); ++ print_page(page); ++ fake_data = kzalloc(sizeof(*fake_data), GFP_KERNEL); ++ if (!fake_data) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", ++ __func__); ++ unlock_page(page); ++ continue; ++ } ++ get_lseg(lseg); ++ fake_data->lseg = lseg; ++ fake_data->bypass_eof = 1; ++ mapping->a_ops->write_end(filp, mapping, ++ index << PAGE_CACHE_SHIFT, ++ PAGE_CACHE_SIZE, ++ PAGE_CACHE_SIZE, ++ page, fake_data); ++ /* Note fake_data is freed by nfs_write_end */ ++ } else ++ unlock_page(page); ++ } ++ kfree(fsdata->private); ++ fsdata->private = NULL; ++} ++ ++static ssize_t ++bl_get_stripesize(struct pnfs_layout_type *lo) ++{ ++ dprintk("%s enter\n", __func__); ++ return 0; ++} ++ ++/* This is called by nfs_can_coalesce_requests via nfs_pageio_do_add_request. ++ * Should return False if there is a reason requests can not be coalesced, ++ * otherwise, should default to returning True. ++ */ ++static int ++bl_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ dprintk("%s enter\n", __func__); ++ if (pgio->pg_iswrite) ++ return prev->wb_lseg == req->wb_lseg; ++ else ++ return 1; ++} ++ ++static struct layoutdriver_io_operations blocklayout_io_operations = { ++ .commit = bl_commit, ++ .read_pagelist = bl_read_pagelist, ++ .write_pagelist = bl_write_pagelist, ++ .write_begin = bl_write_begin, ++ .write_end = bl_write_end, ++ .write_end_cleanup = bl_write_end_cleanup, ++ .alloc_layout = bl_alloc_layout, ++ .free_layout = bl_free_layout, ++ .alloc_lseg = bl_alloc_lseg, ++ .free_lseg = bl_free_lseg, ++ .setup_layoutcommit = bl_setup_layoutcommit, ++ .encode_layoutcommit = bl_encode_layoutcommit, ++ .cleanup_layoutcommit = bl_cleanup_layoutcommit, ++ .initialize_mountpoint = bl_initialize_mountpoint, ++ .uninitialize_mountpoint = bl_uninitialize_mountpoint, ++}; ++ ++static struct layoutdriver_policy_operations blocklayout_policy_operations = { ++ .get_stripesize = bl_get_stripesize, ++ .pg_test = bl_pg_test, ++}; ++ ++static struct pnfs_layoutdriver_type blocklayout_type = { ++ .id = LAYOUT_BLOCK_VOLUME, ++ .name = "LAYOUT_BLOCK_VOLUME", ++ .ld_io_ops = &blocklayout_io_operations, ++ .ld_policy_ops = &blocklayout_policy_operations, ++}; ++ ++static int __init nfs4blocklayout_init(void) ++{ ++ dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); ++ ++ pnfs_block_callback_ops = pnfs_register_layoutdriver(&blocklayout_type); ++ bl_pipe_init(); ++ return 0; ++} ++ ++static void __exit nfs4blocklayout_exit(void) ++{ ++ dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", ++ __func__); ++ ++ pnfs_unregister_layoutdriver(&blocklayout_type); ++ bl_pipe_exit(); ++} ++ ++module_init(nfs4blocklayout_init); ++module_exit(nfs4blocklayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-23 12:09:03.289501933 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-23 12:09:03.289501933 -0400 +@@ -0,0 +1,335 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdev.c ++ * ++ * Device operations for the pnfs nfs4 file layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include /* __bread */ ++ ++#include ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes) ++{ ++ uint32_t *q = p + XDR_QUADLEN(nbytes); ++ if (unlikely(q > end || q < p)) ++ return NULL; ++ return p; ++} ++EXPORT_SYMBOL(blk_overflow); ++ ++/* Open a block_device by device number. */ ++struct block_device *nfs4_blkdev_get(dev_t dev) ++{ ++ struct block_device *bd; ++ ++ dprintk("%s enter\n", __func__); ++ bd = open_by_devnum(dev, FMODE_READ); ++ if (IS_ERR(bd)) ++ goto fail; ++ return bd; ++fail: ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ return NULL; ++} ++ ++/* ++ * Release the block device ++ */ ++int nfs4_blkdev_put(struct block_device *bdev) ++{ ++ dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), ++ MINOR(bdev->bd_dev)); ++ bd_release(bdev); ++ return blkdev_put(bdev, FMODE_READ); ++} ++ ++/* Decodes pnfs_block_deviceaddr4 (draft-8) which is XDR encoded ++ * in dev->dev_addr_buf. ++ */ ++struct pnfs_block_dev * ++nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist) ++{ ++ struct pnfs_block_dev *rv = NULL; ++ struct block_device *bd = NULL; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint32_t major, minor; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return NULL; ++ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); ++ dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, ++ dev->mincount); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_MOUNT, 0, dev->area, ++ dev->mincount); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out_err; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ dprintk("%s CALLING USERSPACE DAEMON\n", __func__); ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out_err; ++ } ++ if (reply->status != BL_DEVICE_REQUEST_PROC) { ++ dprintk("%s failed to open device: %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ memcpy(&major, (uint32_t *)(payload_of(reply)), sizeof(uint32_t)); ++ memcpy(&minor, (uint32_t *)(payload_of(reply) + sizeof(uint32_t)), ++ sizeof(uint32_t)); ++ bd = nfs4_blkdev_get(MKDEV(major, minor)); ++ if (IS_ERR(bd)) { ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ ++ rv = kzalloc(sizeof(*rv), GFP_KERNEL); ++ if (!rv) ++ goto out_err; ++ ++ rv->bm_mdev = bd; ++ memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct pnfs_deviceid)); ++ dprintk("%s Created device %s with bd_block_size %u\n", ++ __func__, ++ bd->bd_disk->disk_name, ++ bd->bd_block_size); ++ kfree(reply); ++ kfree(msg); ++ return rv; ++ ++out_err: ++ kfree(rv); ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return NULL; ++} ++ ++/* Map deviceid returned by the server to constructed block_device */ ++static struct block_device *translate_devid(struct pnfs_layout_type *lo, ++ struct pnfs_deviceid *id) ++{ ++ struct block_device *rv = NULL; ++ struct block_mount_id *mid; ++ struct pnfs_block_dev *dev; ++ ++ dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); ++ mid = BLK_ID(lo); ++ spin_lock(&mid->bm_lock); ++ list_for_each_entry(dev, &mid->bm_devlist, bm_node) { ++ if (memcmp(id->data, dev->bm_mdevid.data, ++ NFS4_PNFS_DEVICEID4_SIZE) == 0) { ++ rv = dev->bm_mdev; ++ goto out; ++ } ++ } ++ out: ++ spin_unlock(&mid->bm_lock); ++ dprintk("%s returning %p\n", __func__, rv); ++ return rv; ++} ++ ++/* Tracks info needed to ensure extents in layout obey constraints of spec */ ++struct layout_verification { ++ u32 mode; /* R or RW */ ++ u64 start; /* Expected start of next non-COW extent */ ++ u64 inval; /* Start of INVAL coverage */ ++ u64 cowread; /* End of COW read coverage */ ++}; ++ ++/* Verify the extent meets the layout requirements of the pnfs-block draft, ++ * section 2.3.1. ++ */ ++static int verify_extent(struct pnfs_block_extent *be, ++ struct layout_verification *lv) ++{ ++ if (lv->mode == IOMODE_READ) { ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA || ++ be->be_state == PNFS_BLOCK_INVALID_DATA) ++ return -EIO; ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } ++ /* lv->mode == IOMODE_RW */ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ if (lv->cowread > lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ lv->inval = lv->start; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_READ_DATA) { ++ if (be->be_f_offset > lv->start) ++ return -EIO; ++ if (be->be_f_offset < lv->inval) ++ return -EIO; ++ if (be->be_f_offset < lv->cowread) ++ return -EIO; ++ /* It looks like you might want to min this with lv->start, ++ * but you really don't. ++ */ ++ lv->inval = lv->inval + be->be_length; ++ lv->cowread = be->be_f_offset + be->be_length; ++ return 0; ++ } else ++ return -EIO; ++} ++ ++/* XDR decode pnfs_block_layout4 structure */ ++int ++nfs4_blk_process_layoutget(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layoutget_res *lgr) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ uint32_t *p = (uint32_t *)lgr->layout.buf; ++ uint32_t *end = (uint32_t *)((char *)lgr->layout.buf + lgr->layout.len); ++ int i, status = -EIO; ++ uint32_t count; ++ struct pnfs_block_extent *be = NULL, *save; ++ uint64_t tmp; /* Used by READSECTOR */ ++ struct layout_verification lv = { ++ .mode = lgr->lseg.iomode, ++ .start = lgr->lseg.offset >> 9, ++ .inval = lgr->lseg.offset >> 9, ++ .cowread = lgr->lseg.offset >> 9, ++ }; ++ ++ LIST_HEAD(extents); ++ ++ BLK_READBUF(p, end, 4); ++ READ32(count); ++ ++ dprintk("%s enter, number of extents %i\n", __func__, count); ++ BLK_READBUF(p, end, (28 + NFS4_PNFS_DEVICEID4_SIZE) * count); ++ ++ /* Decode individual extents, putting them in temporary ++ * staging area until whole layout is decoded to make error ++ * recovery easier. ++ */ ++ for (i = 0; i < count; i++) { ++ be = alloc_extent(); ++ if (!be) { ++ status = -ENOMEM; ++ goto out_err; ++ } ++ READ_DEVID(&be->be_devid); ++ be->be_mdev = translate_devid(lo, &be->be_devid); ++ if (!be->be_mdev) ++ goto out_err; ++ /* The next three values are read in as bytes, ++ * but stored as 512-byte sector lengths ++ */ ++ READ_SECTOR(be->be_f_offset); ++ READ_SECTOR(be->be_length); ++ READ_SECTOR(be->be_v_offset); ++ READ32(be->be_state); ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ be->be_inval = &bl->bl_inval; ++ if (verify_extent(be, &lv)) { ++ dprintk("%s verify failed\n", __func__); ++ goto out_err; ++ } ++ list_add_tail(&be->be_node, &extents); ++ } ++ if (p != end) { ++ dprintk("%s Undecoded cruft at end of opaque\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lgr->lseg.offset + lgr->lseg.length != lv.start << 9) { ++ dprintk("%s Final length mismatch\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lv.start < lv.cowread) { ++ dprintk("%s Final uncovered COW extent\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ /* Extents decoded properly, now try to merge them in to ++ * existing layout extents. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ list_for_each_entry_safe(be, save, &extents, be_node) { ++ list_del(&be->be_node); ++ status = add_and_merge_extent(bl, be); ++ if (status) { ++ spin_unlock(&bl->bl_ext_lock); ++ /* This is a fairly catastrophic error, as the ++ * entire layout extent lists are now corrupted. ++ * We should have some way to distinguish this. ++ */ ++ be = NULL; ++ goto out_err; ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ status = 0; ++ out: ++ dprintk("%s returns %i\n", __func__, status); ++ return status; ++ ++ out_err: ++ put_extent(be); ++ while (!list_empty(&extents)) { ++ be = list_first_entry(&extents, struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ goto out; ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-23 12:09:03.290395707 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-23 12:09:03.290395707 -0400 +@@ -0,0 +1,120 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdm.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2007 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Fred Isaman ++ * Andy Adamson ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include /* gendisk - used in a dprintk*/ ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Defines used for calculating memory usage in nfs4_blk_flatten() */ ++#define ARGSIZE 24 /* Max bytes needed for linear target arg string */ ++#define SPECSIZE (sizeof8(struct dm_target_spec) + ARGSIZE) ++#define SPECS_PER_PAGE (PAGE_SIZE / SPECSIZE) ++#define SPEC_HEADER_ADJUST (SPECS_PER_PAGE - \ ++ (PAGE_SIZE - sizeof8(struct dm_ioctl)) / SPECSIZE) ++#define roundup8(x) (((x)+7) & ~7) ++#define sizeof8(x) roundup8(sizeof(x)) ++ ++static int dev_remove(dev_t dev) ++{ ++ int ret = 1; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint64_t bl_dev; ++ uint32_t major = MAJOR(dev), minor = MINOR(dev); ++ ++ dprintk("Entering %s\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return ret; ++ ++ memcpy((void *)&bl_dev, &major, sizeof(uint32_t)); ++ memcpy((void *)&bl_dev + sizeof(uint32_t), &minor, sizeof(uint32_t)); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_UMOUNT, 0, (void *)&bl_dev, ++ sizeof(uint64_t)); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out; ++ } ++ ++ if (reply->status == BL_DEVICE_REQUEST_PROC) ++ ret = 0; /*TODO: what to return*/ ++out: ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return ret; ++} ++ ++/* ++ * Release meta device ++ */ ++static int nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) ++{ ++ int rv; ++ ++ dprintk("%s Releasing\n", __func__); ++ /* XXX Check return? */ ++ rv = nfs4_blkdev_put(bdev->bm_mdev); ++ dprintk("%s nfs4_blkdev_put returns %d\n", __func__, rv); ++ ++ rv = dev_remove(bdev->bm_mdev->bd_dev); ++ dprintk("%s Returns %d\n", __func__, rv); ++ return rv; ++} ++ ++void free_block_dev(struct pnfs_block_dev *bdev) ++{ ++ if (bdev) { ++ if (bdev->bm_mdev) { ++ dprintk("%s Removing DM device: %d:%d\n", ++ __func__, ++ MAJOR(bdev->bm_mdev->bd_dev), ++ MINOR(bdev->bm_mdev->bd_dev)); ++ /* XXX Check status ?? */ ++ nfs4_blk_metadev_release(bdev); ++ } ++ kfree(bdev); ++ } ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-23 12:09:03.290395707 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-23 12:09:03.291501560 -0400 +@@ -0,0 +1,303 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#ifndef FS_NFS_NFS4BLOCKLAYOUT_H ++#define FS_NFS_NFS4BLOCKLAYOUT_H ++ ++#include ++#include /* Needed by nfs4_pnfs.h */ ++#include ++#include /* Needed for struct dm_ioctl*/ ++ ++#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> 9) ++ ++#define PG_pnfserr PG_owner_priv_1 ++#define PagePnfsErr(page) test_bit(PG_pnfserr, &(page)->flags) ++#define SetPagePnfsErr(page) set_bit(PG_pnfserr, &(page)->flags) ++#define ClearPagePnfsErr(page) clear_bit(PG_pnfserr, &(page)->flags) ++ ++extern int dm_dev_create(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_dev_remove(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_do_resume(struct dm_ioctl *param); ++extern int dm_table_load(struct dm_ioctl *param, size_t param_size); ++ ++struct block_mount_id { ++ spinlock_t bm_lock; /* protects list */ ++ struct list_head bm_devlist; /* holds pnfs_block_dev */ ++}; ++ ++struct pnfs_block_dev { ++ struct list_head bm_node; ++ struct pnfs_deviceid bm_mdevid; /* associated devid */ ++ struct block_device *bm_mdev; /* meta device itself */ ++}; ++ ++/* holds visible disks that can be matched against VOLUME_SIMPLE signatures */ ++struct visible_block_device { ++ struct list_head vi_node; ++ struct block_device *vi_bdev; ++ int vi_mapped; ++ int vi_put_done; ++}; ++ ++enum blk_vol_type { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, /* maps to a single LU */ ++ PNFS_BLOCK_VOLUME_SLICE = 1, /* slice of another volume */ ++ PNFS_BLOCK_VOLUME_CONCAT = 2, /* concatenation of multiple volumes */ ++ PNFS_BLOCK_VOLUME_STRIPE = 3 /* striped across multiple volumes */ ++}; ++ ++/* All disk offset/lengths are stored in 512-byte sectors */ ++struct pnfs_blk_volume { ++ uint32_t bv_type; ++ sector_t bv_size; ++ struct pnfs_blk_volume **bv_vols; ++ int bv_vol_n; ++ union { ++ dev_t bv_dev; ++ sector_t bv_stripe_unit; ++ sector_t bv_offset; ++ }; ++}; ++ ++/* Since components need not be aligned, cannot use sector_t */ ++struct pnfs_blk_sig_comp { ++ int64_t bs_offset; /* In bytes */ ++ uint32_t bs_length; /* In bytes */ ++ char *bs_string; ++}; ++ ++/* Maximum number of signatures components in a simple volume */ ++# define PNFS_BLOCK_MAX_SIG_COMP 16 ++ ++struct pnfs_blk_sig { ++ int si_num_comps; ++ struct pnfs_blk_sig_comp si_comps[PNFS_BLOCK_MAX_SIG_COMP]; ++}; ++ ++enum exstate4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ ++ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ ++}; ++ ++#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ ++ ++struct my_tree_t { ++ sector_t mtt_step_size; /* Internal sector alignment */ ++ struct list_head mtt_stub; /* Should be a radix tree */ ++}; ++ ++struct pnfs_inval_markings { ++ spinlock_t im_lock; ++ struct my_tree_t im_tree; /* Sectors that need LAYOUTCOMMIT */ ++ sector_t im_block_size; /* Server blocksize in sectors */ ++}; ++ ++struct pnfs_inval_tracking { ++ struct list_head it_link; ++ int it_sector; ++ int it_tags; ++}; ++ ++/* sector_t fields are all in 512-byte sectors */ ++struct pnfs_block_extent { ++ struct kref be_refcnt; ++ struct list_head be_node; /* link into lseg list */ ++ struct pnfs_deviceid be_devid; /* STUB - remevable??? */ ++ struct block_device *be_mdev; ++ sector_t be_f_offset; /* the starting offset in the file */ ++ sector_t be_length; /* the size of the extent */ ++ sector_t be_v_offset; /* the starting offset in the volume */ ++ enum exstate4 be_state; /* the state of this extent */ ++ struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ ++}; ++ ++/* Shortened extent used by LAYOUTCOMMIT */ ++struct pnfs_block_short_extent { ++ struct list_head bse_node; ++ struct pnfs_deviceid bse_devid; /* STUB - removable??? */ ++ struct block_device *bse_mdev; ++ sector_t bse_f_offset; /* the starting offset in the file */ ++ sector_t bse_length; /* the size of the extent */ ++}; ++ ++static inline void ++INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) ++{ ++ spin_lock_init(&marks->im_lock); ++ INIT_LIST_HEAD(&marks->im_tree.mtt_stub); ++ marks->im_block_size = blocksize; ++ marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, ++ blocksize); ++} ++ ++enum extentclass4 { ++ RW_EXTENT = 0, /* READWRTE and INVAL */ ++ RO_EXTENT = 1, /* READ and NONE */ ++ EXTENT_LISTS = 2, ++}; ++ ++static inline int choose_list(enum exstate4 state) ++{ ++ if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) ++ return RO_EXTENT; ++ else ++ return RW_EXTENT; ++} ++ ++struct pnfs_block_layout { ++ struct pnfs_layout_type bl_layout; ++ struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ ++ spinlock_t bl_ext_lock; /* Protects list manipulation */ ++ struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ ++ struct list_head bl_commit; /* Needs layout commit */ ++ unsigned int bl_count; /* entries in bl_commit */ ++ sector_t bl_blocksize; /* Server blocksize in sectors */ ++}; ++ ++/* this struct is comunicated between: ++ * bl_setup_layoutcommit && bl_encode_layoutcommit && bl_cleanup_layoutcommit ++ */ ++struct bl_layoutupdate_data { ++ struct list_head ranges; ++}; ++ ++#define BLK_ID(lo) ((struct block_mount_id *)(PNFS_NFS_SERVER(lo)->pnfs_ld_data)) ++ ++static inline struct pnfs_block_layout * ++BLK_LO2EXT(struct pnfs_layout_type *lo) ++{ ++ return container_of(lo, struct pnfs_block_layout, bl_layout); ++} ++ ++static inline struct pnfs_block_layout * ++BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) ++{ ++ return BLK_LO2EXT(lseg->layout); ++} ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes); ++ ++#define BLK_READBUF(p, e, nbytes) do { \ ++ p = blk_overflow(p, e, nbytes); \ ++ if (!p) { \ ++ printk(KERN_WARNING \ ++ "%s: reply buffer overflowed in line %d.\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++} while (0) ++ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (uint64_t)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++#define READ_DEVID(x) COPYMEM((x)->data, NFS4_PNFS_DEVICEID4_SIZE) ++#define READ_SECTOR(x) do { \ ++ READ64(tmp); \ ++ if (tmp & 0x1ff) { \ ++ printk(KERN_WARNING \ ++ "%s Value not 512-byte aligned at line %d\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++ (x) = tmp >> 9; \ ++} while (0) ++ ++#define WRITE32(n) do { \ ++ *p++ = htonl(n); \ ++ } while (0) ++#define WRITE64(n) do { \ ++ *p++ = htonl((uint32_t)((n) >> 32)); \ ++ *p++ = htonl((uint32_t)(n)); \ ++} while (0) ++#define WRITEMEM(ptr, nbytes) do { \ ++ p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ ++} while (0) ++#define WRITE_DEVID(x) WRITEMEM((x)->data, NFS4_PNFS_DEVICEID4_SIZE) ++ ++/* blocklayoutdev.c */ ++struct block_device *nfs4_blkdev_get(dev_t dev); ++int nfs4_blkdev_put(struct block_device *bdev); ++struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist); ++int nfs4_blk_process_layoutget(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layoutget_res *lgr); ++int nfs4_blk_create_block_disk_list(struct list_head *); ++void nfs4_blk_destroy_disk_list(struct list_head *); ++/* blocklayoutdm.c */ ++int nfs4_blk_flatten(struct pnfs_blk_volume *, int, struct pnfs_block_dev *); ++void free_block_dev(struct pnfs_block_dev *bdev); ++/* extents.c */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read); ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages); ++void put_extent(struct pnfs_block_extent *be); ++struct pnfs_block_extent *alloc_extent(void); ++struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be); ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect); ++int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *arg); ++void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct pnfs_layoutcommit_arg *arg, ++ int status); ++int add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new); ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length); ++ ++#include ++ ++extern struct pipefs_list bl_device_list; ++extern struct dentry *bl_device_pipe; ++ ++int bl_pipe_init(void); ++void bl_pipe_exit(void); ++ ++#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ ++#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ ++#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ ++#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ ++#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ ++ ++#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-23 12:09:03.292511531 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-08-23 12:09:03.292511531 -0400 +@@ -0,0 +1,948 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include "blocklayout.h" ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Bit numbers */ ++#define EXTENT_INITIALIZED 0 ++#define EXTENT_WRITTEN 1 ++#define EXTENT_IN_COMMIT 2 ++#define INTERNAL_EXISTS MY_MAX_TAGS ++#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) ++ ++/* Returns largest t<=s s.t. t%base==0 */ ++static inline sector_t normalize(sector_t s, int base) ++{ ++ sector_t tmp = s; /* Since do_div modifies its argument */ ++ return s - do_div(tmp, base); ++} ++ ++static inline sector_t normalize_up(sector_t s, int base) ++{ ++ return normalize(s + base - 1, base); ++} ++ ++/* Complete stub using list while determine API wanted */ ++ ++/* Returns tags, or negative */ ++static int32_t _find_entry(struct my_tree_t *tree, u64 s) ++{ ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu) enter\n", __func__, s); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) ++ return pos->it_tags & INTERNAL_MASK; ++ else ++ break; ++ } ++ return -ENOENT; ++} ++ ++static inline ++int _has_tag(struct my_tree_t *tree, u64 s, int32_t tag) ++{ ++ int32_t tags; ++ ++ dprintk("%s(%llu, %i) enter\n", __func__, s, tag); ++ s = normalize(s, tree->mtt_step_size); ++ tags = _find_entry(tree, s); ++ if ((tags < 0) || !(tags & (1 << tag))) ++ return 0; ++ else ++ return 1; ++} ++ ++/* Creates entry with tag, or if entry already exists, unions tag to it. ++ * If storage is not NULL, newly created entry will use it. ++ * Returns number of entries added, or negative on error. ++ */ ++static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag, ++ struct pnfs_inval_tracking *storage) ++{ ++ int found = 0; ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) { ++ found = 1; ++ break; ++ } else ++ break; ++ } ++ if (found) { ++ pos->it_tags |= (1 << tag); ++ return 0; ++ } else { ++ struct pnfs_inval_tracking *new; ++ if (storage) ++ new = storage; ++ else { ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ } ++ new->it_sector = s; ++ new->it_tags = (1 << tag); ++ list_add(&new->it_link, &pos->it_link); ++ return 1; ++ } ++} ++ ++/* XXXX Really want option to not create */ ++/* Over range, unions tag with existing entries, else creates entry with tag */ ++static int _set_range(struct my_tree_t *tree, int32_t tag, u64 s, u64 length) ++{ ++ u64 i; ++ ++ dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); ++ for (i = normalize(s, tree->mtt_step_size); i < s + length; ++ i += tree->mtt_step_size) ++ if (_add_entry(tree, i, tag, NULL)) ++ return -ENOMEM; ++ return 0; ++} ++ ++/* Ensure that future operations on given range of tree will not malloc */ ++static int _preload_range(struct my_tree_t *tree, u64 offset, u64 length) ++{ ++ u64 start, end, s; ++ int count, i, used = 0, status = -ENOMEM; ++ struct pnfs_inval_tracking **storage; ++ ++ dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); ++ start = normalize(offset, tree->mtt_step_size); ++ end = normalize_up(offset + length, tree->mtt_step_size); ++ count = (int)(end - start) / (int)tree->mtt_step_size; ++ ++ /* Pre-malloc what memory we might need */ ++ storage = kmalloc(sizeof(*storage) * count, GFP_KERNEL); ++ if (!storage) ++ return -ENOMEM; ++ for (i = 0; i < count; i++) { ++ storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), ++ GFP_KERNEL); ++ if (!storage[i]) ++ goto out_cleanup; ++ } ++ ++ /* Now need lock - HOW??? */ ++ ++ for (s = start; s < end; s += tree->mtt_step_size) ++ used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); ++ ++ /* Unlock - HOW??? */ ++ status = 0; ++ ++ out_cleanup: ++ for (i = used; i < count; i++) { ++ if (!storage[i]) ++ break; ++ kfree(storage[i]); ++ } ++ kfree(storage); ++ return status; ++} ++ ++static void set_needs_init(sector_t *array, sector_t offset) ++{ ++ sector_t *p = array; ++ ++ dprintk("%s enter\n", __func__); ++ if (!p) ++ return; ++ while (*p < offset) ++ p++; ++ if (*p == offset) ++ return; ++ else if (*p == ~0) { ++ *p++ = offset; ++ *p = ~0; ++ return; ++ } else { ++ sector_t *save = p; ++ dprintk("%s Adding %llu\n", __func__, (u64)offset); ++ while (*p != ~0) ++ p++; ++ p++; ++ memmove(save + 1, save, (char *)p - (char *)save); ++ *save = offset; ++ return; ++ } ++} ++ ++/* We are relying on page lock to serialize this */ ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Assume start, end already sector aligned */ ++static int ++_range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag) ++{ ++ struct pnfs_inval_tracking *pos; ++ u64 expect = 0; ++ ++ dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector >= end) ++ continue; ++ if (!expect) { ++ if ((pos->it_sector == end - tree->mtt_step_size) && ++ (pos->it_tags & (1 << tag))) { ++ expect = pos->it_sector - tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ continue; ++ } else { ++ return 0; ++ } ++ } ++ if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) ++ return 0; ++ expect -= tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ } ++ return 0; ++} ++ ++static int is_range_written(struct pnfs_inval_markings *marks, ++ sector_t start, sector_t end) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Marks sectors in [offest, offset_length) as having been initialized. ++ * All lengths are step-aligned, where step is min(pagesize, blocksize). ++ * Notes where partial block is initialized, and helps prepare it for ++ * complete initialization later. ++ */ ++/* Currently assumes offset is page-aligned */ ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages) ++{ ++ sector_t s, start, end; ++ sector_t *array = NULL; /* Pages to mark */ ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", ++ __func__, (u64)offset, (u64)length); ++ s = max((sector_t) 3, ++ 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); ++ dprintk("%s set max=%llu\n", __func__, (u64)s); ++ if (pages) { ++ array = kmalloc(s * sizeof(sector_t), GFP_KERNEL); ++ if (!array) ++ goto outerr; ++ array[0] = ~0; ++ } ++ ++ start = normalize(offset, marks->im_block_size); ++ end = normalize_up(offset + length, marks->im_block_size); ++ if (_preload_range(&marks->im_tree, start, end - start)) ++ goto outerr; ++ ++ spin_lock(&marks->im_lock); ++ ++ for (s = normalize_up(start, PAGE_CACHE_SECTORS); ++ s < offset; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s pre-area pages\n", __func__); ++ /* Portion of used block is not initialized */ ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) ++ goto out_unlock; ++ for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); ++ s < end; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s post-area pages\n", __func__); ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ ++ spin_unlock(&marks->im_lock); ++ ++ if (pages) { ++ if (array[0] == ~0) { ++ kfree(array); ++ *pages = NULL; ++ } else ++ *pages = array; ++ } ++ return 0; ++ ++ out_unlock: ++ spin_unlock(&marks->im_lock); ++ outerr: ++ if (pages) { ++ kfree(array); ++ *pages = NULL; ++ } ++ return -ENOMEM; ++} ++ ++/* Marks sectors in [offest, offset+length) as having been written to disk. ++ * All lengths should be block aligned. ++ */ ++int mark_written_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length) ++{ ++ int status; ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, ++ (u64)offset, (u64)length); ++ spin_lock(&marks->im_lock); ++ status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); ++ spin_unlock(&marks->im_lock); ++ return status; ++} ++ ++static void print_short_extent(struct pnfs_block_short_extent *be) ++{ ++ dprintk("PRINT SHORT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->bse_length); ++ } ++} ++ ++void print_clist(struct list_head *list, unsigned int count) ++{ ++ struct pnfs_block_short_extent *be; ++ unsigned int i = 0; ++ ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, bse_node) { ++ i++; ++ print_short_extent(be); ++ } ++ if (i != count) ++ dprintk("\n\nExpected %u entries\n\n\n", count); ++ dprintk("****************\n"); ++} ++ ++/* Note: In theory, we should do more checking that devid's match between ++ * old and new, but if they don't, the lists are too corrupt to salvage anyway. ++ */ ++/* Note this is very similar to add_and_merge_extent */ ++static void add_to_commitlist(struct pnfs_block_layout *bl, ++ struct pnfs_block_short_extent *new) ++{ ++ struct list_head *clist = &bl->bl_commit; ++ struct pnfs_block_short_extent *old, *save; ++ sector_t end = new->bse_f_offset + new->bse_length; ++ ++ dprintk("%s enter\n", __func__); ++ print_short_extent(new); ++ print_clist(clist, bl->bl_count); ++ bl->bl_count++; ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe(old, save, clist, bse_node) { ++ if (new->bse_f_offset < old->bse_f_offset) ++ break; ++ if (end <= old->bse_f_offset + old->bse_length) { ++ /* Range is already in list */ ++ bl->bl_count--; ++ kfree(new); ++ return; ++ } else if (new->bse_f_offset <= ++ old->bse_f_offset + old->bse_length) { ++ /* new overlaps or abuts existing be */ ++ if (new->bse_mdev == old->bse_mdev) { ++ /* extend new to fully replace old */ ++ new->bse_length += new->bse_f_offset - ++ old->bse_f_offset; ++ new->bse_f_offset = old->bse_f_offset; ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ } ++ /* Note that if we never hit the above break, old will not point to a ++ * valid extent. However, in that case &old->bse_node==list. ++ */ ++ list_add_tail(&new->bse_node, &old->bse_node); ++ /* Scan forward for overlaps. If we find any, extend new and ++ * remove the overlapped extent. ++ */ ++ old = list_prepare_entry(new, clist, bse_node); ++ list_for_each_entry_safe_continue(old, save, clist, bse_node) { ++ if (end < old->bse_f_offset) ++ break; ++ /* new overlaps or abuts old */ ++ if (new->bse_mdev == old->bse_mdev) { ++ if (end < old->bse_f_offset + old->bse_length) { ++ /* extend new to fully cover old */ ++ end = old->bse_f_offset + old->bse_length; ++ new->bse_length = end - new->bse_f_offset; ++ } ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ dprintk("%s: after merging\n", __func__); ++ print_clist(clist, bl->bl_count); ++} ++ ++/* Note the range described by offset, length is guaranteed to be contained ++ * within be. ++ */ ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length) ++{ ++ sector_t new_end, end = offset + length; ++ struct pnfs_block_short_extent *new; ++ struct pnfs_block_layout *bl = container_of(be->be_inval, ++ struct pnfs_block_layout, ++ bl_inval); ++ ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ ++ mark_written_sectors(be->be_inval, offset, length); ++ /* We want to add the range to commit list, but it must be ++ * block-normalized, and verified that the normalized range has ++ * been entirely written to disk. ++ */ ++ new->bse_f_offset = offset; ++ offset = normalize(offset, bl->bl_blocksize); ++ if (offset < new->bse_f_offset) { ++ if (is_range_written(be->be_inval, offset, new->bse_f_offset)) ++ new->bse_f_offset = offset; ++ else ++ new->bse_f_offset = offset + bl->bl_blocksize; ++ } ++ new_end = normalize_up(end, bl->bl_blocksize); ++ if (end < new_end) { ++ if (is_range_written(be->be_inval, end, new_end)) ++ end = new_end; ++ else ++ end = new_end - bl->bl_blocksize; ++ } ++ if (end <= new->bse_f_offset) { ++ kfree(new); ++ return 0; ++ } ++ new->bse_length = end - new->bse_f_offset; ++ new->bse_devid = be->be_devid; ++ new->bse_mdev = be->be_mdev; ++ ++ spin_lock(&bl->bl_ext_lock); ++ /* new will be freed, either by add_to_commitlist if it decides not ++ * to use it, or after LAYOUTCOMMIT uses it in the commitlist. ++ */ ++ add_to_commitlist(bl, new); ++ spin_unlock(&bl->bl_ext_lock); ++ return 0; ++} ++ ++static void print_bl_extent(struct pnfs_block_extent *be) ++{ ++ dprintk("PRINT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->be_length); ++ dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); ++ dprintk(" be_state %d\n", be->be_state); ++ } ++} ++ ++static void ++destroy_extent(struct kref *kref) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = container_of(kref, struct pnfs_block_extent, be_refcnt); ++ dprintk("%s be=%p\n", __func__, be); ++ kfree(be); ++} ++ ++void ++put_extent(struct pnfs_block_extent *be) ++{ ++ if (be) { ++ dprintk("%s enter %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_put(&be->be_refcnt, destroy_extent); ++ } ++} ++ ++struct pnfs_block_extent *alloc_extent(void) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = kmalloc(sizeof(struct pnfs_block_extent), GFP_KERNEL); ++ if (!be) ++ return NULL; ++ INIT_LIST_HEAD(&be->be_node); ++ kref_init(&be->be_refcnt); ++ be->be_inval = NULL; ++ return be; ++} ++ ++struct pnfs_block_extent * ++get_extent(struct pnfs_block_extent *be) ++{ ++ if (be) ++ kref_get(&be->be_refcnt); ++ return be; ++} ++ ++void print_elist(struct list_head *list) ++{ ++ struct pnfs_block_extent *be; ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, be_node) { ++ print_bl_extent(be); ++ } ++ dprintk("****************\n"); ++} ++ ++static inline int ++extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) ++{ ++ /* Note this assumes new->be_f_offset >= old->be_f_offset */ ++ return (new->be_state == old->be_state) && ++ ((new->be_state == PNFS_BLOCK_NONE_DATA) || ++ ((new->be_v_offset - old->be_v_offset == ++ new->be_f_offset - old->be_f_offset) && ++ new->be_mdev == old->be_mdev)); ++} ++ ++/* Adds new to appropriate list in bl, modifying new and removing existing ++ * extents as appropriate to deal with overlaps. ++ * ++ * See find_get_extent for list constraints. ++ * ++ * Refcount on new is already set. If end up not using it, or error out, ++ * need to put the reference. ++ * ++ * Lock is held by caller. ++ */ ++int ++add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new) ++{ ++ struct pnfs_block_extent *be, *tmp; ++ sector_t end = new->be_f_offset + new->be_length; ++ struct list_head *list; ++ ++ dprintk("%s enter with be=%p\n", __func__, new); ++ print_bl_extent(new); ++ list = &bl->bl_extents[choose_list(new->be_state)]; ++ print_elist(list); ++ ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe_reverse(be, tmp, list, be_node) { ++ if (new->be_f_offset >= be->be_f_offset + be->be_length) ++ break; ++ if (new->be_f_offset >= be->be_f_offset) { ++ if (end <= be->be_f_offset + be->be_length) { ++ /* new is a subset of existing be*/ ++ if (extents_consistent(be, new)) { ++ dprintk("%s: new is subset, ignoring\n", ++ __func__); ++ put_extent(new); ++ return 0; ++ } else { ++ goto out_err; ++ } ++ } else { ++ /* |<-- be -->| ++ * |<-- new -->| */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ new->be_length += new->be_f_offset - ++ be->be_f_offset; ++ new->be_f_offset = be->be_f_offset; ++ new->be_v_offset = be->be_v_offset; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } else if (end >= be->be_f_offset + be->be_length) { ++ /* new extent overlap existing be */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } else if (end > be->be_f_offset) { ++ /* |<-- be -->| ++ *|<-- new -->| */ ++ if (extents_consistent(new, be)) { ++ /* extend new to fully replace be */ ++ new->be_length += be->be_f_offset + be->be_length - ++ new->be_f_offset - new->be_length; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } ++ /* Note that if we never hit the above break, be will not point to a ++ * valid extent. However, in that case &be->be_node==list. ++ */ ++ list_add(&new->be_node, &be->be_node); ++ dprintk("%s: inserting new\n", __func__); ++ print_elist(list); ++ /* STUB - The per-list consistency checks have all been done, ++ * should now check cross-list consistency. ++ */ ++ return 0; ++ ++ out_err: ++ put_extent(new); ++ return -EIO; ++} ++ ++/* Returns extent, or NULL. If a second READ extent exists, it is returned ++ * in cow_read, if given. ++ * ++ * The extents are kept in two seperate ordered lists, one for READ and NONE, ++ * one for READWRITE and INVALID. Within each list, we assume: ++ * 1. Extents are ordered by file offset. ++ * 2. For any given isect, there is at most one extents that matches. ++ */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read) ++{ ++ struct pnfs_block_extent *be, *cow, *ret; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ cow = ret = NULL; ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret && ++ (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ if (!ret) ++ ret = be; ++ else if (be->be_state != PNFS_BLOCK_READ_DATA) ++ put_extent(be); ++ else ++ cow = be; ++ break; ++ } ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ if (cow_read) ++ *cow_read = cow; ++ print_bl_extent(ret); ++ return ret; ++} ++ ++/* Similar to find_get_extent, but called with lock held, and ignores cow */ ++static struct pnfs_block_extent * ++find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) ++{ ++ struct pnfs_block_extent *be, *ret = NULL; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ ret = be; ++ break; ++ } ++ } ++ } ++ print_bl_extent(ret); ++ return ret; ++} ++ ++int ++encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *arg) ++{ ++ sector_t start, end; ++ struct pnfs_block_short_extent *lce, *save; ++ unsigned int count = 0; ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct list_head *ranges = &bld->ranges; ++ __be32 *p, *xdr_start; ++ ++ dprintk("%s enter\n", __func__); ++ start = arg->lseg.offset >> 9; ++ end = start + (arg->lseg.length >> 9); ++ dprintk("%s set start=%llu, end=%llu\n", ++ __func__, (u64)start, (u64)end); ++ ++ /* BUG - creation of bl_commit is buggy - need to wait for ++ * entire block to be marked WRITTEN before it can be added. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ /* Want to adjust for possible truncate */ ++ /* We now want to adjust argument range */ ++ ++ /* XDR encode the ranges found */ ++ xdr_start = xdr_reserve_space(xdr, 8); ++ if (!xdr_start) ++ goto out; ++ list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { ++ p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); ++ if (!p) ++ break; ++ WRITE_DEVID(&lce->bse_devid); ++ WRITE64(lce->bse_f_offset << 9); ++ WRITE64(lce->bse_length << 9); ++ WRITE64(0LL); ++ WRITE32(PNFS_BLOCK_READWRITE_DATA); ++ list_del(&lce->bse_node); ++ list_add_tail(&lce->bse_node, ranges); ++ bl->bl_count--; ++ count++; ++ } ++ xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); ++ xdr_start[1] = cpu_to_be32(count); ++out: ++ spin_unlock(&bl->bl_ext_lock); ++ dprintk("%s found %i ranges\n", __func__, count); ++ return 0; ++} ++ ++/* Helper function to set_to_rw that initialize a new extent */ ++static void ++_prep_new_extent(struct pnfs_block_extent *new, ++ struct pnfs_block_extent *orig, ++ sector_t offset, sector_t length, int state) ++{ ++ kref_init(&new->be_refcnt); ++ /* don't need to INIT_LIST_HEAD(&new->be_node) */ ++ memcpy(&new->be_devid, &orig->be_devid, sizeof(struct pnfs_deviceid)); ++ new->be_mdev = orig->be_mdev; ++ new->be_f_offset = offset; ++ new->be_length = length; ++ new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; ++ new->be_state = state; ++ new->be_inval = orig->be_inval; ++} ++ ++/* Tries to merge be with extent in front of it in list. ++ * Frees storage if not used. ++ */ ++static struct pnfs_block_extent * ++_front_merge(struct pnfs_block_extent *be, struct list_head *head, ++ struct pnfs_block_extent *storage) ++{ ++ struct pnfs_block_extent *prev; ++ ++ if (!storage) ++ goto no_merge; ++ if (&be->be_node == head || be->be_node.prev == head) ++ goto no_merge; ++ prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); ++ if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || ++ !extents_consistent(prev, be)) ++ goto no_merge; ++ _prep_new_extent(storage, prev, prev->be_f_offset, ++ prev->be_length + be->be_length, prev->be_state); ++ list_replace(&prev->be_node, &storage->be_node); ++ put_extent(prev); ++ list_del(&be->be_node); ++ put_extent(be); ++ return storage; ++ ++ no_merge: ++ kfree(storage); ++ return be; ++} ++ ++static u64 ++set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) ++{ ++ u64 rv = offset + length; ++ struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; ++ struct pnfs_block_extent *children[3]; ++ struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; ++ int i = 0, j; ++ ++ dprintk("%s(%llu, %llu)\n", __func__, offset, length); ++ /* Create storage for up to three new extents e1, e2, e3 */ ++ e1 = kmalloc(sizeof(*e1), GFP_KERNEL); ++ e2 = kmalloc(sizeof(*e2), GFP_KERNEL); ++ e3 = kmalloc(sizeof(*e3), GFP_KERNEL); ++ /* BUG - we are ignoring any failure */ ++ if (!e1 || !e2 || !e3) ++ goto out_nosplit; ++ ++ spin_lock(&bl->bl_ext_lock); ++ be = find_get_extent_locked(bl, offset); ++ rv = be->be_f_offset + be->be_length; ++ if (be->be_state != PNFS_BLOCK_INVALID_DATA) { ++ spin_unlock(&bl->bl_ext_lock); ++ goto out_nosplit; ++ } ++ /* Add e* to children, bumping e*'s krefs */ ++ if (be->be_f_offset != offset) { ++ _prep_new_extent(e1, be, be->be_f_offset, ++ offset - be->be_f_offset, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e1; ++ print_bl_extent(e1); ++ } else ++ merge1 = e1; ++ _prep_new_extent(e2, be, offset, ++ min(length, be->be_f_offset + be->be_length - offset), ++ PNFS_BLOCK_READWRITE_DATA); ++ children[i++] = e2; ++ print_bl_extent(e2); ++ if (offset + length < be->be_f_offset + be->be_length) { ++ _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, ++ be->be_f_offset + be->be_length - ++ offset - length, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e3; ++ print_bl_extent(e3); ++ } else ++ merge2 = e3; ++ ++ /* Remove be from list, and insert the e* */ ++ /* We don't get refs on e*, since this list is the base reference ++ * set when init'ed. ++ */ ++ if (i < 3) ++ children[i] = NULL; ++ new = children[0]; ++ list_replace(&be->be_node, &new->be_node); ++ put_extent(be); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); ++ for (j = 1; j < i; j++) { ++ old = new; ++ new = children[j]; ++ list_add(&new->be_node, &old->be_node); ++ } ++ if (merge2) { ++ /* This is a HACK, should just create a _back_merge function */ ++ new = list_entry(new->be_node.next, ++ struct pnfs_block_extent, be_node); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ ++ /* Since we removed the base reference above, be is now scheduled for ++ * destruction. ++ */ ++ put_extent(be); ++ dprintk("%s returns %llu after split\n", __func__, rv); ++ return rv; ++ ++ out_nosplit: ++ kfree(e1); ++ kfree(e2); ++ kfree(e3); ++ dprintk("%s returns %llu without splitting\n", __func__, rv); ++ return rv; ++} ++ ++void ++clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct pnfs_layoutcommit_arg *arg, ++ int status) ++{ ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct pnfs_block_short_extent *lce, *save; ++ ++ dprintk("%s status %d\n", __func__, status); ++ list_for_each_entry_safe_reverse(lce, save, &bld->ranges, bse_node) { ++ if (likely(!status)) { ++ u64 offset = lce->bse_f_offset; ++ u64 end = offset + lce->bse_length; ++ ++ do { ++ offset = set_to_rw(bl, offset, end - offset); ++ } while (offset < end); ++ ++ kfree(lce); ++ } else { ++ spin_lock(&bl->bl_ext_lock); ++ add_to_commitlist(bl, lce); ++ spin_unlock(&bl->bl_ext_lock); ++ } ++ } ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile +--- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-23 12:09:03.292511531 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-08-23 12:09:03.293491476 -0400 +@@ -0,0 +1,6 @@ ++# ++# Makefile for the pNFS block layout driver kernel module ++# ++obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o ++blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \ ++ extents.o block-device-discovery-pipe.o +diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/callback.h +--- linux-2.6.34.noarch/fs/nfs/callback.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-08-23 12:09:03.293491476 -0400 +@@ -8,6 +8,8 @@ + #ifndef __LINUX_FS_NFS_CALLBACK_H + #define __LINUX_FS_NFS_CALLBACK_H + ++#include ++ + #define NFS4_CALLBACK 0x40000000 + #define NFS4_CALLBACK_XDRSIZE 2048 + #define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE) +@@ -72,6 +74,8 @@ struct cb_recallargs { + + #if defined(CONFIG_NFS_V4_1) + ++#include ++ + struct referring_call { + uint32_t rc_sequenceid; + uint32_t rc_slotid; +@@ -111,6 +115,13 @@ extern int nfs41_validate_delegation_sta + + #define RCA4_TYPE_MASK_RDATA_DLG 0 + #define RCA4_TYPE_MASK_WDATA_DLG 1 ++#define RCA4_TYPE_MASK_DIR_DLG 2 ++#define RCA4_TYPE_MASK_FILE_LAYOUT 3 ++#define RCA4_TYPE_MASK_BLK_LAYOUT 4 ++#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8 ++#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 ++#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 ++#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 + + struct cb_recallanyargs { + struct sockaddr *craa_addr; +@@ -127,6 +138,37 @@ struct cb_recallslotargs { + extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, + void *dummy); + ++struct cb_pnfs_layoutrecallargs { ++ struct sockaddr *cbl_addr; ++ struct nfs_fh cbl_fh; ++ struct nfs4_pnfs_layout_segment cbl_seg; ++ struct nfs_fsid cbl_fsid; ++ uint32_t cbl_recall_type; ++ uint32_t cbl_layout_type; ++ uint32_t cbl_layoutchanged; ++ nfs4_stateid cbl_stateid; ++}; ++ ++extern unsigned pnfs_cb_layoutrecall(struct cb_pnfs_layoutrecallargs *args, ++ void *dummy); ++ ++struct cb_pnfs_devicenotifyitem { ++ uint32_t cbd_notify_type; ++ uint32_t cbd_layout_type; ++ struct pnfs_deviceid cbd_dev_id; ++ uint32_t cbd_immediate; ++}; ++ ++/* XXX: Should be dynamic up to max compound size */ ++#define NFS4_DEV_NOTIFY_MAXENTRIES 10 ++struct cb_pnfs_devicenotifyargs { ++ struct sockaddr *addr; ++ int ndevs; ++ struct cb_pnfs_devicenotifyitem devs[NFS4_DEV_NOTIFY_MAXENTRIES]; ++}; ++ ++extern unsigned pnfs_cb_devicenotify(struct cb_pnfs_devicenotifyargs *args, ++ void *dummy); + #endif /* CONFIG_NFS_V4_1 */ + + extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); +diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/nfs/callback_proc.c +--- linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-08-23 12:09:03.294522414 -0400 +@@ -8,10 +8,15 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + #include "nfs4_fs.h" + #include "callback.h" + #include "delegation.h" + #include "internal.h" ++#include "pnfs.h" + + #ifdef NFS_DEBUG + #define NFSDBG_FACILITY NFSDBG_CALLBACK +@@ -62,16 +67,6 @@ out: + return res->status; + } + +-static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *) +-{ +-#if defined(CONFIG_NFS_V4_1) +- if (clp->cl_minorversion > 0) +- return nfs41_validate_delegation_stateid; +-#endif +- return nfs4_validate_delegation_stateid; +-} +- +- + __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) + { + struct nfs_client *clp; +@@ -92,8 +87,7 @@ __be32 nfs4_callback_recall(struct cb_re + inode = nfs_delegation_find_inode(clp, &args->fh); + if (inode != NULL) { + /* Set up a helper thread to actually return the delegation */ +- switch (nfs_async_inode_return_delegation(inode, &args->stateid, +- nfs_validate_delegation_stateid(clp))) { ++ switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { + case 0: + res = 0; + break; +@@ -116,24 +110,364 @@ out: + + int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) + { +- if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, +- sizeof(delegation->stateid.data)) != 0) ++ if (delegation == NULL || memcmp(delegation->stateid.u.data, ++ stateid->u.data, ++ sizeof(delegation->stateid.u.data))) + return 0; + return 1; + } + + #if defined(CONFIG_NFS_V4_1) + ++static bool ++pnfs_is_next_layout_stateid(const struct pnfs_layout_type *lo, ++ const nfs4_stateid stateid) ++{ ++ int seqlock; ++ bool res; ++ u32 oldseqid, newseqid; ++ ++ do { ++ seqlock = read_seqbegin(&lo->seqlock); ++ oldseqid = be32_to_cpu(lo->stateid.u.stateid.seqid); ++ newseqid = be32_to_cpu(stateid.u.stateid.seqid); ++ res = !memcmp(lo->stateid.u.stateid.other, ++ stateid.u.stateid.other, ++ NFS4_STATEID_OTHER_SIZE); ++ if (res) { /* comparing layout stateids */ ++ if (oldseqid == ~0) ++ res = (newseqid == 1); ++ else ++ res = (newseqid == oldseqid + 1); ++ } else { /* open stateid */ ++ res = !memcmp(lo->stateid.u.data, ++ &zero_stateid, ++ NFS4_STATEID_SIZE); ++ if (res) ++ res = (newseqid == 1); ++ } ++ } while (read_seqretry(&lo->seqlock, seqlock)); ++ ++ return res; ++} ++ ++/* ++ * Retrieve an inode based on layout recall parameters ++ * ++ * Note: caller must iput(inode) to dereference the inode. ++ */ ++static struct inode * ++nfs_layoutrecall_find_inode(struct nfs_client *clp, ++ const struct cb_pnfs_layoutrecallargs *args) ++{ ++ struct nfs_inode *nfsi; ++ struct pnfs_layout_type *layout; ++ struct nfs_server *server; ++ struct inode *ino = NULL; ++ ++ dprintk("%s: Begin recall_type=%d clp %p\n", ++ __func__, args->cbl_recall_type, clp); ++ ++ spin_lock(&clp->cl_lock); ++ list_for_each_entry(layout, &clp->cl_layouts, lo_layouts) { ++ nfsi = PNFS_NFS_INODE(layout); ++ if (!nfsi) ++ continue; ++ ++ dprintk("%s: Searching inode=%lu\n", ++ __func__, nfsi->vfs_inode.i_ino); ++ ++ if (args->cbl_recall_type == RETURN_FILE) { ++ if (nfs_compare_fh(&args->cbl_fh, &nfsi->fh)) ++ continue; ++ } else if (args->cbl_recall_type == RETURN_FSID) { ++ server = NFS_SERVER(&nfsi->vfs_inode); ++ if (server->fsid.major != args->cbl_fsid.major || ++ server->fsid.minor != args->cbl_fsid.minor) ++ continue; ++ } ++ ++ /* Make sure client didn't clean up layout without ++ * telling the server */ ++ if (!has_layout(nfsi)) ++ continue; ++ ++ ino = igrab(&nfsi->vfs_inode); ++ dprintk("%s: Found inode=%p\n", __func__, ino); ++ break; ++ } ++ spin_unlock(&clp->cl_lock); ++ return ino; ++} ++ ++struct recall_layout_threadargs { ++ struct inode *inode; ++ struct nfs_client *clp; ++ struct completion started; ++ struct cb_pnfs_layoutrecallargs *rl; ++ int result; ++}; ++ ++static int pnfs_recall_layout(void *data) ++{ ++ struct inode *inode, *ino; ++ struct nfs_client *clp; ++ struct cb_pnfs_layoutrecallargs rl; ++ struct nfs4_pnfs_layoutreturn *lrp; ++ struct recall_layout_threadargs *args = ++ (struct recall_layout_threadargs *)data; ++ int status = 0; ++ ++ daemonize("nfsv4-layoutreturn"); ++ ++ dprintk("%s: recall_type=%d fsid 0x%llx-0x%llx start\n", ++ __func__, args->rl->cbl_recall_type, ++ args->rl->cbl_fsid.major, args->rl->cbl_fsid.minor); ++ ++ clp = args->clp; ++ inode = args->inode; ++ rl = *args->rl; ++ ++ /* support whole file layouts only */ ++ rl.cbl_seg.offset = 0; ++ rl.cbl_seg.length = NFS4_MAX_UINT64; ++ ++ if (rl.cbl_recall_type == RETURN_FILE) { ++ if (pnfs_is_next_layout_stateid(NFS_I(inode)->layout, ++ rl.cbl_stateid)) ++ status = pnfs_return_layout(inode, &rl.cbl_seg, ++ &rl.cbl_stateid, RETURN_FILE, ++ false); ++ else ++ status = cpu_to_be32(NFS4ERR_DELAY); ++ if (status) ++ dprintk("%s RETURN_FILE error: %d\n", __func__, status); ++ else ++ status = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); ++ args->result = status; ++ complete(&args->started); ++ goto out; ++ } ++ ++ status = cpu_to_be32(NFS4_OK); ++ args->result = status; ++ complete(&args->started); ++ args = NULL; ++ ++ /* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */ ++ while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) { ++ /* FIXME: need to check status on pnfs_return_layout */ ++ pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, false); ++ iput(ino); ++ } ++ ++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); ++ if (!lrp) { ++ dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n", ++ __func__); ++ goto out; ++ } ++ ++ /* send final layoutreturn */ ++ lrp->args.reclaim = 0; ++ lrp->args.layout_type = rl.cbl_layout_type; ++ lrp->args.return_type = rl.cbl_recall_type; ++ lrp->args.lseg = rl.cbl_seg; ++ lrp->args.inode = inode; ++ pnfs4_proc_layoutreturn(lrp, true); ++ ++out: ++ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); ++ nfs_put_client(clp); ++ module_put_and_exit(0); ++ dprintk("%s: exit status %d\n", __func__, 0); ++ return 0; ++} ++ ++/* ++ * Asynchronous layout recall! ++ */ ++static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode, ++ struct cb_pnfs_layoutrecallargs *rl) ++{ ++ struct recall_layout_threadargs data = { ++ .clp = clp, ++ .inode = inode, ++ .rl = rl, ++ }; ++ struct task_struct *t; ++ int status = -EAGAIN; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ /* FIXME: do not allow two concurrent layout recalls */ ++ if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) ++ return status; ++ ++ init_completion(&data.started); ++ __module_get(THIS_MODULE); ++ if (!atomic_inc_not_zero(&clp->cl_count)) ++ goto out_put_no_client; ++ ++ t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout"); ++ if (IS_ERR(t)) { ++ printk(KERN_INFO "NFS: Layout recall callback thread failed " ++ "for client (clientid %08x/%08x)\n", ++ (unsigned)(clp->cl_clientid >> 32), ++ (unsigned)(clp->cl_clientid)); ++ status = PTR_ERR(t); ++ goto out_module_put; ++ } ++ wait_for_completion(&data.started); ++ return data.result; ++out_module_put: ++ nfs_put_client(clp); ++out_put_no_client: ++ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); ++ module_put(THIS_MODULE); ++ return status; ++} ++ ++static int pnfs_recall_all_layouts(struct nfs_client *clp) ++{ ++ struct cb_pnfs_layoutrecallargs rl; ++ struct inode *inode; ++ int status = 0; ++ ++ rl.cbl_recall_type = RETURN_ALL; ++ rl.cbl_seg.iomode = IOMODE_ANY; ++ rl.cbl_seg.offset = 0; ++ rl.cbl_seg.length = NFS4_MAX_UINT64; ++ ++ /* we need the inode to get the nfs_server struct */ ++ inode = nfs_layoutrecall_find_inode(clp, &rl); ++ if (!inode) ++ return status; ++ status = pnfs_async_return_layout(clp, inode, &rl); ++ iput(inode); ++ ++ return status; ++} ++ ++__be32 pnfs_cb_layoutrecall(struct cb_pnfs_layoutrecallargs *args, ++ void *dummy) ++{ ++ struct nfs_client *clp; ++ struct inode *inode = NULL; ++ __be32 res; ++ int status; ++ unsigned int num_client = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); ++ clp = nfs_find_client(args->cbl_addr, 4); ++ if (clp == NULL) { ++ dprintk("%s: no client for addr %u.%u.%u.%u\n", ++ __func__, NIPQUAD(args->cbl_addr)); ++ goto out; ++ } ++ ++ res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); ++ do { ++ struct nfs_client *prev = clp; ++ num_client++; ++ /* the callback must come from the MDS personality */ ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) ++ goto loop; ++ if (args->cbl_recall_type == RETURN_FILE) { ++ inode = nfs_layoutrecall_find_inode(clp, args); ++ if (inode != NULL) { ++ status = pnfs_async_return_layout(clp, inode, ++ args); ++ if (status) ++ res = cpu_to_be32(NFS4ERR_DELAY); ++ iput(inode); ++ } ++ } else { /* _ALL or _FSID */ ++ /* we need the inode to get the nfs_server struct */ ++ inode = nfs_layoutrecall_find_inode(clp, args); ++ if (!inode) ++ goto loop; ++ status = pnfs_async_return_layout(clp, inode, args); ++ if (status) ++ res = cpu_to_be32(NFS4ERR_DELAY); ++ iput(inode); ++ } ++loop: ++ clp = nfs_find_client_next(prev); ++ nfs_put_client(prev); ++ } while (clp != NULL); ++ ++out: ++ dprintk("%s: exit with status = %d numclient %u\n", ++ __func__, ntohl(res), num_client); ++ return res; ++} ++ ++/* Remove the deviceid(s) from the nfs_client deviceid cache */ ++static __be32 pnfs_devicenotify_client(struct nfs_client *clp, ++ struct cb_pnfs_devicenotifyargs *args) ++{ ++ uint32_t type; ++ int i; ++ ++ dprintk("%s: --> clp %p\n", __func__, clp); ++ ++ for (i = 0; i < args->ndevs; i++) { ++ struct cb_pnfs_devicenotifyitem *dev = &args->devs[i]; ++ type = dev->cbd_notify_type; ++ if (type == NOTIFY_DEVICEID4_DELETE && clp->cl_devid_cache) ++ nfs4_delete_device(clp->cl_devid_cache, ++ &dev->cbd_dev_id); ++ else if (type == NOTIFY_DEVICEID4_CHANGE) ++ printk(KERN_ERR "%s: NOTIFY_DEVICEID4_CHANGE " ++ "not supported\n", __func__); ++ } ++ return 0; ++} ++ ++__be32 pnfs_cb_devicenotify(struct cb_pnfs_devicenotifyargs *args, ++ void *dummy) ++{ ++ struct nfs_client *clp; ++ __be32 res = 0; ++ unsigned int num_client = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ res = __constant_htonl(NFS4ERR_INVAL); ++ clp = nfs_find_client(args->addr, 4); ++ if (clp == NULL) { ++ dprintk("%s: no client for addr %u.%u.%u.%u\n", ++ __func__, NIPQUAD(args->addr)); ++ goto out; ++ } ++ ++ do { ++ struct nfs_client *prev = clp; ++ num_client++; ++ res = pnfs_devicenotify_client(clp, args); ++ clp = nfs_find_client_next(prev); ++ nfs_put_client(prev); ++ } while (clp != NULL); ++ ++out: ++ dprintk("%s: exit with status = %d numclient %u\n", ++ __func__, ntohl(res), num_client); ++ return res; ++} ++ + int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) + { + if (delegation == NULL) + return 0; + +- /* seqid is 4-bytes long */ +- if (((u32 *) &stateid->data)[0] != 0) ++ if (stateid->u.stateid.seqid != 0) + return 0; +- if (memcmp(&delegation->stateid.data[4], &stateid->data[4], +- sizeof(stateid->data)-4)) ++ if (memcmp(&delegation->stateid.u.stateid.other, ++ &stateid->u.stateid.other, ++ NFS4_STATEID_OTHER_SIZE)) + return 0; + + return 1; +@@ -335,13 +669,37 @@ out: + return status; + } + ++static inline bool ++validate_bitmap_values(const unsigned long *mask) ++{ ++ int i; ++ ++ if (*mask == 0) ++ return true; ++ if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_WDATA_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_DIR_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, mask) || ++ test_bit(RCA4_TYPE_MASK_BLK_LAYOUT, mask)) ++ return true; ++ for (i = RCA4_TYPE_MASK_OBJ_LAYOUT_MIN; ++ i <= RCA4_TYPE_MASK_OBJ_LAYOUT_MAX; i++) ++ if (test_bit(i, mask)) ++ return true; ++ for (i = RCA4_TYPE_MASK_OTHER_LAYOUT_MIN; ++ i <= RCA4_TYPE_MASK_OTHER_LAYOUT_MAX; i++) ++ if (test_bit(i, mask)) ++ return true; ++ return false; ++} ++ + __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) + { + struct nfs_client *clp; + __be32 status; + fmode_t flags = 0; + +- status = htonl(NFS4ERR_OP_NOT_IN_SESSION); ++ status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + clp = nfs_find_client(args->craa_addr, 4); + if (clp == NULL) + goto out; +@@ -349,16 +707,25 @@ __be32 nfs4_callback_recallany(struct cb + dprintk("NFS: RECALL_ANY callback request from %s\n", + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + ++ status = cpu_to_be32(NFS4ERR_INVAL); ++ if (!validate_bitmap_values((const unsigned long *) ++ &args->craa_type_mask)) ++ return status; ++ ++ status = cpu_to_be32(NFS4_OK); + if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags = FMODE_READ; + if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags |= FMODE_WRITE; ++ if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) ++ &args->craa_type_mask)) ++ if (pnfs_recall_all_layouts(clp) == -EAGAIN) ++ status = cpu_to_be32(NFS4ERR_DELAY); + + if (flags) + nfs_expire_all_delegation_types(clp, flags); +- status = htonl(NFS4_OK); + out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/nfs/callback_xdr.c +--- linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-08-23 12:09:03.295502055 -0400 +@@ -22,6 +22,8 @@ + #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + + #if defined(CONFIG_NFS_V4_1) ++#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) ++#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ + 4 + 1 + 3) + #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +@@ -136,7 +138,7 @@ static __be32 decode_stateid(struct xdr_ + p = read_buf(xdr, 16); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); +- memcpy(stateid->data, p, 16); ++ memcpy(stateid->u.data, p, 16); + return 0; + } + +@@ -220,6 +222,148 @@ out: + + #if defined(CONFIG_NFS_V4_1) + ++static __be32 decode_pnfs_layoutrecall_args(struct svc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct cb_pnfs_layoutrecallargs *args) ++{ ++ __be32 *p; ++ __be32 status = 0; ++ ++ args->cbl_addr = svc_addr(rqstp); ++ p = read_buf(xdr, 4 * sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ ++ args->cbl_layout_type = ntohl(*p++); ++ args->cbl_seg.iomode = ntohl(*p++); ++ args->cbl_layoutchanged = ntohl(*p++); ++ args->cbl_recall_type = ntohl(*p++); ++ ++ if (likely(args->cbl_recall_type == RETURN_FILE)) { ++ status = decode_fh(xdr, &args->cbl_fh); ++ if (unlikely(status != 0)) ++ goto out; ++ ++ p = read_buf(xdr, 2 * sizeof(uint64_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ p = xdr_decode_hyper(p, &args->cbl_seg.offset); ++ p = xdr_decode_hyper(p, &args->cbl_seg.length); ++ status = decode_stateid(xdr, &args->cbl_stateid); ++ if (unlikely(status != 0)) ++ goto out; ++ } else if (args->cbl_recall_type == RETURN_FSID) { ++ p = read_buf(xdr, 2 * sizeof(uint64_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ p = xdr_decode_hyper(p, &args->cbl_fsid.major); ++ p = xdr_decode_hyper(p, &args->cbl_fsid.minor); ++ } ++ dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d " ++ "fsid %llx-%llx fhsize %d\n", __func__, ++ args->cbl_layout_type, args->cbl_seg.iomode, ++ args->cbl_layoutchanged, args->cbl_recall_type, ++ args->cbl_fsid.major, args->cbl_fsid.minor, ++ args->cbl_fh.size); ++out: ++ dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); ++ return status; ++} ++ ++static ++__be32 decode_pnfs_devicenotify_args(struct svc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct cb_pnfs_devicenotifyargs *args) ++{ ++ __be32 *p; ++ __be32 status = 0; ++ u32 tmp; ++ int n, i; ++ args->ndevs = 0; ++ ++ args->addr = svc_addr(rqstp); ++ ++ /* Num of device notifications */ ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ n = ntohl(*p++); ++ if (n <= 0) ++ goto out; ++ ++ /* XXX: need to possibly return error in this case */ ++ if (n > NFS4_DEV_NOTIFY_MAXENTRIES) { ++ dprintk("%s: Processing (%d) notifications out of (%d)\n", ++ __func__, NFS4_DEV_NOTIFY_MAXENTRIES, n); ++ n = NFS4_DEV_NOTIFY_MAXENTRIES; ++ } ++ ++ /* Decode each dev notification */ ++ for (i = 0; i < n; i++) { ++ struct cb_pnfs_devicenotifyitem *dev = &args->devs[i]; ++ ++ p = read_buf(xdr, (4 * sizeof(uint32_t)) ++ + NFS4_PNFS_DEVICEID4_SIZE); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* bitmap size */ ++ if (tmp != 1) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_notify_type = ntohl(*p++); ++ if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* opaque size */ ++ if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) && ++ (tmp != NFS4_PNFS_DEVICEID4_SIZE + 8)) || ++ ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) && ++ (tmp != NFS4_PNFS_DEVICEID4_SIZE + 4))) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_layout_type = ntohl(*p++); ++ memcpy(dev->cbd_dev_id.data, p, NFS4_PNFS_DEVICEID4_SIZE); ++ p += XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE); ++ ++ if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_DELAY); ++ goto out; ++ } ++ dev->cbd_immediate = ntohl(*p++); ++ } else { ++ dev->cbd_immediate = 0; ++ } ++ ++ args->ndevs++; ++ ++ dprintk("%s: type %d layout 0x%x immediate %d\n", ++ __func__, dev->cbd_notify_type, dev->cbd_layout_type, ++ dev->cbd_immediate); ++ } ++out: ++ dprintk("%s: status %d ndevs %d\n", ++ __func__, ntohl(status), args->ndevs); ++ return status; ++} ++ + static __be32 decode_sessionid(struct xdr_stream *xdr, + struct nfs4_sessionid *sid) + { +@@ -574,11 +718,11 @@ preprocess_nfs41_op(int nop, unsigned in + case OP_CB_SEQUENCE: + case OP_CB_RECALL_ANY: + case OP_CB_RECALL_SLOT: ++ case OP_CB_LAYOUTRECALL: ++ case OP_CB_NOTIFY_DEVICEID: + *op = &callback_ops[op_nr]; + break; + +- case OP_CB_LAYOUTRECALL: +- case OP_CB_NOTIFY_DEVICEID: + case OP_CB_NOTIFY: + case OP_CB_PUSH_DELEG: + case OP_CB_RECALLABLE_OBJ_AVAIL: +@@ -739,6 +883,18 @@ static struct callback_op callback_ops[] + .res_maxsize = CB_OP_RECALL_RES_MAXSZ, + }, + #if defined(CONFIG_NFS_V4_1) ++ [OP_CB_LAYOUTRECALL] = { ++ .process_op = (callback_process_op_t)pnfs_cb_layoutrecall, ++ .decode_args = ++ (callback_decode_arg_t)decode_pnfs_layoutrecall_args, ++ .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, ++ }, ++ [OP_CB_NOTIFY_DEVICEID] = { ++ .process_op = (callback_process_op_t)pnfs_cb_devicenotify, ++ .decode_args = ++ (callback_decode_arg_t)decode_pnfs_devicenotify_args, ++ .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ, ++ }, + [OP_CB_SEQUENCE] = { + .process_op = (callback_process_op_t)nfs4_callback_sequence, + .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, +diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/client.c +--- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-08-23 12:08:29.037481540 -0400 ++++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-23 12:09:03.297501650 -0400 +@@ -39,6 +39,7 @@ + #include + #include + #include ++#include + + #include + +@@ -48,6 +49,7 @@ + #include "iostat.h" + #include "internal.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_CLIENT + +@@ -150,11 +152,14 @@ static struct nfs_client *nfs_alloc_clie + clp->cl_boot_time = CURRENT_TIME; + clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; + clp->cl_minorversion = cl_init->minorversion; ++ clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; + #endif + cred = rpc_lookup_machine_cred(); + if (!IS_ERR(cred)) + clp->cl_machine_cred = cred; +- ++#if defined(CONFIG_NFS_V4_1) ++ INIT_LIST_HEAD(&clp->cl_layouts); ++#endif + nfs_fscache_get_client_cookie(clp); + + return clp; +@@ -178,7 +183,7 @@ static void nfs4_clear_client_minor_vers + clp->cl_session = NULL; + } + +- clp->cl_call_sync = _nfs4_call_sync; ++ clp->cl_mvops = nfs_v4_minor_ops[0]; + #endif /* CONFIG_NFS_V4_1 */ + } + +@@ -188,7 +193,7 @@ static void nfs4_clear_client_minor_vers + static void nfs4_destroy_callback(struct nfs_client *clp) + { + if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) +- nfs_callback_down(clp->cl_minorversion); ++ nfs_callback_down(clp->cl_mvops->minor_version); + } + + static void nfs4_shutdown_client(struct nfs_client *clp) +@@ -251,6 +256,7 @@ void nfs_put_client(struct nfs_client *c + nfs_free_client(clp); + } + } ++EXPORT_SYMBOL(nfs_put_client); + + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + /* +@@ -343,7 +349,7 @@ static int nfs_sockaddr_match_ipaddr(con + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, including the port number. + */ +-static int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2) + { + if (sa1->sa_family != sa2->sa_family) +@@ -357,6 +363,7 @@ static int nfs_sockaddr_cmp(const struct + } + return 0; + } ++EXPORT_SYMBOL(nfs_sockaddr_cmp); + + /* + * Find a client by IP address and protocol version +@@ -548,6 +555,7 @@ int nfs4_check_client_ready(struct nfs_c + return -EPROTONOSUPPORT; + return 0; + } ++EXPORT_SYMBOL(nfs4_check_client_ready); + + /* + * Initialise the timeout values for a connection +@@ -865,9 +873,34 @@ error: + } + + /* ++ * Initialize the pNFS layout driver and setup pNFS related parameters ++ */ ++static void nfs4_init_pnfs(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) ++{ ++#if defined(CONFIG_NFS_V4_1) ++ struct nfs_client *clp = server->nfs_client; ++ ++ if (nfs4_has_session(clp) && ++ (clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) { ++ server->pnfs_blksize = fsinfo->blksize; ++ set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype); ++ pnfs_set_ds_iosize(server); ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ ++static void nfs4_uninit_pnfs(struct nfs_server *server) ++{ ++#if defined(CONFIG_NFS_V4_1) ++ if (server->nfs_client && nfs4_has_session(server->nfs_client)) ++ unmount_pnfs_layoutdriver(server); ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ ++/* + * Load up the server record from information gained in an fsinfo record + */ +-static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) ++static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) + { + unsigned long max_rpc_payload; + +@@ -897,6 +930,8 @@ static void nfs_server_set_fsinfo(struct + if (server->wsize > NFS_MAX_FILE_IO_SIZE) + server->wsize = NFS_MAX_FILE_IO_SIZE; + server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; ++ nfs4_init_pnfs(server, mntfh, fsinfo); ++ + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); + + server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); +@@ -938,7 +973,7 @@ static int nfs_probe_fsinfo(struct nfs_s + if (error < 0) + goto out_error; + +- nfs_server_set_fsinfo(server, &fsinfo); ++ nfs_server_set_fsinfo(server, mntfh, &fsinfo); + + /* Get some general file system info */ + if (server->namelen == 0) { +@@ -1016,6 +1051,7 @@ void nfs_free_server(struct nfs_server * + { + dprintk("--> nfs_free_server()\n"); + ++ nfs4_uninit_pnfs(server); + spin_lock(&nfs_client_lock); + list_del(&server->client_link); + list_del(&server->master_link); +@@ -1126,7 +1162,7 @@ static int nfs4_init_callback(struct nfs + return error; + } + +- error = nfs_callback_up(clp->cl_minorversion, ++ error = nfs_callback_up(clp->cl_mvops->minor_version, + clp->cl_rpcclient->cl_xprt); + if (error < 0) { + dprintk("%s: failed to start callback. Error = %d\n", +@@ -1143,10 +1179,8 @@ static int nfs4_init_callback(struct nfs + */ + static int nfs4_init_client_minor_version(struct nfs_client *clp) + { +- clp->cl_call_sync = _nfs4_call_sync; +- + #if defined(CONFIG_NFS_V4_1) +- if (clp->cl_minorversion) { ++ if (clp->cl_mvops->minor_version) { + struct nfs4_session *session = NULL; + /* + * Create the session and mark it expired. +@@ -1158,7 +1192,13 @@ static int nfs4_init_client_minor_versio + return -ENOMEM; + + clp->cl_session = session; +- clp->cl_call_sync = _nfs4_call_sync_session; ++ /* ++ * The create session reply races with the server back ++ * channel probe. Mark the client NFS_CS_SESSION_INITING ++ * so that the client back channel can find the ++ * nfs_client struct ++ */ ++ clp->cl_cons_state = NFS_CS_SESSION_INITING; + } + #endif /* CONFIG_NFS_V4_1 */ + +@@ -1216,7 +1256,7 @@ error: + /* + * Set up an NFS4 client + */ +-static int nfs4_set_client(struct nfs_server *server, ++int nfs4_set_client(struct nfs_server *server, + const char *hostname, + const struct sockaddr *addr, + const size_t addrlen, +@@ -1259,6 +1299,7 @@ error: + dprintk("<-- nfs4_set_client() = xerror %d\n", error); + return error; + } ++EXPORT_SYMBOL(nfs4_set_client); + + + /* +@@ -1448,7 +1489,7 @@ struct nfs_server *nfs4_create_referral_ + data->authflavor, + parent_server->client->cl_xprt->prot, + parent_server->client->cl_timeout, +- parent_client->cl_minorversion); ++ parent_client->cl_mvops->minor_version); + if (error < 0) + goto error; + +diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/bl_com.c +--- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-08-23 12:09:03.297501650 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-08-23 12:09:03.298501447 -0400 +@@ -0,0 +1,292 @@ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++static ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); ++static void bl_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops bl_upcall_ops = { ++ .upcall = bl_pipe_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = bl_pipe_destroy_msg, ++}; ++ ++bl_comm_t *bl_comm_global; ++ ++int ++nfsd_bl_start(void) ++{ ++ bl_comm_t *bl_comm = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ dprintk("%s: starting pipe\n", __func__); ++ if (bl_comm_global) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ bl_comm = kzalloc(sizeof (*bl_comm), GFP_KERNEL); ++ if (!bl_comm) { ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ /* FIXME: rename to "spnfs_block" */ ++ bl_comm->pipe_dentry = rpc_mkpipe(nd.path.dentry, "pnfs_block", bl_comm, ++ &bl_upcall_ops, 0); ++ if (IS_ERR(bl_comm->pipe_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ mutex_init(&bl_comm->lock); ++ mutex_init(&bl_comm->pipe_lock); ++ init_waitqueue_head(&bl_comm->pipe_wq); ++ ++ bl_comm_global = bl_comm; ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(bl_comm); ++ return rc; ++} ++ ++void ++nfsd_bl_stop(void) ++{ ++ bl_comm_t *c = bl_comm_global; ++ ++ dprintk("%s: stopping pipe\n", __func__); ++ if (!c) ++ return; ++ rpc_unlink(c->pipe_dentry); ++ rpc_put_mount(); ++ bl_comm_global = NULL; ++ kfree(c); ++} ++ ++static ssize_t ++bl_pipe_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *dst, ++ size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied, ++ left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ ++ return mlen; ++} ++ ++static ssize_t ++bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ bl_comm_t *bc = (bl_comm_t *)rpci->private; ++ bl_comm_msg_t *im = &bc->msg; ++ int ret; ++ bl_comm_res_t *res; ++ ++ ++ if (mlen == 0) { ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ im->msg_res = NULL; ++ wake_up(&bc->pipe_wq); ++ return -EFAULT; ++ } ++ ++ if ((res = kmalloc(mlen, GFP_KERNEL)) == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(res, src, mlen)) { ++ kfree(res); ++ return -EFAULT; ++ } ++ ++ mutex_lock(&bc->pipe_lock); ++ ++ ret = mlen; ++ im->msg_status = res->res_status; ++ im->msg_res = res; ++ ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++ return ret; ++} ++ ++static void ++bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ bl_comm_msg_t *im = msg->data; ++ bl_comm_t *bc = container_of(im, struct bl_comm, msg); ++ ++ if (msg->errno >= 0) ++ return; ++ ++ mutex_lock(&bc->pipe_lock); ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++} ++ ++int ++bl_upcall(bl_comm_t *bc, bl_comm_msg_t *upmsg, bl_comm_res_t **res) ++{ ++ struct rpc_pipe_msg msg; ++ DECLARE_WAITQUEUE(wq, current); ++ int rval = 1; ++ bl_comm_msg_t *m = &bc->msg; ++ ++ if (bc == NULL) { ++ dprintk("%s: No pNFS block daemon available\n", __func__); ++ return 1; ++ } ++ ++ mutex_lock(&bc->lock); ++ mutex_lock(&bc->pipe_lock); ++ ++ memcpy(m, upmsg, sizeof (*m)); ++ ++ memset(&msg, 0, sizeof (msg)); ++ msg.data = m; ++ msg.len = sizeof (*m); ++ ++ add_wait_queue(&bc->pipe_wq, &wq); ++ rval = rpc_queue_upcall(bc->pipe_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&bc->pipe_lock); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ mutex_lock(&bc->pipe_lock); ++ ++ if (m->msg_status == PNFS_BLOCK_SUCCESS) { ++ *res = m->msg_res; ++ rval = 0; ++ } else ++ rval = 1; ++ ++out: ++ mutex_unlock(&bc->pipe_lock); ++ mutex_unlock(&bc->lock); ++ return rval; ++} ++ ++static ssize_t ctl_write(struct file *file, const char __user *buf, size_t len, ++ loff_t *offset) ++{ ++ int cmd, ++ rc; ++ bl_comm_t *bc = bl_comm_global; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof (int))) ++ return -EFAULT; ++ switch (cmd) { ++ case PNFS_BLOCK_CTL_STOP: ++ msg.msg_type = PNFS_UPCALL_MSG_STOP; ++ (void) bl_upcall(bc, &msg, &res); ++ kfree(res); ++ nfsd_bl_stop(); ++ break; ++ ++ case PNFS_BLOCK_CTL_START: ++ rc = nfsd_bl_start(); ++ if (rc != 0) ++ return rc; ++ break; ++ ++ case PNFS_BLOCK_CTL_VERS: ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bc, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ kfree(res); ++ break; ++ ++ default: ++ dprintk("%s: unknown ctl command %d\n", __func__, cmd); ++ break; ++ } ++ return len; ++} ++ ++static struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++ ++/* ++ * bl_init_proc -- set up proc interfaces ++ * ++ * Creating a pnfs_block directory isn't really required at this point ++ * since we've only got a single node in that directory. If the need for ++ * more nodes doesn't present itself shortly this code should revert ++ * to a single top level node. McNeal 11-Aug-2008. ++ */ ++int ++bl_init_proc(void) ++{ ++ struct proc_dir_entry *e; ++ ++ e = proc_mkdir("fs/pnfs_block", NULL); ++ if (!e) ++ return -ENOMEM; ++ ++ e = create_proc_entry("fs/pnfs_block/ctl", 0, NULL); ++ if (!e) ++ return -ENOMEM; ++ e->proc_fops = &ctl_ops; ++ ++ return 0; ++} ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/bl_ops.c +--- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-08-23 12:09:03.299501445 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-08-23 12:09:03.299501445 -0400 +@@ -0,0 +1,1672 @@ ++/* ++ * bl_ops.c ++ * spNFS ++ * ++ * Created by Rick McNeal on 4/1/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++ ++/* ++ * Block layout operations. ++ * ++ * These functions, with the exception of pnfs_block_enabled, are assigned to ++ * the super block s_export_op structure. ++ */ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#define MIN(a, b) ((a) < (b) ? (a) : (b)) ++ ++#define BL_LAYOUT_HASH_BITS 4 ++#define BL_LAYOUT_HASH_SIZE (1 << BL_LAYOUT_HASH_BITS) ++#define BL_LAYOUT_HASH_MASK (BL_LAYOUT_HASH_SIZE - 1) ++#define BL_LIST_REQ (sizeof (struct dm_ioctl) + 256) ++ ++#define bl_layout_hashval(id) \ ++ ((id) & BL_LAYOUT_HASH_MASK) ++ ++#define BLL_F_END(p) ((p)->bll_foff + (p)->bll_len) ++#define BLL_S_END(p) ((p)->bll_soff + (p)->bll_len) ++#define _2SECTS(v) ((v) >> 9) ++ ++#ifndef READ32 ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++(x) = (u64)ntohl(*p++) << 32; \ ++(x) |= ntohl(*p++); \ ++} while (0) ++#endif ++ ++ ++typedef enum {True, False} boolean_t; ++/* ---- block layoutget and commit structure ---- */ ++typedef struct bl_layout_rec { ++ struct list_head blr_hash, ++ blr_layouts; ++ dev_t blr_rdev; ++ struct inode *blr_inode; ++ int blr_recalled; // debug ++ u64 blr_orig_size, ++ blr_commit_size, ++ blr_ext_size; ++ spinlock_t blr_lock; // Protects blr_layouts ++} bl_layout_rec_t; ++ ++static struct list_head layout_hash; ++static struct list_head layout_hashtbl[BL_LAYOUT_HASH_SIZE]; ++static spinlock_t layout_hashtbl_lock; ++ ++/* ---- prototypes ---- */ ++static boolean_t device_slice(dev_t devid); ++static boolean_t device_dm(dev_t devid); ++static boolean_t layout_inode_add(struct inode *i, bl_layout_rec_t **); ++static bl_layout_rec_t *layout_inode_find(struct inode *i); ++static void layout_inode_del(struct inode *i); ++static char *map_state2name(enum pnfs_block_extent_state4 s); ++static pnfs_blocklayout_devinfo_t *bld_alloc(struct list_head *volume, int type); ++static void bld_free(pnfs_blocklayout_devinfo_t *bld); ++static pnfs_blocklayout_devinfo_t *bld_simple(struct list_head *volumes, ++ dev_t devid, int local_index); ++static pnfs_blocklayout_devinfo_t *bld_slice(struct list_head *volumes, ++ dev_t devid, int my_loc, int idx); ++static int layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg); ++struct list_head *layout_cache_iter(bl_layout_rec_t *r, ++ struct list_head *bl_possible, struct nfsd4_layout_seg *seg); ++static void layout_cache_merge(bl_layout_rec_t *r, struct list_head *h); ++static int layout_cache_update(bl_layout_rec_t *r, struct list_head *h); ++static void layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg); ++static void print_bll(pnfs_blocklayout_layout_t *b, char *); ++static inline boolean_t layout_cache_fill_from_list(bl_layout_rec_t *r, ++ struct list_head *h, struct nfsd4_layout_seg *seg); ++static inline void bll_collapse(bl_layout_rec_t *r, ++ pnfs_blocklayout_layout_t *c); ++static pnfs_blocklayout_layout_t *bll_alloc(u64 offset, u64 len, ++ enum bl_cache_state state, struct list_head *h); ++static pnfs_blocklayout_layout_t *bll_alloc_dup(pnfs_blocklayout_layout_t *b, ++ enum bl_cache_state c, struct list_head *h); ++static inline boolean_t layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s); ++static void extents_setup(struct fiemap_extent_info *fei); ++static void extents_count(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_get(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_process(struct fiemap_extent_info *fei, ++ struct list_head *bl_candidates, struct nfsd4_layout_seg *, dev_t dev, ++ pnfs_blocklayout_layout_t *b); ++static void extents_cleanup(struct fiemap_extent_info *fei); ++ ++void ++nfsd_bl_init(void) ++{ ++ int i; ++ dprintk("%s loaded\n", __func__); ++ ++ spin_lock_init(&layout_hashtbl_lock); ++ INIT_LIST_HEAD(&layout_hash); ++ for (i = 0; i < BL_LAYOUT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&layout_hashtbl[i]); ++ bl_init_proc(); ++} ++ ++/* ++ * pnfs_block_enabled -- check to see if this file system should be export as ++ * block pnfs ++ */ ++int ++pnfs_block_enabled(struct inode *inode, int ex_flags) ++{ ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ static int bl_comm_once = 0; ++ ++ dprintk("--> %s\n", __func__); ++ /* ++ * FIXME: Figure out method to determine if this file system should ++ * be exported. The following areas need to be checked. ++ * (1) Validate that this file system was exported as a pNFS ++ * block-layout ++ * (2) Has there been successful communication with the ++ * volume daemon? ++ */ ++ /* Check #1 */ ++#ifdef notyet ++ if (!(ex_flags & NFSEXP_PNFS_BLOCK)) { ++ dprintk("%s: pnfs_block not set in export\n", __func__); ++ return 0; ++ } ++#endif ++ ++ /* Check #1 */ ++ if (!bl_comm_once) { ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ if (msg.u.msg_vers != res->u.vers) { ++ dprintk("%s: vers mismatch, kernel != daemon\n", ++ __func__); ++ kfree(res); ++ return 0; ++ } ++ } ++ bl_comm_once = 1; ++ ++ kfree(res); ++ ++ dprintk("<-- %s okay\n", __func__); ++ return 1; ++} ++ ++int ++bl_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_BLOCK_VOLUME; ++} ++ ++int ++bl_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_devid = sb->s_dev; ++ res->gd_verf = 1; ++ res->gd_cookie = 1; ++ return 0; ++} ++ ++static int ++bl_getdeviceinfo_slice(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld_slice_p, ++ *bld_simple_p, ++ *bld; ++ int status = -EIO, ++ location = 0; ++ struct list_head volumes; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ bld_simple_p = bld_simple(&volumes, devid->devid, ++ location++); ++ if (!bld_simple_p) ++ goto out; ++ bld_slice_p = bld_slice(&volumes, devid->devid, location++, ++ bld_simple_p->bld_index_loc); ++ ++ if (!bld_slice_p) ++ goto out; ++ ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ if (bld->bld_type == PNFS_BLOCK_VOLUME_SIMPLE) ++ kfree(bld->u.simple.bld_sig); ++ bld_free(bld); ++ } ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++static int ++bl_getdeviceinfo_dm(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ int status = -EIO, // default to error ++ i, ++ location = 0; ++ struct list_head volumes; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMGET; ++ msg.u.msg_dev = devid->devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: upcall for DMGET failed\n", __func__); ++ goto out; ++ } ++ ++ /* ++ * Don't use bld_alloc() here. If used this will be the first volume ++ * type added to the list whereas the protocol requires it to be the ++ * last. ++ */ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ goto out; ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = PNFS_BLOCK_VOLUME_STRIPE; ++ bld->u.stripe.bld_stripes = res->u.stripe.num_stripes; ++ bld->u.stripe.bld_chunk_size = res->u.stripe.stripe_size * 512LL; ++ dprintk("%s: stripes %d, chunk_size %Lu\n", __func__, ++ bld->u.stripe.bld_stripes, bld->u.stripe.bld_chunk_size / 512LL); ++ ++ bld->u.stripe.bld_stripe_indexs = kmalloc(bld->u.stripe.bld_stripes * ++ sizeof (int), GFP_KERNEL); ++ if (!bld->u.stripe.bld_stripe_indexs) ++ goto out; ++ ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) { ++ dev_t dev; ++ pnfs_blocklayout_devinfo_t *bldp; ++ ++ dev = MKDEV(res->u.stripe.devs[i].major, ++ res->u.stripe.devs[i].minor); ++ if (dev == 0) ++ goto out; ++ ++ bldp = bld_simple(&volumes, dev, location++); ++ if (!bldp) { ++ dprintk("%s: bld_simple failed\n", __func__); ++ goto out; ++ } ++ bldp = bld_slice(&volumes, dev, location++, bldp->bld_index_loc); ++ ++ if (!bldp) { ++ dprintk("%s: bld_slice failed\n", __func__); ++ goto out; ++ } ++ bld->u.stripe.bld_stripe_indexs[i] = bldp->bld_index_loc; ++ ++ } ++ list_add_tail(&bld->bld_list, &volumes); ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SLICE: ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ // No memory to release for these ++ break; ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ kfree(bld->u.simple.bld_sig); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ kfree(bld->u.stripe.bld_stripe_indexs); ++ break; ++ } ++ bld_free(bld); ++ } ++ kfree(res); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++/* ++ * bl_getdeviceinfo -- determine device tree for requested devid ++ */ ++int ++bl_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ if (device_slice(devid->devid) == True) ++ return bl_getdeviceinfo_slice(sb, xdr, devid); ++ else if (device_dm(devid->devid) == True) ++ return bl_getdeviceinfo_dm(sb, xdr, devid); ++ return -EINVAL; ++} ++ ++enum nfsstat4 ++bl_layoutget(struct inode *i, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ pnfs_blocklayout_layout_t *b; ++ bl_layout_rec_t *r; ++ struct list_head bl_possible, ++ *bl_candidates = NULL; ++ boolean_t del_on_error = False; ++ int adj; ++ enum nfsstat4 nfserr = NFS4_OK; ++ ++ dprintk("--> %s (inode=[0x%x:%lu], offset=%Lu, len=%Lu, iomode=%d)\n", ++ __func__, i->i_sb->s_dev, i->i_ino, _2SECTS(res->lg_seg.offset), ++ _2SECTS(res->lg_seg.length), res->lg_seg.iomode); ++ ++ if (res->lg_seg.length == 0) { ++ printk("%s: request length of 0, error condition\n", __func__); ++ return NFS4ERR_BADLAYOUT; ++ } ++ ++ /* ++ * Adjust the length as required per spec. ++ * - First case is were the length is set to (u64)-1. Cheap means to ++ * define the end of the file. ++ * - Second case is were the I/O mode is read-only, but the request is ++ * past the end of the file so the request needs to be trimed. ++ */ ++ if ((res->lg_seg.length == NFS4_MAX_UINT64) || ++ (((res->lg_seg.offset + res->lg_seg.length) > i->i_size) && ++ (res->lg_seg.iomode == IOMODE_READ))) ++ res->lg_seg.length = i->i_size - res->lg_seg.offset; ++ ++ adj = (res->lg_seg.offset & 511) ? res->lg_seg.offset & 511 : 0; ++ res->lg_seg.offset -= adj; ++ res->lg_seg.length = (res->lg_seg.length + adj + 511) & ~511; ++ ++ if (res->lg_seg.iomode != IOMODE_READ) ++ if (i->i_op->fallocate(i, FALLOC_FL_KEEP_SIZE, ++ res->lg_seg.offset, res->lg_seg.length)) ++ return NFS4ERR_IO; ++ ++ INIT_LIST_HEAD(&bl_possible); ++ ++ if ((r = layout_inode_find(i)) == NULL) { ++ if (layout_inode_add(i, &r) == False) { ++ printk("%s: layout_inode_add failed\n", __func__); ++ return NFS4ERR_IO; ++ } ++ del_on_error = True; ++ } ++ BUG_ON(!r); ++ ++ spin_lock(&r->blr_lock); ++ ++ if (layout_cache_fill_from(r, &bl_possible, &res->lg_seg)) { ++ /* ++ * This will send LAYOUTTRYAGAIN error to the client. ++ */ ++ dprintk("%s: layout_cache_fill_from() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res->lg_return_on_close = 1; ++ res->lg_seg.length = 0; ++ ++ bl_candidates = layout_cache_iter(r, &bl_possible, &res->lg_seg); ++ if (!bl_candidates) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ layout_cache_merge(r, bl_candidates); ++ if (layout_cache_update(r, bl_candidates)) { ++ /* ---- Failed to allocate memory. ---- */ ++ dprintk("%s: layout_cache_update() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ nfserr = blocklayout_encode_layout(xdr, bl_candidates); ++ if (nfserr) ++ dprintk("%s: layoutget xdr routine failed\n", __func__); ++ ++layoutget_cleanup: ++ if (bl_candidates) { ++ while (!list_empty(bl_candidates)) { ++ b = list_entry(bl_candidates->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ } ++ ++ spin_unlock(&r->blr_lock); ++ if (unlikely(nfserr)) { ++ if (del_on_error == True) ++ layout_inode_del(i); ++ res->lg_seg.length = 0; ++ res->lg_seg.offset = 0; ++ } ++ ++ dprintk("<-- %s (rval %u)\n", __func__, nfserr); ++ return nfserr; ++} ++ ++/* ++ * bl_layoutcommit -- commit changes, especially size, to file systemj ++ * ++ * Currently this routine isn't called and everything is handled within ++ * nfsd4_layoutcommit(). By not calling this routine the server doesn't ++ * handle a partial return, a set of extents, of the layout. The extents ++ * are decoded here, but nothing is done with them. If this routine is ++ * be called the interface must change to pass the 'dentry' pointer such ++ * that notify_change() can be called. ++ */ ++int ++bl_layoutcommit(struct inode *i, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ bl_layout_rec_t *r; ++ int status = 0; ++ u64 lw_plus; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ r = layout_inode_find(i); ++ if (r) { ++ lw_plus = args->lc_last_wr + 1; ++ if (args->lc_newoffset) { ++ dprintk(" lc_last_wr %Lu\n", lw_plus); ++ if (r->blr_orig_size < lw_plus) { ++ r->blr_orig_size = lw_plus; ++ res->lc_size_chg = 1; ++ res->lc_newsize = lw_plus; ++ } ++ } ++ ++ if (args->lc_up_len) { ++ int extents, ++ i; ++ struct pnfs_blocklayout_layout *b; ++ __be32 *p = args->lc_up_layout; ++ ++ /* ++ * Client is returning a set of extents which ++ * should/could be used to update the file system. ++ * See section 2.3.2 in draft-ietf-nfsv4-pnfs-block-08 ++ */ ++ READ32(extents); ++ dprintk(" Client returning %d extents: data size %d\n", ++ extents, args->lc_up_len); ++ b = kmalloc(sizeof (struct pnfs_blocklayout_layout) * ++ extents, GFP_KERNEL); ++ if (b) { ++ for (i = 0; i < extents; i++) { ++ READ64(b[i].bll_vol_id.sbid); ++ READ64(b[i].bll_vol_id.devid); ++ READ64(b[i].bll_foff); ++ READ64(b[i].bll_len); ++ READ64(b[i].bll_soff); ++ READ32(b[i].bll_es); ++ dprintk(" %d: foff %Lu, len %Lu, soff %Lu " ++ "state %s\n", ++ i, _2SECTS(b[i].bll_foff), ++ _2SECTS(b[i].bll_len), ++ _2SECTS(b[i].bll_soff), ++ map_state2name(b[i].bll_es)); ++ } ++ kfree(b); ++ } else { ++ status = -ENOMEM; ++ } ++ } ++ } else ++ dprintk("%s: Unexpected commit to inode %p\n", __func__, i); ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutreturn(struct inode *i, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ int status = 0; ++ bl_layout_rec_t *r; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ ++ r = layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ layout_cache_del(r, &args->lr_seg); ++ spin_unlock(&r->blr_lock); ++ dprintk(" ext_size %Lu, i_size %Lu, orig_size %Lu\n", ++ r->blr_ext_size, i->i_size, r->blr_orig_size); ++ } ++ ++ layout_inode_del(i); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ bl_layout_rec_t *r; ++ pnfs_blocklayout_layout_t *b; ++ u64 adj; ++ ++ dprintk("--> %s\n", __func__); ++ BUG_ON(!len); ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk(" recalling layout [0x%x:%lu], %Lu:%Lu\n", ++ inode->i_sb->s_dev, inode->i_ino, ++ _2SECTS(offset), _2SECTS(len)); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* ++ * XXX figure out how to get a sb since there's no ++ * inode ptr ++ */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++restart: ++ r = layout_inode_find(inode); ++ if (r && len && !r->blr_recalled) { ++ spin_lock(&r->blr_lock); ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (!r->blr_recalled && !b->bll_recalled && ++ (offset >= b->bll_foff) && (offset < BLL_F_END(b))) { ++ b->bll_recalled = 1; ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_BLOCK_VOLUME; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = 0; ++ lr.cbl_seg.length = NFS4_MAX_UINT64; ++ r->blr_recalled = 1; ++ dprintk(" FULL LAYOUTRECALL\n"); ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ ++ /* ++ * Currently there are only two cases where the ++ * layout is being returned. ++ * (1) Someone is issuing a NFS_WRITE operation ++ * to this layout. ++ * (2) The file has been truncated which means ++ * the layout is immediately made invalid. ++ * In both cases the client must write any ++ * uncommitted modifications to the server via ++ * NFS_WRITE. ++ */ ++ lr.cbl_layoutchanged = 1; ++ ++ /* ++ * Need to drop the lock because we'll get a ++ * layoutreturn which will block waiting for ++ * the lock. The request will come in on the ++ * same thread which will cause a deadlock. ++ */ ++ spin_unlock(&r->blr_lock); ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ adj = MIN(b->bll_len - (offset - b->bll_foff), ++ len); ++ offset += adj; ++ len -= adj; ++ if (!len) { ++ spin_lock(&r->blr_lock); ++ break; ++ } ++ /* ++ * Since layoutreturn will have been called we ++ * can't assume blr_layouts is still valid, ++ * so restart. ++ */ ++ goto restart; ++ } ++ } ++ spin_unlock(&r->blr_lock); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++/* ++ * []------------------------------------------------------------------[] ++ * | Support functions from here on down. | ++ * []------------------------------------------------------------------[] ++ */ ++ ++/* ++ * bld_simple -- given a dev_t build a simple volume structure ++ * ++ * Simple volume contains the device signature and offset to that data in ++ * the storage volume. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_simple(struct list_head *volumes, dev_t devid, int local_index) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSIG; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to get signature information\n", __func__); ++ goto error; ++ } ++ ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SIMPLE); ++ if (!bld) ++ return NULL; ++ ++ bld->u.simple.bld_offset = (res->u.sig.sector * 512LL) + res->u.sig.offset; ++ bld->u.simple.bld_sig_len = res->u.sig.len; ++ bld->u.simple.bld_sig = kmalloc(res->u.sig.len, GFP_KERNEL); ++ if (!bld->u.simple.bld_sig) ++ goto error; ++ ++ memcpy(bld->u.simple.bld_sig, res->u.sig.sig, res->u.sig.len); ++ kfree(res); ++ return bld; ++ ++error: ++ if (bld) ++ bld_free(bld); ++ if (res) ++ kfree(res); ++ dprintk("%s: error in bld_simple\n", __func__); ++ return NULL; ++} ++ ++/* ++ * bld_slice -- given a dev_t build a slice volume structure ++ * ++ * A slice volume contains the length of the slice/partition and its offset ++ * from the beginning of the storage volume. There's also a reference to ++ * the "simple" volume which contains this slice. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_slice(struct list_head *volumes, dev_t devid, int my_loc, int simple_loc) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SLICE); ++ if (!bld) ++ return NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSLICE; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Upcall to get slice info failed\n"); ++ bld_free(bld); ++ return NULL; ++ } ++ ++ bld->bld_devid.devid = devid; ++ bld->bld_index_loc = my_loc; ++ bld->u.slice.bld_start = res->u.slice.start * 512LL; ++ bld->u.slice.bld_len = res->u.slice.length * 512LL; ++ bld->u.slice.bld_index = simple_loc; ++ ++ dprintk("%s: start %Lu, len %Lu\n", __func__, ++ bld->u.slice.bld_start / 512LL, bld->u.slice.bld_len / 512LL); ++ ++ kfree(res); ++ dprintk("<-- %s (rval %p)\n", __func__, bld); ++ return bld; ++} ++ ++static int ++layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!list_empty(&r->blr_layouts)) ++ if (layout_cache_fill_from_list(r, h, seg) == False) ++ return -EIO; ++ ++ /* ++ * This deals with two conditions. ++ * (1) When blr_layouts is empty we need to create the first entry ++ * (2) When the range requested falls past the end of any current ++ * layout the residual must be taken care of. ++ */ ++ if (seg->length) { ++ n = bll_alloc(seg->offset, seg->length, BLOCK_LAYOUT_NEW, h); ++ if (!n) ++ return -ENOMEM; ++ dprintk(" remaining at %Lu, len %Lu\n", _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len)); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++struct list_head * ++layout_cache_iter(bl_layout_rec_t *r, struct list_head *bl_possible, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n = NULL; ++ struct list_head *bl_candidates = NULL; ++ struct fiemap_extent_info fei; ++ struct inode *i; ++ dev_t dev; ++ ++ dev = r->blr_rdev; ++ i = r->blr_inode; ++ ++ dprintk("--> %s\n", __func__); ++ bl_candidates = kmalloc(sizeof (*bl_candidates), GFP_KERNEL); ++ if (!bl_candidates) ++ return NULL; ++ INIT_LIST_HEAD(bl_candidates); ++ extents_setup(&fei); ++ ++ list_for_each_entry(b, bl_possible, bll_list) { ++ if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ extents_count(&fei, i, b->bll_foff, b->bll_len); ++ if (fei.fi_extents_mapped) { ++ ++ /* ++ * Common case here. Got a range which has ++ * extents. Now get those extents and process ++ * them into pNFS extents. ++ */ ++ if (extents_get(&fei, i, b->bll_foff, ++ b->bll_len) == False) ++ goto cleanup; ++ if (extents_process(&fei, bl_candidates, ++ seg, dev, b) == False) ++ goto cleanup; ++ extents_cleanup(&fei); ++ ++ } else if (seg->iomode == IOMODE_READ) { ++ ++ /* ++ * Found a hole in a file while reading. No ++ * problem, just create a pNFS extent for the ++ * range and let the client know there's no ++ * backing store. ++ */ ++ n = bll_alloc(b->bll_foff, b->bll_len, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += b->bll_len; ++ } else { ++ ++ /* ++ * There's a problem here. Since the iomode ++ * is read/write fallocate should have allocated ++ * any necessary storage for the given range. ++ */ ++ dprintk(" Extent count for RW is 0\n"); ++ goto cleanup; ++ } ++ ++ } else { ++ n = bll_alloc_dup(b, b->bll_cache_state, bl_candidates); ++ seg->length += n->bll_len; ++ } ++ ++ if (r->blr_ext_size < (b->bll_foff + b->bll_len)) ++ r->blr_ext_size = b->bll_foff + b->bll_len; ++ } ++ ++ while (!list_empty(bl_possible)) { ++ b = list_entry(bl_possible->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ ++ b = list_first_entry(bl_candidates, struct pnfs_blocklayout_layout, ++ bll_list); ++ seg->offset = b->bll_foff; ++ dprintk("<-- %s okay\n", __func__); ++ return bl_candidates; ++ ++cleanup: ++ extents_cleanup(&fei); ++ if (bl_candidates) ++ kfree(bl_candidates); ++ dprintk("<-- %s, error occurred\n", __func__); ++ return NULL; ++} ++ ++/* ++ * layout_cache_merge -- collapse layouts which make up a contiguous range. ++ */ ++static void ++layout_cache_merge(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *p; ++ ++ dprintk("--> %s\n", __func__); ++restart: ++ p = NULL; ++ list_for_each_entry(b, h, bll_list) { ++ if (p && (BLL_S_END(p) == b->bll_soff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ /* ++ * We've got a condidate. ++ */ ++#ifdef too_verbose ++ dprintk(" merge %Lu(f):%Lu(l):%Lu(s) into %Lu(f):%Lu(l):%Lu(s)\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), ++ _2SECTS(p->bll_foff), _2SECTS(p->bll_len), ++ _2SECTS(b->bll_soff)); ++#endif ++ ++ if (p->bll_cache_state == BLOCK_LAYOUT_CACHE) ++ p->bll_cache_state = BLOCK_LAYOUT_UPDATE; ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else if (p && (BLL_F_END(p) == b->bll_foff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es == PNFS_BLOCK_NONE_DATA)) { ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else ++ p = b; ++ } ++ dprintk("<-- %s\n", __func__); ++} ++ ++static int ++layout_cache_update(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *c, ++ *n; ++ boolean_t status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ if (list_empty(&r->blr_layouts)) { ++ /* ---- Just add entries and return ---- */ ++ dprintk(" cache empty for inode 0x%x:%ld\n", r->blr_rdev, ++ r->blr_inode->i_ino); ++ list_for_each_entry(b, h, bll_list) { ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, ++ &r->blr_layouts); ++ if (!c) { ++ status = -ENOMEM; ++ break; ++ } ++ dprintk(" adding %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), c->bll_es); ++ } ++ return status; ++ } ++ ++ list_for_each_entry(b, h, bll_list) { ++ BUG_ON(!b->bll_vol_id.devid); ++ if (b->bll_cache_state == BLOCK_LAYOUT_UPDATE) { ++ boolean_t found = False; ++ list_for_each_entry(c, &r->blr_layouts, bll_list) { ++ if ((b->bll_soff >= c->bll_soff) && ++ (b->bll_soff < BLL_S_END(c)) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ u64 u; ++ ++ if ((b->bll_foff < c->bll_foff) || ++ (b->bll_foff > BLL_F_END(c))) ++ BUG(); ++ ++ u = BLL_S_END(b) - BLL_S_END(c); ++ /* ++ * The updated cache entry has to be ++ * different than the current. ++ * Otherwise the cache state for 'b' ++ * should be BLOCK_LAYOUT_CACHE. ++ */ ++ BUG_ON(BLL_S_END(b) < BLL_S_END(c)); ++ ++ dprintk(" " ++ "updating %Lu(f):%Lu(l):%Lu(s) to len %Lu\n", ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), ++ _2SECTS(c->bll_len + u)); ++ c->bll_len += u; ++ bll_collapse(r, c); ++ found = True; ++ break; ++ } ++ } ++ ++ if (found == False) { ++ dprintk(" ERROR Expected to find" ++ " %Lu(f):%Lu(l):%Lu(s), but didn't\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_for_each_entry(c, &r->blr_layouts, bll_list) ++ print_bll(c, "Cached"); ++ BUG(); ++ } ++ } else if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ c = list_first_entry(&r->blr_layouts, ++ struct pnfs_blocklayout_layout, bll_list); ++ if (b->bll_foff < c->bll_foff) { ++ /* ++ * Special case where new entry is before ++ * first cached entry. ++ */ ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, NULL); ++ list_add(&c->bll_list, &r->blr_layouts); ++ dprintk(" new entry at head of list at %Lu, " ++ "len %Lu\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len)); ++ } else { ++ list_for_each_entry(c, &r->blr_layouts, ++ bll_list) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, ++ bll_list); ++ /* ++ * This is ugly, but can't think of ++ * another way to examine this case. ++ * Consider the following. Need to ++ * add an entry which starts at 40 ++ * and the cache has the following ++ * entries: ++ * Start Length ++ * 10 5 ++ * 30 5 ++ * 50 5 ++ * So, need to look and see if the new ++ * entry starts after the current ++ * cache, but before the next one. ++ * There's a catch in that the next ++ * entry might not be valid as it's ++ * really just a pointer to the list ++ * head. ++ */ ++ if (((b->bll_foff >= ++ BLL_F_END(c)) && ++ (c->bll_list.next == &r->blr_layouts)) || ++ ((b->bll_foff >= ++ BLL_F_END(c)) && ++ (b->bll_foff < n->bll_foff))) { ++ ++ n = bll_alloc_dup(b, ++ BLOCK_LAYOUT_CACHE, NULL); ++ dprintk(" adding new %Lu:%Lu" ++ " after %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len), ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len)); ++ list_add(&n->bll_list, ++ &c->bll_list); ++ break; ++ } ++ } ++ } ++ } ++ } ++ dprintk("<-- %s\n", __func__); ++ return status; ++} ++ ++static void ++layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg_in) ++{ ++ struct pnfs_blocklayout_layout *b, ++ *n; ++ u64 len; ++ struct nfsd4_layout_seg seg = *seg_in; ++ ++ dprintk("--> %s\n", __func__); ++ if (seg.length == NFS4_MAX_UINT64) { ++ r->blr_recalled = 0; ++ dprintk(" Fast return of all layouts\n"); ++ while (!list_empty(&r->blr_layouts)) { ++ b = list_entry(r->blr_layouts.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ dprintk(" foff %Lu, len %Lu, soff %Lu\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ dprintk("<-- %s\n", __func__); ++ return; ++ } ++ ++restart: ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg.offset == b->bll_foff) { ++ /* ++ * This handle the following three cases: ++ * (1) return layout matches entire cache layout ++ * (2) return layout matches beginning portion of cache ++ * (3) return layout matches entire cache layout and ++ * into next entry. Varies from #1 in end case. ++ */ ++ dprintk(" match on offsets, %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length)); ++ len = MIN(seg.length, b->bll_len); ++ b->bll_foff += len; ++ b->bll_soff += len; ++ b->bll_len -= len; ++ seg.length -= len; ++ seg.offset += len; ++ if (!b->bll_len) { ++ list_del(&b->bll_list); ++ kfree(b); ++ dprintk(" removing cache line\n"); ++ if (!seg.length) { ++ dprintk(" also finished\n"); ++ goto complete; ++ } ++ /* ++ * Since 'b' was freed we can't continue at the ++ * next entry which is referenced as ++ * b->bll_list.next by the list_for_each_entry ++ * macro. Need to restart the loop. ++ * TODO: Think about creating a dummy 'b' which ++ * would keep list_for_each_entry() happy. ++ */ ++ goto restart; ++ } ++ if (!seg.length) { ++ dprintk(" finished, but cache line not" ++ "empty\n"); ++ goto complete; ++ } ++ } else if ((seg.offset >= b->bll_foff) && ++ (seg.offset < BLL_F_END(b))) { ++ /* ++ * layout being returned is within this cache line. ++ */ ++ dprintk(" layout %Lu:%Lu within cache line %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ BUG_ON(!seg.length); ++ if ((seg.offset + seg.length) >= BLL_F_END(b)) { ++ /* ++ * Layout returned starts in the middle of ++ * cache entry and just need to trim back ++ * cache to shorter length. ++ */ ++ dprintk(" trim back cache line\n"); ++ len = seg.offset - b->bll_foff; ++ seg.offset += b->bll_len - len; ++ seg.length -= b->bll_len - len; ++ b->bll_len = len; ++ if (!seg.length) ++ return; ++ } else { ++ /* ++ * Need to split current cache layout because ++ * chunk is being removed from the middle. ++ */ ++ dprintk(" split cache line\n"); ++ len = seg.offset + seg.length; ++ n = bll_alloc(len, ++ (b->bll_foff + b->bll_len) - len, ++ BLOCK_LAYOUT_CACHE, NULL); ++ n->bll_soff = b->bll_soff + len; ++ list_add(&n->bll_list, &b->bll_list); ++ b->bll_len = seg.offset - b->bll_foff; ++ return; ++ } ++ } ++ } ++complete: ++ if (list_empty(&r->blr_layouts)) ++ r->blr_recalled = 0; ++ dprintk("<-- %s\n", __func__); ++} ++ ++/* ++ * layout_cache_fill_from_list -- fills from cache list ++ * ++ * NOTE: This routine was only seperated out from layout_cache_file_from() ++ * to reduce the indentation level which makes the code easier to read. ++ */ ++static inline boolean_t ++layout_cache_fill_from_list(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n; ++ enum pnfs_block_extent_state4 s; ++ ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg->offset < b->bll_foff) { ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, b->bll_foff - seg->offset), ++ BLOCK_LAYOUT_NEW, NULL); ++ if (!n) ++ return False; ++ ++ list_add(&n->bll_list, h->prev); ++ dprintk(" new: %Lu:%Lu, added before %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ ++ if ((seg->offset >= b->bll_foff) && ++ (seg->offset < BLL_F_END(b))) { ++ if (layout_conflict(b, seg->iomode, &s) == False) { ++ dprintk(" CONFLICT FOUND: " ++ "%Lu(f):%Lu(l):%Lu(s) state %d, iomode %d\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es, ++ seg->iomode); ++ return False; ++ } ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, BLL_F_END(b) - seg->offset), ++ BLOCK_LAYOUT_CACHE, h); ++ dprintk(" CACHE hit: Found %Lu(f):%Lu(l): " ++ "in %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es); ++ if (!n) ++ return False; ++ ++ n->bll_soff = b->bll_soff + seg->offset - b->bll_foff; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ n->bll_es = s; ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ } ++ return True; ++} ++ ++static u64 ++bll_alloc_holey(struct list_head *bl_candidates, u64 offset, u64 length, ++ dev_t dev) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ n = bll_alloc(offset, length, BLOCK_LAYOUT_NEW, bl_candidates); ++ if (!n) ++ return 0; ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ ++ return n->bll_len; ++} ++ ++static void ++extents_setup(struct fiemap_extent_info *fei) ++{ ++ fei->fi_extents_start = NULL; ++} ++ ++/* ++ * extents_count -- Determine the number of extents for a given range. ++ * ++ * No need to call set_fs() here because the function ++ * doesn't use copy_to_user() if it's only counting ++ * the number of extents needed. ++ */ ++static void ++extents_count(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ dprintk(" Need fiemap of %Ld:%Ld\n", _2SECTS(foff), _2SECTS(len)); ++ fei->fi_flags = FIEMAP_FLAG_SYNC; ++ fei->fi_extents_max = 0; ++ fei->fi_extents_start = NULL; ++ fei->fi_extents_mapped = 0; ++ i->i_op->fiemap(i, fei, foff, len + (1 << i->i_sb->s_blocksize_bits) - 1); ++} ++ ++/* ++ * extents_get -- Get list of extents for range ++ * ++ * extents_count() must have been called before this routine such that ++ * fi_extents_mapped is known. ++ */ ++static boolean_t ++extents_get(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ int m_space, ++ rval; ++ struct fiemap_extent *fe; ++ mm_segment_t old_fs = get_fs(); ++ ++ /* ++ * Now malloc the correct amount of space ++ * needed. It's possible for the file to have changed ++ * between calls which would require more space for ++ * the extents. If that occurs the last extent will ++ * not have FIEMAP_EXTENT_LAST set and the error will ++ * be caught in extents_process(). ++ */ ++ m_space = fei->fi_extents_mapped * sizeof (struct fiemap_extent); ++ fe = kmalloc(m_space, GFP_KERNEL); ++ if (!fe) ++ return False; ++ memset(fe, 0, m_space); ++ ++ fei->fi_extents_max = fei->fi_extents_mapped; ++ fei->fi_extents_mapped = 0; ++ fei->fi_extents_start = fe; ++ ++ set_fs(KERNEL_DS); ++ rval = i->i_op->fiemap(i, fei, foff, len + ++ (1 << i->i_sb->s_blocksize_bits) - 1); ++ set_fs(old_fs); ++ ++ if (rval || !fei->fi_extents_mapped) { ++ dprintk(" No extents. Wanted %d, got %d\n", ++ fei->fi_extents_max, fei->fi_extents_mapped); ++ kfree(fe); ++ fei->fi_extents_start = NULL; ++ return False; ++ } else ++ return True; ++} ++ ++/* ++ * extents_process -- runs through the extent returned from the file system and ++ * creates block layout entries. ++ */ ++static boolean_t ++extents_process(struct fiemap_extent_info *fei, struct list_head *bl_candidates, ++ struct nfsd4_layout_seg *seg, dev_t dev, pnfs_blocklayout_layout_t *b) ++{ ++ struct fiemap_extent *fep, ++ *fep_last = NULL; ++ int i; ++ pnfs_blocklayout_layout_t *n; ++ u64 last_end, ++ rval; ++ ++ dprintk("--> %s\n", __func__); ++ for (fep = fei->fi_extents_start, i = 0; i < fei->fi_extents_mapped; ++ i++, fep++) { ++ ++ BUG_ON(!fep->fe_physical); ++ /* ++ * Deal with corner cases of hoel-y files. ++ */ ++ if (fep_last && ((fep_last->fe_logical + fep_last->fe_length) != ++ fep->fe_logical)) { ++ ++ /* ++ * If the last extent doesn't end logically ++ * at the beginning of the current we've got ++ * hole and need to create a pNFS extent. ++ */ ++ dprintk(" Got a hole at %Ld:%Ld \n", ++ _2SECTS(fep_last->fe_logical), ++ _2SECTS(fep_last->fe_length)); ++ last_end = fep_last->fe_logical + fep_last->fe_length; ++ rval = bll_alloc_holey(bl_candidates, last_end, ++ fep->fe_logical - last_end, dev); ++ if (!rval) ++ return False; ++ seg->length += rval; ++ } ++ ++ n = bll_alloc(fep->fe_logical, fep->fe_length, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ if (unlikely(n == NULL)) { ++ dprintk("%s: bll_alloc failed\n", __func__); ++ return False; ++ } ++ ++ n->bll_soff = fep->fe_physical; ++ n->bll_es = seg->iomode == IOMODE_READ ? ++ PNFS_BLOCK_READ_DATA : PNFS_BLOCK_READWRITE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += fep->fe_length; ++ print_bll(n, "New extent"); ++ fep_last = fep; ++ } ++ dprintk("<-- %s (i=%d)\n", __func__, i); ++ ++ return True; ++} ++ ++static void ++extents_cleanup(struct fiemap_extent_info *fei) ++{ ++ if (fei->fi_extents_start) { ++ kfree(fei->fi_extents_start); ++ fei->fi_extents_start = NULL; ++ } ++} ++ ++/* ++ * device_slice -- check to see if device is a slice or DM ++ */ ++static boolean_t ++device_slice(dev_t devid) ++{ ++ struct block_device *bd = open_by_devnum(devid, FMODE_READ); ++ boolean_t rval = False; ++ ++ if (bd) { ++ if (bd->bd_disk->minors > 1) ++ rval = True; ++ blkdev_put(bd, FMODE_READ); ++ } ++ return rval; ++} ++ ++/* ++ * device_dm -- check to see if device is a Device Mapper volume. ++ * ++ * Returns 1 for DM or 0 if not ++ */ ++static boolean_t ++device_dm(dev_t devid) ++{ ++ boolean_t rval = False; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMCHK; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Failed upcall to check on DM status\n"); ++ } else if (res->u.dm_vol) { ++ rval = True; ++ dprintk("Device is DM volume\n"); ++ } else ++ dprintk("Device is not DM volume\n"); ++ kfree(res); ++ ++ return rval; ++} ++ ++static boolean_t ++layout_inode_add(struct inode *i, bl_layout_rec_t **p) ++{ ++ bl_layout_rec_t *r = NULL; ++ ++ if (!i->i_op->fiemap || !i->i_op->fallocate) { ++ printk("pNFS: file system doesn't support required fiemap or" ++ "fallocate methods\n"); ++ return False; ++ } ++ ++ r = kmalloc(sizeof (*r), GFP_KERNEL); ++ if (!r) ++ goto error; ++ ++ r->blr_rdev = i->i_sb->s_dev; ++ r->blr_inode = i; ++ r->blr_orig_size = i->i_size; ++ r->blr_ext_size = 0; ++ r->blr_recalled = 0; ++ INIT_LIST_HEAD(&r->blr_layouts); ++ spin_lock_init(&r->blr_lock); ++ spin_lock(&layout_hashtbl_lock); ++ list_add_tail(&r->blr_hash, &layout_hash); ++ spin_unlock(&layout_hashtbl_lock); ++ *p = r; ++ return True; ++ ++error: ++ if (r) ++ kfree(r); ++ return False; ++} ++ ++static bl_layout_rec_t * ++__layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ if (!list_empty(&layout_hash)) { ++ list_for_each_entry(r, &layout_hash, blr_hash) { ++ if ((r->blr_inode->i_ino == i->i_ino) && ++ (r->blr_rdev == i->i_sb->s_dev)) { ++ return r; ++ } ++ } ++ } ++ return NULL; ++} ++ ++static bl_layout_rec_t * ++layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ spin_unlock(&layout_hashtbl_lock); ++ ++ return r; ++} ++ ++static void ++layout_inode_del(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ if (list_empty(&r->blr_layouts)) { ++ list_del(&r->blr_hash); ++ spin_unlock(&r->blr_lock); ++ kfree(r); ++ } else { ++ spin_unlock(&r->blr_lock); ++ } ++ } else { ++ dprintk("%s: failed to find inode [0x%x:%lu] in table for delete\n", ++ __func__, i->i_sb->s_dev, i->i_ino); ++ } ++ spin_unlock(&layout_hashtbl_lock); ++} ++ ++/* ++ * map_state2name -- converts state in ascii string. ++ * ++ * Used for debug messages only. ++ */ ++static char * ++map_state2name(enum pnfs_block_extent_state4 s) ++{ ++ switch (s) { ++ case PNFS_BLOCK_READWRITE_DATA: return " RW"; ++ case PNFS_BLOCK_READ_DATA: return " RO"; ++ case PNFS_BLOCK_INVALID_DATA: return "INVALID"; ++ case PNFS_BLOCK_NONE_DATA: return " NONE"; ++ default: ++ BUG(); ++ } ++} ++ ++static pnfs_blocklayout_devinfo_t * ++bld_alloc(struct list_head *volumes, int type) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ return NULL; ++ ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = type; ++ list_add_tail(&bld->bld_list, volumes); ++ ++ return bld; ++} ++ ++static void ++bld_free(pnfs_blocklayout_devinfo_t *bld) ++{ ++ list_del(&bld->bld_list); ++ kfree(bld); ++} ++ ++static void ++print_bll(pnfs_blocklayout_layout_t *b, char *text) ++{ ++ dprintk(" BLL: %s\n", text); ++ dprintk(" foff %Lu, soff %Lu, len %Lu, state %s\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_soff), _2SECTS(b->bll_len), ++ map_state2name(b->bll_es)); ++} ++ ++static inline void ++bll_collapse(bl_layout_rec_t *r, pnfs_blocklayout_layout_t *c) ++{ ++ pnfs_blocklayout_layout_t *n; ++ int dbg_count = 0; ++ u64 endpoint; ++ ++ BUG_ON(c->bll_es == PNFS_BLOCK_NONE_DATA); ++ while (c->bll_list.next != &r->blr_layouts) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ endpoint = BLL_S_END(c); ++ if ((n->bll_soff >= c->bll_soff) && ++ (n->bll_soff < endpoint)) { ++ if (endpoint < BLL_S_END(n)) { ++ /* ++ * The following is possible. ++ * ++ * ++ * Existing: +---+ +---+ ++ * New: +-----------------------+ ++ * The client request merge entries together ++ * but didn't require picking up all of the ++ * last entry. So, we still need to delete ++ * the last entry and add the remaining space ++ * to the new entry. ++ */ ++ c->bll_len += BLL_S_END(n) - endpoint; ++ } ++ dbg_count++; ++ list_del(&n->bll_list); ++ kfree(n); ++ } else { ++ break; ++ } ++ } ++ /* ---- Debug only, remove before integration ---- */ ++ if (dbg_count) ++ dprintk(" Collapsed %d cache entries between %Lu(s) and %Lu(s)\n", ++ dbg_count, _2SECTS(c->bll_soff), _2SECTS(BLL_S_END(c))); ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc(u64 offset, u64 len, enum bl_cache_state state, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = kmalloc(sizeof (*n), GFP_KERNEL); ++ if (n) { ++ memset(n, 0, sizeof (*n)); ++ n->bll_foff = offset; ++ n->bll_len = len; ++ n->bll_cache_state = state; ++ if (h) ++ list_add_tail(&n->bll_list, h); ++ } ++ return n; ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc_dup(pnfs_blocklayout_layout_t *b, enum bl_cache_state c, ++ struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = bll_alloc(b->bll_foff, b->bll_len, c, h); ++ if (n) { ++ n->bll_es = b->bll_es; ++ n->bll_soff = b->bll_soff; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ } ++ return n; ++} ++ ++static inline boolean_t ++layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s) ++{ ++ /* ---- Normal case ---- */ ++ *s = b->bll_es; ++ ++ switch (b->bll_es) { ++ case PNFS_BLOCK_READWRITE_DATA: ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_READ_DATA; ++ /* ---- Any use is permitted. ---- */ ++ break; ++ case PNFS_BLOCK_READ_DATA: ++ /* ---- Committed as read only data. ---- */ ++ if (iomode == IOMODE_RW) ++ return False; ++ break; ++ case PNFS_BLOCK_INVALID_DATA: ++ /* ---- Blocks have been allocated, but not initialized ---- */ ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_NONE_DATA; ++ break; ++ case PNFS_BLOCK_NONE_DATA: ++ /* ---- Hole-y file. No backing store avail. ---- */ ++ if (iomode != IOMODE_READ) ++ return False; ++ break; ++ default: ++ BUG(); ++ } ++ return True; ++} ++ ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs/delegation.c +--- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-08-23 12:08:29.037481540 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-23 12:09:03.300491952 -0400 +@@ -104,7 +104,8 @@ again: + continue; + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + continue; +- if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) ++ if (memcmp(state->stateid.u.data, stateid->u.data, ++ sizeof(state->stateid.u.data)) != 0) + continue; + get_nfs_open_context(ctx); + spin_unlock(&inode->i_lock); +@@ -133,8 +134,8 @@ void nfs_inode_reclaim_delegation(struct + if (delegation != NULL) { + spin_lock(&delegation->lock); + if (delegation->inode != NULL) { +- memcpy(delegation->stateid.data, res->delegation.data, +- sizeof(delegation->stateid.data)); ++ memcpy(delegation->stateid.u.data, res->delegation.u.data, ++ sizeof(delegation->stateid.u.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + oldcred = delegation->cred; +@@ -187,8 +188,9 @@ static struct nfs_delegation *nfs_detach + if (delegation == NULL) + goto nomatch; + spin_lock(&delegation->lock); +- if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data, +- sizeof(delegation->stateid.data)) != 0) ++ if (stateid != NULL && memcmp(delegation->stateid.u.data, ++ stateid->u.data, ++ sizeof(delegation->stateid.u.data)) != 0) + goto nomatch_unlock; + list_del_rcu(&delegation->super_list); + delegation->inode = NULL; +@@ -216,8 +218,8 @@ int nfs_inode_set_delegation(struct inod + delegation = kmalloc(sizeof(*delegation), GFP_NOFS); + if (delegation == NULL) + return -ENOMEM; +- memcpy(delegation->stateid.data, res->delegation.data, +- sizeof(delegation->stateid.data)); ++ memcpy(delegation->stateid.u.data, res->delegation.u.data, ++ sizeof(delegation->stateid.u.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + delegation->change_attr = nfsi->change_attr; +@@ -471,9 +473,7 @@ void nfs_expire_unreferenced_delegations + /* + * Asynchronous delegation recall! + */ +-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, +- int (*validate_stateid)(struct nfs_delegation *delegation, +- const nfs4_stateid *stateid)) ++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) + { + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_delegation *delegation; +@@ -481,7 +481,7 @@ int nfs_async_inode_return_delegation(st + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + +- if (!validate_stateid(delegation, stateid)) { ++ if (!clp->cl_mvops->validate_stateid(delegation, stateid)) { + rcu_read_unlock(); + return -ENOENT; + } +@@ -562,7 +562,8 @@ int nfs4_copy_delegation_stateid(nfs4_st + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation != NULL) { +- memcpy(dst->data, delegation->stateid.data, sizeof(dst->data)); ++ memcpy(dst->u.data, delegation->stateid.u.data, ++ sizeof(dst->u.data)); + ret = 1; + } + rcu_read_unlock(); +diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs/delegation.h +--- linux-2.6.34.noarch/fs/nfs/delegation.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-08-23 12:09:03.301431797 -0400 +@@ -34,9 +34,7 @@ enum { + int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); + void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); + int nfs_inode_return_delegation(struct inode *inode); +-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, +- int (*validate_stateid)(struct nfs_delegation *delegation, +- const nfs4_stateid *stateid)); ++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); + void nfs_inode_return_delegation_noreclaim(struct inode *inode); + + struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); +diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c +--- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-08-23 12:08:29.089481525 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 12:09:03.302511603 -0400 +@@ -17,11 +17,19 @@ + #include + #include + ++#include ++#if defined(CONFIG_SPNFS) ++#include ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif ++#endif + #include + #include + + #include "nfsd.h" + #include "nfsfh.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_EXPORT + +@@ -352,6 +360,40 @@ static int svc_export_upcall(struct cach + return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); + } + ++#if defined(CONFIG_PNFSD) ++static struct pnfsd_cb_operations pnfsd_cb_op = { ++ .cb_layout_recall = nfsd_layout_recall_cb, ++ .cb_device_notify = nfsd_device_notify_cb, ++ ++ .cb_get_state = nfs4_pnfs_cb_get_state, ++ .cb_change_state = nfs4_pnfs_cb_change_state, ++}; ++ ++#if defined(CONFIG_SPNFS) ++static struct pnfs_export_operations spnfs_export_ops = { ++ .layout_type = spnfs_layout_type, ++ .get_device_info = spnfs_getdeviceinfo, ++ .get_device_iter = spnfs_getdeviceiter, ++ .layout_get = spnfs_layoutget, ++ .layout_return = spnfs_layoutreturn, ++}; ++ ++static struct pnfs_export_operations spnfs_ds_export_ops = { ++ .get_state = spnfs_get_state, ++}; ++ ++#if defined(CONFIG_SPNFS_BLOCK) ++static struct pnfs_export_operations bl_export_ops = { ++ .layout_type = bl_layout_type, ++ .get_device_info = bl_getdeviceinfo, ++ .get_device_iter = bl_getdeviceiter, ++ .layout_get = bl_layoutget, ++ .layout_return = bl_layoutreturn, ++}; ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_SPNFS */ ++#endif /* CONFIG_PNFSD */ ++ + static struct svc_export *svc_export_update(struct svc_export *new, + struct svc_export *old); + static struct svc_export *svc_export_lookup(struct svc_export *); +@@ -395,6 +437,47 @@ static int check_export(struct inode *in + return -EINVAL; + } + ++#if !defined(CONFIG_SPNFS) ++ if (inode->i_sb->s_pnfs_op && ++ (!inode->i_sb->s_pnfs_op->layout_type || ++ !inode->i_sb->s_pnfs_op->get_device_info || ++ !inode->i_sb->s_pnfs_op->layout_get)) { ++ dprintk("exp_export: export of invalid fs pnfs export ops.\n"); ++ return -EINVAL; ++ } ++#endif /* CONFIG_SPNFS */ ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ if (!inode->i_sb->s_pnfs_op) ++ pnfsd_lexp_init(inode); ++ return 0; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, *flags)) { ++ dprintk("set pnfs block export structure... \n"); ++ inode->i_sb->s_pnfs_op = &bl_export_ops; ++ } else ++#endif /* CONFIG_SPNFS_BLOCK */ ++ /* ++ * spnfs_enabled() indicates we're an MDS. ++ * XXX Better to check an export time option as well. ++ */ ++ if (spnfs_enabled()) { ++ dprintk("set spnfs export structure...\n"); ++ inode->i_sb->s_pnfs_op = &spnfs_export_ops; ++ } else { ++ dprintk("%s spnfs not in use\n", __func__); ++ ++ /* ++ * get_state is needed if we're a DS using spnfs. ++ * XXX Better to check an export time option instead. ++ */ ++ inode->i_sb->s_pnfs_op = &spnfs_ds_export_ops; ++ } ++#endif /* CONFIG_SPNFS */ ++ + return 0; + + } +@@ -586,6 +669,8 @@ static int svc_export_parse(struct cache + if (exp.ex_uuid == NULL) + err = -ENOMEM; + } ++ } else if (strcmp(buf, "pnfs") == 0) { ++ exp.ex_pnfs = 1; + } else if (strcmp(buf, "secinfo") == 0) + err = secinfo_parse(&mesg, buf, &exp); + else +@@ -660,6 +745,8 @@ static int svc_export_show(struct seq_fi + seq_printf(m, "%02x", exp->ex_uuid[i]); + } + } ++ if (exp->ex_pnfs) ++ seq_puts(m, ",pnfs"); + show_secinfo(m, exp); + } + seq_puts(m, ")\n"); +@@ -687,6 +774,7 @@ static void svc_export_init(struct cache + new->ex_fslocs.locations = NULL; + new->ex_fslocs.locations_count = 0; + new->ex_fslocs.migrated = 0; ++ new->ex_pnfs = 0; + } + + static void export_update(struct cache_head *cnew, struct cache_head *citem) +@@ -699,6 +787,7 @@ static void export_update(struct cache_h + new->ex_anon_uid = item->ex_anon_uid; + new->ex_anon_gid = item->ex_anon_gid; + new->ex_fsid = item->ex_fsid; ++ new->ex_pnfs = item->ex_pnfs; + new->ex_uuid = item->ex_uuid; + item->ex_uuid = NULL; + new->ex_pathname = item->ex_pathname; +@@ -1635,8 +1724,17 @@ nfsd_export_init(void) + if (rv) + return rv; + rv = cache_register(&svc_expkey_cache); +- if (rv) ++ if (rv) { + cache_unregister(&svc_export_cache); ++ goto out; ++ } ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = THIS_MODULE; ++ pnfsd_cb_ctl.cb_op = &pnfsd_cb_op; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ ++out: + return rv; + + } +@@ -1664,6 +1762,12 @@ nfsd_export_shutdown(void) + + exp_writelock(); + ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = NULL; ++ pnfsd_cb_ctl.cb_op = NULL; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ + cache_unregister(&svc_expkey_cache); + cache_unregister(&svc_export_cache); + svcauth_unix_purge(); +diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/direct.c +--- linux-2.6.34.noarch/fs/nfs/direct.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-08-23 12:09:03.303491500 -0400 +@@ -267,6 +267,38 @@ static const struct rpc_call_ops nfs_rea + .rpc_release = nfs_direct_read_release, + }; + ++static long nfs_direct_read_execute(struct nfs_read_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ nfs_fattr_init(&data->fattr); ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ NFS_PROTO(inode)->read_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct read call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each rsize'd chunk of the user's buffer, dispatch an NFS READ + * operation. If nfs_readdata_alloc() or get_user_pages() fails, +@@ -283,7 +315,6 @@ static ssize_t nfs_direct_read_schedule_ + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; + size_t rsize = NFS_SERVER(inode)->rsize; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -343,26 +374,9 @@ static ssize_t nfs_direct_read_schedule_ + data->res.fattr = &data->fattr; + data->res.eof = 0; + data->res.count = bytes; +- nfs_fattr_init(&data->fattr); +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct read call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_read_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +@@ -448,12 +462,15 @@ static void nfs_direct_free_writedata(st + } + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg); ++ + static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) + { + struct inode *inode = dreq->inode; + struct list_head *p; + struct nfs_write_data *data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = dreq->ctx->cred, + }; +@@ -487,25 +504,7 @@ static void nfs_direct_write_reschedule( + * Reuse data->task; data->args should not have changed + * since the original request was sent. + */ +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- /* +- * We're called via an RPC callback, so BKL is already held. +- */ +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); +- +- dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- data->args.count, +- (unsigned long long)data->args.offset); ++ nfs_direct_write_execute(data, &task_setup_data, &msg); + } + + if (put_dreq(dreq)) +@@ -548,10 +547,31 @@ static const struct rpc_call_ops nfs_com + .rpc_release = nfs_direct_commit_release, + }; + ++static long nfs_direct_commit_execute(struct nfs_direct_req *dreq, ++ struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct rpc_task *task; ++ ++ NFS_PROTO(data->inode)->commit_setup(data, msg); ++ ++ /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ ++ dreq->commit_data = NULL; ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ return 0; ++} ++ + static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) + { + struct nfs_write_data *data = dreq->commit_data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +@@ -579,16 +599,7 @@ static void nfs_direct_commit_schedule(s + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- NFS_PROTO(data->inode)->commit_setup(data, &msg); +- +- /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ +- dreq->commit_data = NULL; +- +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); +- +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); ++ nfs_direct_commit_execute(dreq, data, &task_setup_data, &msg); + } + + static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +@@ -690,6 +701,36 @@ static const struct rpc_call_ops nfs_wri + .rpc_release = nfs_direct_write_release, + }; + ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ NFS_PROTO(inode)->write_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE + * operation. If nfs_writedata_alloc() or get_user_pages() fails, +@@ -705,7 +746,6 @@ static ssize_t nfs_direct_write_schedule + struct inode *inode = ctx->path.dentry->d_inode; + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -771,24 +811,8 @@ static ssize_t nfs_direct_write_schedule + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct write call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_write_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kconfig +--- linux-2.6.34.noarch/fs/nfsd/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-08-23 12:09:03.304505472 -0400 +@@ -79,3 +79,52 @@ config NFSD_V4 + available from http://linux-nfs.org/. + + If unsure, say N. ++ ++config PNFSD ++ bool "NFSv4.1 server support for Parallel NFS (pNFS) (DEVELOPER ONLY)" ++ depends on NFSD_V4 && EXPERIMENTAL ++ select EXPORTFS_FILE_LAYOUT ++ help ++ This option enables support for the parallel NFS features of the ++ minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1) ++ in the kernel's NFS server. ++ ++ Unless you're an NFS developer, say N. ++ ++config PNFSD_LOCAL_EXPORT ++ bool "Enable pNFS support for exporting local filesystems for debugging purposes" ++ depends on PNFSD ++ help ++ Say Y here if you want your pNFS server to export local file systems ++ over the files layout type. With this option the MDS (metadata ++ server) functions also as a single DS (data server). This is mostly ++ useful for development and debugging purposes. ++ ++ If unsure, say N. ++ ++config SPNFS ++ bool "Provide spNFS server support (EXPERIMENTAL)" ++ depends on PNFSD ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS server support. ++ ++ If unsure, say N. ++ ++config SPNFS_LAYOUTSEGMENTS ++ bool "Allow spNFS to return partial file layouts (EXPERIMENTAL)" ++ depends on SPNFS ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS to be able to return layout segments. ++ ++ If unsure, say N. ++ ++config SPNFS_BLOCK ++ bool "Provide Block Layout server support (EXPERIMENTAL)" ++ depends on SPNFS ++ select EXPORTFS_BLOCK_LAYOUT ++ help ++ Say Y here if you want spNFS block layout support ++ ++ If unsure, say N. +diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/Makefile +--- linux-2.6.34.noarch/fs/nfsd/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-08-23 12:09:03.304505472 -0400 +@@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs + nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o + nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ + nfs4acl.o nfs4callback.o nfs4recover.o ++nfsd-$(CONFIG_PNFSD) += nfs4pnfsd.o nfs4pnfsdlm.o nfs4pnfsds.o ++nfsd-$(CONFIG_PNFSD_LOCAL_EXPORT) += pnfsd_lexp.o ++nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o ++nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-23 12:08:29.090501507 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 12:09:03.306491345 -0400 +@@ -40,7 +40,6 @@ + + #define NFSPROC4_CB_NULL 0 + #define NFSPROC4_CB_COMPOUND 1 +-#define NFS4_STATEID_SIZE 16 + + /* Index of predefined Linux callback client operations */ + +@@ -48,11 +47,17 @@ enum { + NFSPROC4_CLNT_CB_NULL = 0, + NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_SEQUENCE, ++#if defined(CONFIG_PNFSD) ++ NFSPROC4_CLNT_CB_LAYOUT, ++ NFSPROC4_CLNT_CB_DEVICE, ++#endif + }; + + enum nfs_cb_opnum4 { + OP_CB_RECALL = 4, ++ OP_CB_LAYOUT = 5, + OP_CB_SEQUENCE = 11, ++ OP_CB_DEVICE = 14, + }; + + #define NFS4_MAXTAGLEN 20 +@@ -78,6 +83,19 @@ enum nfs_cb_opnum4 { + #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) ++#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 3 + \ ++ enc_nfs4_fh_sz + 4) ++#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) ++#define NFS4_enc_cb_device_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 6) ++#define NFS4_dec_cb_device_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) + + /* + * Generic encode routines from fs/nfs/nfs4xdr.c +@@ -94,6 +112,10 @@ xdr_writemem(__be32 *p, const void *ptr, + } + + #define WRITE32(n) *p++ = htonl(n) ++#define WRITE64(n) do { \ ++ *p++ = htonl((u32)((n) >> 32)); \ ++ *p++ = htonl((u32)(n)); \ ++} while (0) + #define WRITEMEM(ptr,nbytes) do { \ + p = xdr_writemem(p, ptr, nbytes); \ + } while (0) +@@ -204,6 +226,16 @@ nfs_cb_stat_to_errno(int stat) + */ + + static void ++encode_stateid(struct xdr_stream *xdr, stateid_t *sid) ++{ ++ __be32 *p; ++ ++ RESERVE_SPACE(sizeof(stateid_t)); ++ WRITE32(sid->si_generation); ++ WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); ++} ++ ++static void + encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) + { + __be32 * p; +@@ -228,10 +260,10 @@ encode_cb_recall(struct xdr_stream *xdr, + __be32 *p; + int len = dp->dl_fh.fh_size; + +- RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len); ++ RESERVE_SPACE(4); + WRITE32(OP_CB_RECALL); +- WRITE32(dp->dl_stateid.si_generation); +- WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t)); ++ encode_stateid(xdr, &dp->dl_stateid); ++ RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2)); + WRITE32(0); /* truncate optimization not implemented */ + WRITE32(len); + WRITEMEM(&dp->dl_fh.fh_base, len); +@@ -259,6 +291,111 @@ encode_cb_sequence(struct xdr_stream *xd + hdr->nops++; + } + ++#if defined(CONFIG_PNFSD) ++ ++#include "pnfsd.h" ++ ++static void ++encode_cb_layout(struct xdr_stream *xdr, struct nfs4_layoutrecall *clr, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ RESERVE_SPACE(20); ++ WRITE32(OP_CB_LAYOUT); ++ WRITE32(clr->cb.cbl_seg.layout_type); ++ WRITE32(clr->cb.cbl_seg.iomode); ++ WRITE32(clr->cb.cbl_layoutchanged); ++ WRITE32(clr->cb.cbl_recall_type); ++ if (unlikely(clr->cb.cbl_recall_type == RETURN_FSID)) { ++ struct nfs4_fsid fsid = clr->cb.cbl_fsid; ++ ++ RESERVE_SPACE(16); ++ WRITE64(fsid.major); ++ WRITE64(fsid.minor); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "fsid 0x%llx-0x%llx\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, fsid.major, fsid.minor); ++ } else if (clr->cb.cbl_recall_type == RETURN_FILE) { ++ int len = clr->clr_file->fi_fhlen; ++ stateid_t *cbl_sid = (stateid_t *)&clr->cb.cbl_sid; ++ ++ RESERVE_SPACE(20 + len); ++ WRITE32(len); ++ WRITEMEM(clr->clr_file->fi_fhval, len); ++ WRITE64(clr->cb.cbl_seg.offset); ++ WRITE64(clr->cb.cbl_seg.length); ++ encode_stateid(xdr, cbl_sid); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "offset %lld length %lld stateid " STATEID_FMT "\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, ++ clr->cb.cbl_seg.offset, clr->cb.cbl_seg.length, ++ STATEID_VAL(cbl_sid)); ++ } else { ++ dprintk("%s: type %x iomode %d changed %d recall_type %d\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type); ++ } ++ hdr->nops++; ++} ++ ++static void ++encode_cb_device(struct xdr_stream *xdr, struct nfs4_notify_device *nd, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ int i; ++ int len = nd->nd_list->cbd_len; ++ struct nfsd4_pnfs_cb_dev_item *cbd = nd->nd_list->cbd_list; ++ ++ dprintk("NFSD %s: --> num %d\n", __func__, len); ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ RESERVE_SPACE(8); ++ WRITE32(OP_CB_DEVICE); ++ ++ /* notify4 cnda_changes<>; */ ++ WRITE32(len); ++ for (i = 0; i < len; i++) { ++ dprintk("%s: nt %d lt %d devid x%llx-x%llx im %d i %d\n", ++ __func__, cbd[i].cbd_notify_type, ++ cbd[i].cbd_layout_type, ++ cbd[i].cbd_devid.sbid, ++ cbd[i].cbd_devid.devid, ++ cbd[i].cbd_immediate, i); ++ ++ BUG_ON(cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_DELETE); ++ RESERVE_SPACE(32); ++ /* bitmap4 notify_mask; */ ++ WRITE32(1); ++ WRITE32(cbd[i].cbd_notify_type); ++ /* opaque notify_vals<>; */ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) ++ WRITE32(24); ++ else ++ WRITE32(20); ++ WRITE32(cbd[i].cbd_layout_type); ++ WRITE64(cbd[i].cbd_devid.sbid); ++ WRITE64(cbd[i].cbd_devid.devid); ++ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) { ++ RESERVE_SPACE(4); ++ WRITE32(cbd[i].cbd_immediate); ++ } ++ } ++ hdr->nops++; ++} ++#endif /* CONFIG_PNFSD */ ++ + static int + nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) + { +@@ -288,6 +425,45 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst * + return 0; + } + ++#if defined(CONFIG_PNFSD) ++static int ++nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, u32 *p, ++ struct nfs4_rpc_args *rpc_args) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_layoutrecall *args = rpc_args->args_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = rpc_args->args_seq.cbs_minorversion, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_cb_compound_hdr(&xdr, &hdr); ++ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); ++ encode_cb_layout(&xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++ return 0; ++} ++ ++static int ++nfs4_xdr_enc_cb_device(struct rpc_rqst *req, u32 *p, ++ struct nfs4_rpc_args *rpc_args) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_notify_device *args = rpc_args->args_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = rpc_args->args_seq.cbs_minorversion, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_cb_compound_hdr(&xdr, &hdr); ++ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); ++ encode_cb_device(&xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++ return 0; ++} ++#endif /* CONFIG_PNFSD */ + + static int + decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ +@@ -403,6 +579,48 @@ out: + return status; + } + ++#if defined(CONFIG_PNFSD) ++static int ++nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, u32 *p, ++ struct nfsd4_cb_sequence *seq) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_cb_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_cb_sequence(&xdr, seq, rqstp); ++ if (status) ++ goto out; ++ status = decode_cb_op_hdr(&xdr, OP_CB_LAYOUT); ++out: ++ return status; ++} ++ ++static int ++nfs4_xdr_dec_cb_device(struct rpc_rqst *rqstp, u32 *p, ++ struct nfsd4_cb_sequence *seq) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_cb_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_cb_sequence(&xdr, seq, rqstp); ++ if (status) ++ goto out; ++ status = decode_cb_op_hdr(&xdr, OP_CB_DEVICE); ++out: ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * RPC procedure tables + */ +@@ -420,6 +638,10 @@ out: + static struct rpc_procinfo nfs4_cb_procedures[] = { + PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null), + PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall), ++#if defined(CONFIG_PNFSD) ++ PROC(CB_LAYOUT, COMPOUND, enc_cb_layout, dec_cb_layout), ++ PROC(CB_DEVICE, COMPOUND, enc_cb_device, dec_cb_device), ++#endif + }; + + static struct rpc_version nfs_cb_version4 = { +@@ -606,10 +828,9 @@ out: + * TODO: cb_sequence should support referring call lists, cachethis, multiple + * slots, and mark callback channel down on communication errors. + */ +-static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_prepare_sequence(struct rpc_task *task, ++ struct nfs4_client *clp) + { +- struct nfs4_delegation *dp = calldata; +- struct nfs4_client *clp = dp->dl_client; + struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; + u32 minorversion = clp->cl_cb_conn.cb_minorversion; + int status = 0; +@@ -629,11 +850,15 @@ static void nfsd4_cb_prepare(struct rpc_ + rpc_call_start(task); + } + +-static void nfsd4_cb_done(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_recall_prepare(struct rpc_task *task, void *calldata) + { + struct nfs4_delegation *dp = calldata; +- struct nfs4_client *clp = dp->dl_client; ++ nfsd4_cb_prepare_sequence(task, dp->dl_client); ++} + ++static void nfsd4_cb_done_sequence(struct rpc_task *task, ++ struct nfs4_client *clp) ++{ + dprintk("%s: minorversion=%d\n", __func__, + clp->cl_cb_conn.cb_minorversion); + +@@ -657,7 +882,7 @@ static void nfsd4_cb_recall_done(struct + struct nfs4_client *clp = dp->dl_client; + struct rpc_clnt *current_rpc_client = clp->cl_cb_client; + +- nfsd4_cb_done(task, calldata); ++ nfsd4_cb_done_sequence(task, clp); + + if (current_rpc_client == NULL) { + /* We're shutting down; give up. */ +@@ -688,7 +913,7 @@ static void nfsd4_cb_recall_done(struct + if (dp->dl_retries--) { + rpc_delay(task, 2*HZ); + task->tk_status = 0; +- rpc_restart_call(task); ++ rpc_restart_call_prepare(task); + return; + } else { + atomic_set(&clp->cl_cb_set, 0); +@@ -704,7 +929,7 @@ static void nfsd4_cb_recall_release(void + } + + static const struct rpc_call_ops nfsd4_cb_recall_ops = { +- .rpc_call_prepare = nfsd4_cb_prepare, ++ .rpc_call_prepare = nfsd4_cb_recall_prepare, + .rpc_call_done = nfsd4_cb_recall_done, + .rpc_release = nfsd4_cb_recall_release, + }; +@@ -781,3 +1006,173 @@ void nfsd4_cb_recall(struct nfs4_delegat + { + queue_work(callback_wq, &dp->dl_recall.cb_work); + } ++ ++#if defined(CONFIG_PNFSD) ++static void nfsd4_cb_layout_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ nfsd4_cb_prepare_sequence(task, clr->clr_client); ++} ++ ++static void nfsd4_cb_layout_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ struct nfs4_client *clp = clr->clr_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ if (!task->tk_status) ++ return; ++ ++ printk("%s: clp %p cb_client %p fp %p failed with status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ clr->clr_file, ++ task->tk_status); ++ ++ switch (task->tk_status) { ++ case -EIO: ++ /* Network partition? */ ++ atomic_set(&clp->cl_cb_set, 0); ++ warn_no_callback_path(clp, task->tk_status); ++ /* FIXME: ++ * The pnfs standard states that we need to only expire ++ * the client after at-least "lease time" .eg lease-time * 2 ++ * when failing to communicate a recall ++ */ ++ break; ++ case -NFS4ERR_DELAY: ++ /* Pole the client until it's done with the layout */ ++ rpc_delay(task, HZ/100); /* 10 mili-seconds */ ++ task->tk_status = 0; ++ rpc_restart_call_prepare(task); ++ break; ++ case -NFS4ERR_NOMATCHING_LAYOUT: ++ task->tk_status = 0; ++ nomatching_layout(clr); ++ } ++} ++ ++static void nfsd4_cb_layout_release(void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ kfree(clr->clr_args); ++ clr->clr_args = NULL; ++ put_layoutrecall(clr); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_layout_ops = { ++ .rpc_call_prepare = nfsd4_cb_layout_prepare, ++ .rpc_call_done = nfsd4_cb_layout_done, ++ .rpc_release = nfsd4_cb_layout_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++int ++nfsd4_cb_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfs4_client *clp = clr->clr_client; ++ struct rpc_clnt *clnt = clp->cl_cb_client; ++ struct nfs4_rpc_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_LAYOUT], ++ .rpc_cred = callback_cred ++ }; ++ int status; ++ ++ args = kzalloc(sizeof(*args), GFP_KERNEL); ++ if (!args) { ++ status = -ENOMEM; ++ goto out; ++ } ++ clr->clr_args = args; ++ args->args_op = clr; ++ msg.rpc_argp = args; ++ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, ++ &nfsd4_cb_layout_ops, clr); ++out: ++ if (status) { ++ kfree(args); ++ put_layoutrecall(clr); ++ } ++ dprintk("NFSD: nfsd4_cb_layout: status %d\n", status); ++ return status; ++} ++ ++static void nfsd4_cb_device_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ nfsd4_cb_prepare_sequence(task, cbnd->nd_client); ++} ++ ++static void nfsd4_cb_device_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ struct nfs4_client *clp = cbnd->nd_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ dprintk("%s: clp %p cb_client %p: status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ task->tk_status); ++ ++ if (task->tk_status == -EIO) { ++ /* Network partition? */ ++ atomic_set(&clp->cl_cb_set, 0); ++ warn_no_callback_path(clp, task->tk_status); ++ } ++} ++ ++static void nfsd4_cb_device_release(void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ kfree(cbnd->nd_args); ++ cbnd->nd_args = NULL; ++ kfree(cbnd); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_device_ops = { ++ .rpc_call_prepare = nfsd4_cb_device_prepare, ++ .rpc_call_done = nfsd4_cb_device_done, ++ .rpc_release = nfsd4_cb_device_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++int ++nfsd4_cb_notify_device(struct nfs4_notify_device *cbnd) ++{ ++ struct nfs4_client *clp = cbnd->nd_client; ++ struct rpc_clnt *clnt = clp->cl_cb_client; ++ struct nfs4_rpc_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_DEVICE], ++ .rpc_cred = callback_cred ++ }; ++ int status = -EIO; ++ ++ dprintk("%s: clp %p\n", __func__, clp); ++ ++ args = kzalloc(sizeof(*args), GFP_KERNEL); ++ if (!args) { ++ status = -ENOMEM; ++ goto out; ++ } ++ args->args_op = cbnd; ++ msg.rpc_argp = args; ++ ++ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, ++ &nfsd4_cb_device_ops, cbnd); ++out: ++ if (status) ++ kfree(args); ++ dprintk("%s: status %d\n", __func__, status); ++ return status; ++} ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-23 12:09:03.307491492 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-23 12:09:03.308491262 -0400 +@@ -0,0 +1,1679 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ *****************************************************************************/ ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Globals */ ++static u32 current_layoutid = 1; ++ ++/* ++ * Currently used for manipulating the layout state. ++ */ ++static DEFINE_SPINLOCK(layout_lock); ++ ++#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_SMP) ++# define BUG_ON_UNLOCKED_LAYOUT() BUG_ON(!spin_is_locked(&layout_lock)) ++#else ++# define BUG_ON_UNLOCKED_LAYOUT() ++#endif ++ ++/* ++ * Layout state - NFSv4.1 pNFS ++ */ ++static struct kmem_cache *pnfs_layout_slab; ++static struct kmem_cache *pnfs_layoutrecall_slab; ++ ++/* hash table for nfsd4_pnfs_deviceid.sbid */ ++#define SBID_HASH_BITS 8 ++#define SBID_HASH_SIZE (1 << SBID_HASH_BITS) ++#define SBID_HASH_MASK (SBID_HASH_SIZE - 1) ++ ++struct sbid_tracker { ++ u64 id; ++ struct super_block *sb; ++ struct list_head hash; ++}; ++ ++static u64 current_sbid; ++static struct list_head sbid_hashtbl[SBID_HASH_SIZE]; ++ ++static inline unsigned long ++sbid_hashval(struct super_block *sb) ++{ ++ return hash_ptr(sb, SBID_HASH_BITS); ++} ++ ++static inline struct sbid_tracker * ++alloc_sbid(void) ++{ ++ return kmalloc(sizeof(struct sbid_tracker), GFP_KERNEL); ++} ++ ++static void ++destroy_sbid(struct sbid_tracker *sbid) ++{ ++ spin_lock(&layout_lock); ++ list_del(&sbid->hash); ++ spin_unlock(&layout_lock); ++ kfree(sbid); ++} ++ ++void ++nfsd4_free_pnfs_slabs(void) ++{ ++ int i; ++ struct sbid_tracker *sbid; ++ ++ nfsd4_free_slab(&pnfs_layout_slab); ++ nfsd4_free_slab(&pnfs_layoutrecall_slab); ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ while (!list_empty(&sbid_hashtbl[i])) { ++ sbid = list_first_entry(&sbid_hashtbl[i], ++ struct sbid_tracker, ++ hash); ++ destroy_sbid(sbid); ++ } ++ } ++} ++ ++int ++nfsd4_init_pnfs_slabs(void) ++{ ++ int i; ++ ++ pnfs_layout_slab = kmem_cache_create("pnfs_layouts", ++ sizeof(struct nfs4_layout), 0, 0, NULL); ++ if (pnfs_layout_slab == NULL) ++ return -ENOMEM; ++ pnfs_layoutrecall_slab = kmem_cache_create("pnfs_layoutrecalls", ++ sizeof(struct nfs4_layoutrecall), 0, 0, NULL); ++ if (pnfs_layoutrecall_slab == NULL) ++ return -ENOMEM; ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ INIT_LIST_HEAD(&sbid_hashtbl[i]); ++ } ++ ++ return 0; ++} ++ ++/* XXX: Need to implement the notify types and track which ++ * clients have which devices. */ ++void pnfs_set_device_notify(clientid_t *clid, unsigned int types) ++{ ++ struct nfs4_client *clp; ++ dprintk("%s: -->\n", __func__); ++ ++ nfs4_lock_state(); ++ /* Indicate that client has a device so we can only notify ++ * the correct clients */ ++ clp = find_confirmed_client(clid); ++ if (clp) { ++ atomic_inc(&clp->cl_deviceref); ++ dprintk("%s: Incr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++ } ++ nfs4_unlock_state(); ++} ++ ++/* Clear notifications for this client ++ * XXX: Do we need to loop through a clean up all ++ * krefs when nfsd cleans up the client? */ ++void pnfs_clear_device_notify(struct nfs4_client *clp) ++{ ++ atomic_dec(&clp->cl_deviceref); ++ dprintk("%s: Decr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++} ++ ++static struct nfs4_layout_state * ++alloc_init_layout_state(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid) ++{ ++ struct nfs4_layout_state *new; ++ ++ /* FIXME: use a kmem_cache */ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return new; ++ get_nfs4_file(fp); ++ INIT_LIST_HEAD(&new->ls_perfile); ++ INIT_LIST_HEAD(&new->ls_layouts); ++ kref_init(&new->ls_ref); ++ new->ls_client = clp; ++ new->ls_file = fp; ++ new->ls_stateid.si_boot = stateid->si_boot; ++ new->ls_stateid.si_stateownerid = 0; /* identifies layout stateid */ ++ new->ls_stateid.si_generation = 1; ++ spin_lock(&layout_lock); ++ new->ls_stateid.si_fileid = current_layoutid++; ++ list_add(&new->ls_perfile, &fp->fi_layout_states); ++ spin_unlock(&layout_lock); ++ return new; ++} ++ ++static inline void ++get_layout_state(struct nfs4_layout_state *ls) ++{ ++ kref_get(&ls->ls_ref); ++} ++ ++static void ++destroy_layout_state_common(struct nfs4_layout_state *ls) ++{ ++ struct nfs4_file *fp = ls->ls_file; ++ ++ dprintk("pNFS %s: ls %p fp %p clp %p\n", __func__, ls, fp, ++ ls->ls_client); ++ BUG_ON(!list_empty(&ls->ls_layouts)); ++ kfree(ls); ++ put_nfs4_file(fp); ++} ++ ++static void ++destroy_layout_state(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ spin_lock(&layout_lock); ++ list_del(&ls->ls_perfile); ++ spin_unlock(&layout_lock); ++ destroy_layout_state_common(ls); ++} ++ ++static void ++destroy_layout_state_locked(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ list_del(&ls->ls_perfile); ++ destroy_layout_state_common(ls); ++} ++ ++static inline void ++put_layout_state(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state); ++} ++ ++static inline void ++put_layout_state_locked(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state_locked); ++} ++ ++/* ++ * Search the fp->fi_layout_state list for a layout state with the clientid. ++ * If not found, then this is a 'first open/delegation/lock stateid' from ++ * the client for this file. ++ * Called under the layout_lock. ++ */ ++static struct nfs4_layout_state * ++find_get_layout_state(struct nfs4_client *clp, struct nfs4_file *fp) ++{ ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_for_each_entry(ls, &fp->fi_layout_states, ls_perfile) { ++ if (ls->ls_client == clp) { ++ dprintk("pNFS %s: before GET ls %p ls_ref %d\n", ++ __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ get_layout_state(ls); ++ return ls; ++ } ++ } ++ return NULL; ++} ++ ++static __be32 ++verify_stateid(struct nfs4_file *fp, stateid_t *stateid) ++{ ++ struct nfs4_stateid *local = NULL; ++ struct nfs4_delegation *temp = NULL; ++ ++ /* check if open or lock stateid */ ++ local = find_stateid(stateid, RD_STATE); ++ if (local) ++ return 0; ++ temp = find_delegation_stateid(fp->fi_inode, stateid); ++ if (temp) ++ return 0; ++ return nfserr_bad_stateid; ++} ++ ++/* ++ * nfs4_preocess_layout_stateid () ++ * ++ * We have looked up the nfs4_file corresponding to the current_fh, and ++ * confirmed the clientid. Pull the few tests from nfs4_preprocess_stateid_op() ++ * that make sense with a layout stateid. ++ * ++ * Called with the state_lock held ++ * Returns zero and stateid is updated, or error. ++ * ++ * Note: the struct nfs4_layout_state pointer is only set by layoutget. ++ */ ++static __be32 ++nfs4_process_layout_stateid(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid, struct nfs4_layout_state **lsp) ++{ ++ struct nfs4_layout_state *ls = NULL; ++ __be32 status = 0; ++ ++ dprintk("--> %s clp %p fp %p \n", __func__, clp, fp); ++ ++ dprintk("%s: operation stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ goto out; ++ ++ /* Is this the first use of this layout ? */ ++ spin_lock(&layout_lock); ++ ls = find_get_layout_state(clp, fp); ++ spin_unlock(&layout_lock); ++ if (!ls) { ++ /* Only alloc layout state on layoutget (which sets lsp). */ ++ if (!lsp) { ++ dprintk("%s ERROR: Not layoutget & no layout stateid\n", ++ __func__); ++ status = nfserr_bad_stateid; ++ goto out; ++ } ++ dprintk("%s Initial stateid for layout: file %p client %p\n", ++ __func__, fp, clp); ++ ++ /* verify input stateid */ ++ status = verify_stateid(fp, stateid); ++ if (status) { ++ dprintk("%s ERROR: invalid open/deleg/lock stateid\n", ++ __func__); ++ goto out; ++ } ++ ls = alloc_init_layout_state(clp, fp, stateid); ++ if (!ls) { ++ dprintk("%s pNFS ERROR: no memory for layout state\n", ++ __func__); ++ status = nfserr_resource; ++ goto out; ++ } ++ } else { ++ dprintk("%s Not initial stateid. Layout state %p file %p\n", ++ __func__, ls, fp); ++ ++ /* BAD STATEID */ ++ status = nfserr_bad_stateid; ++ if (memcmp(&ls->ls_stateid.si_opaque, &stateid->si_opaque, ++ sizeof(stateid_opaque_t)) != 0) { ++ ++ /* if a LAYOUTGET operation and stateid is a valid ++ * open/deleg/lock stateid, accept it as a parallel ++ * initial layout stateid ++ */ ++ if (lsp && ((verify_stateid(fp, stateid)) == 0)) { ++ dprintk("%s parallel initial layout state\n", ++ __func__); ++ goto update; ++ } ++ ++ dprintk("%s ERROR bad opaque in stateid 1\n", __func__); ++ goto out_put; ++ } ++ ++ /* stateid is a valid layout stateid for this file. */ ++ if (stateid->si_generation > ls->ls_stateid.si_generation) { ++ dprintk("%s bad stateid 1\n", __func__); ++ goto out_put; ++ } ++update: ++ update_stateid(&ls->ls_stateid); ++ dprintk("%s Updated ls_stateid to %d on layoutstate %p\n", ++ __func__, ls->ls_stateid.si_generation, ls); ++ } ++ status = 0; ++ /* Set the stateid to be encoded */ ++ memcpy(stateid, &ls->ls_stateid, sizeof(stateid_t)); ++ ++ /* Return the layout state if requested */ ++ if (lsp) { ++ get_layout_state(ls); ++ *lsp = ls; ++ } ++ dprintk("%s: layout stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(&ls->ls_stateid)); ++out_put: ++ dprintk("%s PUT LO STATE:\n", __func__); ++ put_layout_state(ls); ++out: ++ dprintk("<-- %s status %d\n", __func__, htonl(status)); ++ ++ return status; ++} ++ ++static inline struct nfs4_layout * ++alloc_layout(void) ++{ ++ return kmem_cache_alloc(pnfs_layout_slab, GFP_KERNEL); ++} ++ ++static inline void ++free_layout(struct nfs4_layout *lp) ++{ ++ kmem_cache_free(pnfs_layout_slab, lp); ++} ++ ++static void ++init_layout(struct nfs4_layout_state *ls, ++ struct nfs4_layout *lp, ++ struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg) ++{ ++ dprintk("pNFS %s: ls %p lp %p clp %p fp %p ino %p\n", __func__, ++ ls, lp, clp, fp, fp->fi_inode); ++ ++ get_nfs4_file(fp); ++ lp->lo_client = clp; ++ lp->lo_file = fp; ++ get_layout_state(ls); ++ lp->lo_state = ls; ++ memcpy(&lp->lo_seg, seg, sizeof(lp->lo_seg)); ++ spin_lock(&layout_lock); ++ list_add_tail(&lp->lo_perstate, &ls->ls_layouts); ++ list_add_tail(&lp->lo_perclnt, &clp->cl_layouts); ++ list_add_tail(&lp->lo_perfile, &fp->fi_layouts); ++ spin_unlock(&layout_lock); ++ dprintk("pNFS %s end\n", __func__); ++} ++ ++static void ++dequeue_layout(struct nfs4_layout *lp) ++{ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del(&lp->lo_perclnt); ++ list_del(&lp->lo_perfile); ++ list_del(&lp->lo_perstate); ++} ++ ++static void ++destroy_layout(struct nfs4_layout *lp) ++{ ++ struct nfs4_client *clp; ++ struct nfs4_file *fp; ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ clp = lp->lo_client; ++ fp = lp->lo_file; ++ ls = lp->lo_state; ++ dprintk("pNFS %s: lp %p clp %p fp %p ino %p ls_layouts empty %d\n", ++ __func__, lp, clp, fp, fp->fi_inode, ++ list_empty(&ls->ls_layouts)); ++ ++ kmem_cache_free(pnfs_layout_slab, lp); ++ /* release references taken by init_layout */ ++ put_layout_state_locked(ls); ++ put_nfs4_file(fp); ++} ++ ++void fs_layout_return(struct super_block *sb, struct inode *ino, ++ struct nfsd4_pnfs_layoutreturn *lrp, int flags, ++ void *recall_cookie) ++{ ++ int ret; ++ ++ if (unlikely(!sb->s_pnfs_op->layout_return)) ++ return; ++ ++ lrp->lr_flags = flags; ++ lrp->args.lr_cookie = recall_cookie; ++ ++ if (!ino) /* FSID or ALL */ ++ ino = sb->s_root->d_inode; ++ ++ ret = sb->s_pnfs_op->layout_return(ino, &lrp->args); ++ dprintk("%s: inode %lu iomode=%d offset=0x%llx length=0x%llx " ++ "cookie = %p flags 0x%x status=%d\n", ++ __func__, ino->i_ino, lrp->args.lr_seg.iomode, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, ++ recall_cookie, flags, ret); ++} ++ ++static u64 ++alloc_init_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ struct sbid_tracker *new = alloc_sbid(); ++ unsigned long hash_idx = sbid_hashval(sb); ++ u64 id = 0; ++ ++ if (likely(new)) { ++ spin_lock(&layout_lock); ++ id = ++current_sbid; ++ new->id = (id << SBID_HASH_BITS) | (hash_idx & SBID_HASH_MASK); ++ id = new->id; ++ BUG_ON(id == 0); ++ new->sb = sb; ++ ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) ++ if (sbid->sb == sb) { ++ kfree(new); ++ id = sbid->id; ++ spin_unlock(&layout_lock); ++ return id; ++ } ++ list_add(&new->hash, &sbid_hashtbl[hash_idx]); ++ spin_unlock(&layout_lock); ++ } ++ return id; ++} ++ ++struct super_block * ++find_sbid_id(u64 id) ++{ ++ struct sbid_tracker *sbid; ++ struct super_block *sb = NULL; ++ unsigned long hash_idx = id & SBID_HASH_MASK; ++ int pos = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->id != id) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ sb = sbid->sb; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return sb; ++} ++ ++u64 ++find_create_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ unsigned long hash_idx = sbid_hashval(sb); ++ int pos = 0; ++ u64 id = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->sb != sb) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ id = sbid->id; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ if (!id) ++ id = alloc_init_sbid(sb); ++ ++ return id; ++} ++ ++/* ++ * Create a layoutrecall structure ++ * An optional layoutrecall can be cloned (except for the layoutrecall lists) ++ */ ++static struct nfs4_layoutrecall * ++alloc_init_layoutrecall(struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_client *clp, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ dprintk("NFSD %s\n", __func__); ++ clr = kmem_cache_alloc(pnfs_layoutrecall_slab, GFP_KERNEL); ++ if (clr == NULL) ++ return clr; ++ ++ dprintk("NFSD %s -->\n", __func__); ++ ++ memset(clr, 0, sizeof(*clr)); ++ if (lrfile) ++ get_nfs4_file(lrfile); ++ clr->clr_client = clp; ++ clr->clr_file = lrfile; ++ clr->cb = *cbl; ++ ++ kref_init(&clr->clr_ref); ++ INIT_LIST_HEAD(&clr->clr_perclnt); ++ ++ dprintk("NFSD %s return %p\n", __func__, clr); ++ return clr; ++} ++ ++static void ++get_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ kref_get(&clr->clr_ref); ++} ++ ++static void ++destroy_layoutrecall(struct kref *kref) ++{ ++ struct nfs4_layoutrecall *clr = ++ container_of(kref, struct nfs4_layoutrecall, clr_ref); ++ dprintk("pNFS %s: clr %p fp %p clp %p\n", __func__, clr, ++ clr->clr_file, clr->clr_client); ++ BUG_ON(!list_empty(&clr->clr_perclnt)); ++ if (clr->clr_file) ++ put_nfs4_file(clr->clr_file); ++ kmem_cache_free(pnfs_layoutrecall_slab, clr); ++} ++ ++int ++put_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ return kref_put(&clr->clr_ref, destroy_layoutrecall); ++} ++ ++void * ++layoutrecall_done(struct nfs4_layoutrecall *clr) ++{ ++ void *recall_cookie = clr->cb.cbl_cookie; ++ struct nfs4_layoutrecall *parent = clr->parent; ++ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del_init(&clr->clr_perclnt); ++ put_layoutrecall(clr); ++ ++ if (parent && !put_layoutrecall(parent)) ++ recall_cookie = NULL; ++ ++ return recall_cookie; ++} ++ ++/* ++ * get_state() and cb_get_state() are ++ */ ++void ++release_pnfs_ds_dev_list(struct nfs4_stateid *stp) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ while (!list_empty(&stp->st_pnfs_ds_id)) { ++ ddp = list_entry(stp->st_pnfs_ds_id.next, ++ struct pnfs_ds_dev_entry, dd_dev_entry); ++ list_del(&ddp->dd_dev_entry); ++ kfree(ddp); ++ } ++} ++ ++static int ++nfs4_add_pnfs_ds_dev(struct nfs4_stateid *stp, u32 dsid) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ ddp = kmalloc(sizeof(*ddp), GFP_KERNEL); ++ if (!ddp) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&ddp->dd_dev_entry); ++ list_add(&ddp->dd_dev_entry, &stp->st_pnfs_ds_id); ++ ddp->dd_dsid = dsid; ++ return 0; ++} ++ ++/* ++ * are two octet ranges overlapping? ++ * start1 last1 ++ * |-----------------| ++ * start2 last2 ++ * |----------------| ++ */ ++static inline int ++lo_seg_overlapping(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 last1 = last_byte_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 last2 = last_byte_offset(start2, l2->length); ++ int ret; ++ ++ /* if last1 == start2 there's a single byte overlap */ ++ ret = (last2 >= start1) && (last1 >= start2); ++ dprintk("%s: l1 %llu:%lld l2 %llu:%lld ret=%d\n", __func__, ++ l1->offset, l1->length, l2->offset, l2->length, ret); ++ return ret; ++} ++ ++static inline int ++same_fsid_major(struct nfs4_fsid *fsid, u64 major) ++{ ++ return fsid->major == major; ++} ++ ++static inline int ++same_fsid(struct nfs4_fsid *fsid, struct svc_fh *current_fh) ++{ ++ return same_fsid_major(fsid, current_fh->fh_export->ex_fsid); ++} ++ ++/* ++ * find a layout recall conflicting with the specified layoutget ++ */ ++static int ++is_layout_recalled(struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (clr, &clp->cl_layoutrecalls, clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != seg->layout_type) ++ continue; ++ if (clr->cb.cbl_recall_type == RETURN_ALL) ++ goto found; ++ if (clr->cb.cbl_recall_type == RETURN_FSID) { ++ if (same_fsid(&clr->cb.cbl_fsid, current_fh)) ++ goto found; ++ else ++ continue; ++ } ++ BUG_ON(clr->cb.cbl_recall_type != RETURN_FILE); ++ if (clr->cb.cbl_seg.clientid == seg->clientid && ++ lo_seg_overlapping(&clr->cb.cbl_seg, seg)) ++ goto found; ++ } ++ spin_unlock(&layout_lock); ++ return 0; ++found: ++ spin_unlock(&layout_lock); ++ return 1; ++} ++ ++/* ++ * are two octet ranges overlapping or adjacent? ++ */ ++static inline int ++lo_seg_mergeable(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ /* is end1 == start2 ranges are adjacent */ ++ return (end2 >= start1) && (end1 >= start2); ++} ++ ++static void ++extend_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lg) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lg_start = lg->offset; ++ u64 lg_end = end_offset(lg_start, lg->length); ++ ++ /* lo already covers lg? */ ++ if (lo_start <= lg_start && lg_end <= lo_end) ++ return; ++ ++ /* extend start offset */ ++ if (lo_start > lg_start) ++ lo_start = lg_start; ++ ++ /* extend end offset */ ++ if (lo_end < lg_end) ++ lo_end = lg_end; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? ++ lo_end : lo_end - lo_start; ++} ++ ++static struct nfs4_layout * ++merge_layout(struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layout *lp = NULL; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (lp, &fp->fi_layouts, lo_perfile) ++ if (lp->lo_seg.layout_type == seg->layout_type && ++ lp->lo_seg.clientid == seg->clientid && ++ lp->lo_seg.iomode == seg->iomode && ++ lo_seg_mergeable(&lp->lo_seg, seg)) { ++ extend_layout(&lp->lo_seg, seg); ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return lp; ++} ++ ++__be32 ++nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *lgp, ++ struct exp_xdr_stream *xdr) ++{ ++ u32 status; ++ __be32 nfserr; ++ struct inode *ino = lgp->lg_fhp->fh_dentry->d_inode; ++ struct super_block *sb = ino->i_sb; ++ int can_merge; ++ struct nfs4_file *fp; ++ struct nfs4_client *clp; ++ struct nfs4_layout *lp = NULL; ++ struct nfs4_layout_state *ls = NULL; ++ struct nfsd4_pnfs_layoutget_arg args = { ++ .lg_minlength = lgp->lg_minlength, ++ .lg_fh = &lgp->lg_fhp->fh_handle, ++ }; ++ struct nfsd4_pnfs_layoutget_res res = { ++ .lg_seg = lgp->lg_seg, ++ }; ++ ++ dprintk("NFSD: %s Begin\n", __func__); ++ ++ args.lg_sbid = find_create_sbid(sb); ++ if (!args.lg_sbid) { ++ nfserr = nfserr_layouttrylater; ++ goto out; ++ } ++ ++ can_merge = sb->s_pnfs_op->can_merge_layouts != NULL && ++ sb->s_pnfs_op->can_merge_layouts(lgp->lg_seg.layout_type); ++ ++ nfs4_lock_state(); ++ fp = find_alloc_file(ino, lgp->lg_fhp); ++ clp = find_confirmed_client((clientid_t *)&lgp->lg_seg.clientid); ++ dprintk("pNFS %s: fp %p clp %p \n", __func__, fp, clp); ++ if (!fp || !clp) { ++ nfserr = nfserr_inval; ++ goto out_unlock; ++ } ++ ++ /* Check decoded layout stateid */ ++ nfserr = nfs4_process_layout_stateid(clp, fp, &lgp->lg_sid, &ls); ++ if (nfserr) ++ goto out_unlock; ++ ++ if (is_layout_recalled(clp, lgp->lg_fhp, &lgp->lg_seg)) { ++ nfserr = nfserr_recallconflict; ++ goto out; ++ } ++ ++ /* pre-alloc layout in case we can't merge after we call ++ * the file system ++ */ ++ lp = alloc_layout(); ++ if (!lp) { ++ nfserr = nfserr_layouttrylater; ++ goto out_unlock; ++ } ++ ++ dprintk("pNFS %s: pre-export type 0x%x maxcount %Zd " ++ "iomode %u offset %llu length %llu\n", ++ __func__, lgp->lg_seg.layout_type, ++ exp_xdr_qbytes(xdr->end - xdr->p), ++ lgp->lg_seg.iomode, lgp->lg_seg.offset, lgp->lg_seg.length); ++ ++ /* FIXME: need to eliminate the use of the state lock */ ++ nfs4_unlock_state(); ++ status = sb->s_pnfs_op->layout_get(ino, xdr, &args, &res); ++ nfs4_lock_state(); ++ ++ dprintk("pNFS %s: post-export status %u " ++ "iomode %u offset %llu length %llu\n", ++ __func__, status, res.lg_seg.iomode, ++ res.lg_seg.offset, res.lg_seg.length); ++ ++ /* ++ * The allowable error codes for the layout_get pNFS export ++ * operations vector function (from the file system) can be ++ * expanded as needed to include other errors defined for ++ * the RFC 5561 LAYOUTGET operation. ++ */ ++ switch (status) { ++ case 0: ++ nfserr = NFS4_OK; ++ break; ++ case NFS4ERR_ACCESS: ++ case NFS4ERR_BADIOMODE: ++ /* No support for LAYOUTIOMODE4_RW layouts */ ++ case NFS4ERR_BADLAYOUT: ++ /* No layout matching loga_minlength rules */ ++ case NFS4ERR_INVAL: ++ case NFS4ERR_IO: ++ case NFS4ERR_LAYOUTTRYLATER: ++ case NFS4ERR_LAYOUTUNAVAILABLE: ++ case NFS4ERR_LOCKED: ++ case NFS4ERR_NOSPC: ++ case NFS4ERR_RECALLCONFLICT: ++ case NFS4ERR_SERVERFAULT: ++ case NFS4ERR_TOOSMALL: ++ /* Requested layout too big for loga_maxcount */ ++ case NFS4ERR_WRONG_TYPE: ++ /* Not a regular file */ ++ nfserr = cpu_to_be32(status); ++ goto out_freelayout; ++ default: ++ BUG(); ++ nfserr = nfserr_serverfault; ++ } ++ ++ lgp->lg_seg = res.lg_seg; ++ lgp->lg_roc = res.lg_return_on_close; ++ ++ /* SUCCESS! ++ * Can the new layout be merged into an existing one? ++ * If so, free unused layout struct ++ */ ++ if (can_merge && merge_layout(fp, clp, &res.lg_seg)) ++ goto out_freelayout; ++ ++ /* Can't merge, so let's initialize this new layout */ ++ init_layout(ls, lp, fp, clp, lgp->lg_fhp, &res.lg_seg); ++out_unlock: ++ if (ls) ++ put_layout_state(ls); ++ if (fp) ++ put_nfs4_file(fp); ++ nfs4_unlock_state(); ++out: ++ dprintk("pNFS %s: lp %p exit nfserr %u\n", __func__, lp, ++ be32_to_cpu(nfserr)); ++ return nfserr; ++out_freelayout: ++ free_layout(lp); ++ goto out_unlock; ++} ++ ++static void ++trim_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lr) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lr_start = lr->offset; ++ u64 lr_end = end_offset(lr_start, lr->length); ++ ++ dprintk("%s:Begin lo %llu:%lld lr %llu:%lld\n", __func__, ++ lo->offset, lo->length, lr->offset, lr->length); ++ ++ /* lr fully covers lo? */ ++ if (lr_start <= lo_start && lo_end <= lr_end) { ++ lo->length = 0; ++ goto out; ++ } ++ ++ /* ++ * split not supported yet. retain layout segment. ++ * remains must be returned by the client ++ * on the final layout return. ++ */ ++ if (lo_start < lr_start && lr_end < lo_end) { ++ dprintk("%s: split not supported\n", __func__); ++ goto out; ++ } ++ ++ if (lo_start < lr_start) ++ lo_end = lr_start - 1; ++ else /* lr_end < lo_end */ ++ lo_start = lr_end + 1; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? lo_end : lo_end - lo_start; ++out: ++ dprintk("%s:End lo %llu:%lld\n", __func__, lo->offset, lo->length); ++} ++ ++static int ++pnfs_return_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ dprintk("%s: clp %p fp %p\n", __func__, clp, fp); ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &fp->fi_layouts, lo_perfile) { ++ dprintk("%s: lp %p client %p,%p lo_type %x,%x iomode %d,%d\n", ++ __func__, lp, ++ lp->lo_client, clp, ++ lp->lo_seg.layout_type, lrp->args.lr_seg.layout_type, ++ lp->lo_seg.iomode, lrp->args.lr_seg.iomode); ++ if (lp->lo_client != clp || ++ lp->lo_seg.layout_type != lrp->args.lr_seg.layout_type || ++ (lp->lo_seg.iomode != lrp->args.lr_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) || ++ !lo_seg_overlapping(&lp->lo_seg, &lrp->args.lr_seg)) ++ continue; ++ layouts_found++; ++ trim_layout(&lp->lo_seg, &lrp->args.lr_seg); ++ if (!lp->lo_seg.length) { ++ lrp->lrs_present = 0; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ } ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++pnfs_return_client_layouts(struct nfs4_client *clp, ++ struct nfsd4_pnfs_layoutreturn *lrp, u64 ex_fsid) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &clp->cl_layouts, lo_perclnt) { ++ if (lrp->args.lr_seg.layout_type != lp->lo_seg.layout_type || ++ (lrp->args.lr_seg.iomode != lp->lo_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY)) ++ continue; ++ ++ if (lrp->args.lr_return_type == RETURN_FSID && ++ !same_fsid_major(&lp->lo_file->fi_fsid, ex_fsid)) ++ continue; ++ ++ layouts_found++; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++recall_return_perfect_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode || ++ clr->cb.cbl_recall_type != lrp->args.lr_return_type) ++ return 0; ++ ++ return (clr->cb.cbl_recall_type == RETURN_FILE && ++ clr->clr_file == fp && ++ clr->cb.cbl_seg.offset == lrp->args.lr_seg.offset && ++ clr->cb.cbl_seg.length == lrp->args.lr_seg.length) || ++ ++ (clr->cb.cbl_recall_type == RETURN_FSID && ++ same_fsid(&clr->cb.cbl_fsid, current_fh)) || ++ ++ clr->cb.cbl_recall_type == RETURN_ALL; ++} ++ ++static int ++recall_return_partial_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ /* iomode matching? */ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode && ++ clr->cb.cbl_seg.iomode != IOMODE_ANY && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) ++ return 0; ++ ++ if (clr->cb.cbl_recall_type == RETURN_ALL || ++ lrp->args.lr_return_type == RETURN_ALL) ++ return 1; ++ ++ /* fsid matches? */ ++ if (clr->cb.cbl_recall_type == RETURN_FSID || ++ lrp->args.lr_return_type == RETURN_FSID) ++ return same_fsid(&clr->cb.cbl_fsid, current_fh); ++ ++ /* file matches, range overlapping? */ ++ return clr->clr_file == fp && ++ lo_seg_overlapping(&clr->cb.cbl_seg, &lrp->args.lr_seg); ++} ++ ++int nfs4_pnfs_return_layout(struct super_block *sb, struct svc_fh *current_fh, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status = 0; ++ int layouts_found = 0; ++ struct inode *ino = current_fh->fh_dentry->d_inode; ++ struct nfs4_file *fp = NULL; ++ struct nfs4_client *clp; ++ struct nfs4_layoutrecall *clr, *nextclr; ++ u64 ex_fsid = current_fh->fh_export->ex_fsid; ++ void *recall_cookie = NULL; ++ ++ dprintk("NFSD: %s\n", __func__); ++ ++ nfs4_lock_state(); ++ clp = find_confirmed_client((clientid_t *)&lrp->args.lr_seg.clientid); ++ if (!clp) ++ goto out; ++ ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ fp = find_file(ino); ++ if (!fp) { ++ printk(KERN_ERR "%s: RETURN_FILE: no nfs4_file for " ++ "ino %p:%lu\n", ++ __func__, ino, ino ? ino->i_ino : 0L); ++ goto out; ++ } ++ ++ /* Check the stateid */ ++ dprintk("%s PROCESS LO_STATEID inode %p\n", __func__, ino); ++ status = nfs4_process_layout_stateid(clp, fp, &lrp->lr_sid, ++ NULL); ++ if (status) ++ goto out_put_file; ++ ++ /* update layouts */ ++ layouts_found = pnfs_return_file_layouts(clp, fp, lrp); ++ /* optimize for the all-empty case */ ++ if (list_empty(&fp->fi_layouts)) ++ recall_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ } else { ++ layouts_found = pnfs_return_client_layouts(clp, lrp, ex_fsid); ++ } ++ ++ dprintk("pNFS %s: clp %p fp %p layout_type 0x%x iomode %d " ++ "return_type %d fsid 0x%llx offset %llu length %llu: " ++ "layouts_found %d\n", ++ __func__, clp, fp, lrp->args.lr_seg.layout_type, ++ lrp->args.lr_seg.iomode, lrp->args.lr_return_type, ++ ex_fsid, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, layouts_found); ++ ++ /* update layoutrecalls ++ * note: for RETURN_{FSID,ALL}, fp may be NULL ++ */ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (clr, nextclr, &clp->cl_layoutrecalls, ++ clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != lrp->args.lr_seg.layout_type) ++ continue; ++ ++ if (recall_return_perfect_match(clr, lrp, fp, current_fh)) ++ recall_cookie = layoutrecall_done(clr); ++ else if (layouts_found && ++ recall_return_partial_match(clr, lrp, fp, current_fh)) ++ clr->clr_time = CURRENT_TIME; ++ } ++ spin_unlock(&layout_lock); ++ ++out_put_file: ++ if (fp) ++ put_nfs4_file(fp); ++out: ++ nfs4_unlock_state(); ++ ++ /* call exported filesystem layout_return (ignore return-code) */ ++ fs_layout_return(sb, ino, lrp, 0, recall_cookie); ++ ++ dprintk("pNFS %s: exit status %d \n", __func__, status); ++ return status; ++} ++ ++/* ++ * PNFS Metadata server export operations callback for get_state ++ * ++ * called by the cluster fs when it receives a get_state() from a data ++ * server. ++ * returns status, or pnfs_get_state* with pnfs_get_state->status set. ++ * ++ */ ++int ++nfs4_pnfs_cb_get_state(struct super_block *sb, struct pnfs_get_state *arg) ++{ ++ struct nfs4_stateid *stp; ++ int flags = LOCK_STATE | OPEN_STATE; /* search both hash tables */ ++ int status = -EINVAL; ++ struct inode *ino; ++ struct nfs4_delegation *dl; ++ stateid_t *stid = (stateid_t *)&arg->stid; ++ ++ dprintk("NFSD: %s sid=" STATEID_FMT " ino %llu\n", __func__, ++ STATEID_VAL(stid), arg->ino); ++ ++ nfs4_lock_state(); ++ stp = find_stateid(stid, flags); ++ if (!stp) { ++ ino = iget_locked(sb, arg->ino); ++ if (!ino) ++ goto out; ++ ++ if (ino->i_state & I_NEW) { ++ iget_failed(ino); ++ goto out; ++ } ++ ++ dl = find_delegation_stateid(ino, stid); ++ if (dl) ++ status = 0; ++ ++ iput(ino); ++ } else { ++ /* XXX ANDROS: marc removed nfs4_check_fh - how come? */ ++ ++ /* arg->devid is the Data server id, set by the cluster fs */ ++ status = nfs4_add_pnfs_ds_dev(stp, arg->dsid); ++ if (status) ++ goto out; ++ ++ arg->access = stp->st_access_bmap; ++ *(clientid_t *)&arg->clid = ++ stp->st_stateowner->so_client->cl_clientid; ++ } ++out: ++ nfs4_unlock_state(); ++ return status; ++} ++ ++static int ++cl_has_file_layout(struct nfs4_client *clp, struct nfs4_file *lrfile, ++ stateid_t *lsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ struct nfs4_layout_state *ls; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) { ++ if (lp->lo_file != lrfile) ++ continue; ++ ++ ls = find_get_layout_state(clp, lrfile); ++ if (!ls) { ++ /* This shouldn't happen as the file should have a ++ * layout stateid if it has a layout. ++ */ ++ printk(KERN_ERR "%s: file %p has no layout stateid\n", ++ __func__, lrfile); ++ WARN_ON(1); ++ break; ++ } ++ update_stateid(&ls->ls_stateid); ++ memcpy(lsid, &ls->ls_stateid, sizeof(stateid_t)); ++ put_layout_state_locked(ls); ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return found; ++} ++ ++static int ++cl_has_fsid_layout(struct nfs4_client *clp, struct nfs4_fsid *fsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ ++ /* note: minor version unused */ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) ++ if (lp->lo_file->fi_fsid.major == fsid->major) { ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return found; ++} ++ ++static int ++cl_has_any_layout(struct nfs4_client *clp) ++{ ++ return !list_empty(&clp->cl_layouts); ++} ++ ++static int ++cl_has_layout(struct nfs4_client *clp, struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile, stateid_t *lsid) ++{ ++ switch (cbl->cbl_recall_type) { ++ case RETURN_FILE: ++ return cl_has_file_layout(clp, lrfile, lsid); ++ case RETURN_FSID: ++ return cl_has_fsid_layout(clp, &cbl->cbl_fsid); ++ default: ++ return cl_has_any_layout(clp); ++ } ++} ++ ++/* ++ * Called without the layout_lock. ++ */ ++void ++nomatching_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfsd4_pnfs_layoutreturn lr = { ++ .args.lr_return_type = clr->cb.cbl_recall_type, ++ .args.lr_seg = clr->cb.cbl_seg, ++ }; ++ struct inode *inode; ++ void *recall_cookie; ++ ++ if (clr->clr_file) { ++ inode = igrab(clr->clr_file->fi_inode); ++ if (WARN_ON(!inode)) ++ return; ++ } else { ++ inode = NULL; ++ } ++ ++ dprintk("%s: clp %p fp %p: simulating layout_return\n", __func__, ++ clr->clr_client, clr->clr_file); ++ ++ if (clr->cb.cbl_recall_type == RETURN_FILE) ++ pnfs_return_file_layouts(clr->clr_client, clr->clr_file, &lr); ++ else ++ pnfs_return_client_layouts(clr->clr_client, &lr, ++ clr->cb.cbl_fsid.major); ++ ++ spin_lock(&layout_lock); ++ recall_cookie = layoutrecall_done(clr); ++ spin_unlock(&layout_lock); ++ ++ fs_layout_return(clr->clr_sb, inode, &lr, LR_FLAG_INTERN, ++ recall_cookie); ++ iput(inode); ++} ++ ++void pnfs_expire_client(struct nfs4_client *clp) ++{ ++ for (;;) { ++ struct nfs4_layoutrecall *lrp = NULL; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layoutrecalls)) { ++ lrp = list_entry(clp->cl_layoutrecalls.next, ++ struct nfs4_layoutrecall, clr_perclnt); ++ get_layoutrecall(lrp); ++ } ++ spin_unlock(&layout_lock); ++ if (!lrp) ++ break; ++ ++ dprintk("%s: lrp %p, fp %p\n", __func__, lrp, lrp->clr_file); ++ BUG_ON(lrp->clr_client != clp); ++ nomatching_layout(lrp); ++ put_layoutrecall(lrp); ++ } ++ ++ for (;;) { ++ struct nfs4_layout *lp = NULL; ++ struct inode *inode = NULL; ++ struct nfsd4_pnfs_layoutreturn lr; ++ bool empty = false; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layouts)) { ++ lp = list_entry(clp->cl_layouts.next, ++ struct nfs4_layout, lo_perclnt); ++ inode = igrab(lp->lo_file->fi_inode); ++ memset(&lr, 0, sizeof(lr)); ++ lr.args.lr_return_type = RETURN_FILE; ++ lr.args.lr_seg = lp->lo_seg; ++ empty = list_empty(&lp->lo_file->fi_layouts); ++ BUG_ON(lp->lo_client != clp); ++ dequeue_layout(lp); ++ destroy_layout(lp); /* do not access lp after this */ ++ } ++ spin_unlock(&layout_lock); ++ if (!lp) ++ break; ++ ++ if (WARN_ON(!inode)) ++ break; ++ ++ dprintk("%s: inode %lu lp %p clp %p\n", __func__, inode->i_ino, ++ lp, clp); ++ ++ fs_layout_return(inode->i_sb, inode, &lr, LR_FLAG_EXPIRE, ++ empty ? PNFS_LAST_LAYOUT_NO_RECALLS : NULL); ++ iput(inode); ++ } ++} ++ ++struct create_recall_list_arg { ++ struct nfsd4_pnfs_cb_layout *cbl; ++ struct nfs4_file *lrfile; ++ struct list_head *todolist; ++ unsigned todo_count; ++}; ++ ++/* ++ * look for matching layout for the given client ++ * and add a pending layout recall to the todo list ++ * if found any. ++ * returns: ++ * 0 if layouts found or negative error. ++ */ ++static int ++lo_recall_per_client(struct nfs4_client *clp, void *p) ++{ ++ stateid_t lsid; ++ struct nfs4_layoutrecall *pending; ++ struct create_recall_list_arg *arg = p; ++ ++ memset(&lsid, 0, sizeof(lsid)); ++ if (!cl_has_layout(clp, arg->cbl, arg->lrfile, &lsid)) ++ return 0; ++ ++ /* Matching put done by layoutreturn */ ++ pending = alloc_init_layoutrecall(arg->cbl, clp, arg->lrfile); ++ /* out of memory, drain todo queue */ ++ if (!pending) ++ return -ENOMEM; ++ ++ *(stateid_t *)&pending->cb.cbl_sid = lsid; ++ list_add(&pending->clr_perclnt, arg->todolist); ++ arg->todo_count++; ++ return 0; ++} ++ ++/* Create a layoutrecall structure for each client based on the ++ * original structure. */ ++int ++create_layout_recall_list(struct list_head *todolist, unsigned *todo_len, ++ struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_client *clp; ++ struct create_recall_list_arg arg = { ++ .cbl = cbl, ++ .lrfile = lrfile, ++ .todolist = todolist, ++ }; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ /* If client given by fs, just do single client */ ++ if (cbl->cbl_seg.clientid) { ++ clp = find_confirmed_client( ++ (clientid_t *)&cbl->cbl_seg.clientid); ++ if (!clp) { ++ status = -ENOENT; ++ dprintk("%s: clientid %llx not found\n", __func__, ++ (unsigned long long)cbl->cbl_seg.clientid); ++ goto out; ++ } ++ ++ status = lo_recall_per_client(clp, &arg); ++ } else { ++ /* Check all clients for layout matches */ ++ status = filter_confirmed_clients(lo_recall_per_client, &arg); ++ } ++ ++out: ++ *todo_len = arg.todo_count; ++ dprintk("%s: <-- list len %u status %d\n", __func__, *todo_len, status); ++ return status; ++} ++ ++/* ++ * Recall layouts asynchronously ++ * Called with state lock. ++ */ ++static int ++spawn_layout_recall(struct super_block *sb, struct list_head *todolist, ++ unsigned todo_len) ++{ ++ struct nfs4_layoutrecall *pending; ++ struct nfs4_layoutrecall *parent = NULL; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ if (todo_len > 1) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ ++ parent = alloc_init_layoutrecall(&pending->cb, NULL, ++ pending->clr_file); ++ if (unlikely(!parent)) { ++ /* We want forward progress. If parent cannot be ++ * allocated take the first one as parent but don't ++ * execute it. Caller must check for -EAGAIN, if so ++ * When the partial recalls return, ++ * nfsd_layout_recall_cb should be called again. ++ */ ++ list_del_init(&pending->clr_perclnt); ++ if (todo_len > 2) { ++ parent = pending; ++ } else { ++ parent = NULL; ++ put_layoutrecall(pending); ++ } ++ --todo_len; ++ status = -ENOMEM; ++ } ++ } ++ ++ while (!list_empty(todolist)) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ list_del_init(&pending->clr_perclnt); ++ dprintk("%s: clp %p cb_client %p fp %p\n", __func__, ++ pending->clr_client, ++ pending->clr_client->cl_cb_client, ++ pending->clr_file); ++ if (unlikely(!pending->clr_client->cl_cb_client)) { ++ printk(KERN_INFO ++ "%s: clientid %08x/%08x has no callback path\n", ++ __func__, ++ pending->clr_client->cl_clientid.cl_boot, ++ pending->clr_client->cl_clientid.cl_id); ++ put_layoutrecall(pending); ++ continue; ++ } ++ ++ pending->clr_time = CURRENT_TIME; ++ pending->clr_sb = sb; ++ if (parent) { ++ /* If we created a parent its initial ref count is 1. ++ * We will need to de-ref it eventually. So we just ++ * don't increment on behalf of the last one. ++ */ ++ if (todo_len != 1) ++ get_layoutrecall(parent); ++ } ++ pending->parent = parent; ++ get_layoutrecall(pending); ++ /* Add to list so corresponding layoutreturn can find req */ ++ list_add(&pending->clr_perclnt, ++ &pending->clr_client->cl_layoutrecalls); ++ ++ nfsd4_cb_layout(pending); ++ --todo_len; ++ } ++ ++ return status; ++} ++ ++/* ++ * Spawn a thread to perform a recall layout ++ * ++ */ ++int nfsd_layout_recall_cb(struct super_block *sb, struct inode *inode, ++ struct nfsd4_pnfs_cb_layout *cbl) ++{ ++ int status; ++ struct nfs4_file *lrfile = NULL; ++ struct list_head todolist; ++ unsigned todo_len = 0; ++ ++ dprintk("NFSD nfsd_layout_recall_cb: inode %p cbl %p\n", inode, cbl); ++ BUG_ON(!cbl); ++ BUG_ON(cbl->cbl_recall_type != RETURN_FILE && ++ cbl->cbl_recall_type != RETURN_FSID && ++ cbl->cbl_recall_type != RETURN_ALL); ++ BUG_ON(cbl->cbl_recall_type == RETURN_FILE && !inode); ++ BUG_ON(cbl->cbl_seg.iomode != IOMODE_READ && ++ cbl->cbl_seg.iomode != IOMODE_RW && ++ cbl->cbl_seg.iomode != IOMODE_ANY); ++ ++ if (nfsd_serv == NULL) { ++ dprintk("NFSD nfsd_layout_recall_cb: nfsd_serv == NULL\n"); ++ return -ENOENT; ++ } ++ ++ nfs4_lock_state(); ++ status = -ENOENT; ++ if (inode) { ++ lrfile = find_file(inode); ++ if (!lrfile) { ++ dprintk("NFSD nfsd_layout_recall_cb: " ++ "nfs4_file not found\n"); ++ goto err; ++ } ++ if (cbl->cbl_recall_type == RETURN_FSID) ++ cbl->cbl_fsid = lrfile->fi_fsid; ++ } ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ /* If no cookie provided by FS, return a default one */ ++ if (!cbl->cbl_cookie) ++ cbl->cbl_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ ++ status = create_layout_recall_list(&todolist, &todo_len, cbl, lrfile); ++ if (list_empty(&todolist)) { ++ status = -ENOENT; ++ } else { ++ /* process todolist even if create_layout_recall_list ++ * returned an error */ ++ int status2 = spawn_layout_recall(sb, &todolist, todo_len); ++ if (status2) ++ status = status2; ++ } ++ ++err: ++ nfs4_unlock_state(); ++ if (lrfile) ++ put_nfs4_file(lrfile); ++ return (todo_len && status) ? -EAGAIN : status; ++} ++ ++struct create_device_notify_list_arg { ++ struct list_head *todolist; ++ struct nfsd4_pnfs_cb_dev_list *ndl; ++}; ++ ++static int ++create_device_notify_per_cl(struct nfs4_client *clp, void *p) ++{ ++ struct nfs4_notify_device *cbnd; ++ struct create_device_notify_list_arg *arg = p; ++ ++ if (atomic_read(&clp->cl_deviceref) <= 0) ++ return 0; ++ ++ cbnd = kmalloc(sizeof(*cbnd), GFP_KERNEL); ++ if (!cbnd) ++ return -ENOMEM; ++ ++ cbnd->nd_list = arg->ndl; ++ cbnd->nd_client = clp; ++ list_add(&cbnd->nd_perclnt, arg->todolist); ++ return 0; ++} ++ ++/* Create a list of clients to send device notifications. */ ++int ++create_device_notify_list(struct list_head *todolist, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ int status; ++ struct create_device_notify_list_arg arg = { ++ .todolist = todolist, ++ .ndl = ndl, ++ }; ++ ++ nfs4_lock_state(); ++ status = filter_confirmed_clients(create_device_notify_per_cl, &arg); ++ nfs4_unlock_state(); ++ ++ return status; ++} ++ ++/* ++ * For each client that a device, send a device notification. ++ * XXX: Need to track which clients have which devices. ++ */ ++int nfsd_device_notify_cb(struct super_block *sb, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ struct nfs4_notify_device *cbnd; ++ unsigned int notify_num = 0; ++ int status2, status = 0; ++ struct list_head todolist; ++ ++ BUG_ON(!ndl || ndl->cbd_len == 0 || !ndl->cbd_list); ++ ++ dprintk("NFSD %s: cbl %p len %u\n", __func__, ndl, ndl->cbd_len); ++ ++ if (nfsd_serv == NULL) ++ return -ENOENT; ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ status = create_device_notify_list(&todolist, ndl); ++ ++ while (!list_empty(&todolist)) { ++ cbnd = list_entry(todolist.next, struct nfs4_notify_device, ++ nd_perclnt); ++ list_del_init(&cbnd->nd_perclnt); ++ status2 = nfsd4_cb_notify_device(cbnd); ++ pnfs_clear_device_notify(cbnd->nd_client); ++ if (status2) { ++ kfree(cbnd); ++ status = status2; ++ } ++ notify_num++; ++ } ++ ++ dprintk("NFSD %s: status %d clients %u\n", ++ __func__, status, notify_num); ++ return status; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-23 12:09:03.309501439 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-23 12:09:03.309501439 -0400 +@@ -0,0 +1,461 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsfh.h" ++#include "nfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Just use a linked list. Do not expect more than 32 dlm_device_entries ++ * the first implementation will just use one device per cluster file system ++ */ ++ ++static LIST_HEAD(dlm_device_list); ++static DEFINE_SPINLOCK(dlm_device_list_lock); ++ ++struct dlm_device_entry { ++ struct list_head dlm_dev_list; ++ char disk_name[DISK_NAME_LEN]; ++ int num_ds; ++ char ds_list[NFSD_DLM_DS_LIST_MAX]; ++}; ++ ++static struct dlm_device_entry * ++_nfsd4_find_pnfs_dlm_device(char *disk_name) ++{ ++ struct dlm_device_entry *dlm_pdev; ++ ++ dprintk("--> %s disk name %s\n", __func__, disk_name); ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) { ++ dprintk("%s Look for dlm_pdev %s\n", __func__, ++ dlm_pdev->disk_name); ++ if (!memcmp(dlm_pdev->disk_name, disk_name, strlen(disk_name))) { ++ spin_unlock(&dlm_device_list_lock); ++ return dlm_pdev; ++ } ++ } ++ spin_unlock(&dlm_device_list_lock); ++ return NULL; ++} ++ ++static struct dlm_device_entry * ++nfsd4_find_pnfs_dlm_device(struct super_block *sb) { ++ char dname[BDEVNAME_SIZE]; ++ ++ bdevname(sb->s_bdev, dname); ++ return _nfsd4_find_pnfs_dlm_device(dname); ++} ++ ++ssize_t ++nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen) ++{ ++ char *pos = buf; ++ ssize_t size = 0; ++ struct dlm_device_entry *dlm_pdev; ++ int ret = -EINVAL; ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) ++ { ++ int advanced; ++ advanced = snprintf(pos, buflen - size, "%s:%s\n", dlm_pdev->disk_name, dlm_pdev->ds_list); ++ if (advanced >= buflen - size) ++ goto out; ++ size += advanced; ++ pos += advanced; ++ } ++ ret = size; ++ ++out: ++ spin_unlock(&dlm_device_list_lock); ++ return ret; ++} ++ ++bool nfsd4_validate_pnfs_dlm_device(char *ds_list, int *num_ds) ++{ ++ char *start = ds_list; ++ ++ *num_ds = 0; ++ ++ while (*start) { ++ struct sockaddr_storage tempAddr; ++ int ipLen = strcspn(start, ","); ++ ++ if (!rpc_pton(start, ipLen, (struct sockaddr *)&tempAddr, sizeof(tempAddr))) ++ return false; ++ (*num_ds)++; ++ start += ipLen + 1; ++ } ++ return true; ++} ++ ++/* ++ * pnfs_dlm_device string format: ++ * block-device-path:, ++ * ++ * Examples ++ * /dev/sda:192.168.1.96,192.168.1.97' creates a data server list with ++ * two data servers for the dlm cluster file system mounted on /dev/sda. ++ * ++ * /dev/sda:192.168.1.96,192.168.1.100' ++ * replaces the data server list for /dev/sda ++ * ++ * Only the deviceid == 1 is supported. Can add device id to ++ * pnfs_dlm_device string when needed. ++ * ++ * Only the round robin each data server once stripe index is supported. ++ */ ++int ++nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len) ++ ++{ ++ struct dlm_device_entry *new, *found; ++ char *bufp = pnfs_dlm_device; ++ char *endp = bufp + strlen(bufp); ++ int err = -ENOMEM; ++ ++ dprintk("--> %s len %d\n", __func__, len); ++ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return err; ++ ++ err = -EINVAL; ++ /* disk_name */ ++ /* FIXME: need to check for valid disk_name. search superblocks? ++ * check for slash dev slash ? ++ */ ++ len = strcspn(bufp, ":"); ++ if (len > DISK_NAME_LEN) ++ goto out_free; ++ memcpy(new->disk_name, bufp, len); ++ ++ err = -EINVAL; ++ bufp += len + 1; ++ if (bufp >= endp) ++ goto out_free; ++ ++ /* data server list */ ++ /* FIXME: need to check for comma separated valid ip format */ ++ len = strcspn(bufp, ":"); ++ if (len > NFSD_DLM_DS_LIST_MAX) ++ goto out_free; ++ memcpy(new->ds_list, bufp, len); ++ ++ ++ /* validate the ips */ ++ if (!nfsd4_validate_pnfs_dlm_device(new->ds_list, &(new->num_ds))) ++ goto out_free; ++ ++ dprintk("%s disk_name %s num_ds %d ds_list %s\n", __func__, ++ new->disk_name, new->num_ds, new->ds_list); ++ ++ found = _nfsd4_find_pnfs_dlm_device(new->disk_name); ++ if (found) { ++ /* FIXME: should compare found->ds_list with new->ds_list ++ * and if it is different, kick off a CB_NOTIFY change ++ * deviceid. ++ */ ++ dprintk("%s pnfs_dlm_device %s:%s already in cache " ++ " replace ds_list with new ds_list %s\n", __func__, ++ found->disk_name, found->ds_list, new->ds_list); ++ memset(found->ds_list, 0, DISK_NAME_LEN); ++ memcpy(found->ds_list, new->ds_list, strlen(new->ds_list)); ++ found->num_ds = new->num_ds; ++ kfree(new); ++ } else { ++ dprintk("%s Adding pnfs_dlm_device %s:%s\n", __func__, ++ new->disk_name, new->ds_list); ++ spin_lock(&dlm_device_list_lock); ++ list_add(&new->dlm_dev_list, &dlm_device_list); ++ spin_unlock(&dlm_device_list_lock); ++ } ++ dprintk("<-- %s Success\n", __func__); ++ return 0; ++ ++out_free: ++ kfree(new); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ struct dlm_device_entry *dlm_pdev, *next; ++ ++ dprintk("--> %s\n", __func__); ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry_safe (dlm_pdev, next, &dlm_device_list, ++ dlm_dev_list) { ++ list_del(&dlm_pdev->dlm_dev_list); ++ kfree(dlm_pdev); ++ } ++ spin_unlock(&dlm_device_list_lock); ++} ++ ++static int nfsd4_pnfs_dlm_getdeviter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ printk(KERN_ERR "%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return -ENOTSUPP; ++ } ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ return 0; ++} ++ ++static int nfsd4_pnfs_dlm_getdevinfo(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err, len, i = 0; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_devaddr *daddr; ++ struct dlm_device_entry *dlm_pdev; ++ char *bufp; ++ ++ err = -ENOTSUPP; ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ dprintk("%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return err; ++ } ++ ++ /* We only hand out a deviceid of 1 in LAYOUTGET, so a GETDEVICEINFO ++ * with a gdia_device_id != 1 is invalid. ++ */ ++ err = -EINVAL; ++ if (devid->devid != 1) { ++ dprintk("%s: WARNING: didn't receive a deviceid of " ++ "1 (got: 0x%llx)\n", __func__, devid->devid); ++ return err; ++ } ++ ++ /* ++ * If the DS list has not been established, return -EINVAL ++ */ ++ dlm_pdev = nfsd4_find_pnfs_dlm_device(sb); ++ if (!dlm_pdev) { ++ dprintk("%s: DEBUG: disk %s Not Found\n", __func__, ++ sb->s_bdev->bd_disk->disk_name); ++ return err; ++ } ++ ++ dprintk("%s: Found disk %s with DS list |%s|\n", ++ __func__, dlm_pdev->disk_name, dlm_pdev->ds_list); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ fdev.fl_device_length = dlm_pdev->num_ds; ++ ++ err = -ENOMEM; ++ len = sizeof(*fdev.fl_device_list) * fdev.fl_device_length; ++ fdev.fl_device_list = kzalloc(len, GFP_KERNEL); ++ if (!fdev.fl_device_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device list " ++ "buffer for %d DSes.\n", __func__, i); ++ fdev.fl_device_length = 0; ++ goto out; ++ } ++ ++ /* Set a simple stripe indicie */ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = kzalloc(sizeof(u32) * ++ fdev.fl_stripeindices_length, GFP_KERNEL); ++ ++ if (!fdev.fl_stripeindices_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a stripeindices " ++ "list buffer for %d DSes.\n", __func__, i); ++ goto out; ++ } ++ for (i = 0; i < fdev.fl_stripeindices_length; i++) ++ fdev.fl_stripeindices_list[i] = i; ++ ++ /* Transfer the data server list with a single multipath entry */ ++ bufp = dlm_pdev->ds_list; ++ for (i = 0; i < fdev.fl_device_length; i++) { ++ daddr = kmalloc(sizeof(*daddr), GFP_KERNEL); ++ if (!daddr) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device " ++ "addr buffer.\n", __func__); ++ goto out; ++ } ++ ++ daddr->r_netid.data = "tcp"; ++ daddr->r_netid.len = 3; ++ ++ len = strcspn(bufp, ","); ++ daddr->r_addr.data = kmalloc(len + 4, GFP_KERNEL); ++ memcpy(daddr->r_addr.data, bufp, len); ++ /* ++ * append the port number. interpreted as two more bytes ++ * beyond the quad: ".8.1" -> 0x08.0x01 -> 0x0801 = port 2049. ++ */ ++ memcpy(daddr->r_addr.data + len, ".8.1", 4); ++ daddr->r_addr.len = len + 4; ++ ++ fdev.fl_device_list[i].fl_multipath_length = 1; ++ fdev.fl_device_list[i].fl_multipath_list = daddr; ++ ++ dprintk("%s: encoding DS |%s|\n", __func__, bufp); ++ ++ bufp += len + 1; ++ } ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ for (i = 0; i < fdev.fl_device_length; i++) ++ kfree(fdev.fl_device_list[i].fl_multipath_list); ++ kfree(fdev.fl_device_list); ++ kfree(fdev.fl_stripeindices_list); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize >= NFSSVC_MAXBLKSIZE) ++ return blocksize; ++ return NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++} ++ ++/* ++ * Look up inode block device in pnfs_dlm_device list. ++ * Hash on the inode->i_ino and number of data servers. ++ */ ++static int dlm_ino_hash(struct inode *ino) ++{ ++ struct dlm_device_entry *de; ++ u32 hash_mask = 0; ++ ++ /* If can't find the inode block device in the pnfs_dlm_deivce list ++ * then don't hand out a layout ++ */ ++ de = nfsd4_find_pnfs_dlm_device(ino->i_sb); ++ if (!de) ++ return -1; ++ hash_mask = de->num_ds - 1; ++ return ino->i_ino & hash_mask; ++} ++ ++static enum nfsstat4 nfsd4_pnfs_dlm_layoutget(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ int index; ++ enum nfsstat4 rc = NFS4_OK; ++ ++ dprintk("%s: LAYOUT_GET\n", __func__); ++ ++ /* DLM exported file systems only support layouts for READ */ ++ if (res->lg_seg.iomode == IOMODE_RW) ++ return NFS4ERR_BADIOMODE; ++ ++ index = dlm_ino_hash(inode); ++ dprintk("%s first stripe index %d i_ino %lu\n", __func__, index, ++ inode->i_ino); ++ if (index < 0) ++ return NFS4ERR_LAYOUTUNAVAILABLE; ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ /* Always give out whole file layouts */ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ /* Always give out READ ONLY layouts */ ++ res->lg_seg.iomode = IOMODE_READ; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = false; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = args->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = index; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ memcpy(fhp, args->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++nfsd4_pnfs_dlm_layouttype(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++const struct pnfs_export_operations pnfs_dlm_export_ops = { ++ .layout_type = nfsd4_pnfs_dlm_layouttype, ++ .get_device_info = nfsd4_pnfs_dlm_getdevinfo, ++ .get_device_iter = nfsd4_pnfs_dlm_getdeviter, ++ .layout_get = nfsd4_pnfs_dlm_layoutget, ++}; ++EXPORT_SYMBOL(pnfs_dlm_export_ops); +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-23 12:09:03.310501390 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-23 12:09:03.310501390 -0400 +@@ -0,0 +1,620 @@ ++/* ++* linux/fs/nfsd/nfs4pnfsds.c ++* ++* Copyright (c) 2005 The Regents of the University of Michigan. ++* All rights reserved. ++* ++* Andy Adamson ++* ++* Redistribution and use in source and binary forms, with or without ++* modification, are permitted provided that the following conditions ++* are met: ++* ++* 1. Redistributions of source code must retain the above copyright ++* notice, this list of conditions and the following disclaimer. ++* 2. Redistributions in binary form must reproduce the above copyright ++* notice, this list of conditions and the following disclaimer in the ++* documentation and/or other materials provided with the distribution. ++* 3. Neither the name of the University nor the names of its ++* contributors may be used to endorse or promote products derived ++* from this software without specific prior written permission. ++* ++* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++* ++*/ ++#if defined(CONFIG_PNFSD) ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsd.h" ++#include "pnfsd.h" ++#include "state.h" ++ ++/* ++ ******************* ++ * PNFS ++ ******************* ++ */ ++/* ++ * Hash tables for pNFS Data Server state ++ * ++ * mds_nodeid: list of struct pnfs_mds_id one per Metadata server (MDS) using ++ * this data server (DS). ++ * ++ * mds_clid_hashtbl[]: uses clientid_hashval(), hash of all clientids obtained ++ * from any MDS. ++ * ++ * ds_stid_hashtbl[]: uses stateid_hashval(), hash of all stateids obtained ++ * from any MDS. ++ * ++ */ ++/* Hash tables for clientid state */ ++#define CLIENT_HASH_BITS 4 ++#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) ++#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) ++ ++#define clientid_hashval(id) \ ++ ((id) & CLIENT_HASH_MASK) ++ ++/* hash table for pnfs_ds_stateid */ ++#define STATEID_HASH_BITS 10 ++#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) ++#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) ++ ++#define stateid_hashval(owner_id, file_id) \ ++ (((owner_id) + (file_id)) & STATEID_HASH_MASK) ++ ++static struct list_head mds_id_tbl; ++static struct list_head mds_clid_hashtbl[CLIENT_HASH_SIZE]; ++static struct list_head ds_stid_hashtbl[STATEID_HASH_SIZE]; ++ ++static inline void put_ds_clientid(struct pnfs_ds_clientid *dcp); ++static inline void put_ds_mdsid(struct pnfs_mds_id *mdp); ++ ++/* Mutex for data server state. Needs to be separate from ++ * mds state mutex since a node can be both mds and ds */ ++static DEFINE_MUTEX(ds_mutex); ++static struct thread_info *ds_mutex_owner; ++ ++static void ++ds_lock_state(void) ++{ ++ mutex_lock(&ds_mutex); ++ ds_mutex_owner = current_thread_info(); ++} ++ ++static void ++ds_unlock_state(void) ++{ ++ BUG_ON(ds_mutex_owner != current_thread_info()); ++ ds_mutex_owner = NULL; ++ mutex_unlock(&ds_mutex); ++} ++ ++static int ++cmp_clid(const clientid_t *cl1, const clientid_t *cl2) ++{ ++ return (cl1->cl_boot == cl2->cl_boot) && ++ (cl1->cl_id == cl2->cl_id); ++} ++ ++void ++nfs4_pnfs_state_init(void) ++{ ++ int i; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&mds_clid_hashtbl[i]); ++ ++ for (i = 0; i < STATEID_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&ds_stid_hashtbl[i]); ++ ++ INIT_LIST_HEAD(&mds_id_tbl); ++} ++ ++static struct pnfs_mds_id * ++find_pnfs_mds_id(u32 mdsid) ++{ ++ struct pnfs_mds_id *local = NULL; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ list_for_each_entry(local, &mds_id_tbl, di_hash) { ++ if (local->di_mdsid == mdsid) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_clientid * ++find_pnfs_ds_clientid(const clientid_t *clid) ++{ ++ struct pnfs_ds_clientid *local = NULL; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = clientid_hashval(clid->cl_id); ++ list_for_each_entry(local, &mds_clid_hashtbl[hashval], dc_hash) { ++ if (cmp_clid(&local->dc_mdsclid, clid)) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_stateid * ++find_pnfs_ds_stateid(stateid_t *stid) ++{ ++ struct pnfs_ds_stateid *local = NULL; ++ u32 st_id = stid->si_stateownerid; ++ u32 f_id = stid->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_for_each_entry(local, &ds_stid_hashtbl[hashval], ds_hash) ++ if ((local->ds_stid.si_stateownerid == st_id) && ++ (local->ds_stid.si_fileid == f_id) && ++ (local->ds_stid.si_boot == stid->si_boot)) { ++ stateid_t *sid = &local->ds_stid; ++ dprintk("NFSD: %s <-- %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, local, local->ds_flags, ++ STATEID_VAL(sid)); ++ return local; ++ } ++ return NULL; ++} ++ ++static void ++release_ds_mdsid(struct kref *kref) ++{ ++ struct pnfs_mds_id *mdp = ++ container_of(kref, struct pnfs_mds_id, di_ref); ++ dprintk("pNFSD: %s\n", __func__); ++ ++ list_del(&mdp->di_hash); ++ list_del(&mdp->di_mdsclid); ++ kfree(mdp); ++} ++ ++static void ++release_ds_clientid(struct kref *kref) ++{ ++ struct pnfs_ds_clientid *dcp = ++ container_of(kref, struct pnfs_ds_clientid, dc_ref); ++ struct pnfs_mds_id *mdp; ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(dcp->dc_mdsid); ++ if (mdp) ++ put_ds_mdsid(mdp); ++ ++ list_del(&dcp->dc_hash); ++ list_del(&dcp->dc_stateid); ++ list_del(&dcp->dc_permdsid); ++ kfree(dcp); ++} ++ ++static void ++release_ds_stateid(struct kref *kref) ++{ ++ struct pnfs_ds_stateid *dsp = ++ container_of(kref, struct pnfs_ds_stateid, ds_ref); ++ struct pnfs_ds_clientid *dcp; ++ dprintk("pNFS %s: dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid(&dsp->ds_mdsclid); ++ if (dcp) ++ put_ds_clientid(dcp); ++ ++ list_del(&dsp->ds_hash); ++ list_del(&dsp->ds_perclid); ++ kfree(dsp); ++} ++ ++static inline void ++put_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_put(&dcp->dc_ref, release_ds_clientid); ++} ++ ++static inline void ++get_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_get(&dcp->dc_ref); ++} ++ ++static inline void ++put_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_put(&mdp->di_ref, release_ds_mdsid); ++} ++ ++static inline void ++get_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_get(&mdp->di_ref); ++} ++ ++static inline void ++put_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_put(&dsp->ds_ref, release_ds_stateid); ++} ++ ++static inline void ++get_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_get(&dsp->ds_ref); ++} ++ ++void ++nfs4_pnfs_state_shutdown(void) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int i; ++ ++ dprintk("pNFSD %s: -->\n", __func__); ++ ++ ds_lock_state(); ++ for (i = 0; i < STATEID_HASH_SIZE; i++) { ++ while (!list_empty(&ds_stid_hashtbl[i])) { ++ dsp = list_entry(ds_stid_hashtbl[i].next, ++ struct pnfs_ds_stateid, ds_hash); ++ put_ds_stateid(dsp); ++ } ++ } ++ ds_unlock_state(); ++} ++ ++static struct pnfs_mds_id * ++alloc_init_mds_id(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = kmalloc(sizeof(*mdp), GFP_KERNEL); ++ if (!mdp) ++ return NULL; ++ INIT_LIST_HEAD(&mdp->di_hash); ++ INIT_LIST_HEAD(&mdp->di_mdsclid); ++ list_add(&mdp->di_hash, &mds_id_tbl); ++ mdp->di_mdsid = gsp->dsid; ++ mdp->di_mdsboot = 0; ++ kref_init(&mdp->di_ref); ++ return mdp; ++} ++ ++static struct pnfs_ds_clientid * ++alloc_init_ds_clientid(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ struct pnfs_ds_clientid *dcp; ++ clientid_t *clid = (clientid_t *)&gsp->clid; ++ unsigned int hashval = clientid_hashval(clid->cl_id); ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(gsp->dsid); ++ if (!mdp) { ++ mdp = alloc_init_mds_id(gsp); ++ if (!mdp) ++ return NULL; ++ } else { ++ get_ds_mdsid(mdp); ++ } ++ ++ dcp = kmalloc(sizeof(*dcp), GFP_KERNEL); ++ if (!dcp) ++ return NULL; ++ ++ INIT_LIST_HEAD(&dcp->dc_hash); ++ INIT_LIST_HEAD(&dcp->dc_stateid); ++ INIT_LIST_HEAD(&dcp->dc_permdsid); ++ list_add(&dcp->dc_hash, &mds_clid_hashtbl[hashval]); ++ list_add(&dcp->dc_permdsid, &mdp->di_mdsclid); ++ dcp->dc_mdsclid = *clid; ++ kref_init(&dcp->dc_ref); ++ dcp->dc_mdsid = gsp->dsid; ++ return dcp; ++} ++ ++static struct pnfs_ds_stateid * ++alloc_init_ds_stateid(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct pnfs_ds_stateid *dsp; ++ u32 st_id = stidp->si_stateownerid; ++ u32 f_id = stidp->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ dsp = kmalloc(sizeof(*dsp), GFP_KERNEL); ++ if (!dsp) ++ return dsp; ++ ++ INIT_LIST_HEAD(&dsp->ds_hash); ++ INIT_LIST_HEAD(&dsp->ds_perclid); ++ memcpy(&dsp->ds_stid, stidp, sizeof(stateid_t)); ++ fh_copy_shallow(&dsp->ds_fh, &cfh->fh_handle); ++ dsp->ds_access = 0; ++ dsp->ds_status = 0; ++ dsp->ds_flags = 0L; ++ kref_init(&dsp->ds_ref); ++ set_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ clear_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ init_waitqueue_head(&dsp->ds_waitq); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_add(&dsp->ds_hash, &ds_stid_hashtbl[hashval]); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++static int ++update_ds_stateid(struct pnfs_ds_stateid *dsp, struct svc_fh *cfh, ++ struct pnfs_get_state *gsp) ++{ ++ struct pnfs_ds_clientid *dcp; ++ int new = 0; ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid((clientid_t *)&gsp->clid); ++ if (!dcp) { ++ dcp = alloc_init_ds_clientid(gsp); ++ if (!dcp) ++ return 1; ++ new = 1; ++ } ++ if (test_bit(DS_STATEID_NEW, &dsp->ds_flags)) { ++ list_add(&dsp->ds_perclid, &dcp->dc_stateid); ++ if (!new) ++ get_ds_clientid(dcp); ++ } ++ ++ memcpy(&dsp->ds_stid, &gsp->stid, sizeof(stateid_t)); ++ dsp->ds_access = gsp->access; ++ dsp->ds_status = 0; ++ dsp->ds_verifier[0] = gsp->verifier[0]; ++ dsp->ds_verifier[1] = gsp->verifier[1]; ++ memcpy(&dsp->ds_mdsclid, &gsp->clid, sizeof(clientid_t)); ++ set_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ clear_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ return 0; ++} ++ ++int ++nfs4_pnfs_cb_change_state(struct pnfs_get_state *gs) ++{ ++ stateid_t *stid = (stateid_t *)&gs->stid; ++ struct pnfs_ds_stateid *dsp; ++ ++ dprintk("pNFSD: %s stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stid)); ++ ++ ds_lock_state(); ++ dsp = find_pnfs_ds_stateid(stid); ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ if (dsp) ++ return 0; ++ return -ENOENT; ++} ++ ++/* Retrieves and validates stateid. ++ * If stateid exists and its fields match, return it. ++ * If stateid exists but either the generation or ++ * ownerids don't match, check with mds to see if it is valid. ++ * If the stateid doesn't exist, the first thread creates a ++ * invalid *marker* stateid, then checks to see if the ++ * stateid exists on the mds. If so, it validates the *marker* ++ * stateid and updates its fields. Subsequent threads that ++ * find the *marker* stateid wait until it is valid or an error ++ * occurs. ++ * Called with ds_state_lock. ++ */ ++static struct pnfs_ds_stateid * ++nfsv4_ds_get_state(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct inode *ino = cfh->fh_dentry->d_inode; ++ struct super_block *sb; ++ struct pnfs_ds_stateid *dsp = NULL; ++ struct pnfs_get_state gs = { ++ .access = 0, ++ }; ++ int status = 0, waiter = 0; ++ ++ dprintk("pNFSD: %s -->\n", __func__); ++ ++ dsp = find_pnfs_ds_stateid(stidp); ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags) && ++ (stidp->si_generation == dsp->ds_stid.si_generation)) ++ goto out_noput; ++ ++ sb = ino->i_sb; ++ if (!sb || !sb->s_pnfs_op->get_state) ++ goto out_noput; ++ ++ /* Uninitialize current state if it exists yet it doesn't match. ++ * If it is already invalid, another thread is checking state */ ++ if (dsp) { ++ if (!test_and_clear_bit(DS_STATEID_VALID, &dsp->ds_flags)) ++ waiter = 1; ++ } else { ++ dsp = alloc_init_ds_stateid(cfh, stidp); ++ if (!dsp) ++ goto out_noput; ++ } ++ ++ dprintk("pNFSD: %s Starting loop\n", __func__); ++ get_ds_stateid(dsp); ++ while (!test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ ds_unlock_state(); ++ ++ /* Another thread is checking the state */ ++ if (waiter) { ++ dprintk("pNFSD: %s waiting\n", __func__); ++ wait_event_interruptible_timeout(dsp->ds_waitq, ++ (test_bit(DS_STATEID_VALID, &dsp->ds_flags) || ++ test_bit(DS_STATEID_ERROR, &dsp->ds_flags)), ++ msecs_to_jiffies(1024)); ++ dprintk("pNFSD: %s awake\n", __func__); ++ ds_lock_state(); ++ if (test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ goto out; ++ ++ continue; ++ } ++ ++ /* Validate stateid on mds */ ++ dprintk("pNFSD: %s Checking state on MDS\n", __func__); ++ memcpy(&gs.stid, stidp, sizeof(stateid_t)); ++ status = sb->s_pnfs_op->get_state(ino, &cfh->fh_handle, &gs); ++ dprintk("pNFSD: %s from MDS status %d\n", __func__, status); ++ ds_lock_state(); ++ /* if !status and stateid is valid, update id and mark valid */ ++ if (status || update_ds_stateid(dsp, cfh, &gs)) { ++ set_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ /* remove invalid stateid from list */ ++ put_ds_stateid(dsp); ++ wake_up(&dsp->ds_waitq); ++ goto out; ++ } ++ ++ wake_up(&dsp->ds_waitq); ++ } ++out: ++ if (dsp) ++ put_ds_stateid(dsp); ++out_noput: ++ if (dsp) ++ dprintk("pNFSD: %s <-- dsp %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, dsp, dsp->ds_flags, STATEID_VAL(&dsp->ds_stid)); ++ /* If error, return null */ ++ if (dsp && test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ dsp = NULL; ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++int ++nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *cfh, stateid_t *stateid) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int status = 0; ++ ++ dprintk("pNFSD: %s --> " STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ /* Must release state lock while verifying stateid on mds */ ++ nfs4_unlock_state(); ++ ds_lock_state(); ++ dsp = nfsv4_ds_get_state(cfh, stateid); ++ if (dsp) { ++ get_ds_stateid(dsp); ++ dprintk("pNFSD: %s Found " STATEID_FMT "\n", __func__, ++ STATEID_VAL(&dsp->ds_stid)); ++ ++ dprintk("NFSD: %s: dsp %p fh_size %u:%u " ++ "fh [%08x:%08x:%08x:%08x]:[%08x:%08x:%08x:%08x] " ++ "gen %x:%x\n", ++ __func__, dsp, ++ cfh->fh_handle.fh_size, dsp->ds_fh.fh_size, ++ ((unsigned *)&cfh->fh_handle.fh_base)[0], ++ ((unsigned *)&cfh->fh_handle.fh_base)[1], ++ ((unsigned *)&cfh->fh_handle.fh_base)[2], ++ ((unsigned *)&cfh->fh_handle.fh_base)[3], ++ ((unsigned *)&dsp->ds_fh.fh_base)[0], ++ ((unsigned *)&dsp->ds_fh.fh_base)[1], ++ ((unsigned *)&dsp->ds_fh.fh_base)[2], ++ ((unsigned *)&dsp->ds_fh.fh_base)[3], ++ stateid->si_generation, dsp->ds_stid.si_generation); ++ } ++ ++ if (!dsp || ++ (cfh->fh_handle.fh_size != dsp->ds_fh.fh_size) || ++ (memcmp(&cfh->fh_handle.fh_base, &dsp->ds_fh.fh_base, ++ dsp->ds_fh.fh_size) != 0) || ++ (stateid->si_generation > dsp->ds_stid.si_generation)) ++ status = nfserr_bad_stateid; ++ else if (stateid->si_generation < dsp->ds_stid.si_generation) ++ status = nfserr_old_stateid; ++ ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ nfs4_lock_state(); ++ dprintk("pNFSD: %s <-- status %d\n", __func__, be32_to_cpu(status)); ++ return status; ++} ++ ++void ++nfs4_ds_get_verifier(stateid_t *stateid, struct super_block *sb, u32 *p) ++{ ++ struct pnfs_ds_stateid *dsp = NULL; ++ ++ dprintk("pNFSD: %s --> stid %p\n", __func__, stateid); ++ ++ ds_lock_state(); ++ if (stateid != NULL) { ++ dsp = find_pnfs_ds_stateid(stateid); ++ if (dsp) ++ get_ds_stateid(dsp); ++ } ++ ++ /* XXX: Should we fetch the stateid or wait if some other ++ * thread is currently retrieving the stateid ? */ ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ *p++ = dsp->ds_verifier[0]; ++ *p++ = dsp->ds_verifier[1]; ++ put_ds_stateid(dsp); ++ } else { ++ /* must be on MDS */ ++ ds_unlock_state(); ++ sb->s_pnfs_op->get_verifier(sb, p); ++ ds_lock_state(); ++ p += 2; ++ } ++ ds_unlock_state(); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-23 12:08:29.091491685 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 12:09:03.311501496 -0400 +@@ -34,10 +34,14 @@ + */ + #include + #include ++#include ++#include ++#include + + #include "cache.h" + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_PROC + +@@ -372,6 +376,24 @@ nfsd4_open(struct svc_rqst *rqstp, struc + * set, (2) sets open->op_stateid, (3) sets open->op_delegation. + */ + status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); ++#if defined(CONFIG_SPNFS) ++ if (!status && spnfs_enabled()) { ++ struct inode *inode = cstate->current_fh.fh_dentry->d_inode; ++ ++ status = spnfs_open(inode, open); ++ if (status) { ++ dprintk( ++ "nfsd: pNFS could not be enabled for inode: %lu\n", ++ inode->i_ino); ++ /* ++ * XXX When there's a failure then need to indicate to ++ * future ops that no pNFS is available. Should I save ++ * the status in the inode? It's kind of a big hammer. ++ * But there may be no stripes available? ++ */ ++ } ++ } ++#endif /* CONFIG_SPNFS */ + out: + if (open->op_stateowner) { + nfs4_get_stateowner(open->op_stateowner); +@@ -454,16 +476,30 @@ nfsd4_access(struct svc_rqst *rqstp, str + &access->ac_supported); + } + ++static void ++nfsd4_get_verifier(struct super_block *sb, nfs4_verifier *verf) ++{ ++ u32 *p = (u32 *)verf->data; ++ ++#if defined(CONFIG_PNFSD) ++ if (sb->s_pnfs_op && sb->s_pnfs_op->get_verifier) { ++ nfs4_ds_get_verifier(NULL, sb, p); ++ return; ++ } ++#endif /* CONFIG_PNFSD */ ++ ++ *p++ = nfssvc_boot.tv_sec; ++ *p++ = nfssvc_boot.tv_usec; ++} ++ + static __be32 + nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_commit *commit) + { + __be32 status; + +- u32 *p = (u32 *)commit->co_verf.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; +- ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &commit->co_verf); + status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + commit->co_count); + if (status == nfserr_symlink) +@@ -816,7 +852,6 @@ nfsd4_write(struct svc_rqst *rqstp, stru + { + stateid_t *stateid = &write->wr_stateid; + struct file *filp = NULL; +- u32 *p; + __be32 status = nfs_ok; + unsigned long cnt; + +@@ -838,13 +873,49 @@ nfsd4_write(struct svc_rqst *rqstp, stru + + cnt = write->wr_buflen; + write->wr_how_written = write->wr_stable_how; +- p = (u32 *)write->wr_verifier.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; + ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &write->wr_verifier); ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(cstate->current_fh.fh_dentry->d_inode, 0)) { ++ status = bl_layoutrecall(cstate->current_fh.fh_dentry->d_inode, ++ RETURN_FILE, write->wr_offset, write->wr_buflen); ++ if (!status) { ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++ } ++ } else ++#endif ++ ++ if (spnfs_enabled()) { ++ status = spnfs_write(cstate->current_fh.fh_dentry->d_inode, ++ write->wr_offset, write->wr_buflen, write->wr_vlen, ++ rqstp); ++ if (status == nfs_ok) { ++ /* DMXXX: HACK to get filesize set */ ++ /* write one byte at offset+length-1 */ ++ struct kvec k[1]; ++ char zero = 0; ++ unsigned long cnt = 1; ++ ++ k[0].iov_base = (void *)&zero; ++ k[0].iov_len = 1; ++ nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset+write->wr_buflen-1, k, 1, ++ &cnt, &write->wr_how_written); ++ } ++ } else /* we're not an MDS */ ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++#else + status = nfsd_write(rqstp, &cstate->current_fh, filp, + write->wr_offset, rqstp->rq_vec, write->wr_vlen, + &cnt, &write->wr_how_written); ++#endif /* CONFIG_SPNFS */ ++ + if (filp) + fput(filp); + +@@ -935,6 +1006,306 @@ nfsd4_verify(struct svc_rqst *rqstp, str + return status == nfserr_same ? nfs_ok : status; + } + ++#if defined(CONFIG_PNFSD) ++ ++static __be32 ++nfsd4_layout_verify(struct super_block *sb, struct svc_export *exp, ++ unsigned int layout_type) ++{ ++ int status, type; ++ ++ /* check to see if pNFS is supported. */ ++ status = nfserr_layoutunavailable; ++ if (exp && exp->ex_pnfs == 0) { ++ dprintk("%s: Underlying file system " ++ "is not exported over pNFS\n", __func__); ++ goto out; ++ } ++ if (!sb->s_pnfs_op || !sb->s_pnfs_op->layout_type) { ++ dprintk("%s: Underlying file system " ++ "does not support pNFS\n", __func__); ++ goto out; ++ } ++ ++ type = sb->s_pnfs_op->layout_type(sb); ++ ++ /* check to see if requested layout type is supported. */ ++ status = nfserr_unknown_layouttype; ++ if (!type) ++ dprintk("BUG: %s: layout_type 0 is reserved and must not be " ++ "used by filesystem\n", __func__); ++ else if (type != layout_type) ++ dprintk("%s: requested layout type %d " ++ "does not match supported type %d\n", ++ __func__, layout_type, type); ++ else ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevlist(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevlist *gdlp) ++{ ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ int status; ++ ++ dprintk("%s: type %u maxdevices %u cookie %llu verf %llu\n", ++ __func__, gdlp->gd_layout_type, gdlp->gd_maxdevices, ++ gdlp->gd_cookie, gdlp->gd_verf); ++ ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* We must be able to encode at list one device */ ++ if (!gdlp->gd_maxdevices) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ gdlp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Do nothing if underlying file system does not support ++ * getdevicelist */ ++ if (!sb->s_pnfs_op->get_device_iter) { ++ status = nfserr_notsupp; ++ goto out; ++ } ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdlp->gd_fhp = &cstate->current_fh; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevinfo(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevinfo *gdp) ++{ ++ struct super_block *sb; ++ int status; ++ clientid_t clid; ++ ++ dprintk("%s: layout_type %u dev_id %llx:%llx maxcnt %u\n", ++ __func__, gdp->gd_layout_type, gdp->gd_devid.sbid, ++ gdp->gd_devid.devid, gdp->gd_maxcount); ++ ++ status = nfserr_inval; ++ sb = find_sbid_id(gdp->gd_devid.sbid); ++ dprintk("%s: sb %p\n", __func__, sb); ++ if (!sb) { ++ status = nfserr_noent; ++ goto out; ++ } ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, NULL, gdp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdp->gd_sb = sb; ++ ++ /* Update notifications */ ++ copy_clientid(&clid, cstate->session); ++ pnfs_set_device_notify(&clid, gdp->gd_notify_types); ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutget(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lgp->lg_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_badiomode; ++ if (lgp->lg_seg.iomode != IOMODE_READ && ++ lgp->lg_seg.iomode != IOMODE_RW) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lgp->lg_seg.iomode); ++ goto out; ++ } ++ ++ /* Set up arguments so layout can be retrieved at encode time */ ++ lgp->lg_fhp = current_fh; ++ copy_clientid((clientid_t *)&lgp->lg_seg.clientid, cstate->session); ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutcommit(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ int status; ++ struct inode *ino = NULL; ++ struct iattr ia; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ dprintk("NFSD: nfsd4_layoutcommit \n"); ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ ino = current_fh->fh_dentry->d_inode; ++ if (!ino) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = ino->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lcp->args.lc_seg.layout_type); ++ if (status) ++ goto out; ++ ++ /* This will only extend the file length. Do a quick ++ * check to see if there is any point in waiting for the update ++ * locks. ++ * TODO: Is this correct for all back ends? ++ */ ++ dprintk("%s:new offset: %d new size: %llu old size: %lld\n", ++ __func__, lcp->args.lc_newoffset, lcp->args.lc_last_wr + 1, ++ ino->i_size); ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lcp->args.lc_seg.clientid, cstate->session); ++ lcp->res.lc_size_chg = 0; ++ if (sb->s_pnfs_op->layout_commit) { ++ status = sb->s_pnfs_op->layout_commit(ino, &lcp->args, &lcp->res); ++ dprintk("%s:layout_commit result %d\n", __func__, status); ++ } else { ++ fh_lock(current_fh); ++ if ((lcp->args.lc_newoffset == 0) || ++ ((lcp->args.lc_last_wr + 1) <= ino->i_size)) { ++ status = 0; ++ lcp->res.lc_size_chg = 0; ++ fh_unlock(current_fh); ++ goto out; ++ } ++ ++ /* Try our best to update the file size */ ++ dprintk("%s: Modifying file size\n", __func__); ++ ia.ia_valid = ATTR_SIZE; ++ ia.ia_size = lcp->args.lc_last_wr + 1; ++ status = notify_change(current_fh->fh_dentry, &ia); ++ fh_unlock(current_fh); ++ dprintk("%s:notify_change result %d\n", __func__, status); ++ } ++ ++ if (!status && lcp->res.lc_size_chg && ++ EX_ISSYNC(current_fh->fh_export)) { ++ dprintk("%s: Synchronously writing inode size %llu\n", ++ __func__, ino->i_size); ++ write_inode_now(ino, 1); ++ lcp->res.lc_newsize = i_size_read(ino); ++ } ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutreturn(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lrp->args.lr_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_return_type != RETURN_FILE && ++ lrp->args.lr_return_type != RETURN_FSID && ++ lrp->args.lr_return_type != RETURN_ALL) { ++ dprintk("pNFS %s: invalid return_type %d\n", __func__, ++ lrp->args.lr_return_type); ++ goto out; ++ } ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_seg.iomode != IOMODE_READ && ++ lrp->args.lr_seg.iomode != IOMODE_RW && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lrp->args.lr_seg.iomode); ++ goto out; ++ } ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lrp->args.lr_seg.clientid, cstate->session); ++ lrp->lrs_present = (lrp->args.lr_return_type == RETURN_FILE); ++ status = nfs4_pnfs_return_layout(sb, current_fh, lrp); ++out: ++ dprintk("pNFS %s: status %d return_type 0x%x lrs_present %d\n", ++ __func__, status, lrp->args.lr_return_type, lrp->lrs_present); ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * NULL call. + */ +@@ -1317,6 +1688,29 @@ static struct nfsd4_operation nfsd4_ops[ + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_RECLAIM_COMPLETE", + }, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICELIST] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevlist, ++ .op_name = "OP_GETDEVICELIST", ++ }, ++ [OP_GETDEVICEINFO] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevinfo, ++ .op_flags = ALLOWED_WITHOUT_FH, ++ .op_name = "OP_GETDEVICEINFO", ++ }, ++ [OP_LAYOUTGET] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutget, ++ .op_name = "OP_LAYOUTGET", ++ }, ++ [OP_LAYOUTCOMMIT] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutcommit, ++ .op_name = "OP_LAYOUTCOMMIT", ++ }, ++ [OP_LAYOUTRETURN] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutreturn, ++ .op_name = "OP_LAYOUTRETURN", ++ }, ++#endif /* CONFIG_PNFSD */ + }; + + static const char *nfsd4_op_name(unsigned opnum) +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-08-23 12:08:29.093491375 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 12:09:03.313491310 -0400 +@@ -42,6 +42,8 @@ + #include "xdr4.h" + #include "vfs.h" + ++#include "pnfsd.h" ++ + #define NFSDDBG_FACILITY NFSDDBG_PROC + + /* Globals */ +@@ -60,8 +62,6 @@ static u64 current_sessionid = 1; + #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) + + /* forward declarations */ +-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); +-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); + static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; + static void nfs4_set_recdir(char *recdir); + +@@ -69,6 +69,7 @@ static void nfs4_set_recdir(char *recdir + + /* Currently used for almost all code touching nfsv4 state: */ + static DEFINE_MUTEX(client_mutex); ++struct task_struct *client_mutex_owner; + + /* + * Currently used for the del_recall_lru and file hash table. In an +@@ -86,11 +87,21 @@ void + nfs4_lock_state(void) + { + mutex_lock(&client_mutex); ++ client_mutex_owner = current; ++} ++ ++#define BUG_ON_UNLOCKED_STATE() BUG_ON(client_mutex_owner != current) ++ ++void ++nfs4_bug_on_unlocked_state(void) ++{ ++ BUG_ON(client_mutex_owner != current); + } + + void + nfs4_unlock_state(void) + { ++ client_mutex_owner = NULL; + mutex_unlock(&client_mutex); + } + +@@ -109,7 +120,7 @@ opaque_hashval(const void *ptr, int nbyt + + static struct list_head del_recall_lru; + +-static inline void ++inline void + put_nfs4_file(struct nfs4_file *fi) + { + if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { +@@ -120,7 +131,7 @@ put_nfs4_file(struct nfs4_file *fi) + } + } + +-static inline void ++inline void + get_nfs4_file(struct nfs4_file *fi) + { + atomic_inc(&fi->fi_ref); +@@ -230,7 +241,10 @@ nfs4_close_delegation(struct nfs4_delega + * but we want to remove the lease in any case. */ + if (dp->dl_flock) + vfs_setlease(filp, F_UNLCK, &dp->dl_flock); ++ BUG_ON_UNLOCKED_STATE(); ++ nfs4_unlock_state(); /* allow nested layout recall/return */ + nfsd_close(filp); ++ nfs4_lock_state(); + } + + /* Called under the state lock. */ +@@ -266,8 +280,8 @@ static DEFINE_SPINLOCK(client_lock); + * reclaim_str_hashtbl[] holds known client info from previous reset/reboot + * used in reboot/reset lease grace period processing + * +- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed +- * setclientid_confirmed info. ++ * conf_id_hashtbl[], and conf_str_hashtbl[] hold ++ * confirmed setclientid_confirmed info. + * + * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed + * setclientid info. +@@ -292,6 +306,7 @@ static void unhash_generic_stateid(struc + list_del(&stp->st_hash); + list_del(&stp->st_perfile); + list_del(&stp->st_perstateowner); ++ release_pnfs_ds_dev_list(stp); + } + + static void free_generic_stateid(struct nfs4_stateid *stp) +@@ -345,7 +360,10 @@ static void release_open_stateid(struct + { + unhash_generic_stateid(stp); + release_stateid_lockowners(stp); ++ BUG_ON_UNLOCKED_STATE(); ++ nfs4_unlock_state(); /* allow nested layout recall/return */ + nfsd_close(stp->st_vfs_file); ++ nfs4_lock_state(); + free_generic_stateid(stp); + } + +@@ -739,6 +757,8 @@ expire_client(struct nfs4_client *clp) + struct nfs4_delegation *dp; + struct list_head reaplist; + ++ BUG_ON_UNLOCKED_STATE(); ++ + INIT_LIST_HEAD(&reaplist); + spin_lock(&recall_lock); + while (!list_empty(&clp->cl_delegations)) { +@@ -758,6 +778,7 @@ expire_client(struct nfs4_client *clp) + sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); + release_openowner(sop); + } ++ pnfs_expire_client(clp); + nfsd4_set_callback_client(clp, NULL); + if (clp->cl_cb_conn.cb_xprt) + svc_xprt_put(clp->cl_cb_conn.cb_xprt); +@@ -770,6 +791,13 @@ expire_client(struct nfs4_client *clp) + spin_unlock(&client_lock); + } + ++void expire_client_lock(struct nfs4_client *clp) ++{ ++ nfs4_lock_state(); ++ expire_client(clp); ++ nfs4_unlock_state(); ++} ++ + static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) + { + memcpy(target->cl_verifier.data, source->data, +@@ -859,6 +887,11 @@ static struct nfs4_client *create_client + INIT_LIST_HEAD(&clp->cl_strhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&clp->cl_layouts); ++ INIT_LIST_HEAD(&clp->cl_layoutrecalls); ++ atomic_set(&clp->cl_deviceref, 0); ++#endif /* CONFIG_PNFSD */ + INIT_LIST_HEAD(&clp->cl_sessions); + INIT_LIST_HEAD(&clp->cl_lru); + clp->cl_time = get_seconds(); +@@ -908,7 +941,7 @@ move_to_confirmed(struct nfs4_client *cl + renew_client(clp); + } + +-static struct nfs4_client * ++struct nfs4_client * + find_confirmed_client(clientid_t *clid) + { + struct nfs4_client *clp; +@@ -978,6 +1011,24 @@ find_unconfirmed_client_by_str(const cha + return NULL; + } + ++int ++filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), ++ void *arg) ++{ ++ struct nfs4_client *clp, *next; ++ int i, status = 0; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ list_for_each_entry_safe (clp, next, &conf_str_hashtbl[i], ++ cl_strhash) { ++ status = func(clp, arg); ++ if (status) ++ break; ++ } ++ ++ return status; ++} ++ + static void + gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) + { +@@ -1110,8 +1161,12 @@ nfsd4_replay_cache_entry(struct nfsd4_co + static void + nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) + { +- /* pNFS is not supported */ ++#if defined(CONFIG_PNFSD) ++ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS | ++ EXCHGID4_FLAG_USE_PNFS_DS; ++#else /* CONFIG_PNFSD */ + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; ++#endif /* CONFIG_PNFSD */ + + /* Referrals are supported, Migration is not. */ + new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; +@@ -1301,6 +1356,13 @@ nfsd4_create_session(struct svc_rqst *rq + struct nfsd4_clid_slot *cs_slot = NULL; + int status = 0; + ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ /* XXX hack to get local ip address */ ++ memcpy(&pnfsd_lexp_addr, &rqstp->rq_xprt->xpt_local, ++ sizeof(pnfsd_lexp_addr)); ++ pnfs_lexp_addr_len = rqstp->rq_xprt->xpt_locallen; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ + nfs4_lock_state(); + unconf = find_unconfirmed_client(&cr_ses->clientid); + conf = find_confirmed_client(&cr_ses->clientid); +@@ -1340,25 +1402,26 @@ nfsd4_create_session(struct svc_rqst *rq + cs_slot->sl_seqid++; /* from 0 to 1 */ + move_to_confirmed(unconf); + +- if (cr_ses->flags & SESSION4_BACK_CHAN) { +- unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; +- svc_xprt_get(rqstp->rq_xprt); +- rpc_copy_addr( +- (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, +- sa); +- unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); +- unconf->cl_cb_conn.cb_minorversion = +- cstate->minorversion; +- unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; +- unconf->cl_cb_seq_nr = 1; +- nfsd4_probe_callback(unconf, &unconf->cl_cb_conn); +- } ++ if (is_ds_only_session(unconf->cl_exchange_flags)) ++ cr_ses->flags &= ~SESSION4_BACK_CHAN; ++ + conf = unconf; + } else { + status = nfserr_stale_clientid; + goto out; + } + ++ if (cr_ses->flags & SESSION4_BACK_CHAN) { ++ conf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; ++ svc_xprt_get(rqstp->rq_xprt); ++ rpc_copy_addr((struct sockaddr *)&conf->cl_cb_conn.cb_addr, sa); ++ conf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); ++ conf->cl_cb_conn.cb_minorversion = cstate->minorversion; ++ conf->cl_cb_conn.cb_prog = cr_ses->callback_prog; ++ conf->cl_cb_seq_nr = 1; ++ nfsd4_probe_callback(conf, &conf->cl_cb_conn); ++ } ++ + /* + * We do not support RDMA or persistent sessions + */ +@@ -1746,7 +1809,7 @@ out: + + /* OPEN Share state helper functions */ + static inline struct nfs4_file * +-alloc_init_file(struct inode *ino) ++alloc_init_file(struct inode *ino, struct svc_fh *current_fh) + { + struct nfs4_file *fp; + unsigned int hashval = file_hashval(ino); +@@ -1760,6 +1823,16 @@ alloc_init_file(struct inode *ino) + fp->fi_inode = igrab(ino); + fp->fi_id = current_fileid++; + fp->fi_had_conflict = false; ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&fp->fi_layouts); ++ INIT_LIST_HEAD(&fp->fi_layout_states); ++ fp->fi_fsid.major = current_fh->fh_export->ex_fsid; ++ fp->fi_fsid.minor = 0; ++ fp->fi_fhlen = current_fh->fh_handle.fh_size; ++ BUG_ON(fp->fi_fhlen > sizeof(fp->fi_fhval)); ++ memcpy(fp->fi_fhval, ¤t_fh->fh_handle.fh_base, ++ fp->fi_fhlen); ++#endif /* CONFIG_PNFSD */ + spin_lock(&recall_lock); + list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); +@@ -1768,7 +1841,7 @@ alloc_init_file(struct inode *ino) + return NULL; + } + +-static void ++void + nfsd4_free_slab(struct kmem_cache **slab) + { + if (*slab == NULL) +@@ -1784,6 +1857,7 @@ nfsd4_free_slabs(void) + nfsd4_free_slab(&file_slab); + nfsd4_free_slab(&stateid_slab); + nfsd4_free_slab(&deleg_slab); ++ nfsd4_free_pnfs_slabs(); + } + + static int +@@ -1805,6 +1879,8 @@ nfsd4_init_slabs(void) + sizeof(struct nfs4_delegation), 0, 0, NULL); + if (deleg_slab == NULL) + goto out_nomem; ++ if (nfsd4_init_pnfs_slabs()) ++ goto out_nomem; + return 0; + out_nomem: + nfsd4_free_slabs(); +@@ -1878,6 +1954,9 @@ init_stateid(struct nfs4_stateid *stp, s + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); + INIT_LIST_HEAD(&stp->st_perfile); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &stateid_hashtbl[hashval]); + list_add(&stp->st_perstateowner, &sop->so_stateids); + list_add(&stp->st_perfile, &fp->fi_stateids); +@@ -1919,6 +1998,7 @@ find_openstateowner_str(unsigned int has + { + struct nfs4_stateowner *so = NULL; + ++ BUG_ON_UNLOCKED_STATE(); + list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { + if (same_owner_str(so, &open->op_owner, &open->op_clientid)) + return so; +@@ -1927,7 +2007,7 @@ find_openstateowner_str(unsigned int has + } + + /* search file_hashtbl[] for file */ +-static struct nfs4_file * ++struct nfs4_file * + find_file(struct inode *ino) + { + unsigned int hashval = file_hashval(ino); +@@ -1945,6 +2025,18 @@ find_file(struct inode *ino) + return NULL; + } + ++struct nfs4_file * ++find_alloc_file(struct inode *ino, struct svc_fh *current_fh) ++{ ++ struct nfs4_file *fp; ++ ++ fp = find_file(ino); ++ if (fp) ++ return fp; ++ ++ return alloc_init_file(ino, current_fh); ++} ++ + static inline int access_valid(u32 x, u32 minorversion) + { + if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) +@@ -2503,7 +2595,7 @@ nfsd4_process_open2(struct svc_rqst *rqs + if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) + goto out; + status = nfserr_resource; +- fp = alloc_init_file(ino); ++ fp = alloc_init_file(ino, current_fh); + if (fp == NULL) + goto out; + } +@@ -2730,7 +2822,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct + return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_path.dentry->d_inode; + } + +-static int ++int + STALE_STATEID(stateid_t *stateid) + { + if (stateid->si_boot == boot_time) +@@ -2740,6 +2832,16 @@ STALE_STATEID(stateid_t *stateid) + return 1; + } + ++__be32 ++nfs4_check_stateid(stateid_t *stateid) ++{ ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ return nfserr_bad_stateid; ++ if (STALE_STATEID(stateid)) ++ return nfserr_stale_stateid; ++ return 0; ++} ++ + static inline int + access_permit_read(unsigned long access_bmap) + { +@@ -2848,6 +2950,24 @@ nfs4_preprocess_stateid_op(struct nfsd4_ + if (grace_disallows_io(ino)) + return nfserr_grace; + ++#if defined(CONFIG_PNFSD) ++ if (pnfs_fh_is_ds(¤t_fh->fh_handle)) { ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ status = nfserr_bad_stateid; ++ else ++#ifdef CONFIG_GFS2_FS_LOCKING_DLM ++ { ++ dprintk("%s Don't check DS stateid\n", __func__); ++ return 0; ++ } ++#else /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ status = nfs4_preprocess_pnfs_ds_stateid(current_fh, ++ stateid); ++#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ goto out; ++ } ++#endif /* CONFIG_PNFSD */ ++ + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + +@@ -2924,13 +3044,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_co + *stpp = NULL; + *sopp = NULL; + +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { +- dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); +- return nfserr_bad_stateid; +- } +- +- if (STALE_STATEID(stateid)) +- return nfserr_stale_stateid; ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ return status; + + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; +@@ -3205,11 +3321,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + nfs4_lock_state(); +- status = nfserr_bad_stateid; +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) +- goto out; +- status = nfserr_stale_stateid; +- if (STALE_STATEID(stateid)) ++ status = nfs4_check_stateid(stateid); ++ if (status) + goto out; + status = nfserr_bad_stateid; + if (!is_delegation_stateid(stateid)) +@@ -3238,26 +3351,6 @@ out: + #define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) + #define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) + +-static inline u64 +-end_offset(u64 start, u64 len) +-{ +- u64 end; +- +- end = start + len; +- return end >= start ? end: NFS4_MAX_UINT64; +-} +- +-/* last octet in a range */ +-static inline u64 +-last_byte_offset(u64 start, u64 len) +-{ +- u64 end; +- +- BUG_ON(!len); +- end = start + len; +- return end > start ? end - 1: NFS4_MAX_UINT64; +-} +- + #define lockownerid_hashval(id) \ + ((id) & LOCK_HASH_MASK) + +@@ -3274,7 +3367,7 @@ static struct list_head lock_ownerid_has + static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; + static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; + +-static struct nfs4_stateid * ++struct nfs4_stateid * + find_stateid(stateid_t *stid, int flags) + { + struct nfs4_stateid *local; +@@ -3303,7 +3396,7 @@ find_stateid(stateid_t *stid, int flags) + return NULL; + } + +-static struct nfs4_delegation * ++struct nfs4_delegation * + find_delegation_stateid(struct inode *ino, stateid_t *stid) + { + struct nfs4_file *fp; +@@ -3436,6 +3529,9 @@ alloc_init_lock_stateid(struct nfs4_stat + INIT_LIST_HEAD(&stp->st_perfile); + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]); + list_add(&stp->st_perfile, &fp->fi_stateids); + list_add(&stp->st_perstateowner, &sop->so_stateids); +@@ -3998,6 +4094,9 @@ nfs4_state_init(void) + INIT_LIST_HEAD(&client_lru); + INIT_LIST_HEAD(&del_recall_lru); + reclaim_str_hashtbl_size = 0; ++#if defined(CONFIG_PNFSD) ++ nfs4_pnfs_state_init(); ++#endif /* CONFIG_PNFSD */ + return 0; + } + +@@ -4110,6 +4209,7 @@ __nfs4_state_shutdown(void) + } + + nfsd4_shutdown_recdir(); ++ nfs4_pnfs_state_shutdown(); + nfs4_init = 0; + } + +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-23 12:09:03.315491356 -0400 +@@ -47,9 +47,14 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_XDR + +@@ -1234,6 +1239,138 @@ nfsd4_decode_sequence(struct nfsd4_compo + DECODE_TAIL; + } + ++#if defined(CONFIG_PNFSD) ++static __be32 ++nfsd4_decode_getdevlist(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16 + sizeof(nfs4_verifier)); ++ READ32(gdevl->gd_layout_type); ++ READ32(gdevl->gd_maxdevices); ++ READ64(gdevl->gd_cookie); ++ COPYMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_getdevinfo(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ u32 num; ++ DECODE_HEAD; ++ ++ READ_BUF(12 + sizeof(struct nfsd4_pnfs_deviceid)); ++ READ64(gdev->gd_devid.sbid); ++ READ64(gdev->gd_devid.devid); ++ READ32(gdev->gd_layout_type); ++ READ32(gdev->gd_maxcount); ++ READ32(num); ++ if (num) { ++ READ_BUF(4); ++ READ32(gdev->gd_notify_types); ++ } else { ++ gdev->gd_notify_types = 0; ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(36); ++ READ32(lgp->lg_signal); ++ READ32(lgp->lg_seg.layout_type); ++ READ32(lgp->lg_seg.iomode); ++ READ64(lgp->lg_seg.offset); ++ READ64(lgp->lg_seg.length); ++ READ64(lgp->lg_minlength); ++ nfsd4_decode_stateid(argp, &lgp->lg_sid); ++ READ_BUF(4); ++ READ32(lgp->lg_maxcount); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ DECODE_HEAD; ++ u32 timechange; ++ ++ READ_BUF(20); ++ READ64(lcp->args.lc_seg.offset); ++ READ64(lcp->args.lc_seg.length); ++ READ32(lcp->args.lc_reclaim); ++ nfsd4_decode_stateid(argp, &lcp->lc_sid); ++ READ_BUF(4); ++ READ32(lcp->args.lc_newoffset); ++ if (lcp->args.lc_newoffset) { ++ READ_BUF(8); ++ READ64(lcp->args.lc_last_wr); ++ } else ++ lcp->args.lc_last_wr = 0; ++ READ_BUF(4); ++ READ32(timechange); ++ if (timechange) { ++ READ_BUF(12); ++ READ64(lcp->args.lc_mtime.seconds); ++ READ32(lcp->args.lc_mtime.nseconds); ++ } else { ++ lcp->args.lc_mtime.seconds = 0; ++ lcp->args.lc_mtime.nseconds = 0; ++ } ++ READ_BUF(8); ++ READ32(lcp->args.lc_seg.layout_type); ++ /* XXX: saving XDR'ed layout update. Since we don't have the ++ * current_fh yet, and therefore no export_ops, we can't call ++ * the layout specific decode routines. File and pVFS2 ++ * do not use the layout update.... ++ */ ++ READ32(lcp->args.lc_up_len); ++ if (lcp->args.lc_up_len > 0) { ++ READ_BUF(lcp->args.lc_up_len); ++ READMEM(lcp->args.lc_up_layout, lcp->args.lc_up_len); ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16); ++ READ32(lrp->args.lr_reclaim); ++ READ32(lrp->args.lr_seg.layout_type); ++ READ32(lrp->args.lr_seg.iomode); ++ READ32(lrp->args.lr_return_type); ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ READ_BUF(16); ++ READ64(lrp->args.lr_seg.offset); ++ READ64(lrp->args.lr_seg.length); ++ nfsd4_decode_stateid(argp, &lrp->lr_sid); ++ READ_BUF(4); ++ READ32(lrp->args.lrf_body_len); ++ if (lrp->args.lrf_body_len > 0) { ++ READ_BUF(lrp->args.lrf_body_len); ++ READMEM(lrp->args.lrf_body, lrp->args.lrf_body_len); ++ } ++ } ++ ++ DECODE_TAIL; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) + { +@@ -1335,11 +1472,19 @@ static nfsd4_dec nfsd41_dec_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, + [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, +@@ -2136,6 +2281,36 @@ out_acl: + } + WRITE64(stat.ino); + } ++#if defined(CONFIG_PNFSD) ++ if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { ++ struct super_block *sb = dentry->d_inode->i_sb; ++ int type = 0; ++ ++ /* Query the filesystem for supported pNFS layout types. ++ * Currently, we only support one layout type per file system. ++ * The export_ops->layout_type() returns the pnfs_layouttype4. ++ */ ++ buflen -= 4; ++ if (buflen < 0) /* length */ ++ goto out_resource; ++ ++ if (sb && sb->s_pnfs_op && sb->s_pnfs_op->layout_type) ++ type = sb->s_pnfs_op->layout_type(sb); ++ if (type) { ++ if ((buflen -= 4) < 0) /* type */ ++ goto out_resource; ++ WRITE32(1); /* length */ ++ WRITE32(type); /* type */ ++ } else ++ WRITE32(0); /* length */ ++ } ++ ++ if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ if ((buflen -= 4) < 0) ++ goto out_resource; ++ WRITE32(stat.blksize); ++ } ++#endif /* CONFIG_PNFSD */ + if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + WRITE32(3); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); +@@ -2366,6 +2541,10 @@ nfsd4_encode_commit(struct nfsd4_compoun + if (!nfserr) { + RESERVE_SPACE(8); + WRITEMEM(commit->co_verf.data, 8); ++ dprintk("NFSD: nfsd4_encode_commit: verifier %x:%x\n", ++ ((u32 *)(&commit->co_verf.data))[0], ++ ((u32 *)(&commit->co_verf.data))[1]); ++ + ADJUST_ARGS(); + } + return nfserr; +@@ -2620,9 +2799,20 @@ nfsd4_encode_read(struct nfsd4_compoundr + } + read->rd_vlen = v; + ++#if defined(CONFIG_SPNFS) ++ if (spnfs_enabled()) ++ nfserr = spnfs_read(read->rd_fhp->fh_dentry->d_inode, ++ read->rd_offset, &maxcount, read->rd_vlen, ++ resp->rqstp); ++ else /* we're not an MDS */ ++ nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, ++ read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, ++ &maxcount); ++#else + nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, + read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, + &maxcount); ++#endif /* CONFIG_SPNFS */ + + if (nfserr == nfserr_symlink) + nfserr = nfserr_inval; +@@ -2926,6 +3116,9 @@ nfsd4_encode_write(struct nfsd4_compound + WRITE32(write->wr_bytes_written); + WRITE32(write->wr_how_written); + WRITEMEM(write->wr_verifier.data, 8); ++ dprintk("NFSD: nfsd4_encode_write: verifier %x:%x\n", ++ ((u32 *)(&write->wr_verifier.data))[0], ++ ((u32 *)(&write->wr_verifier.data))[1]); + ADJUST_ARGS(); + } + return nfserr; +@@ -3069,6 +3262,343 @@ nfsd4_encode_sequence(struct nfsd4_compo + return 0; + } + ++#if defined(CONFIG_PNFSD) ++ ++/* Uses the export interface to iterate through the available devices ++ * and encodes them on the response stream. ++ */ ++static __be32 ++nfsd4_encode_devlist_iterator(struct nfsd4_compoundres *resp, ++ struct nfsd4_pnfs_getdevlist *gdevl, ++ unsigned int *dev_count) ++{ ++ struct super_block *sb = gdevl->gd_fhp->fh_dentry->d_inode->i_sb; ++ __be32 nfserr; ++ int status; ++ __be32 *p; ++ struct nfsd4_pnfs_dev_iter_res res = { ++ .gd_cookie = gdevl->gd_cookie, ++ .gd_verf = gdevl->gd_verf, ++ .gd_eof = 0 ++ }; ++ u64 sbid; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ sbid = find_create_sbid(sb); ++ *dev_count = 0; ++ do { ++ status = sb->s_pnfs_op->get_device_iter(sb, ++ gdevl->gd_layout_type, ++ &res); ++ if (status) { ++ if (status == -ENOENT) { ++ res.gd_eof = 1; ++ /* return success */ ++ break; ++ } ++ nfserr = nfserrno(status); ++ goto out_err; ++ } ++ ++ /* Encode device id and layout type */ ++ RESERVE_SPACE(sizeof(struct nfsd4_pnfs_deviceid)); ++ WRITE64((__be64)sbid); ++ WRITE64(res.gd_devid); /* devid minor */ ++ ADJUST_ARGS(); ++ (*dev_count)++; ++ } while (*dev_count < gdevl->gd_maxdevices && !res.gd_eof); ++ gdevl->gd_cookie = res.gd_cookie; ++ gdevl->gd_verf = res.gd_verf; ++ gdevl->gd_eof = res.gd_eof; ++ nfserr = nfs_ok; ++out_err: ++ dprintk("%s: Encoded %u devices\n", __func__, *dev_count); ++ return nfserr; ++} ++ ++/* Encodes the response of get device list. ++*/ ++static __be32 ++nfsd4_encode_getdevlist(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ unsigned int dev_count = 0, lead_count; ++ u32 *p_in = resp->p; ++ __be32 *p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ /* Ensure we have room for cookie, verifier, and devlist len, ++ * which we will backfill in after we encode as many devices as possible ++ */ ++ lead_count = 8 + sizeof(nfs4_verifier) + 4; ++ RESERVE_SPACE(lead_count); ++ /* skip past these values */ ++ p += XDR_QUADLEN(lead_count); ++ ADJUST_ARGS(); ++ ++ /* Iterate over as many device ids as possible on the xdr stream */ ++ nfserr = nfsd4_encode_devlist_iterator(resp, gdevl, &dev_count); ++ if (nfserr) ++ goto out_err; ++ ++ /* Backfill in cookie, verf and number of devices encoded */ ++ p = p_in; ++ WRITE64(gdevl->gd_cookie); ++ WRITEMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ WRITE32(dev_count); ++ ++ /* Skip over devices */ ++ p += XDR_QUADLEN(dev_count * sizeof(struct nfsd4_pnfs_deviceid)); ++ ADJUST_ARGS(); ++ ++ /* are we at the end of devices? */ ++ RESERVE_SPACE(4); ++ WRITE32(gdevl->gd_eof); ++ ADJUST_ARGS(); ++ ++ dprintk("%s: done.\n", __func__); ++ ++ nfserr = nfs_ok; ++out: ++ return nfserr; ++out_err: ++ p = p_in; ++ ADJUST_ARGS(); ++ goto out; ++} ++ ++/* For a given device id, have the file system retrieve and encode the ++ * associated device. For file layout, the encoding function is ++ * passed down to the file system. The file system then has the option ++ * of using this encoding function or one of its own. ++ * ++ * Note: the file system must return the XDR size of struct device_addr4 ++ * da_addr_body in pnfs_xdr_info.bytes_written on NFS4ERR_TOOSMALL for the ++ * gdir_mincount calculation. ++ */ ++static __be32 ++nfsd4_encode_getdevinfo(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ struct super_block *sb; ++ int maxcount = 0, type_notify_len = 12; ++ __be32 *p, *p_save = NULL, *p_in = resp->p; ++ struct exp_xdr_stream xdr; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = gdev->gd_sb; ++ ++ if (gdev->gd_maxcount != 0) { ++ /* FIXME: this will be bound by the session max response */ ++ maxcount = svc_max_payload(resp->rqstp); ++ if (maxcount > gdev->gd_maxcount) ++ maxcount = gdev->gd_maxcount; ++ ++ /* Ensure have room for type and notify field */ ++ maxcount -= type_notify_len; ++ if (maxcount < 0) { ++ nfserr = -ETOOSMALL; ++ goto toosmall; ++ } ++ } ++ ++ RESERVE_SPACE(4); ++ WRITE32(gdev->gd_layout_type); ++ ADJUST_ARGS(); ++ ++ /* If maxcount is 0 then just update notifications */ ++ if (gdev->gd_maxcount == 0) ++ goto handle_notifications; ++ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ nfserr = sb->s_pnfs_op->get_device_info(sb, &xdr, gdev->gd_layout_type, ++ &gdev->gd_devid); ++ if (nfserr) ++ goto err; ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ BUG_ON(xdr.p > xdr.end); ++ ++ /* Update the xdr stream with the number of bytes encoded ++ * by the file system. ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++handle_notifications: ++ /* Encode supported device notifications */ ++ RESERVE_SPACE(4); ++ if (sb->s_pnfs_op->set_device_notify) { ++ struct pnfs_devnotify_arg dn_args; ++ ++ dn_args.dn_layout_type = gdev->gd_layout_type; ++ dn_args.dn_devid = gdev->gd_devid; ++ dn_args.dn_notify_types = gdev->gd_notify_types; ++ nfserr = sb->s_pnfs_op->set_device_notify(sb, &dn_args); ++ if (nfserr) ++ goto err; ++ WRITE32(dn_args.dn_notify_types); ++ } else { ++ WRITE32(0); ++ } ++ ADJUST_ARGS(); ++ ++out: ++ return nfserrno(nfserr); ++toosmall: ++ dprintk("%s: maxcount too small\n", __func__); ++ RESERVE_SPACE(4); ++ WRITE32((p_save ? (xdr.p - p_save) * 4 : 0) + type_notify_len); ++ ADJUST_ARGS(); ++ goto out; ++err: ++ /* Rewind to the beginning */ ++ p = p_in; ++ ADJUST_ARGS(); ++ if (nfserr == -ETOOSMALL) ++ goto toosmall; ++ printk(KERN_ERR "%s: export ERROR %d\n", __func__, nfserr); ++ goto out; ++} ++ ++static __be32 ++nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, ++ __be32 nfserr, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int maxcount, leadcount; ++ struct super_block *sb; ++ struct exp_xdr_stream xdr; ++ __be32 *p, *p_save, *p_start = resp->p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = lgp->lg_fhp->fh_dentry->d_inode->i_sb; ++ maxcount = PAGE_SIZE; ++ if (maxcount > lgp->lg_maxcount) ++ maxcount = lgp->lg_maxcount; ++ ++ /* Check for space on xdr stream */ ++ leadcount = 36 + sizeof(stateid_opaque_t); ++ RESERVE_SPACE(leadcount); ++ /* encode layout metadata after file system encodes layout */ ++ p += XDR_QUADLEN(leadcount); ++ ADJUST_ARGS(); ++ ++ /* Ensure have room for ret_on_close, off, len, iomode, type */ ++ maxcount -= leadcount; ++ if (maxcount < 0) { ++ printk(KERN_ERR "%s: buffer too small\n", __func__); ++ nfserr = nfserr_toosmall; ++ goto err; ++ } ++ ++ /* Set xdr info so file system can encode layout */ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ /* Retrieve, encode, and merge layout; process stateid */ ++ nfserr = nfs4_pnfs_get_layout(lgp, &xdr); ++ if (nfserr) ++ goto err; ++ ++ /* Ensure file system returned enough bytes for the client ++ * to access. ++ */ ++ if (lgp->lg_seg.length < lgp->lg_minlength) { ++ nfserr = nfserr_badlayout; ++ goto err; ++ } ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ ++ /* Rewind to beginning and encode attrs */ ++ resp->p = p_start; ++ RESERVE_SPACE(4); ++ WRITE32(lgp->lg_roc); /* return on close */ ++ ADJUST_ARGS(); ++ nfsd4_encode_stateid(resp, &lgp->lg_sid); ++ RESERVE_SPACE(28); ++ /* Note: response logr_layout array count, always one for now */ ++ WRITE32(1); ++ WRITE64(lgp->lg_seg.offset); ++ WRITE64(lgp->lg_seg.length); ++ WRITE32(lgp->lg_seg.iomode); ++ WRITE32(lgp->lg_seg.layout_type); ++ ++ /* Update the xdr stream with the number of bytes written ++ * by the file system ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++ return nfs_ok; ++err: ++ resp->p = p_start; ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lcp->res.lc_size_chg); ++ ADJUST_ARGS(); ++ if (lcp->res.lc_size_chg) { ++ RESERVE_SPACE(8); ++ WRITE64(lcp->res.lc_newsize); ++ ADJUST_ARGS(); ++ } ++out: ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lrp->lrs_present != 0); /* got stateid? */ ++ ADJUST_ARGS(); ++ if (lrp->lrs_present) ++ nfsd4_encode_stateid(resp, &lrp->lr_sid); ++out: ++ return nfserr; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) + { +@@ -3129,11 +3659,19 @@ static nfsd4_enc nfsd4_enc_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, + [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, +diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c +--- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-08-23 12:08:29.094491943 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 12:09:03.317501495 -0400 +@@ -13,10 +13,15 @@ + #include + #include + #include ++#include + + #include "nfsd.h" + #include "cache.h" + ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++#include ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + /* + * We have a single directory with 9 nodes in it. + */ +@@ -49,6 +54,9 @@ enum { + NFSD_Gracetime, + NFSD_RecoveryDir, + #endif ++#ifdef CONFIG_PNFSD ++ NFSD_pnfs_dlm_device, ++#endif + }; + + /* +@@ -74,6 +82,9 @@ static ssize_t write_leasetime(struct fi + static ssize_t write_gracetime(struct file *file, char *buf, size_t size); + static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); + #endif ++#ifdef CONFIG_PNFSD ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size); ++#endif + + static ssize_t (*write_op[])(struct file *, char *, size_t) = { + [NFSD_Svc] = write_svc, +@@ -96,6 +107,9 @@ static ssize_t (*write_op[])(struct file + [NFSD_Gracetime] = write_gracetime, + [NFSD_RecoveryDir] = write_recoverydir, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = write_pnfs_dlm_device, ++#endif + }; + + static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) +@@ -1349,6 +1363,68 @@ static ssize_t write_recoverydir(struct + + #endif + ++#ifdef CONFIG_PNFSD ++ ++static ssize_t __write_pnfs_dlm_device(struct file *file, char *buf, ++ size_t size) ++{ ++ char *mesg = buf; ++ char *pnfs_dlm_device; ++ int max_size = NFSD_PNFS_DLM_DEVICE_MAX; ++ int len, ret = 0; ++ ++ if (size > 0) { ++ ret = -EINVAL; ++ if (size > max_size || buf[size-1] != '\n') ++ return ret; ++ buf[size-1] = 0; ++ ++ pnfs_dlm_device = mesg; ++ len = qword_get(&mesg, pnfs_dlm_device, size); ++ if (len <= 0) ++ return ret; ++ ++ ret = nfsd4_set_pnfs_dlm_device(pnfs_dlm_device, len); ++ } else ++ return nfsd4_get_pnfs_dlm_device_list(buf, SIMPLE_TRANSACTION_LIMIT); ++ ++ return ret <= 0 ? ret : strlen(buf); ++} ++ ++/** ++ * write_pnfs_dlm_device - Set or report the current pNFS data server list ++ * ++ * Input: ++ * buf: ignored ++ * size: zero ++ * ++ * OR ++ * ++ * Input: ++ * buf: C string containing a block device name, ++ * a colon, and then a comma separated ++ * list of pNFS data server IPv4 addresses ++ * size: non-zero length of C string in @buf ++ * Output: ++ * On success: passed-in buffer filled with '\n'-terminated C ++ * string containing a block device name, a colon, and ++ * then a comma separated list of pNFS ++ * data server IPv4 addresses. ++ * return code is the size in bytes of the string ++ * On error: return code is a negative errno value ++ */ ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size) ++{ ++ ssize_t rv; ++ ++ mutex_lock(&nfsd_mutex); ++ rv = __write_pnfs_dlm_device(file, buf, size); ++ mutex_unlock(&nfsd_mutex); ++ return rv; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ + /*----------------------------------------------------------------------------*/ + /* + * populating the filesystem. +@@ -1383,6 +1459,10 @@ static int nfsd_fill_super(struct super_ + [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = {"pnfs_dlm_device", &transaction_ops, ++ S_IWUSR|S_IRUSR}, ++#endif + /* last one */ {""} + }; + return simple_fill_super(sb, 0x6e667364, nfsd_files); +@@ -1421,6 +1501,9 @@ static int create_proc_exports_entry(voi + } + #endif + ++#if defined(CONFIG_SPNFS_BLOCK) ++int nfsd_bl_init(void); ++#endif + static int __init init_nfsd(void) + { + int retval; +@@ -1443,6 +1526,15 @@ static int __init init_nfsd(void) + retval = create_proc_exports_entry(); + if (retval) + goto out_free_idmap; ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ retval = spnfs_init_proc(); ++ if (retval != 0) ++ goto out_free_idmap; ++#if defined(CONFIG_SPNFS_BLOCK) ++ nfsd_bl_init(); ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + retval = register_filesystem(&nfsd_fs_type); + if (retval) + goto out_free_all; +@@ -1465,7 +1557,22 @@ out_free_stat: + + static void __exit exit_nfsd(void) + { ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ remove_proc_entry("fs/nfs/spnfs/recall", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/getfh", NULL); ++ remove_proc_entry("fs/nfs/spnfs/config", NULL); ++ remove_proc_entry("fs/nfs/spnfs/ctl", NULL); ++ remove_proc_entry("fs/nfs/spnfs", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutsegsize", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ + nfsd_export_shutdown(); ++ nfsd4_pnfs_dlm_shutdown(); + nfsd_reply_cache_shutdown(); + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); +diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h +--- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-08-23 12:08:29.095491390 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 12:09:03.318355741 -0400 +@@ -285,11 +285,17 @@ extern time_t nfsd4_grace; + #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ + NFSD4_SUPPORTED_ATTRS_WORD0 + ++#if defined(CONFIG_PNFSD) ++#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ ++ (NFSD4_SUPPORTED_ATTRS_WORD1 | FATTR4_WORD1_FS_LAYOUT_TYPES) ++#else /* CONFIG_PNFSD */ + #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ + NFSD4_SUPPORTED_ATTRS_WORD1 ++#endif /* CONFIG_PNFSD */ + + #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ +- (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) ++ (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT | \ ++ FATTR4_WORD2_LAYOUT_BLKSIZE) + + static inline u32 nfsd_suppattrs0(u32 minorversion) + { +diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.c +--- linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-08-23 12:09:03.319511586 -0400 +@@ -10,6 +10,7 @@ + #include + + #include ++#include + #include "nfsd.h" + #include "vfs.h" + #include "auth.h" +@@ -139,6 +140,7 @@ static inline __be32 check_pseudo_root(s + static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) + { + struct knfsd_fh *fh = &fhp->fh_handle; ++ int fsid_type; + struct fid *fid = NULL, sfid; + struct svc_export *exp; + struct dentry *dentry; +@@ -159,7 +161,8 @@ static __be32 nfsd_set_fh_dentry(struct + return error; + if (fh->fh_auth_type != 0) + return error; +- len = key_len(fh->fh_fsid_type) / 4; ++ fsid_type = pnfs_fh_fsid_type(fh); ++ len = key_len(fsid_type) / 4; + if (len == 0) + return error; + if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { +@@ -172,7 +175,7 @@ static __be32 nfsd_set_fh_dentry(struct + data_left -= len; + if (data_left < 0) + return error; +- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth); ++ exp = rqst_exp_find(rqstp, fsid_type, fh->fh_auth); + fid = (struct fid *)(fh->fh_auth + len); + } else { + __u32 tfh[2]; +diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.h +--- linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-08-23 12:09:03.319511586 -0400 +@@ -14,6 +14,7 @@ enum nfsd_fsid { + FSID_UUID8, + FSID_UUID16, + FSID_UUID16_INUM, ++ FSID_MAX + }; + + enum fsid_source { +@@ -205,4 +206,42 @@ fh_unlock(struct svc_fh *fhp) + } + } + ++#if defined(CONFIG_PNFSD) ++ ++/* ++ * fh_fsid_type is overloaded to indicate whether a filehandle was one supplied ++ * to a DS by LAYOUTGET. nfs4_preprocess_stateid_op() uses this to decide how ++ * to handle a given stateid. ++ */ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return fh->fh_fsid_type >= FSID_MAX; ++} ++ ++static inline void pnfs_fh_mark_ds(struct knfsd_fh *fh) ++{ ++ BUG_ON(fh->fh_version != 1); ++ BUG_ON(pnfs_fh_is_ds(fh)); ++ fh->fh_fsid_type += FSID_MAX; ++} ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ ++/* allows fh_verify() to check the real fsid_type (i.e., not overloaded). */ ++static inline int pnfs_fh_fsid_type(struct knfsd_fh *fh) ++{ ++ int fsid_type = fh->fh_fsid_type; ++ ++ if (pnfs_fh_is_ds(fh)) ++ return fsid_type - FSID_MAX; ++ return fsid_type; ++} ++ + #endif /* _LINUX_NFSD_FH_INT_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/nfssvc.c +--- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-08-23 12:08:27.631563969 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-08-23 12:09:03.320416974 -0400 +@@ -115,7 +115,7 @@ struct svc_program nfsd_program = { + + }; + +-u32 nfsd_supported_minorversion; ++u32 nfsd_supported_minorversion = NFSD_SUPPORTED_MINOR_VERSION; + + int nfsd_vers(int vers, enum vers_op change) + { +diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pnfsd.h +--- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-08-23 12:09:03.321376171 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-08-23 12:09:03.321376171 -0400 +@@ -0,0 +1,143 @@ ++/* ++ * Copyright (c) 2005 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef LINUX_NFSD_PNFSD_H ++#define LINUX_NFSD_PNFSD_H ++ ++#include ++#include ++ ++#include "state.h" ++#include "xdr4.h" ++ ++/* outstanding layout stateid */ ++struct nfs4_layout_state { ++ struct list_head ls_perfile; ++ struct list_head ls_layouts; /* list of nfs4_layouts */ ++ struct kref ls_ref; ++ struct nfs4_client *ls_client; ++ struct nfs4_file *ls_file; ++ stateid_t ls_stateid; ++}; ++ ++/* outstanding layout */ ++struct nfs4_layout { ++ struct list_head lo_perfile; /* hash by f_id */ ++ struct list_head lo_perclnt; /* hash by clientid */ ++ struct list_head lo_perstate; ++ struct nfs4_file *lo_file; /* backpointer */ ++ struct nfs4_client *lo_client; ++ struct nfs4_layout_state *lo_state; ++ struct nfsd4_layout_seg lo_seg; ++}; ++ ++struct pnfs_inval_state { ++ struct knfsd_fh mdsfh; /* needed only by invalidate all */ ++ stateid_t stid; ++ clientid_t clid; ++ u32 status; ++}; ++ ++/* pNFS Data Server state */ ++#define DS_STATEID_VALID 0 ++#define DS_STATEID_ERROR 1 ++#define DS_STATEID_NEW 2 ++ ++struct pnfs_ds_stateid { ++ struct list_head ds_hash; /* ds_stateid hash entry */ ++ struct list_head ds_perclid; /* per client hash entry */ ++ stateid_t ds_stid; ++ struct knfsd_fh ds_fh; ++ unsigned long ds_access; ++ u32 ds_status; /* from MDS */ ++ u32 ds_verifier[2]; /* from MDS */ ++ wait_queue_head_t ds_waitq; ++ unsigned long ds_flags; ++ struct kref ds_ref; ++ clientid_t ds_mdsclid; ++}; ++ ++struct pnfs_ds_clientid { ++ struct list_head dc_hash; /* mds_clid_hashtbl entry */ ++ struct list_head dc_stateid; /* ds_stateid head */ ++ struct list_head dc_permdsid; /* per mdsid hash entry */ ++ clientid_t dc_mdsclid; ++ struct kref dc_ref; ++ uint32_t dc_mdsid; ++}; ++ ++struct pnfs_mds_id { ++ struct list_head di_hash; /* mds_nodeid list entry */ ++ struct list_head di_mdsclid; /* mds_clientid head */ ++ uint32_t di_mdsid; ++ time_t di_mdsboot; /* mds boot time */ ++ struct kref di_ref; ++}; ++ ++/* notify device request (from exported filesystem) */ ++struct nfs4_notify_device { ++ struct nfsd4_pnfs_cb_dev_list *nd_list; ++ struct nfs4_client *nd_client; ++ struct list_head nd_perclnt; ++ ++ void *nd_args; /* nfsd internal */ ++}; ++ ++u64 find_create_sbid(struct super_block *); ++struct super_block *find_sbid_id(u64); ++__be32 nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *, struct exp_xdr_stream *); ++int nfs4_pnfs_return_layout(struct super_block *, struct svc_fh *, ++ struct nfsd4_pnfs_layoutreturn *); ++int nfs4_pnfs_cb_get_state(struct super_block *, struct pnfs_get_state *); ++int nfs4_pnfs_cb_change_state(struct pnfs_get_state *); ++void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++int put_layoutrecall(struct nfs4_layoutrecall *); ++void nomatching_layout(struct nfs4_layoutrecall *); ++void *layoutrecall_done(struct nfs4_layoutrecall *); ++int nfsd4_cb_layout(struct nfs4_layoutrecall *); ++int nfsd_layout_recall_cb(struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++int nfsd_device_notify_cb(struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++int nfsd4_cb_notify_device(struct nfs4_notify_device *); ++void pnfs_set_device_notify(clientid_t *, unsigned int types); ++void pnfs_clear_device_notify(struct nfs4_client *); ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++extern struct sockaddr pnfsd_lexp_addr; ++extern size_t pnfs_lexp_addr_len; ++ ++extern void pnfsd_lexp_init(struct inode *); ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#endif /* LINUX_NFSD_PNFSD_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c +--- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-23 12:09:03.321376171 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-23 12:09:03.322501672 -0400 +@@ -0,0 +1,225 @@ ++/* ++ * linux/fs/nfsd/pnfs_lexp.c ++ * ++ * pNFS export of local filesystems. ++ * ++ * Export local file systems over the files layout type. ++ * The MDS (metadata server) functions also as a single DS (data server). ++ * This is mostly useful for development and debugging purposes. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * Copyright (C) 2008 Benny Halevy, ++ * ++ * Initial implementation was based on the pnfs-gfs2 patches done ++ * by David M. Richter ++ */ ++ ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++struct sockaddr pnfsd_lexp_addr; ++size_t pnfs_lexp_addr_len; ++ ++static int ++pnfsd_lexp_layout_type(struct super_block *sb) ++{ ++ int ret = LAYOUT_NFSV4_1_FILES; ++ dprintk("<-- %s: return %d\n", __func__, ret); ++ return ret; ++} ++ ++static int ++pnfsd_lexp_get_device_iter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ ++ dprintk("<-- %s: return 0\n", __func__); ++ return 0; ++} ++ ++static int ++pnfsd_lexp_get_device_info(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_multipath fl_devices[1]; ++ u32 fl_stripe_indices[1] = { 0 }; ++ struct pnfs_filelayout_devaddr daddr; ++ /* %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x.%03u.%03u */ ++ char daddr_buf[8*4 + 2*3 + 10]; ++ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ ++ if (devid->devid != 1) { ++ printk(KERN_ERR "%s: WARNING: didn't receive a deviceid of 1 " ++ "(got: 0x%llx)\n", __func__, devid->devid); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ /* count the number of comma-delimited DS IPs */ ++ fdev.fl_device_length = 1; ++ fdev.fl_device_list = fl_devices; ++ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = fl_stripe_indices; ++ ++ daddr.r_addr.data = daddr_buf; ++ daddr.r_addr.len = sizeof(daddr_buf); ++ err = __svc_print_netaddr(&pnfsd_lexp_addr, &daddr.r_addr); ++ if (err < 0) ++ goto out; ++ daddr.r_addr.len = err; ++ switch (pnfsd_lexp_addr.sa_family) { ++ case AF_INET: ++ daddr.r_netid.data = "tcp"; ++ daddr.r_netid.len = 3; ++ break; ++ case AF_INET6: ++ daddr.r_netid.data = "tcp6"; ++ daddr.r_netid.len = 4; ++ break; ++ default: ++ BUG(); ++ } ++ fdev.fl_device_list[0].fl_multipath_length = 1; ++ fdev.fl_device_list[0].fl_multipath_list = &daddr; ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ dprintk("<-- %s: return %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize < NFSSVC_MAXBLKSIZE) ++ blocksize = NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++ dprintk("%s: return %d\n", __func__, blocksize); ++ return blocksize; ++} ++ ++static enum nfsstat4 ++pnfsd_lexp_layout_get(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ enum nfsstat4 rc = NFS4_OK; ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ ++ dprintk("--> %s: inode=%p\n", __func__, inode); ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = true; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = arg->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = 0; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ memcpy(fhp, arg->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ dprintk("<-- %s: return %d\n", __func__, rc); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++pnfsd_lexp_layout_commit(struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ dprintk("%s: (unimplemented)\n", __func__); ++ ++ return 0; ++} ++ ++static int ++pnfsd_lexp_layout_return(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ dprintk("%s: (unimplemented)\n", __func__); ++ ++ return 0; ++} ++ ++static int pnfsd_lexp_get_state(struct inode *inode, struct knfsd_fh *fh, ++ struct pnfs_get_state *p) ++{ ++ return 0; /* just use the current stateid */ ++} ++ ++static struct pnfs_export_operations pnfsd_lexp_ops = { ++ .layout_type = pnfsd_lexp_layout_type, ++ .get_device_info = pnfsd_lexp_get_device_info, ++ .get_device_iter = pnfsd_lexp_get_device_iter, ++ .layout_get = pnfsd_lexp_layout_get, ++ .layout_commit = pnfsd_lexp_layout_commit, ++ .layout_return = pnfsd_lexp_layout_return, ++ .get_state = pnfsd_lexp_get_state, ++}; ++ ++void ++pnfsd_lexp_init(struct inode *inode) ++{ ++ dprintk("%s: &pnfsd_lexp_ops=%p\n", __func__, &pnfsd_lexp_ops); ++ inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_com.c +--- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-23 12:09:03.322501672 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-08-23 12:09:03.323511608 -0400 +@@ -0,0 +1,535 @@ ++/* ++ * fs/nfsd/spnfs_com.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * Based heavily on idmap.c ++ * ++ */ ++ ++/* ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Marius Aamodt Eriksen ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++static ssize_t spnfs_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t spnfs_pipe_downcall(struct file *, const char __user *, ++ size_t); ++static void spnfs_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops spnfs_upcall_ops = { ++ .upcall = spnfs_pipe_upcall, ++ .downcall = spnfs_pipe_downcall, ++ .destroy_msg = spnfs_pipe_destroy_msg, ++}; ++ ++/* evil global variable */ ++struct spnfs *global_spnfs; ++struct spnfs_config *spnfs_config; ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++int spnfs_use_layoutsegments; ++uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++/* ++ * Used by spnfs_enabled() ++ * Tracks if the subsystem has been initialized at some point. It doesn't ++ * matter if it's not currently initialized. ++ */ ++static int spnfs_enabled_at_some_point; ++ ++/* call this to start the ball rolling */ ++/* code it like we're going to avoid the global variable in the future */ ++int ++nfsd_spnfs_new(void) ++{ ++ struct spnfs *spnfs = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ if (global_spnfs != NULL) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ spnfs = kzalloc(sizeof(*spnfs), GFP_KERNEL); ++ if (spnfs == NULL){ ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ spnfs->spnfs_dentry = rpc_mkpipe(nd.path.dentry, "spnfs", spnfs, ++ &spnfs_upcall_ops, 0); ++ if (IS_ERR(spnfs->spnfs_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ ++ mutex_init(&spnfs->spnfs_lock); ++ mutex_init(&spnfs->spnfs_plock); ++ init_waitqueue_head(&spnfs->spnfs_wq); ++ ++ global_spnfs = spnfs; ++ spnfs_enabled_at_some_point = 1; ++ ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(spnfs); ++ return rc; ++} ++ ++/* again, code it like we're going to remove the global variable */ ++void ++nfsd_spnfs_delete(void) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ ++ if (!spnfs) ++ return; ++ rpc_unlink(spnfs->spnfs_dentry); ++ rpc_put_mount(); ++ global_spnfs = NULL; ++ kfree(spnfs); ++} ++ ++/* RPC pipefs upcall/downcall routines */ ++/* looks like this code is invoked by the rpc_pipe code */ ++/* to handle upcalls on things we've queued elsewhere */ ++/* See nfs_idmap_id for an exmaple of enqueueing */ ++static ssize_t ++spnfs_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, ++ char __user *dst, size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied; ++ ssize_t left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ return mlen; ++} ++ ++static ssize_t ++spnfs_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ struct spnfs *spnfs = (struct spnfs *)rpci->private; ++ struct spnfs_msg *im_in = NULL, *im = &spnfs->spnfs_im; ++ int ret; ++ ++ if (mlen != sizeof(struct spnfs_msg)) ++ return -ENOSPC; ++ ++ im_in = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im_in == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(im_in, src, mlen) != 0) ++ return -EFAULT; ++ ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ ret = mlen; ++ im->im_status = im_in->im_status; ++ /* If we got an error, terminate now, and wake up pending upcalls */ ++ if (!(im_in->im_status & SPNFS_STATUS_SUCCESS)) { ++ wake_up(&spnfs->spnfs_wq); ++ goto out; ++ } ++ ++ ret = -EINVAL; ++ /* Did we match the current upcall? */ ++ /* DMXXX: do not understand the comment above, from original code */ ++ /* DMXXX: when do we _not_ match the current upcall? */ ++ /* DMXXX: anyway, let's to a simplistic check */ ++ if (im_in->im_type == im->im_type) { ++ /* copy the response into the spnfs struct */ ++ memcpy(&im->im_res, &im_in->im_res, sizeof(im->im_res)); ++ ret = mlen; ++ } else ++ dprintk("spnfs: downcall type != upcall type\n"); ++ ++ ++ wake_up(&spnfs->spnfs_wq); ++/* DMXXX handle rval processing */ ++out: ++ mutex_unlock(&spnfs->spnfs_plock); ++ kfree(im_in); ++ return ret; ++} ++ ++static void ++spnfs_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ struct spnfs_msg *im = msg->data; ++ struct spnfs *spnfs = container_of(im, struct spnfs, spnfs_im); ++ ++ if (msg->errno >= 0) ++ return; ++ mutex_lock(&spnfs->spnfs_plock); ++ im->im_status = SPNFS_STATUS_FAIL; /* DMXXX */ ++ wake_up(&spnfs->spnfs_wq); ++ mutex_unlock(&spnfs->spnfs_plock); ++} ++ ++/* generic upcall. called by functions in spnfs_ops.c */ ++int ++spnfs_upcall(struct spnfs *spnfs, struct spnfs_msg *upmsg, ++ union spnfs_msg_res *res) ++{ ++ struct rpc_pipe_msg msg; ++ struct spnfs_msg *im; ++ DECLARE_WAITQUEUE(wq, current); ++ int ret = -EIO; ++ int rval; ++ ++ im = &spnfs->spnfs_im; ++ ++ mutex_lock(&spnfs->spnfs_lock); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ memset(im, 0, sizeof(*im)); ++ memcpy(im, upmsg, sizeof(*upmsg)); ++ ++ memset(&msg, 0, sizeof(msg)); ++ msg.data = im; ++ msg.len = sizeof(*im); ++ ++ add_wait_queue(&spnfs->spnfs_wq, &wq); ++ rval = rpc_queue_upcall(spnfs->spnfs_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&spnfs->spnfs_plock); ++ schedule(); ++ current->state = TASK_RUNNING; ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ if (im->im_status & SPNFS_STATUS_SUCCESS) { ++ /* copy our result from the upcall */ ++ memcpy(res, &im->im_res, sizeof(*res)); ++ ret = 0; ++ } ++ ++out: ++ memset(im, 0, sizeof(*im)); ++ mutex_unlock(&spnfs->spnfs_plock); ++ mutex_unlock(&spnfs->spnfs_lock); ++ return(ret); ++} ++ ++/* ++ * This is used to determine if the spnfsd daemon has been started at ++ * least once since the system came up. This is used to by the export ++ * mechanism to decide if spnfs is in use. ++ * ++ * Returns non-zero if the spnfsd has initialized the communication pipe ++ * at least once. ++ */ ++int spnfs_enabled(void) ++{ ++ return spnfs_enabled_at_some_point; ++} ++ ++#ifdef CONFIG_PROC_FS ++ ++/* ++ * procfs virtual files for user/kernel space communication: ++ * ++ * ctl - currently just an on/off switch...can be expanded ++ * getfh - fd to fh conversion ++ * recall - recall a layout from the command line, for example: ++ * echo > /proc/fs/spnfs/recall ++ * config - configuration info, e.g., stripe size, num ds, etc. ++ */ ++ ++/*-------------- start ctl -------------------------*/ ++static ssize_t ctl_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int cmd, rc; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (cmd) { ++ rc = nfsd_spnfs_new(); ++ if (rc != 0) ++ return rc; ++ } else ++ nfsd_spnfs_delete(); ++ ++ return count; ++} ++ ++static const struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++/*-------------- end ctl ---------------------------*/ ++ ++/*-------------- start config -------------------------*/ ++static ssize_t config_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ static struct spnfs_config cfg; ++ ++ if (copy_from_user(&cfg, buf, count)) ++ return -EFAULT; ++ ++ spnfs_config = &cfg; ++ return 0; ++} ++ ++static const struct file_operations config_ops = { ++ .write = config_write, ++}; ++/*-------------- end config ---------------------------*/ ++ ++/*-------------- start getfh -----------------------*/ ++static int getfh_open(struct inode *inode, struct file *file) ++{ ++ file->private_data = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); ++ if (file->private_data == NULL) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static ssize_t getfh_read(struct file *file, char __user *buf, size_t count, ++ loff_t *offset) ++{ ++ if (copy_to_user(buf, file->private_data, sizeof(struct nfs_fh))) ++ return -EFAULT; ++ ++ return count; ++} ++ ++static ssize_t getfh_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int fd; ++ ++ if (copy_from_user((int *)&fd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (spnfs_getfh(fd, file->private_data) != 0) ++ return -EIO; ++ ++ return count; ++} ++ ++static int getfh_release(struct inode *inode, struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static const struct file_operations getfh_ops = { ++ .open = getfh_open, ++ .read = getfh_read, ++ .write = getfh_write, ++ .release = getfh_release, ++}; ++/*-------------- end getfh ------------------------*/ ++ ++ ++/*-------------- start recall layout --------------*/ ++static ssize_t recall_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char input[128]; ++ char *path, *str, *p; ++ int rc; ++ u64 off = 0, len = 0; ++ ++ if (count > 128) ++ return -EINVAL; ++ ++ if (copy_from_user(input, buf, count)) ++ return -EFAULT; ++ ++ /* assumes newline-terminated path */ ++ p = memchr(input, '\n', count); ++ if (p == NULL) ++ return -EINVAL; ++ *p = '\0'; ++ ++ /* ++ * Scan for path and, optionally, an offset and length ++ * of a layout segment to be recalled; if there are two ++ * fields, they're assumed to be path and offset. ++ */ ++ p = input; ++ path = strsep(&p, " "); ++ if (path == NULL) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &off); ++ if (rc != 0) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &len); ++ if (rc != 0) ++ return -EINVAL; ++ } ++ } ++ ++ rc = spnfs_test_layoutrecall(path, off, len); ++ if (rc != 0) ++ return rc; ++ ++ return count; ++} ++ ++static const struct file_operations recall_ops = { ++ .write = recall_write, ++}; ++/*-------------- end recall layout --------------*/ ++ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++/*-------------- start layoutseg -------------------------*/ ++static ssize_t layoutseg_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[3]; ++ ++ if (copy_from_user(cmd, buf, 1)) ++ return -EFAULT; ++ if (cmd[0] == '0') ++ spnfs_use_layoutsegments = 0; ++ else ++ spnfs_use_layoutsegments = 1; ++ ++ return count; ++} ++ ++static const struct file_operations layoutseg_ops = { ++ .write = layoutseg_write, ++}; ++/*-------------- end layoutseg ---------------------------*/ ++ ++/*-------------- start layoutsegsize -------------------------*/ ++static ssize_t layoutsegsize_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[50]; ++ ++ if (copy_from_user(cmd, buf, 49)) ++ return -EFAULT; ++ layoutsegment_size = simple_strtoull(cmd, NULL, 10); ++ ++ return count; ++} ++ ++static const struct file_operations layoutsegsize_ops = { ++ .write = layoutsegsize_write, ++}; ++/*-------------- end layoutsegsize ---------------------------*/ ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++int ++spnfs_init_proc(void) ++{ ++ struct proc_dir_entry *entry; ++ ++ entry = proc_mkdir("fs/spnfs", NULL); ++ if (!entry) ++ return -ENOMEM; ++ ++ entry = create_proc_entry("fs/spnfs/ctl", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &ctl_ops; ++ ++ entry = create_proc_entry("fs/spnfs/config", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &config_ops; ++ ++ entry = create_proc_entry("fs/spnfs/getfh", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &getfh_ops; ++ ++ entry = create_proc_entry("fs/spnfs/recall", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &recall_ops; ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++ entry = create_proc_entry("fs/spnfs/layoutseg", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutseg_ops; ++ ++ entry = create_proc_entry("fs/spnfs/layoutsegsize", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutsegsize_ops; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ return 0; ++} ++#endif /* CONFIG_PROC_FS */ +diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c +--- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-23 12:09:03.324501390 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-08-23 12:09:03.324501390 -0400 +@@ -0,0 +1,878 @@ ++/* ++ * fs/nfsd/spnfs_ops.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * ++ */ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++/* comment out CONFIG_SPNFS_TEST for non-test behaviour */ ++/* #define CONFIG_SPNFS_TEST 1 */ ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++/* ++ * The functions that are called from elsewhere in the kernel ++ * to perform tasks in userspace ++ * ++ */ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++extern int spnfs_use_layoutsegments; ++extern uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++extern struct spnfs *global_spnfs; ++ ++int ++spnfs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++enum nfsstat4 ++spnfs_layoutget(struct inode *inode, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *lg_arg, ++ struct nfsd4_pnfs_layoutget_res *lg_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct pnfs_filelayout_layout *flp = NULL; ++ int status, i; ++ enum nfsstat4 nfserr; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ im->im_type = SPNFS_TYPE_LAYOUTGET; ++ im->im_args.layoutget_args.inode = inode->i_ino; ++ im->im_args.layoutget_args.generation = inode->i_generation; ++ ++ /* call function to queue the msg for upcall */ ++ if (spnfs_upcall(spnfs, im, res) != 0) { ++ dprintk("failed spnfs upcall: layoutget\n"); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto layoutget_cleanup; ++ } ++ status = res->layoutget_res.status; ++ if (status != 0) { ++ /* FIXME? until user mode is fixed, translate system error */ ++ switch (status) { ++ case -E2BIG: ++ case -ETOOSMALL: ++ nfserr = NFS4ERR_TOOSMALL; ++ break; ++ case -ENOMEM: ++ case -EAGAIN: ++ case -EINTR: ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ break; ++ case -ENOENT: ++ nfserr = NFS4ERR_BADLAYOUT; ++ break; ++ default: ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ } ++ dprintk("spnfs layout_get upcall: status=%d nfserr=%u\n", ++ status, nfserr); ++ goto layoutget_cleanup; ++ } ++ ++ lg_res->lg_return_on_close = 0; ++#if defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ /* if spnfs_use_layoutsegments & layoutsegment_size == 0, use */ ++ /* the amount requested by the client. */ ++ if (spnfs_use_layoutsegments) { ++ if (layoutsegment_size != 0) ++ lg_res->lg_seg.length = layoutsegment_size; ++ } else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ flp = kmalloc(sizeof(struct pnfs_filelayout_layout), GFP_KERNEL); ++ if (flp == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ flp->device_id.sbid = lg_arg->lg_sbid; ++ flp->device_id.devid = res->layoutget_res.devid; ++ flp->lg_layout_type = 1; /* XXX */ ++ flp->lg_stripe_type = res->layoutget_res.stripe_type; ++ flp->lg_commit_through_mds = 0; ++ flp->lg_stripe_unit = res->layoutget_res.stripe_size; ++ flp->lg_first_stripe_index = 0; ++ flp->lg_pattern_offset = 0; ++ flp->lg_fh_length = res->layoutget_res.stripe_count; ++ ++ flp->lg_fh_list = kmalloc(flp->lg_fh_length * sizeof(struct knfsd_fh), ++ GFP_KERNEL); ++ if (flp->lg_fh_list == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ /* ++ * FIX: Doing an extra copy here. Should group res.flist's fh_len ++ * and fh_val into a knfsd_fh structure. ++ */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ flp->lg_fh_list[i].fh_size = res->layoutget_res.flist[i].fh_len; ++ memcpy(&flp->lg_fh_list[i].fh_base, ++ res->layoutget_res.flist[i].fh_val, ++ res->layoutget_res.flist[i].fh_len); ++ } ++ ++ /* encode the layoutget body */ ++ nfserr = filelayout_encode_layout(xdr, flp); ++ ++layoutget_cleanup: ++ if (flp) { ++ if (flp->lg_fh_list) ++ kfree(flp->lg_fh_list); ++ kfree(flp); ++ } ++ kfree(im); ++ kfree(res); ++ ++ return nfserr; ++} ++ ++int ++spnfs_layoutcommit(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutreturn(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for ino = %lu\n", ++ __func__, inode->i_ino); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* XXX figure out how to get a sb since there's no inode ptr */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = offset; ++ lr.cbl_seg.length = len; ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ lr.cbl_layoutchanged = 0; ++ ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ ++ return 0; ++} ++ ++ ++int ++spnfs_test_layoutrecall(char *path, u64 offset, u64 len) ++{ ++ struct nameidata nd; ++ struct inode *inode; ++ int type, rc; ++ ++ dprintk("%s: path=%s, offset=%llu, len=%llu\n", ++ __func__, path, offset, len); ++ ++ if (strcmp(path, "all") == 0) { ++ inode = NULL; ++ type = RETURN_ALL; ++ } else { ++ rc = path_lookup(path, 0, &nd); ++ if (rc != 0) ++ return -ENOENT; ++ ++ /* ++ * XXX todo: add a RETURN_FSID scenario here...maybe if ++ * inode is a dir... ++ */ ++ ++ inode = nd.path.dentry->d_inode; ++ type = RETURN_FILE; ++ } ++ ++ if (len == 0) ++ len = NFS4_MAX_UINT64; ++ ++ rc = spnfs_layoutrecall(inode, type, offset, len); ++ ++ if (type != RETURN_ALL) ++ path_put(&nd.path); ++ return rc; ++} ++ ++int ++spnfs_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *gd_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* XXX keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEITER; ++ im->im_args.getdeviceiter_args.cookie = gd_res->gd_cookie; ++ im->im_args.getdeviceiter_args.verf = gd_res->gd_verf; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceiter_out; ++ } ++ status = res->getdeviceiter_res.status; ++ ++ if (res->getdeviceiter_res.eof) ++ gd_res->gd_eof = 1; ++ else { ++ gd_res->gd_devid = res->getdeviceiter_res.devid; ++ gd_res->gd_cookie = res->getdeviceiter_res.cookie; ++ gd_res->gd_verf = res->getdeviceiter_res.verf; ++ gd_res->gd_eof = 0; ++ } ++ ++getdeviceiter_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++#ifdef CONFIG_SPNFS_TEST ++/* ++ * Setup the rq_res xdr_buf. The svc_rqst rq_respages[1] page contains the ++ * 1024 encoded stripe indices. ++ * ++ * Skip the devaddr4 length and encode the indicies count (1024) in the ++ * rq_res.head and set the rq_res.head length. ++ * ++ * Set the rq_res page_len to 4096 (for the 1024 stripe indices). ++ * Set the rq_res xdr_buf tail base to rq_respages[0] just after the ++ * rq_res head to hold the rest of the getdeviceinfo return. ++ * ++ * So rq_respages[rq_resused - 1] contains the rq_res.head and rq_res.tail and ++ * rq_respages[rq_resused] contains the rq_res.pages. ++ */ ++static int spnfs_test_indices_xdr(struct pnfs_xdr_info *info, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ struct nfsd4_compoundres *resp = info->resp; ++ struct svc_rqst *rqstp = resp->rqstp; ++ struct xdr_buf *xb = &resp->rqstp->rq_res; ++ __be32 *p; ++ ++ p = nfsd4_xdr_reserve_space(resp, 8); ++ p++; /* Fill in length later */ ++ *p++ = cpu_to_be32(fdev->fl_stripeindices_length); /* 1024 */ ++ resp->p = p; ++ ++ xb->head[0].iov_len = (char *)resp->p - (char *)xb->head[0].iov_base; ++ xb->pages = &rqstp->rq_respages[rqstp->rq_resused]; ++ xb->page_base = 0; ++ xb->page_len = PAGE_SIZE; /* page of 1024 encoded indices */ ++ xb->tail[0].iov_base = resp->p; ++ resp->end = xb->head[0].iov_base + PAGE_SIZE; ++ xb->tail[0].iov_len = (char *)resp->end - (char *)resp->p; ++ return 0; ++} ++/* ++ * Return a stripeindices of length 1024 to test ++ * the pNFS client multipage getdeviceinfo implementation. ++ * ++ * Encode a page of stripe indices. ++ */ ++static void spnfs_set_test_indices(struct pnfs_filelayout_device *fldev, ++ struct spnfs_device *dev, ++ struct pnfs_devinfo_arg *info) ++{ ++ struct svc_rqst *rqstp = info->xdr.resp->rqstp; ++ __be32 *p; ++ int i, j = 0; ++ ++ p = (__be32 *)page_address(rqstp->rq_respages[rqstp->rq_resused]); ++ fldev->fl_stripeindices_length = 1024; ++ /* round-robin the data servers device index into the stripe indicie */ ++ for (i = 0; i < 1024; i++) { ++ *p++ = cpu_to_be32(j); ++ if (j < dev->dscount - 1) ++ j++; ++ else ++ j = 0; ++ } ++ fldev->fl_stripeindices_list = NULL; ++} ++#endif /* CONFIG_SPNFS_TEST */ ++ ++int ++spnfs_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct spnfs_device *dev; ++ struct pnfs_filelayout_device *fldev = NULL; ++ struct pnfs_filelayout_multipath *mp = NULL; ++ struct pnfs_filelayout_devaddr *fldap = NULL; ++ int status = 0, i, len; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEINFO; ++ /* XXX FIX: figure out what to do about fsid */ ++ im->im_args.getdeviceinfo_args.devid = devid->devid; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceinfo_out; ++ } ++ status = res->getdeviceinfo_res.status; ++ if (status != 0) ++ goto getdeviceinfo_out; ++ ++ dev = &res->getdeviceinfo_res.devinfo; ++ ++ /* Fill in the device data, i.e., nfs4_1_file_layout_ds_addr4 */ ++ fldev = kzalloc(sizeof(struct pnfs_filelayout_device), GFP_KERNEL); ++ if (fldev == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ /* ++ * Stripe count is the same as data server count for our purposes ++ */ ++ fldev->fl_stripeindices_length = dev->dscount; ++ fldev->fl_device_length = dev->dscount; ++ ++ /* Set stripe indices */ ++#ifdef CONFIG_SPNFS_TEST ++ spnfs_set_test_indices(fldev, dev, info); ++ fldev->fl_enc_stripe_indices = spnfs_test_indices_xdr; ++#else /* CONFIG_SPNFS_TEST */ ++ fldev->fl_stripeindices_list = ++ kmalloc(fldev->fl_stripeindices_length * sizeof(u32), ++ GFP_KERNEL); ++ if (fldev->fl_stripeindices_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_stripeindices_length; i++) ++ fldev->fl_stripeindices_list[i] = i; ++#endif /* CONFIG_SPNFS_TEST */ ++ ++ /* ++ * Set the device's data server addresses No multipath for spnfs, ++ * so mp length is always 1. ++ * ++ */ ++ fldev->fl_device_list = ++ kmalloc(fldev->fl_device_length * ++ sizeof(struct pnfs_filelayout_multipath), ++ GFP_KERNEL); ++ if (fldev->fl_device_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ mp = &fldev->fl_device_list[i]; ++ mp->fl_multipath_length = 1; ++ mp->fl_multipath_list = ++ kmalloc(sizeof(struct pnfs_filelayout_devaddr), ++ GFP_KERNEL); ++ if (mp->fl_multipath_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ fldap = mp->fl_multipath_list; ++ ++ /* ++ * Copy the netid into the device address, for example: "tcp" ++ */ ++ len = strlen(dev->dslist[i].netid); ++ fldap->r_netid.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_netid.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_netid.data, dev->dslist[i].netid, len); ++ fldap->r_netid.len = len; ++ ++ /* ++ * Copy the network address into the device address, ++ * for example: "10.35.9.16.08.01" ++ */ ++ len = strlen(dev->dslist[i].addr); ++ fldap->r_addr.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_addr.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_addr.data, dev->dslist[i].addr, len); ++ fldap->r_addr.len = len; ++ } ++ ++ /* encode the device data */ ++ status = filelayout_encode_devinfo(xdr, fldev); ++ ++getdeviceinfo_out: ++ if (fldev) { ++ kfree(fldev->fl_stripeindices_list); ++ if (fldev->fl_device_list) { ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ fldap = ++ fldev->fl_device_list[i].fl_multipath_list; ++ kfree(fldap->r_netid.data); ++ kfree(fldap->r_addr.data); ++ kfree(fldap); ++ } ++ kfree(fldev->fl_device_list); ++ } ++ kfree(fldev); ++ } ++ ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_setattr(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_open(struct inode *inode, struct nfsd4_open *open) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_OPEN; ++ im->im_args.open_args.inode = inode->i_ino; ++ im->im_args.open_args.generation = inode->i_generation; ++ im->im_args.open_args.create = open->op_create; ++ im->im_args.open_args.createmode = open->op_createmode; ++ im->im_args.open_args.truncate = open->op_truncate; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto open_out; ++ } ++ status = res->open_res.status; ++ ++open_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_create(void) ++{ ++ return 0; ++} ++ ++/* ++ * Invokes the spnfsd with the inode number of the object to remove. ++ * The file has already been removed on the MDS, so all the spnsfd ++ * daemon does is remove the stripes. ++ * Returns 0 on success otherwise error code ++ */ ++int ++spnfs_remove(unsigned long ino, unsigned long generation) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_REMOVE; ++ im->im_args.remove_args.inode = ino; ++ im->im_args.remove_args.generation = generation; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto remove_out; ++ } ++ status = res->remove_res.status; ++ ++remove_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++static int ++read_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_read(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ if (err == 0) ++ break; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDONLY | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0 ; vnum < vlen ; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = read_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err < 0) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ if (err < iolen) { ++ bytecount += err; ++ goto read_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++read_out: ++ *lenp = bytecount; ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ return status; ++} ++ ++__be32 ++spnfs_read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return read(inode, offset, lenp, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++static int ++write_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_write(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDWR | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto write_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0; vnum < vlen; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = write_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err != iolen) { ++ dprintk("spnfs_write: err=%d expected %Zd\n", err, len); ++ status = nfserr_io; ++ goto write_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++write_out: ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ ++ return status; ++} ++ ++__be32 ++spnfs_write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return write(inode, offset, len, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++int ++spnfs_commit(void) ++{ ++ return 0; ++} ++ ++/* ++ * Return the state for this object. ++ * At this time simply return 0 to indicate success and use the existing state ++ */ ++int ++spnfs_get_state(struct inode *inode, struct knfsd_fh *fh, struct pnfs_get_state *arg) ++{ ++ return 0; ++} ++ ++/* ++ * Return the filehandle for the specified file descriptor ++ */ ++int ++spnfs_getfh(int fd, struct nfs_fh *fh) ++{ ++ struct file *file; ++ ++ file = fget(fd); ++ if (file == NULL) ++ return -EIO; ++ ++ memcpy(fh, NFS_FH(file->f_dentry->d_inode), sizeof(struct nfs_fh)); ++ fput(file); ++ return 0; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h +--- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-08-23 12:08:29.096512142 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 12:09:03.325501424 -0400 +@@ -242,6 +242,12 @@ struct nfs4_client { + u32 cl_cb_seq_nr; + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ ++#if defined(CONFIG_PNFSD) ++ struct list_head cl_layouts; /* outstanding layouts */ ++ struct list_head cl_layoutrecalls; /* outstanding layoutrecall ++ callbacks */ ++ atomic_t cl_deviceref; /* Num outstanding devs */ ++#endif /* CONFIG_PNFSD */ + }; + + static inline void +@@ -342,12 +348,31 @@ struct nfs4_file { + struct list_head fi_hash; /* hash by "struct inode *" */ + struct list_head fi_stateids; + struct list_head fi_delegations; ++#if defined(CONFIG_PNFSD) ++ struct list_head fi_layouts; ++ struct list_head fi_layout_states; ++#endif /* CONFIG_PNFSD */ + struct inode *fi_inode; + u32 fi_id; /* used with stateowner->so_id + * for stateid_hashtbl hash */ + bool fi_had_conflict; ++#if defined(CONFIG_PNFSD) ++ /* used by layoutget / layoutrecall */ ++ struct nfs4_fsid fi_fsid; ++ u32 fi_fhlen; ++ u8 fi_fhval[NFS4_FHSIZE]; ++#endif /* CONFIG_PNFSD */ + }; + ++#if defined(CONFIG_PNFSD) ++/* pNFS Metadata server state */ ++ ++struct pnfs_ds_dev_entry { ++ struct list_head dd_dev_entry; /* st_pnfs_ds_id entry */ ++ u32 dd_dsid; ++}; ++#endif /* CONFIG_PNFSD */ ++ + /* + * nfs4_stateid can either be an open stateid or (eventually) a lock stateid + * +@@ -370,6 +395,9 @@ struct nfs4_stateid { + struct list_head st_perfile; + struct list_head st_perstateowner; + struct list_head st_lockowners; ++#if defined(CONFIG_PNFSD) ++ struct list_head st_pnfs_ds_id; ++#endif /* CONFIG_PNFSD */ + struct nfs4_stateowner * st_stateowner; + struct nfs4_file * st_file; + stateid_t st_stateid; +@@ -421,6 +449,34 @@ extern void nfsd4_recdir_purge_old(void) + extern int nfsd4_create_clid_dir(struct nfs4_client *clp); + extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); + extern void release_session_client(struct nfsd4_session *); ++extern void nfsd4_free_slab(struct kmem_cache **); ++extern struct nfs4_file *find_file(struct inode *); ++extern struct nfs4_file *find_alloc_file(struct inode *, struct svc_fh *); ++extern void put_nfs4_file(struct nfs4_file *); ++extern void get_nfs4_file(struct nfs4_file *); ++extern struct nfs4_client *find_confirmed_client(clientid_t *); ++extern struct nfs4_stateid *find_stateid(stateid_t *, int flags); ++extern struct nfs4_delegation *find_delegation_stateid(struct inode *, stateid_t *); ++extern __be32 nfs4_check_stateid(stateid_t *); ++extern void expire_client_lock(struct nfs4_client *); ++extern int filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), void *); ++ ++#if defined(CONFIG_PNFSD) ++extern int nfsd4_init_pnfs_slabs(void); ++extern void nfsd4_free_pnfs_slabs(void); ++extern void pnfs_expire_client(struct nfs4_client *); ++extern void release_pnfs_ds_dev_list(struct nfs4_stateid *); ++extern void nfs4_pnfs_state_init(void); ++extern void nfs4_pnfs_state_shutdown(void); ++extern void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++extern int nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *, stateid_t *); ++#else /* CONFIG_PNFSD */ ++static inline void nfsd4_free_pnfs_slabs(void) {} ++static inline int nfsd4_init_pnfs_slabs(void) { return 0; } ++static inline void pnfs_expire_client(struct nfs4_client *clp) {} ++static inline void release_pnfs_ds_dev_list(struct nfs4_stateid *stp) {} ++static inline void nfs4_pnfs_state_shutdown(void) {} ++#endif /* CONFIG_PNFSD */ + + static inline void + nfs4_put_stateowner(struct nfs4_stateowner *so) +@@ -434,4 +490,24 @@ nfs4_get_stateowner(struct nfs4_stateown + kref_get(&so->so_ref); + } + ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ + #endif /* NFSD4_STATE_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs.c +--- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-08-23 12:08:27.632564132 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-08-23 12:09:03.326501490 -0400 +@@ -37,7 +37,12 @@ + #ifdef CONFIG_NFSD_V4 + #include + #include ++#include ++#include + #endif /* CONFIG_NFSD_V4 */ ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif + + #include "nfsd.h" + #include "vfs.h" +@@ -383,6 +388,12 @@ nfsd_setattr(struct svc_rqst *rqstp, str + NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); + if (err) + goto out; ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, 0)) { ++ err = bl_layoutrecall(inode, RETURN_FILE, ++ iap->ia_size, inode->i_size - iap->ia_size); ++ } ++#endif /* CONFIG_SPNFS_BLOCK */ + } + + /* +@@ -1703,6 +1714,11 @@ nfsd_rename(struct svc_rqst *rqstp, stru + struct inode *fdir, *tdir; + __be32 err; + int host_err; ++#ifdef CONFIG_SPNFS ++ unsigned long ino = 0; ++ unsigned long generation = 0; ++ unsigned int nlink = 0; ++#endif /* CONFIG_SPNFS */ + + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); + if (err) +@@ -1766,7 +1782,26 @@ nfsd_rename(struct svc_rqst *rqstp, stru + if (host_err) + goto out_dput_new; + ++#ifdef CONFIG_SPNFS ++ /* ++ * if the target is a preexisting regular file, remember the ++ * inode number and generation so we can delete the stripes; ++ * save the link count as well so that the stripes only get ++ * get deleted when the last link is deleted ++ */ ++ if (ndentry && ndentry->d_inode && S_ISREG(ndentry->d_inode->i_mode)) { ++ ino = ndentry->d_inode->i_ino; ++ generation = ndentry->d_inode->i_generation; ++ nlink = ndentry->d_inode->i_nlink; ++ } ++#endif /* CONFIG_SPNFS */ ++ + host_err = vfs_rename(fdir, odentry, tdir, ndentry); ++#ifdef CONFIG_SPNFS ++ if (spnfs_enabled() && (!host_err && ino && nlink == 1)) ++ spnfs_remove(ino, generation); ++#endif /* CONFIG_SPNFS */ ++ + if (!host_err) { + host_err = commit_metadata(tfhp); + if (!host_err) +@@ -1807,6 +1842,11 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + struct inode *dirp; + __be32 err; + int host_err; ++#if defined(CONFIG_SPNFS) ++ unsigned long ino; ++ unsigned long generation; ++ unsigned int nlink; ++#endif /* defined(CONFIG_SPNFS) */ + + err = nfserr_acces; + if (!flen || isdotent(fname, flen)) +@@ -1830,6 +1870,17 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + goto out; + } + ++#if defined(CONFIG_SPNFS) ++ /* ++ * Remember the inode number to communicate to the spnfsd ++ * for removal of stripes; save the link count as well so that ++ * the stripes only get get deleted when the last link is deleted ++ */ ++ ino = rdentry->d_inode->i_ino; ++ generation = rdentry->d_inode->i_generation; ++ nlink = rdentry->d_inode->i_nlink; ++#endif /* defined(CONFIG_SPNFS) */ ++ + if (!type) + type = rdentry->d_inode->i_mode & S_IFMT; + +@@ -1854,6 +1905,29 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + if (!host_err) + host_err = commit_metadata(fhp); + ++#if defined(CONFIG_SPNFS) ++ /* ++ * spnfs: notify spnfsd of removal to destroy stripes ++ */ ++/* ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (sb->s_export_op->spnfs_remove) { ++*/ ++ dprintk("%s check if spnfs_enabled\n", __FUNCTION__); ++ if (spnfs_enabled() && nlink == 1) { ++ BUG_ON(ino == 0); ++ dprintk("%s calling spnfs_remove inumber=%ld\n", ++ __FUNCTION__, ino); ++ if (spnfs_remove(ino, generation) == 0) { ++ dprintk("%s spnfs_remove success\n", __FUNCTION__); ++ } else { ++ /* XXX How do we make this atomic? */ ++ printk(KERN_WARNING "nfsd: pNFS could not " ++ "remove stripes for inode: %ld\n", ino); ++ } ++ } ++#endif /* defined(CONFIG_SPNFS) */ ++ + mnt_drop_write(fhp->fh_export->ex_path.mnt); + out_nfserr: + err = nfserrno(host_err); +diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h +--- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-08-23 12:08:29.097425997 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 12:09:03.327451643 -0400 +@@ -37,6 +37,8 @@ + #ifndef _LINUX_NFSD_XDR4_H + #define _LINUX_NFSD_XDR4_H + ++#include ++ + #include "state.h" + #include "nfsd.h" + +@@ -385,6 +387,51 @@ struct nfsd4_reclaim_complete { + u32 rca_one_fs; + }; + ++struct nfsd4_pnfs_getdevinfo { ++ struct nfsd4_pnfs_deviceid gd_devid; /* request */ ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxcount; /* request */ ++ u32 gd_notify_types;/* request */ ++ struct super_block *gd_sb; ++}; ++ ++struct nfsd4_pnfs_getdevlist { ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxdevices; /* request */ ++ u64 gd_cookie; /* request - response */ ++ u64 gd_verf; /* request - response */ ++ struct svc_fh *gd_fhp; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutget { ++ u64 lg_minlength; /* request */ ++ u32 lg_signal; /* request */ ++ u32 lg_maxcount; /* request */ ++ struct svc_fh *lg_fhp; /* request */ ++ stateid_t lg_sid; /* request/response */ ++ struct nfsd4_layout_seg lg_seg; /* request/response */ ++ u32 lg_roc; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit { ++ struct nfsd4_pnfs_layoutcommit_arg args; ++ stateid_t lc_sid; /* request */ ++ struct nfsd4_pnfs_layoutcommit_res res; ++}; ++ ++enum layoutreturn_flags { ++ LR_FLAG_INTERN = 1 << 0, /* internal return */ ++ LR_FLAG_EXPIRE = 1 << 1, /* return on client expiration */ ++}; ++ ++struct nfsd4_pnfs_layoutreturn { ++ struct nfsd4_pnfs_layoutreturn_arg args; ++ u32 lr_flags; ++ stateid_t lr_sid; /* request/resopnse */ ++ u32 lrs_present; /* response */ ++}; ++ + struct nfsd4_op { + int opnum; + __be32 status; +@@ -426,6 +473,13 @@ struct nfsd4_op { + struct nfsd4_destroy_session destroy_session; + struct nfsd4_sequence sequence; + struct nfsd4_reclaim_complete reclaim_complete; ++#if defined(CONFIG_PNFSD) ++ struct nfsd4_pnfs_getdevlist pnfs_getdevlist; ++ struct nfsd4_pnfs_getdevinfo pnfs_getdevinfo; ++ struct nfsd4_pnfs_layoutget pnfs_layoutget; ++ struct nfsd4_pnfs_layoutcommit pnfs_layoutcommit; ++ struct nfsd4_pnfs_layoutreturn pnfs_layoutreturn; ++#endif /* CONFIG_PNFSD */ + } u; + struct nfs4_replay * replay; + }; +diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file.c +--- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-23 12:08:29.039491912 -0400 ++++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-23 12:09:03.328501680 -0400 +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -36,6 +37,7 @@ + #include "internal.h" + #include "iostat.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_FILE + +@@ -388,12 +390,17 @@ static int nfs_write_begin(struct file * + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int once_thru = 0; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + mapping->host->i_ino, len, (long long) pos); + ++ pnfs_update_layout(mapping->host, ++ nfs_file_open_context(file), ++ 0, NFS4_MAX_UINT64, IOMODE_RW, ++ &lseg); + start: + /* + * Prevent starvation issues if someone is doing a consistency +@@ -402,17 +409,22 @@ start: + ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) +- return ret; ++ goto out; + + page = grab_cache_page_write_begin(mapping, index, flags); +- if (!page) +- return -ENOMEM; ++ if (!page) { ++ ret = -ENOMEM; ++ goto out; ++ } + *pagep = page; + +- ret = nfs_flush_incompatible(file, page); ++ ret = nfs_flush_incompatible(file, page, lseg); + if (ret) { + unlock_page(page); + page_cache_release(page); ++ *pagep = NULL; ++ *fsdata = NULL; ++ goto out; + } else if (!once_thru && + nfs_want_read_modify_write(file, page, pos, len)) { + once_thru = 1; +@@ -421,6 +433,12 @@ start: + if (!ret) + goto start; + } ++ ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata); ++ out: ++ if (ret) { ++ put_lseg(lseg); ++ *fsdata = NULL; ++ } + return ret; + } + +@@ -430,6 +448,7 @@ static int nfs_write_end(struct file *fi + { + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + int status; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, +@@ -456,10 +475,17 @@ static int nfs_write_end(struct file *fi + zero_user_segment(page, pglen, PAGE_CACHE_SIZE); + } + +- status = nfs_updatepage(file, page, offset, copied); ++ lseg = nfs4_pull_lseg_from_fsdata(file, fsdata); ++ status = pnfs_write_end(file, page, pos, len, copied, lseg); ++ if (status) ++ goto out; ++ status = nfs_updatepage(file, page, offset, copied, lseg, fsdata); + ++ out: + unlock_page(page); + page_cache_release(page); ++ pnfs_write_end_cleanup(file, fsdata); ++ put_lseg(lseg); + + if (status < 0) + return status; +@@ -570,6 +596,8 @@ static int nfs_vm_page_mkwrite(struct vm + /* make sure the cache has finished storing the page */ + nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); + ++ /* XXX Do we want to call pnfs_update_layout here? */ ++ + lock_page(page); + mapping = page->mapping; + if (mapping != dentry->d_inode->i_mapping) +@@ -580,11 +608,11 @@ static int nfs_vm_page_mkwrite(struct vm + if (pagelen == 0) + goto out_unlock; + +- ret = nfs_flush_incompatible(filp, page); ++ ret = nfs_flush_incompatible(filp, page, NULL); + if (ret != 0) + goto out_unlock; + +- ret = nfs_updatepage(filp, page, 0, pagelen); ++ ret = nfs_updatepage(filp, page, 0, pagelen, NULL, NULL); + out_unlock: + if (!ret) + return VM_FAULT_LOCKED; +diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inode.c +--- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-08-23 12:08:29.042511552 -0400 ++++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-23 12:09:03.329501644 -0400 +@@ -48,6 +48,7 @@ + #include "internal.h" + #include "fscache.h" + #include "dns_resolve.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_VFS + +@@ -278,7 +279,7 @@ nfs_fhget(struct super_block *sb, struct + */ + inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; + if (S_ISREG(inode->i_mode)) { +- inode->i_fop = &nfs_file_operations; ++ inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; + inode->i_data.a_ops = &nfs_file_aops; + inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; + } else if (S_ISDIR(inode->i_mode)) { +@@ -530,6 +531,68 @@ out: + return err; + } + ++static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) ++{ ++ atomic_set(&l_ctx->count, 1); ++ l_ctx->lockowner = current->files; ++ l_ctx->pid = current->tgid; ++ INIT_LIST_HEAD(&l_ctx->list); ++} ++ ++static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) ++{ ++ struct nfs_lock_context *pos; ++ ++ list_for_each_entry(pos, &ctx->lock_context.list, list) { ++ if (pos->lockowner != current->files) ++ continue; ++ if (pos->pid != current->tgid) ++ continue; ++ atomic_inc(&pos->count); ++ return pos; ++ } ++ return NULL; ++} ++ ++struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) ++{ ++ struct nfs_lock_context *res, *new = NULL; ++ struct inode *inode = ctx->path.dentry->d_inode; ++ ++ spin_lock(&inode->i_lock); ++ res = __nfs_find_lock_context(ctx); ++ if (res == NULL) { ++ spin_unlock(&inode->i_lock); ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (new == NULL) ++ return NULL; ++ nfs_init_lock_context(new); ++ spin_lock(&inode->i_lock); ++ res = __nfs_find_lock_context(ctx); ++ if (res == NULL) { ++ list_add_tail(&new->list, &ctx->lock_context.list); ++ new->open_context = ctx; ++ res = new; ++ new = NULL; ++ } ++ } ++ spin_unlock(&inode->i_lock); ++ kfree(new); ++ return res; ++} ++ ++void nfs_put_lock_context(struct nfs_lock_context *l_ctx) ++{ ++ struct nfs_open_context *ctx = l_ctx->open_context; ++ struct inode *inode = ctx->path.dentry->d_inode; ++ ++ if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) ++ return; ++ list_del(&l_ctx->list); ++ spin_unlock(&inode->i_lock); ++ kfree(l_ctx); ++} ++ + /** + * nfs_close_context - Common close_context() routine NFSv2/v3 + * @ctx: pointer to context +@@ -566,11 +629,11 @@ static struct nfs_open_context *alloc_nf + path_get(&ctx->path); + ctx->cred = get_rpccred(cred); + ctx->state = NULL; +- ctx->lockowner = current->files; + ctx->flags = 0; + ctx->error = 0; + ctx->dir_cookie = 0; +- atomic_set(&ctx->count, 1); ++ nfs_init_lock_context(&ctx->lock_context); ++ ctx->lock_context.open_context = ctx; + } + return ctx; + } +@@ -578,15 +641,16 @@ static struct nfs_open_context *alloc_nf + struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) + { + if (ctx != NULL) +- atomic_inc(&ctx->count); ++ atomic_inc(&ctx->lock_context.count); + return ctx; + } ++EXPORT_SYMBOL(get_nfs_open_context); + + static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) + { + struct inode *inode = ctx->path.dentry->d_inode; + +- if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) ++ if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) + return; + list_del(&ctx->list); + spin_unlock(&inode->i_lock); +@@ -933,6 +997,7 @@ void nfs_fattr_init(struct nfs_fattr *fa + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); + } ++EXPORT_SYMBOL(nfs_fattr_init); + + struct nfs_fattr *nfs_alloc_fattr(void) + { +@@ -1142,6 +1207,14 @@ static int nfs_update_inode(struct inode + server->fsid = fattr->fsid; + + /* ++ * file needs layout commit, server attributes may be stale ++ */ ++ if (layoutcommit_needed(nfsi) && nfsi->change_attr >= fattr->change_attr) { ++ dprintk("NFS: %s: layoutcommit is needed for file %s/%ld\n", ++ __func__, inode->i_sb->s_id, inode->i_ino); ++ return 0; ++ } ++ /* + * Update the read time so we don't revalidate too often. + */ + nfsi->read_cache_jiffies = fattr->time_start; +@@ -1340,9 +1413,10 @@ static int nfs_update_inode(struct inode + */ + void nfs4_clear_inode(struct inode *inode) + { ++ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); ++ + /* If we are holding a delegation, return it! */ + nfs_inode_return_delegation_noreclaim(inode); +- /* First call standard NFS clear_inode() code */ + nfs_clear_inode(inode); + } + #endif +@@ -1367,7 +1441,10 @@ struct inode *nfs_alloc_inode(struct sup + + void nfs_destroy_inode(struct inode *inode) + { +- kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); ++ struct nfs_inode *nfsi = NFS_I(inode); ++ ++ pnfs_destroy_layout(nfsi); ++ kmem_cache_free(nfs_inode_cachep, nfsi); + } + + static inline void nfs4_init_once(struct nfs_inode *nfsi) +@@ -1377,6 +1454,11 @@ static inline void nfs4_init_once(struct + nfsi->delegation = NULL; + nfsi->delegation_state = 0; + init_rwsem(&nfsi->rwsem); ++#ifdef CONFIG_NFS_V4_1 ++ init_waitqueue_head(&nfsi->lo_waitq); ++ nfsi->pnfs_layout_suspend = 0; ++ nfsi->layout = NULL; ++#endif /* CONFIG_NFS_V4_1 */ + #endif + } + +@@ -1488,6 +1570,12 @@ static int __init init_nfs_fs(void) + if (err) + goto out0; + ++#ifdef CONFIG_NFS_V4_1 ++ err = pnfs_initialize(); ++ if (err) ++ goto out00; ++#endif /* CONFIG_NFS_V4_1 */ ++ + #ifdef CONFIG_PROC_FS + rpc_proc_register(&nfs_rpcstat); + #endif +@@ -1498,6 +1586,10 @@ out: + #ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); + #endif ++#ifdef CONFIG_NFS_V4_1 ++out00: ++ pnfs_uninitialize(); ++#endif /* CONFIG_NFS_V4_1 */ + nfs_destroy_directcache(); + out0: + nfs_destroy_writepagecache(); +@@ -1531,6 +1623,9 @@ static void __exit exit_nfs_fs(void) + #ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); + #endif ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_uninitialize(); ++#endif + unregister_nfs_fs(); + nfs_fs_proc_exit(); + nfsiod_stop(); +diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/internal.h +--- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-08-23 12:08:29.042511552 -0400 ++++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-23 12:09:03.330502148 -0400 +@@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_serv + struct nfs_fattr *); + extern void nfs_mark_client_ready(struct nfs_client *clp, int state); + extern int nfs4_check_client_ready(struct nfs_client *clp); ++extern int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++ const struct sockaddr *sa2); ++extern int nfs4_set_client(struct nfs_server *server, ++ const char *hostname, ++ const struct sockaddr *addr, ++ const size_t addrlen, ++ const char *ip_addr, ++ rpc_authflavor_t authflavour, ++ int proto, const struct rpc_timeout *timeparms, ++ u32 minorversion); + #ifdef CONFIG_PROC_FS + extern int __init nfs_fs_proc_init(void); + extern void nfs_fs_proc_exit(void); +@@ -201,6 +211,8 @@ extern const u32 nfs41_maxwrite_overhead + extern struct rpc_procinfo nfs4_procedures[]; + #endif + ++extern int nfs4_recover_expired_lease(struct nfs_client *clp); ++ + /* proc.c */ + void nfs_close_context(struct nfs_open_context *ctx, int is_sync); + +@@ -248,10 +260,31 @@ extern int nfs4_get_rootfh(struct nfs_se + #endif + + /* read.c */ ++extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); ++extern int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); + extern void nfs_read_prepare(struct rpc_task *task, void *calldata); + + /* write.c */ ++extern int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs); + extern void nfs_write_prepare(struct rpc_task *task, void *calldata); ++extern void nfs_mark_list_commit(struct list_head *head); + #ifdef CONFIG_MIGRATION + extern int nfs_migrate_page(struct address_space *, + struct page *, struct page *); +diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kconfig +--- linux-2.6.34.noarch/fs/nfs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-08-23 12:09:03.331395814 -0400 +@@ -79,10 +79,48 @@ config NFS_V4_1 + depends on NFS_V4 && EXPERIMENTAL + help + This option enables support for minor version 1 of the NFSv4 protocol +- (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. ++ (RFC5661) including support for the parallel NFS (pNFS) features ++ in the kernel's NFS client. + + Unless you're an NFS developer, say N. + ++config PNFS_FILE_LAYOUT ++ tristate "NFS client support for the pNFS nfs-files layout (DEVELOPER ONLY)" ++ depends on NFS_FS && NFS_V4_1 ++ default y ++ help ++ This option enables support for the pNFS nfs-files layout. ++ ++ Unless you're an NFS developer, say N. ++ ++config PNFS_OBJLAYOUT ++ tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD ++ help ++ Say M here if you want your pNFS client to support the Objects Layout Driver. ++ Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and ++ upper level driver (SCSI_OSD_ULD). ++ ++ If unsure, say N. ++ ++config PNFS_PANLAYOUT ++ tristate "Provide support for the Panasas OSD Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on PNFS_OBJLAYOUT ++ help ++ Say M or y here if you want your pNFS client to support the Panasas OSD Layout Driver. ++ ++ If unsure, say N. ++ ++config PNFS_BLOCK ++ tristate "Provide a pNFS block client (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 ++ select MD ++ select BLK_DEV_DM ++ help ++ Say M or y here if you want your pNfs client to support the block protocol ++ ++ If unsure, say N. ++ + config ROOT_NFS + bool "Root file system on NFS" + depends on NFS_FS=y && IP_PNP +diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Makefile +--- linux-2.6.34.noarch/fs/nfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-08-23 12:09:03.331395814 -0400 +@@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4x + delegation.o idmap.o \ + callback.o callback_xdr.o callback_proc.o \ + nfs4namespace.o ++nfs-$(CONFIG_NFS_V4_1) += pnfs.o + nfs-$(CONFIG_SYSCTL) += sysctl.o + nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o ++ ++obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o ++nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o ++ ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ ++obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ +diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs3proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-08-23 12:08:29.045525837 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-23 12:09:03.332511640 -0400 +@@ -833,6 +833,7 @@ const struct nfs_rpc_ops nfs_v3_clientop + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs3_dir_inode_operations, + .file_inode_ops = &nfs3_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs3_proc_get_root, + .getattr = nfs3_proc_getattr, + .setattr = nfs3_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-23 12:09:03.333512111 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-08-23 12:09:03.334491472 -0400 +@@ -0,0 +1,765 @@ ++/* ++ * linux/fs/nfs/nfs4filelayout.c ++ * ++ * Module for the pnfs nfs4 file layout driver. ++ * Defines all I/O and Policy interface operations, plus code ++ * to register itself with the pNFS client. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfs4filelayout.h" ++#include "nfs4_fs.h" ++#include "internal.h" ++#include "pnfs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Dean Hildebrand "); ++MODULE_DESCRIPTION("The NFSv4 file layout driver"); ++ ++/* Callback operations to the pNFS client */ ++struct pnfs_client_operations *pnfs_callback_ops; ++ ++/* Forward declaration */ ++struct layoutdriver_io_operations filelayout_io_operations; ++ ++int ++filelayout_initialize_mountpoint(struct nfs_server *nfss, ++ const struct nfs_fh *mntfh) ++{ ++ int status = nfs4_alloc_init_deviceid_cache(nfss->nfs_client, ++ nfs4_fl_free_deviceid_callback); ++ if (status) { ++ printk(KERN_WARNING "%s: deviceid cache could not be " ++ "initialized\n", __func__); ++ return status; ++ } ++ dprintk("%s: deviceid cache has been initialized successfully\n", ++ __func__); ++ return 0; ++} ++ ++/* Uninitialize a mountpoint by destroying its device list */ ++int ++filelayout_uninitialize_mountpoint(struct nfs_server *nfss) ++{ ++ dprintk("--> %s\n", __func__); ++ ++ if (nfss->pnfs_curr_ld && nfss->nfs_client->cl_devid_cache) ++ nfs4_put_deviceid_cache(nfss->nfs_client); ++ return 0; ++} ++ ++/* This function is used by the layout driver to calculate the ++ * offset of the file on the dserver based on whether the ++ * layout type is STRIPE_DENSE or STRIPE_SPARSE ++ */ ++static loff_t ++filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ ++ switch (flseg->stripe_type) { ++ case STRIPE_SPARSE: ++ return offset; ++ ++ case STRIPE_DENSE: ++ { ++ u32 stripe_width; ++ u64 tmp, off; ++ u32 unit = flseg->stripe_unit; ++ ++ stripe_width = unit * FILE_DSADDR(lseg)->stripe_count; ++ tmp = off = offset - flseg->pattern_offset; ++ do_div(tmp, stripe_width); ++ return tmp * unit + do_div(off, unit); ++ } ++ default: ++ BUG(); ++ } ++ ++ /* We should never get here... just to stop the gcc warning */ ++ return 0; ++} ++ ++/* ++ * Call ops for the async read/write cases ++ * In the case of dense layouts, the offset needs to be reset to its ++ * original value. ++ */ ++static void filelayout_read_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ if (rdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ rdata->args.offset, rdata->fldata.orig_offset); ++ rdata->args.offset = rdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ rdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_read_release(void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ put_lseg(rdata->pdata.lseg); ++ rdata->pdata.lseg = NULL; ++ rdata->pdata.call_ops->rpc_release(data); ++} ++ ++static void filelayout_write_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ if (wdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ wdata->args.offset, wdata->fldata.orig_offset); ++ wdata->args.offset = wdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_write_release(void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ put_lseg(wdata->pdata.lseg); ++ wdata->pdata.lseg = NULL; ++ wdata->pdata.call_ops->rpc_release(data); ++} ++ ++struct rpc_call_ops filelayout_read_call_ops = { ++ .rpc_call_prepare = nfs_read_prepare, ++ .rpc_call_done = filelayout_read_call_done, ++ .rpc_release = filelayout_read_release, ++}; ++ ++struct rpc_call_ops filelayout_write_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_write_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* Perform sync or async reads. ++ * ++ * An optimization for the NFS file layout driver ++ * allows the original read/write data structs to be passed in the ++ * last argument. ++ * ++ * TODO: join with write_pagelist? ++ */ ++static enum pnfs_try_status ++filelayout_read_pagelist(struct nfs_read_data *data, unsigned nr_pages) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ dprintk("--> %s ino %lu nr_pages %d pgbase %u req %Zu@%llu\n", ++ __func__, data->inode->i_ino, nr_pages, ++ data->args.pgbase, (size_t)data->args.count, offset); ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s USE DS:ip %x %s\n", __func__, ++ htonl(ds->ds_ip_addr), ds->r_addr); ++ ++ /* just try the first data server for the index..*/ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ ++ /* ++ * Now get the file offset on the dserver ++ * Set the read offset to this offset, and ++ * save the original offset in orig_offset ++ * In the case of aync reads, the offset will be reset in the ++ * call_ops->rpc_call_done() routine. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* Perform an asynchronous read */ ++ nfs_initiate_read(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_read_call_ops); ++ ++ data->pdata.pnfs_error = 0; ++ ++ return PNFS_ATTEMPTED; ++} ++ ++/* Perform async writes. */ ++static enum pnfs_try_status ++filelayout_write_pagelist(struct nfs_write_data *data, unsigned nr_pages, int sync) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu %s\n", __func__, ++ data->inode->i_ino, sync, (size_t) data->args.count, offset, ++ htonl(ds->ds_ip_addr), ntohs(ds->ds_port), ds->r_addr); ++ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ /* ++ * Get the file offset on the dserver. Set the write offset to ++ * this offset and save the original offset. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* ++ * Perform an asynchronous write The offset will be reset in the ++ * call_ops->rpc_call_done() routine ++ */ ++ nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_write_call_ops, sync); ++ ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++} ++ ++/* ++ * Create a filelayout layout structure and return it. The pNFS client ++ * will use the pnfs_layout_type type to refer to the layout for this ++ * inode from now on. ++ */ ++static struct pnfs_layout_type * ++filelayout_alloc_layout(struct inode *inode) ++{ ++ struct nfs4_filelayout *flp; ++ ++ dprintk("NFS_FILELAYOUT: allocating layout\n"); ++ flp = kzalloc(sizeof(struct nfs4_filelayout), GFP_KERNEL); ++ return flp ? &flp->fl_layout : NULL; ++} ++ ++/* Free a filelayout layout structure */ ++static void ++filelayout_free_layout(struct pnfs_layout_type *lo) ++{ ++ dprintk("NFS_FILELAYOUT: freeing layout\n"); ++ kfree(FILE_LO(lo)); ++} ++ ++/* ++ * filelayout_check_layout() ++ * ++ * Make sure layout segment parameters are sane WRT the device. ++ * ++ * Notes: ++ * 1) current code insists that # stripe index = # data servers in ds_list ++ * which is wrong. ++ * 2) pattern_offset is ignored and must == 0 which is wrong; ++ * 3) the pattern_offset needs to be a mutliple of the stripe unit. ++ * 4) stripe unit is multiple of page size ++ */ ++ ++static int ++filelayout_check_layout(struct pnfs_layout_type *lo, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct nfs4_filelayout_segment *fl = LSEG_LD_DATA(lseg); ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ int status = -EINVAL; ++ struct nfs_server *nfss = NFS_SERVER(PNFS_INODE(lo)); ++ ++ dprintk("--> %s\n", __func__); ++ dsaddr = nfs4_pnfs_device_item_find(nfss->nfs_client, &fl->dev_id); ++ if (dsaddr == NULL) { ++ dsaddr = get_device_info(PNFS_INODE(lo), &fl->dev_id); ++ if (dsaddr == NULL) { ++ dprintk("%s NO device for dev_id %s\n", ++ __func__, deviceid_fmt(&fl->dev_id)); ++ goto out; ++ } ++ } ++ if (fl->first_stripe_index < 0 || ++ fl->first_stripe_index > dsaddr->stripe_count) { ++ dprintk("%s Bad first_stripe_index %d\n", ++ __func__, fl->first_stripe_index); ++ goto out; ++ } ++ ++ if (fl->pattern_offset != 0) { ++ dprintk("%s Unsupported no-zero pattern_offset %Ld\n", ++ __func__, fl->pattern_offset); ++ goto out; ++ } ++ ++ if (fl->stripe_unit % PAGE_SIZE) { ++ dprintk("%s Stripe unit (%u) not page aligned\n", ++ __func__, fl->stripe_unit); ++ goto out; ++ } ++ ++ /* XXX only support SPARSE packing. Don't support use MDS open fh */ ++ if (!(fl->num_fh == 1 || fl->num_fh == dsaddr->ds_num)) { ++ dprintk("%s num_fh %u not equal to 1 or ds_num %u\n", ++ __func__, fl->num_fh, dsaddr->ds_num); ++ goto out; ++ } ++ ++ if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { ++ dprintk("%s Stripe unit (%u) not aligned with rsize %u " ++ "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, ++ nfss->wsize); ++ } ++ ++ /* reference the device */ ++ nfs4_set_layout_deviceid(lseg, &dsaddr->deviceid); ++ ++ status = 0; ++out: ++ dprintk("--> %s returns %d\n", __func__, status); ++ return status; ++} ++ ++static void _filelayout_free_lseg(struct pnfs_layout_segment *lseg); ++static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl); ++ ++/* Decode layout and store in layoutid. Overwrite any existing layout ++ * information for this file. ++ */ ++static int ++filelayout_set_layout(struct nfs4_filelayout *flo, ++ struct nfs4_filelayout_segment *fl, ++ struct nfs4_pnfs_layoutget_res *lgr) ++{ ++ uint32_t *p = (uint32_t *)lgr->layout.buf; ++ uint32_t nfl_util; ++ int i; ++ ++ dprintk("%s: set_layout_map Begin\n", __func__); ++ ++ memcpy(&fl->dev_id, p, NFS4_PNFS_DEVICEID4_SIZE); ++ p += XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE); ++ nfl_util = be32_to_cpup(p++); ++ if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) ++ fl->commit_through_mds = 1; ++ if (nfl_util & NFL4_UFLG_DENSE) ++ fl->stripe_type = STRIPE_DENSE; ++ else ++ fl->stripe_type = STRIPE_SPARSE; ++ fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; ++ ++ if (!flo->stripe_unit) ++ flo->stripe_unit = fl->stripe_unit; ++ else if (flo->stripe_unit != fl->stripe_unit) { ++ printk(KERN_NOTICE "%s: updating strip_unit from %u to %u\n", ++ __func__, flo->stripe_unit, fl->stripe_unit); ++ flo->stripe_unit = fl->stripe_unit; ++ } ++ ++ fl->first_stripe_index = be32_to_cpup(p++); ++ p = xdr_decode_hyper(p, &fl->pattern_offset); ++ fl->num_fh = be32_to_cpup(p++); ++ ++ dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu dev_id %s\n", ++ __func__, nfl_util, fl->num_fh, fl->first_stripe_index, ++ fl->pattern_offset, deviceid_fmt(&fl->dev_id)); ++ ++ if (fl->num_fh * sizeof(struct nfs_fh) > 2*PAGE_SIZE) { ++ fl->fh_array = vmalloc(fl->num_fh * sizeof(struct nfs_fh)); ++ if (fl->fh_array) ++ memset(fl->fh_array, 0, ++ fl->num_fh * sizeof(struct nfs_fh)); ++ } else { ++ fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh), ++ GFP_KERNEL); ++ } ++ if (!fl->fh_array) ++ return -ENOMEM; ++ ++ for (i = 0; i < fl->num_fh; i++) { ++ /* fh */ ++ fl->fh_array[i].size = be32_to_cpup(p++); ++ if (sizeof(struct nfs_fh) < fl->fh_array[i].size) { ++ printk(KERN_ERR "Too big fh %d received %d\n", ++ i, fl->fh_array[i].size); ++ /* Layout is now invalid, pretend it doesn't exist */ ++ filelayout_free_fh_array(fl); ++ fl->num_fh = 0; ++ break; ++ } ++ memcpy(fl->fh_array[i].data, p, fl->fh_array[i].size); ++ p += XDR_QUADLEN(fl->fh_array[i].size); ++ dprintk("DEBUG: %s: fh len %d\n", __func__, ++ fl->fh_array[i].size); ++ } ++ ++ return 0; ++} ++ ++static struct pnfs_layout_segment * ++filelayout_alloc_lseg(struct pnfs_layout_type *layoutid, ++ struct nfs4_pnfs_layoutget_res *lgr) ++{ ++ struct nfs4_filelayout *flo = FILE_LO(layoutid); ++ struct pnfs_layout_segment *lseg; ++ int rc; ++ ++ dprintk("--> %s\n", __func__); ++ lseg = kzalloc(sizeof(struct pnfs_layout_segment) + ++ sizeof(struct nfs4_filelayout_segment), GFP_KERNEL); ++ if (!lseg) ++ return NULL; ++ ++ rc = filelayout_set_layout(flo, LSEG_LD_DATA(lseg), lgr); ++ ++ if (rc != 0 || filelayout_check_layout(layoutid, lseg)) { ++ _filelayout_free_lseg(lseg); ++ lseg = NULL; ++ } ++ return lseg; ++} ++ ++static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) ++{ ++ if (fl->num_fh * sizeof(struct nfs_fh) > 2*PAGE_SIZE) ++ vfree(fl->fh_array); ++ else ++ kfree(fl->fh_array); ++ ++ fl->fh_array = NULL; ++} ++ ++static void ++_filelayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ filelayout_free_fh_array(LSEG_LD_DATA(lseg)); ++ kfree(lseg); ++} ++ ++static void ++filelayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ dprintk("--> %s\n", __func__); ++ nfs4_unset_layout_deviceid(lseg, lseg->deviceid, ++ nfs4_fl_free_deviceid_callback); ++ _filelayout_free_lseg(lseg); ++} ++ ++/* Allocate a new nfs_write_data struct and initialize */ ++static struct nfs_write_data * ++filelayout_clone_write_data(struct nfs_write_data *old) ++{ ++ static struct nfs_write_data *new; ++ ++ new = nfs_commitdata_alloc(); ++ if (!new) ++ goto out; ++ kref_init(&new->refcount); ++ new->parent = old; ++ kref_get(&old->refcount); ++ new->inode = old->inode; ++ new->cred = old->cred; ++ new->args.offset = 0; ++ new->args.count = 0; ++ new->res.count = 0; ++ new->res.fattr = &new->fattr; ++ nfs_fattr_init(&new->fattr); ++ new->res.verf = &new->verf; ++ new->args.context = get_nfs_open_context(old->args.context); ++ new->pdata.lseg = NULL; ++ new->pdata.call_ops = old->pdata.call_ops; ++ new->pdata.how = old->pdata.how; ++out: ++ return new; ++} ++ ++static void filelayout_commit_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static struct rpc_call_ops filelayout_commit_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_commit_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* ++ * Execute a COMMIT op to the MDS or to each data server on which a page ++ * in 'pages' exists. ++ * Invoke the pnfs_commit_complete callback. ++ */ ++enum pnfs_try_status ++filelayout_commit(struct nfs_write_data *data, int sync) ++{ ++ LIST_HEAD(head); ++ struct nfs_page *req; ++ loff_t file_offset = 0; ++ u16 idx, i; ++ struct list_head **ds_page_list = NULL; ++ u16 *indices_used; ++ int num_indices_seen = 0; ++ const struct rpc_call_ops *call_ops; ++ struct rpc_clnt *clnt; ++ struct nfs_write_data **clone_list = NULL; ++ struct nfs_write_data *dsdata; ++ struct nfs4_pnfs_ds *ds; ++ ++ dprintk("%s data %p sync %d\n", __func__, data, sync); ++ ++ /* Alloc room for both in one go */ ++ ds_page_list = kzalloc((NFS4_PNFS_MAX_MULTI_CNT + 1) * ++ (sizeof(u16) + sizeof(struct list_head *)), ++ GFP_KERNEL); ++ if (!ds_page_list) ++ goto mem_error; ++ indices_used = (u16 *) (ds_page_list + NFS4_PNFS_MAX_MULTI_CNT + 1); ++ /* ++ * Sort pages based on which ds to send to. ++ * MDS is given index equal to NFS4_PNFS_MAX_MULTI_CNT. ++ * Note we are assuming there is only a single lseg in play. ++ * When that is not true, we could first sort on lseg, then ++ * sort within each as we do here. ++ */ ++ while (!list_empty(&data->pages)) { ++ req = nfs_list_entry(data->pages.next); ++ nfs_list_remove_request(req); ++ if (!req->wb_lseg || ++ ((struct nfs4_filelayout_segment *) ++ LSEG_LD_DATA(req->wb_lseg))->commit_through_mds) ++ idx = NFS4_PNFS_MAX_MULTI_CNT; ++ else { ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ idx = nfs4_fl_calc_ds_index(req->wb_lseg, file_offset); ++ } ++ if (ds_page_list[idx]) { ++ /* Already seen this idx */ ++ list_add(&req->wb_list, ds_page_list[idx]); ++ } else { ++ /* New idx not seen so far */ ++ list_add_tail(&req->wb_list, &head); ++ indices_used[num_indices_seen++] = idx; ++ } ++ ds_page_list[idx] = &req->wb_list; ++ } ++ /* Once created, clone must be released via call_op */ ++ clone_list = kzalloc(num_indices_seen * ++ sizeof(struct nfs_write_data *), GFP_KERNEL); ++ if (!clone_list) ++ goto mem_error; ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ clone_list[i] = filelayout_clone_write_data(data); ++ if (!clone_list[i]) ++ goto mem_error; ++ } ++ clone_list[i] = data; ++ /* ++ * Now send off the RPCs to each ds. Note that it is important ++ * that any RPC to the MDS be sent last (or at least after all ++ * clones have been made.) ++ */ ++ for (i = 0; i < num_indices_seen; i++) { ++ dsdata = clone_list[i]; ++ idx = indices_used[i]; ++ list_cut_position(&dsdata->pages, &head, ds_page_list[idx]); ++ if (idx == NFS4_PNFS_MAX_MULTI_CNT) { ++ call_ops = data->pdata.call_ops;; ++ clnt = NFS_CLIENT(dsdata->inode); ++ ds = NULL; ++ } else { ++ struct nfs_fh *fh; ++ ++ call_ops = &filelayout_commit_call_ops; ++ req = nfs_list_entry(dsdata->pages.next); ++ ds = nfs4_fl_prepare_ds(req->wb_lseg, idx); ++ if (!ds) { ++ /* Trigger retry of this chunk through MDS */ ++ dsdata->task.tk_status = -EIO; ++ data->pdata.call_ops->rpc_release(dsdata); ++ continue; ++ } ++ clnt = ds->ds_clp->cl_rpcclient; ++ dsdata->fldata.ds_nfs_client = ds->ds_clp; ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ fh = nfs4_fl_select_ds_fh(req->wb_lseg, file_offset); ++ if (fh) ++ dsdata->args.fh = fh; ++ } ++ dprintk("%s: Initiating commit: %llu USE DS:\n", ++ __func__, file_offset); ++ print_ds(ds); ++ ++ /* Send COMMIT to data server */ ++ nfs_initiate_commit(dsdata, clnt, call_ops, sync); ++ } ++ kfree(clone_list); ++ kfree(ds_page_list); ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++ ++ mem_error: ++ if (clone_list) { ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ if (!clone_list[i]) ++ break; ++ data->pdata.call_ops->rpc_release(clone_list[i]); ++ } ++ kfree(clone_list); ++ } ++ kfree(ds_page_list); ++ /* One of these will be empty, but doesn't hurt to do both */ ++ nfs_mark_list_commit(&head); ++ nfs_mark_list_commit(&data->pages); ++ data->pdata.call_ops->rpc_release(data); ++ return PNFS_ATTEMPTED; ++} ++ ++/* Return the stripesize for the specified file */ ++ssize_t ++filelayout_get_stripesize(struct pnfs_layout_type *layoutid) ++{ ++ struct nfs4_filelayout *flo = FILE_LO(layoutid); ++ ++ return flo->stripe_unit; ++} ++ ++/* ++ * filelayout_pg_test(). Called by nfs_can_coalesce_requests() ++ * ++ * return 1 : coalesce page ++ * return 0 : don't coalesce page ++ */ ++int ++filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ u64 p_stripe, r_stripe; ++ ++ if (pgio->pg_boundary == 0) ++ return 1; ++ p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; ++ r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; ++ ++ do_div(p_stripe, pgio->pg_boundary); ++ do_div(r_stripe, pgio->pg_boundary); ++ ++ return (p_stripe == r_stripe); ++} ++ ++struct layoutdriver_io_operations filelayout_io_operations = { ++ .commit = filelayout_commit, ++ .read_pagelist = filelayout_read_pagelist, ++ .write_pagelist = filelayout_write_pagelist, ++ .alloc_layout = filelayout_alloc_layout, ++ .free_layout = filelayout_free_layout, ++ .alloc_lseg = filelayout_alloc_lseg, ++ .free_lseg = filelayout_free_lseg, ++ .initialize_mountpoint = filelayout_initialize_mountpoint, ++ .uninitialize_mountpoint = filelayout_uninitialize_mountpoint, ++}; ++ ++struct layoutdriver_policy_operations filelayout_policy_operations = { ++ .flags = PNFS_USE_RPC_CODE, ++ .get_stripesize = filelayout_get_stripesize, ++ .pg_test = filelayout_pg_test, ++}; ++ ++struct pnfs_layoutdriver_type filelayout_type = { ++ .id = LAYOUT_NFSV4_1_FILES, ++ .name = "LAYOUT_NFSV4_1_FILES", ++ .ld_io_ops = &filelayout_io_operations, ++ .ld_policy_ops = &filelayout_policy_operations, ++}; ++ ++static int __init nfs4filelayout_init(void) ++{ ++ printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", ++ __func__); ++ ++ /* ++ * Need to register file_operations struct with global list to indicate ++ * that NFS4 file layout is a possible pNFS I/O module ++ */ ++ pnfs_callback_ops = pnfs_register_layoutdriver(&filelayout_type); ++ ++ return 0; ++} ++ ++static void __exit nfs4filelayout_exit(void) ++{ ++ printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", ++ __func__); ++ ++ /* Unregister NFS4 file layout driver with pNFS client*/ ++ pnfs_unregister_layoutdriver(&filelayout_type); ++} ++ ++module_init(nfs4filelayout_init); ++module_exit(nfs4filelayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-23 12:09:03.334491472 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-23 12:09:03.335501543 -0400 +@@ -0,0 +1,636 @@ ++/* ++ * linux/fs/nfs/nfs4filelayoutdev.c ++ * ++ * Device operations for the pnfs nfs4 file layout driver. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * Garth Goodson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include "nfs4filelayout.h" ++#include "internal.h" ++#include "nfs4_fs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++DEFINE_SPINLOCK(nfs4_ds_cache_lock); ++static LIST_HEAD(nfs4_data_server_cache); ++ ++void ++print_ds(struct nfs4_pnfs_ds *ds) ++{ ++ if (ds == NULL) { ++ dprintk("%s NULL device \n", __func__); ++ return; ++ } ++ dprintk(" ip_addr %x\n", ntohl(ds->ds_ip_addr)); ++ dprintk(" port %hu\n", ntohs(ds->ds_port)); ++ dprintk(" client %p\n", ds->ds_clp); ++ dprintk(" ref count %d\n", atomic_read(&ds->ds_count)); ++ if (ds->ds_clp) ++ dprintk(" cl_exchange_flags %x\n", ++ ds->ds_clp->cl_exchange_flags); ++ dprintk(" ip:port %s\n", ds->r_addr); ++} ++ ++void ++print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr) ++{ ++ int i; ++ ++ dprintk("%s dsaddr->ds_num %d\n", __func__, ++ dsaddr->ds_num); ++ for (i = 0; i < dsaddr->ds_num; i++) ++ print_ds(dsaddr->ds_list[i]); ++} ++ ++/* Debugging function assuming a 64bit major/minor split of the deviceid */ ++char * ++deviceid_fmt(const struct pnfs_deviceid *dev_id) ++{ ++ static char buf[17]; ++ uint32_t *p = (uint32_t *)dev_id->data; ++ uint64_t major, minor; ++ ++ p = xdr_decode_hyper(p, &major); ++ p = xdr_decode_hyper(p, &minor); ++ ++ sprintf(buf, "%08llu %08llu", major, minor); ++ return buf; ++} ++ ++/* nfs4_ds_cache_lock is held */ ++static inline struct nfs4_pnfs_ds * ++_data_server_lookup(u32 ip_addr, u32 port) ++{ ++ struct nfs4_pnfs_ds *ds; ++ ++ dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", ++ ntohl(ip_addr), ntohs(port)); ++ ++ list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { ++ if (ds->ds_ip_addr == ip_addr && ++ ds->ds_port == port) { ++ return ds; ++ } ++ } ++ return NULL; ++} ++ ++/* Create an rpc to the data server defined in 'dev_list' */ ++static int ++nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) ++{ ++ struct nfs_server *tmp; ++ struct sockaddr_in sin; ++ struct rpc_clnt *mds_clnt = mds_srv->client; ++ struct nfs_client *clp = mds_srv->nfs_client; ++ struct sockaddr *mds_addr; ++ int err = 0; ++ ++ dprintk("--> %s ip:port %s au_flavor %d\n", __func__, ++ ds->r_addr, mds_clnt->cl_auth->au_flavor); ++ ++ sin.sin_family = AF_INET; ++ sin.sin_addr.s_addr = ds->ds_ip_addr; ++ sin.sin_port = ds->ds_port; ++ ++ /* ++ * If this DS is also the MDS, use the MDS session only if the ++ * MDS exchangeid flags show the EXCHGID4_FLAG_USE_PNFS_DS pNFS role. ++ */ ++ mds_addr = (struct sockaddr *)&clp->cl_addr; ++ if (nfs_sockaddr_cmp((struct sockaddr *)&sin, mds_addr)) { ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO "ip:port %s is not a pNFS Data " ++ "Server\n", ds->r_addr); ++ err = -ENODEV; ++ } else { ++ atomic_inc(&clp->cl_count); ++ ds->ds_clp = clp; ++ dprintk("%s Using MDS Session for DS\n", __func__); ++ } ++ goto out; ++ } ++ ++ /* Temporay server for nfs4_set_client */ ++ tmp = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); ++ if (!tmp) ++ goto out; ++ ++ /* ++ * Set a retrans, timeout interval, and authflavor equual to the MDS ++ * values. Use the MDS nfs_client cl_ipaddr field so as to use the ++ * same co_ownerid as the MDS. ++ */ ++ err = nfs4_set_client(tmp, ++ mds_srv->nfs_client->cl_hostname, ++ (struct sockaddr *)&sin, ++ sizeof(struct sockaddr), ++ mds_srv->nfs_client->cl_ipaddr, ++ mds_clnt->cl_auth->au_flavor, ++ IPPROTO_TCP, ++ mds_clnt->cl_xprt->timeout, ++ 1 /* minorversion */); ++ if (err < 0) ++ goto out_free; ++ ++ clp = tmp->nfs_client; ++ ++ /* Ask for only the EXCHGID4_FLAG_USE_PNFS_DS pNFS role */ ++ dprintk("%s EXCHANGE_ID for clp %p\n", __func__, clp); ++ clp->cl_exchange_flags = EXCHGID4_FLAG_USE_PNFS_DS; ++ ++ err = nfs4_recover_expired_lease(clp); ++ if (!err) ++ err = nfs4_check_client_ready(clp); ++ if (err) ++ goto out_put; ++ ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO "ip:port %s is not a pNFS Data Server\n", ++ ds->r_addr); ++ err = -ENODEV; ++ goto out_put; ++ } ++ /* ++ * Mask the (possibly) returned EXCHGID4_FLAG_USE_PNFS_MDS pNFS role ++ * The is_ds_only_session depends on this. ++ */ ++ clp->cl_exchange_flags &= ~EXCHGID4_FLAG_USE_PNFS_MDS; ++ /* ++ * Set DS lease equal to the MDS lease, renewal is scheduled in ++ * create_session ++ */ ++ spin_lock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time; ++ spin_unlock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_last_renewal = jiffies; ++ ++ clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); ++ ds->ds_clp = clp; ++ ++ dprintk("%s: ip=%x, port=%hu, rpcclient %p\n", __func__, ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), ++ clp->cl_rpcclient); ++out_free: ++ kfree(tmp); ++out: ++ dprintk("%s Returns %d\n", __func__, err); ++ return err; ++out_put: ++ nfs_put_client(clp); ++ goto out_free; ++} ++ ++static void ++destroy_ds(struct nfs4_pnfs_ds *ds) ++{ ++ dprintk("--> %s\n", __func__); ++ print_ds(ds); ++ ++ if (ds->ds_clp) ++ nfs_put_client(ds->ds_clp); ++ kfree(ds); ++} ++ ++static void ++nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) ++{ ++ struct nfs4_pnfs_ds *ds; ++ int i; ++ ++ dprintk("%s: device id=%s\n", __func__, ++ deviceid_fmt(&dsaddr->deviceid.de_id)); ++ ++ for (i = 0; i < dsaddr->ds_num; i++) { ++ ds = dsaddr->ds_list[i]; ++ if (ds != NULL) { ++ if (atomic_dec_and_lock(&ds->ds_count, ++ &nfs4_ds_cache_lock)) { ++ list_del_init(&ds->ds_node); ++ spin_unlock(&nfs4_ds_cache_lock); ++ destroy_ds(ds); ++ } ++ } ++ } ++ kfree(dsaddr->stripe_indices); ++ kfree(dsaddr); ++} ++ ++void ++nfs4_fl_free_deviceid_callback(struct kref *kref) ++{ ++ struct nfs4_deviceid *device = ++ container_of(kref, struct nfs4_deviceid, de_kref); ++ struct nfs4_file_layout_dsaddr *dsaddr = ++ container_of(device, struct nfs4_file_layout_dsaddr, deviceid); ++ ++ nfs4_fl_free_deviceid(dsaddr); ++} ++ ++static void ++nfs4_pnfs_ds_add(struct inode *inode, struct nfs4_pnfs_ds **dsp, ++ u32 ip_addr, u32 port, char *r_addr, int len) ++{ ++ struct nfs4_pnfs_ds *tmp_ds, *ds; ++ ++ *dsp = NULL; ++ ++ ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL); ++ if (!ds) ++ return; ++ ++ spin_lock(&nfs4_ds_cache_lock); ++ tmp_ds = _data_server_lookup(ip_addr, port); ++ if (tmp_ds == NULL) { ++ ds->ds_ip_addr = ip_addr; ++ ds->ds_port = port; ++ strncpy(ds->r_addr, r_addr, len); ++ atomic_set(&ds->ds_count, 1); ++ INIT_LIST_HEAD(&ds->ds_node); ++ ds->ds_clp = NULL; ++ list_add(&ds->ds_node, &nfs4_data_server_cache); ++ *dsp = ds; ++ dprintk("%s add new data server ip 0x%x\n", __func__, ++ ds->ds_ip_addr); ++ spin_unlock(&nfs4_ds_cache_lock); ++ } else { ++ atomic_inc(&tmp_ds->ds_count); ++ *dsp = tmp_ds; ++ dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", ++ __func__, tmp_ds->ds_ip_addr, ++ atomic_read(&tmp_ds->ds_count)); ++ spin_unlock(&nfs4_ds_cache_lock); ++ kfree(ds); ++ } ++} ++ ++static struct nfs4_pnfs_ds * ++decode_and_add_ds(uint32_t **pp, struct inode *inode) ++{ ++ struct nfs4_pnfs_ds *ds = NULL; ++ char r_addr[29]; /* max size of ip/port string */ ++ int len; ++ u32 ip_addr, port; ++ int tmp[6]; ++ uint32_t *p = *pp; ++ ++ dprintk("%s enter\n", __func__); ++ /* check and skip r_netid */ ++ len = be32_to_cpup(p++); ++ /* "tcp" */ ++ if (len != 3) { ++ printk("%s: ERROR: non TCP r_netid len %d\n", ++ __func__, len); ++ goto out_err; ++ } ++ /* ++ * Read the bytes into a temporary buffer ++ * XXX: should probably sanity check them ++ */ ++ tmp[0] = be32_to_cpup(p++); ++ ++ len = be32_to_cpup(p++); ++ if (len >= sizeof(r_addr)) { ++ printk("%s: ERROR: Device ip/port too long (%d)\n", ++ __func__, len); ++ goto out_err; ++ } ++ memcpy(r_addr, p, len); ++ p += XDR_QUADLEN(len); ++ *pp = p; ++ r_addr[len] = '\0'; ++ sscanf(r_addr, "%d.%d.%d.%d.%d.%d", &tmp[0], &tmp[1], ++ &tmp[2], &tmp[3], &tmp[4], &tmp[5]); ++ ip_addr = htonl((tmp[0]<<24) | (tmp[1]<<16) | (tmp[2]<<8) | (tmp[3])); ++ port = htons((tmp[4] << 8) | (tmp[5])); ++ ++ nfs4_pnfs_ds_add(inode, &ds, ip_addr, port, r_addr, len); ++ ++ dprintk("%s: addr:port string = %s\n", __func__, r_addr); ++ return ds; ++out_err: ++ dprintk("%s returned NULL\n", __func__); ++ return NULL; ++} ++ ++/* Decode opaque device data and return the result */ ++static struct nfs4_file_layout_dsaddr* ++decode_device(struct inode *ino, struct pnfs_device *pdev) ++{ ++ int i, dummy; ++ u32 cnt, num; ++ u8 *indexp; ++ uint32_t *p = (u32 *)pdev->area, *indicesp; ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ ++ /* Get the stripe count (number of stripe index) */ ++ cnt = be32_to_cpup(p++); ++ dprintk("%s stripe count %d\n", __func__, cnt); ++ if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { ++ printk(KERN_WARNING "%s: stripe count %d greater than " ++ "supported maximum %d\n", __func__, ++ cnt, NFS4_PNFS_MAX_STRIPE_CNT); ++ goto out_err; ++ } ++ ++ /* Check the multipath list count */ ++ indicesp = p; ++ p += XDR_QUADLEN(cnt << 2); ++ num = be32_to_cpup(p++); ++ dprintk("%s ds_num %u\n", __func__, num); ++ if (num > NFS4_PNFS_MAX_MULTI_CNT) { ++ printk(KERN_WARNING "%s: multipath count %d greater than " ++ "supported maximum %d\n", __func__, ++ num, NFS4_PNFS_MAX_MULTI_CNT); ++ goto out_err; ++ } ++ dsaddr = kzalloc(sizeof(*dsaddr) + ++ (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), ++ GFP_KERNEL); ++ if (!dsaddr) ++ goto out_err; ++ ++ dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL); ++ if (!dsaddr->stripe_indices) ++ goto out_err_free; ++ ++ dsaddr->stripe_count = cnt; ++ dsaddr->ds_num = num; ++ ++ memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ ++ /* Go back an read stripe indices */ ++ p = indicesp; ++ indexp = &dsaddr->stripe_indices[0]; ++ for (i = 0; i < dsaddr->stripe_count; i++) { ++ dummy = be32_to_cpup(p++); ++ *indexp = dummy; /* bound by NFS4_PNFS_MAX_MULTI_CNT */ ++ indexp++; ++ } ++ /* Skip already read multipath list count */ ++ p++; ++ ++ for (i = 0; i < dsaddr->ds_num; i++) { ++ int j; ++ ++ dummy = be32_to_cpup(p++); /* multipath count */ ++ if (dummy > 1) { ++ printk(KERN_WARNING ++ "%s: Multipath count %d not supported, " ++ "skipping all greater than 1\n", __func__, ++ dummy); ++ } ++ for (j = 0; j < dummy; j++) { ++ if (j == 0) { ++ dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); ++ if (dsaddr->ds_list[i] == NULL) ++ goto out_err_free; ++ } else { ++ u32 len; ++ /* skip extra multipath */ ++ len = be32_to_cpup(p++); ++ p += XDR_QUADLEN(len); ++ len = be32_to_cpup(p++); ++ p += XDR_QUADLEN(len); ++ continue; ++ } ++ } ++ } ++ nfs4_init_deviceid_node(&dsaddr->deviceid); ++ ++ return dsaddr; ++ ++out_err_free: ++ nfs4_fl_free_deviceid(dsaddr); ++out_err: ++ dprintk("%s ERROR: returning NULL\n", __func__); ++ return NULL; ++} ++ ++/* ++ * Decode the opaque device specified in 'dev' ++ * and add it to the list of available devices. ++ * If the deviceid is already cached, nfs4_add_deviceid will return ++ * a pointer to the cached struct and throw away the new. ++ */ ++static struct nfs4_file_layout_dsaddr* ++decode_and_add_device(struct inode *inode, struct pnfs_device *dev) ++{ ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ struct nfs4_deviceid *d; ++ ++ dsaddr = decode_device(inode, dev); ++ if (!dsaddr) { ++ printk(KERN_WARNING "%s: Could not decode or add device\n", ++ __func__); ++ return NULL; ++ } ++ ++ d = nfs4_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, ++ &dsaddr->deviceid); ++ ++ return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); ++} ++ ++/* ++ * Retrieve the information for dev_id, add it to the list ++ * of available devices, and return it. ++ */ ++struct nfs4_file_layout_dsaddr * ++get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id) ++{ ++ struct pnfs_device *pdev = NULL; ++ u32 max_resp_sz; ++ int max_pages; ++ struct page **pages = NULL; ++ struct nfs4_file_layout_dsaddr *dsaddr = NULL; ++ int rc, i; ++ struct nfs_server *server = NFS_SERVER(inode); ++ ++ /* ++ * Use the session max response size as the basis for setting ++ * GETDEVICEINFO's maxcount ++ */ ++ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; ++ max_pages = max_resp_sz >> PAGE_SHIFT; ++ dprintk("%s inode %p max_resp_sz %u max_pages %d\n", ++ __func__, inode, max_resp_sz, max_pages); ++ ++ pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL); ++ if (pdev == NULL) ++ return NULL; ++ ++ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); ++ if (pages == NULL) { ++ kfree(pdev); ++ return NULL; ++ } ++ for (i = 0; i < max_pages; i++) { ++ pages[i] = alloc_page(GFP_KERNEL); ++ if (!pages[i]) ++ goto out_free; ++ } ++ ++ /* set pdev->area */ ++ pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); ++ if (!pdev->area) ++ goto out_free; ++ ++ memcpy(&pdev->dev_id, dev_id, NFS4_PNFS_DEVICEID4_SIZE); ++ pdev->layout_type = LAYOUT_NFSV4_1_FILES; ++ pdev->pages = pages; ++ pdev->pgbase = 0; ++ pdev->pglen = PAGE_SIZE * max_pages; ++ pdev->mincount = 0; ++ /* TODO: Update types when CB_NOTIFY_DEVICEID is available */ ++ pdev->dev_notify_types = 0; ++ ++ rc = pnfs_callback_ops->nfs_getdeviceinfo(server, pdev); ++ dprintk("%s getdevice info returns %d\n", __func__, rc); ++ if (rc) ++ goto out_free; ++ ++ /* ++ * Found new device, need to decode it and then add it to the ++ * list of known devices for this mountpoint. ++ */ ++ dsaddr = decode_and_add_device(inode, pdev); ++out_free: ++ if (pdev->area != NULL) ++ vunmap(pdev->area); ++ for (i = 0; i < max_pages; i++) ++ __free_page(pages[i]); ++ kfree(pages); ++ kfree(pdev); ++ dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); ++ return dsaddr; ++} ++ ++struct nfs4_file_layout_dsaddr * ++nfs4_pnfs_device_item_find(struct nfs_client *clp, struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ ++ d = nfs4_find_deviceid(clp->cl_devid_cache, id); ++ dprintk("%s device id (%s) nfs4_deviceid %p\n", __func__, ++ deviceid_fmt(id), d); ++ return (d == NULL) ? NULL : ++ container_of(d, struct nfs4_file_layout_dsaddr, deviceid); ++} ++ ++/* ++ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit ++ * Then: ((res + fsi) % dsaddr->stripe_count) ++ */ ++static inline u32 ++_nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ u64 tmp; ++ ++ tmp = offset - flseg->pattern_offset; ++ do_div(tmp, flseg->stripe_unit); ++ tmp += flseg->first_stripe_index; ++ return do_div(tmp, FILE_DSADDR(lseg)->stripe_count); ++} ++ ++u32 ++nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ u32 j; ++ ++ j = _nfs4_fl_calc_j_index(lseg, offset); ++ return FILE_DSADDR(lseg)->stripe_indices[j]; ++} ++ ++struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ u32 i; ++ ++ if (flseg->stripe_type == STRIPE_SPARSE) { ++ if (flseg->num_fh == 1) ++ i = 0; ++ else if (flseg->num_fh == 0) ++ return NULL; ++ else ++ i = nfs4_fl_calc_ds_index(lseg, offset); ++ } else ++ i = _nfs4_fl_calc_j_index(lseg, offset); ++ return &flseg->fh_array[i]; ++} ++ ++struct nfs4_pnfs_ds * ++nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ ++ dsaddr = FILE_DSADDR(lseg); ++ if (dsaddr->ds_list[ds_idx] == NULL) { ++ printk(KERN_ERR "%s: No data server for device id (%s)!!\n", ++ __func__, deviceid_fmt(&flseg->dev_id)); ++ return NULL; ++ } ++ ++ if (!dsaddr->ds_list[ds_idx]->ds_clp) { ++ int err; ++ ++ err = nfs4_pnfs_ds_create(PNFS_NFS_SERVER(lseg->layout), ++ dsaddr->ds_list[ds_idx]); ++ if (err) { ++ printk(KERN_ERR "%s nfs4_pnfs_ds_create error %d\n", ++ __func__, err); ++ return NULL; ++ } ++ } ++ dprintk("%s: dev_id=%s, ds_idx=%u\n", ++ __func__, deviceid_fmt(&flseg->dev_id), ds_idx); ++ ++ return dsaddr->ds_list[ds_idx]; ++} ++ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-23 12:09:03.335501543 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-08-23 12:09:03.335501543 -0400 +@@ -0,0 +1,97 @@ ++/* ++ * pnfs_nfs4filelayout.h ++ * ++ * NFSv4 file layout driver data structures. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef FS_NFS_NFS4FILELAYOUT_H ++#define FS_NFS_NFS4FILELAYOUT_H ++ ++#include ++#include ++#include ++ ++#define NFS4_PNFS_DEV_HASH_BITS 5 ++#define NFS4_PNFS_DEV_HASH_SIZE (1 << NFS4_PNFS_DEV_HASH_BITS) ++#define NFS4_PNFS_DEV_HASH_MASK (NFS4_PNFS_DEV_HASH_SIZE - 1) ++ ++#define NFS4_PNFS_MAX_STRIPE_CNT 4096 ++#define NFS4_PNFS_MAX_MULTI_CNT 64 /* 256 fit into a u8 stripe_index */ ++#define NFS4_PNFS_MAX_MULTI_DS 2 ++ ++#define FILE_DSADDR(lseg) (container_of(lseg->deviceid, \ ++ struct nfs4_file_layout_dsaddr, \ ++ deviceid)) ++ ++enum stripetype4 { ++ STRIPE_SPARSE = 1, ++ STRIPE_DENSE = 2 ++}; ++ ++/* Individual ip address */ ++struct nfs4_pnfs_ds { ++ struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ ++ u32 ds_ip_addr; ++ u32 ds_port; ++ struct nfs_client *ds_clp; ++ atomic_t ds_count; ++ char r_addr[29]; ++}; ++ ++struct nfs4_file_layout_dsaddr { ++ struct nfs4_deviceid deviceid; ++ u32 stripe_count; ++ u8 *stripe_indices; ++ u32 ds_num; ++ struct nfs4_pnfs_ds *ds_list[1]; ++}; ++ ++struct nfs4_pnfs_dev_hlist { ++ rwlock_t dev_lock; ++ struct hlist_head dev_list[NFS4_PNFS_DEV_HASH_SIZE]; ++}; ++ ++struct nfs4_filelayout_segment { ++ u32 stripe_type; ++ u32 commit_through_mds; ++ u32 stripe_unit; ++ u32 first_stripe_index; ++ u64 pattern_offset; ++ struct pnfs_deviceid dev_id; ++ unsigned int num_fh; ++ struct nfs_fh *fh_array; ++}; ++ ++struct nfs4_filelayout { ++ struct pnfs_layout_type fl_layout; ++ u32 stripe_unit; ++}; ++ ++extern struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset); ++ ++static inline struct nfs4_filelayout * ++FILE_LO(struct pnfs_layout_type *lo) ++{ ++ return container_of(lo, struct nfs4_filelayout, fl_layout); ++} ++ ++extern struct pnfs_client_operations *pnfs_callback_ops; ++ ++extern void nfs4_fl_free_deviceid_callback(struct kref *); ++extern void print_ds(struct nfs4_pnfs_ds *ds); ++char *deviceid_fmt(const struct pnfs_deviceid *dev_id); ++u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset); ++struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, ++ u32 ds_idx); ++extern struct nfs4_file_layout_dsaddr * ++nfs4_pnfs_device_item_find(struct nfs_client *, struct pnfs_deviceid *dev_id); ++struct nfs4_file_layout_dsaddr * ++get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id); ++ ++#endif /* FS_NFS_NFS4FILELAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nfs4_fs.h +--- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-23 12:08:29.047512264 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-23 12:09:03.336490079 -0400 +@@ -45,8 +45,28 @@ enum nfs4_client_state { + NFS4CLNT_RECLAIM_NOGRACE, + NFS4CLNT_DELEGRETURN, + NFS4CLNT_SESSION_RESET, +- NFS4CLNT_SESSION_DRAINING, + NFS4CLNT_RECALL_SLOT, ++ NFS4CLNT_LAYOUT_RECALL, ++}; ++ ++enum nfs4_session_state { ++ NFS4_SESSION_INITING, ++ NFS4_SESSION_DRAINING, ++}; ++ ++struct nfs4_minor_version_ops { ++ u32 minor_version; ++ ++ int (*call_sync)(struct nfs_server *server, ++ struct rpc_message *msg, ++ struct nfs4_sequence_args *args, ++ struct nfs4_sequence_res *res, ++ int cache_reply); ++ int (*validate_stateid)(struct nfs_delegation *, ++ const nfs4_stateid *); ++ const struct nfs4_state_recovery_ops *reboot_recovery_ops; ++ const struct nfs4_state_recovery_ops *nograce_recovery_ops; ++ const struct nfs4_state_maintenance_ops *state_renewal_ops; + }; + + /* +@@ -89,7 +109,6 @@ struct nfs_unique_id { + */ + struct nfs4_state_owner { + struct nfs_unique_id so_owner_id; +- struct nfs_client *so_client; + struct nfs_server *so_server; + struct rb_node so_client_node; + +@@ -99,7 +118,6 @@ struct nfs4_state_owner { + atomic_t so_count; + unsigned long so_flags; + struct list_head so_states; +- struct list_head so_delegations; + struct nfs_seqid_counter so_seqid; + struct rpc_sequence so_sequence; + }; +@@ -125,10 +143,20 @@ enum { + * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) + */ + ++struct nfs4_lock_owner { ++ unsigned int lo_type; ++#define NFS4_ANY_LOCK_TYPE (0U) ++#define NFS4_FLOCK_LOCK_TYPE (1U << 0) ++#define NFS4_POSIX_LOCK_TYPE (1U << 1) ++ union { ++ fl_owner_t posix_owner; ++ pid_t flock_owner; ++ } lo_u; ++}; ++ + struct nfs4_lock_state { + struct list_head ls_locks; /* Other lock stateids */ + struct nfs4_state * ls_state; /* Pointer to open state */ +- fl_owner_t ls_owner; /* POSIX lock owner */ + #define NFS_LOCK_INITIALIZED 1 + int ls_flags; + struct nfs_seqid_counter ls_seqid; +@@ -136,6 +164,7 @@ struct nfs4_lock_state { + struct nfs_unique_id ls_id; + nfs4_stateid ls_stateid; + atomic_t ls_count; ++ struct nfs4_lock_owner ls_owner; + }; + + /* bits for nfs4_state->flags */ +@@ -219,22 +248,34 @@ extern int nfs4_open_revalidate(struct i + extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); + extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, + struct nfs4_fs_locations *fs_locations, struct page *page); ++extern void nfs4_release_lockowner(const struct nfs4_lock_state *); + +-extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[]; +-extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[]; + #if defined(CONFIG_NFS_V4_1) +-extern int nfs4_setup_sequence(struct nfs_client *clp, ++static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) ++{ ++ return server->nfs_client->cl_session; ++} ++ ++extern int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task); + extern void nfs4_destroy_session(struct nfs4_session *session); + extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); ++extern int nfs4_proc_exchange_id(struct nfs_client *, struct rpc_cred *); + extern int nfs4_proc_create_session(struct nfs_client *); + extern int nfs4_proc_destroy_session(struct nfs4_session *); + extern int nfs4_init_session(struct nfs_server *server); + extern int nfs4_proc_get_lease_time(struct nfs_client *clp, + struct nfs_fsinfo *fsinfo); + #else /* CONFIG_NFS_v4_1 */ +-static inline int nfs4_setup_sequence(struct nfs_client *clp, ++static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) ++{ ++ return NULL; ++} ++ ++static inline int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task) + { +@@ -247,12 +288,12 @@ static inline int nfs4_init_session(stru + } + #endif /* CONFIG_NFS_V4_1 */ + +-extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[]; ++extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; + + extern const u32 nfs4_fattr_bitmap[2]; + extern const u32 nfs4_statfs_bitmap[2]; + extern const u32 nfs4_pathconf_bitmap[2]; +-extern const u32 nfs4_fsinfo_bitmap[2]; ++extern const u32 nfs4_fsinfo_bitmap[3]; + extern const u32 nfs4_fs_locations_bitmap[2]; + + /* nfs4renewd.c */ +@@ -284,7 +325,7 @@ extern void nfs41_handle_sequence_flag_e + extern void nfs41_handle_recall_slot(struct nfs_client *clp); + extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); + extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); +-extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); ++extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); + + extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); + extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); +@@ -293,6 +334,7 @@ extern void nfs_increment_lock_seqid(int + extern void nfs_release_seqid(struct nfs_seqid *seqid); + extern void nfs_free_seqid(struct nfs_seqid *seqid); + ++/* write.c */ + extern const nfs4_stateid zero_stateid; + + /* nfs4xdr.c */ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-08-23 12:08:29.050481368 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-23 12:09:03.339481253 -0400 +@@ -49,12 +49,15 @@ + #include + #include + #include ++#include ++#include + + #include "nfs4_fs.h" + #include "delegation.h" + #include "internal.h" + #include "iostat.h" + #include "callback.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PROC + +@@ -67,7 +70,7 @@ struct nfs4_opendata; + static int _nfs4_proc_open(struct nfs4_opendata *data); + static int _nfs4_recover_proc_open(struct nfs4_opendata *data); + static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); +-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); ++static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, struct nfs_client *); + static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, +@@ -125,11 +128,16 @@ const u32 nfs4_pathconf_bitmap[2] = { + 0 + }; + +-const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE ++const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE + | FATTR4_WORD0_MAXREAD + | FATTR4_WORD0_MAXWRITE + | FATTR4_WORD0_LEASE_TIME, ++#ifdef CONFIG_NFS_V4_1 ++ FATTR4_WORD1_FS_LAYOUT_TYPES, ++ FATTR4_WORD2_LAYOUT_BLKSIZE ++#else /* CONFIG_NFS_V4_1 */ + 0 ++#endif /* CONFIG_NFS_V4_1 */ + }; + + const u32 nfs4_fs_locations_bitmap[2] = { +@@ -356,7 +364,7 @@ static void nfs41_check_drain_session_co + { + struct rpc_task *task; + +- if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) { ++ if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); + if (task) + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); +@@ -370,12 +378,11 @@ static void nfs41_check_drain_session_co + complete(&ses->complete); + } + +-static void nfs41_sequence_free_slot(const struct nfs_client *clp, +- struct nfs4_sequence_res *res) ++static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) + { + struct nfs4_slot_table *tbl; + +- tbl = &clp->cl_session->fc_slot_table; ++ tbl = &res->sr_session->fc_slot_table; + if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { + /* just wake up the next guy waiting since + * we may have not consumed a slot after all */ +@@ -385,18 +392,17 @@ static void nfs41_sequence_free_slot(con + + spin_lock(&tbl->slot_tbl_lock); + nfs4_free_slot(tbl, res->sr_slotid); +- nfs41_check_drain_session_complete(clp->cl_session); ++ nfs41_check_drain_session_complete(res->sr_session); + spin_unlock(&tbl->slot_tbl_lock); + res->sr_slotid = NFS4_MAX_SLOT_TABLE; + } + +-static void nfs41_sequence_done(struct nfs_client *clp, +- struct nfs4_sequence_res *res, +- int rpc_status) ++static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) + { + unsigned long timestamp; + struct nfs4_slot_table *tbl; + struct nfs4_slot *slot; ++ struct nfs_client *clp; + + /* + * sr_status remains 1 if an RPC level error occurred. The server +@@ -411,13 +417,16 @@ static void nfs41_sequence_done(struct n + if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) + goto out; + ++ tbl = &res->sr_session->fc_slot_table; ++ slot = tbl->slots + res->sr_slotid; ++ + /* Check the SEQUENCE operation status */ +- if (res->sr_status == 0) { +- tbl = &clp->cl_session->fc_slot_table; +- slot = tbl->slots + res->sr_slotid; ++ switch (res->sr_status) { ++ case 0: + /* Update the slot's sequence and clientid lease timer */ + ++slot->seq_nr; + timestamp = res->sr_renewal_time; ++ clp = res->sr_session->clp; + spin_lock(&clp->cl_lock); + if (time_before(clp->cl_last_renewal, timestamp)) + clp->cl_last_renewal = timestamp; +@@ -425,11 +434,39 @@ static void nfs41_sequence_done(struct n + /* Check sequence flags */ + if (atomic_read(&clp->cl_count) > 1) + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); ++ break; ++ case -NFS4ERR_DELAY: ++ /* The server detected a resend of the RPC call and ++ * returned NFS4ERR_DELAY as per Section 2.10.6.2 ++ * of RFC5661. ++ */ ++ dprintk("%s: slot=%d seq=%d: Operation in progress\n", ++ __func__, res->sr_slotid, slot->seq_nr); ++ goto out_retry; ++ default: ++ /* Just update the slot sequence no. */ ++ ++slot->seq_nr; + } + out: + /* The session may be reset by one of the error handlers. */ + dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); +- nfs41_sequence_free_slot(clp, res); ++ nfs41_sequence_free_slot(res); ++ return 1; ++out_retry: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ rpc_restart_call(task); ++ /* FIXME: rpc_restart_call() should be made to return success/fail */ ++ if (RPC_ASSASSINATED(task)) ++ goto out; ++ return 0; ++} ++ ++static int nfs4_sequence_done(struct rpc_task *task, ++ struct nfs4_sequence_res *res) ++{ ++ if (res->sr_session == NULL) ++ return 1; ++ return nfs41_sequence_done(task, res); + } + + /* +@@ -480,12 +517,11 @@ static int nfs41_setup_sequence(struct n + if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) + return 0; + +- memset(res, 0, sizeof(*res)); + res->sr_slotid = NFS4_MAX_SLOT_TABLE; + tbl = &session->fc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); +- if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) && ++ if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && + !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { + /* + * The state manager will wait until the slot table is empty. +@@ -525,6 +561,7 @@ static int nfs41_setup_sequence(struct n + res->sr_session = session; + res->sr_slotid = slotid; + res->sr_renewal_time = jiffies; ++ res->sr_status_flags = 0; + /* + * sr_status is only set in decode_sequence, and so will remain + * set to 1 if an rpc level failure occurs. +@@ -533,33 +570,36 @@ static int nfs41_setup_sequence(struct n + return 0; + } + +-int nfs4_setup_sequence(struct nfs_client *clp, ++int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply, + struct rpc_task *task) + { ++ struct nfs4_session *session = nfs4_get_session(server); + int ret = 0; + ++ if (ds_session) ++ session = ds_session; ++ if (session == NULL) { ++ args->sa_session = NULL; ++ res->sr_session = NULL; ++ goto out; ++ } ++ + dprintk("--> %s clp %p session %p sr_slotid %d\n", +- __func__, clp, clp->cl_session, res->sr_slotid); ++ __func__, session->clp, session, res->sr_slotid); + +- if (!nfs4_has_session(clp)) +- goto out; +- ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply, ++ ret = nfs41_setup_sequence(session, args, res, cache_reply, + task); +- if (ret && ret != -EAGAIN) { +- /* terminate rpc task */ +- task->tk_status = ret; +- task->tk_action = NULL; +- } + out: + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; + } + + struct nfs41_call_sync_data { +- struct nfs_client *clp; ++ const struct nfs_server *seq_server; + struct nfs4_sequence_args *seq_args; + struct nfs4_sequence_res *seq_res; + int cache_reply; +@@ -569,9 +609,9 @@ static void nfs41_call_sync_prepare(stru + { + struct nfs41_call_sync_data *data = calldata; + +- dprintk("--> %s data->clp->cl_session %p\n", __func__, +- data->clp->cl_session); +- if (nfs4_setup_sequence(data->clp, data->seq_args, ++ dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); ++ ++ if (nfs4_setup_sequence(data->seq_server, NULL, data->seq_args, + data->seq_res, data->cache_reply, task)) + return; + rpc_call_start(task); +@@ -587,7 +627,7 @@ static void nfs41_call_sync_done(struct + { + struct nfs41_call_sync_data *data = calldata; + +- nfs41_sequence_done(data->clp, data->seq_res, task->tk_status); ++ nfs41_sequence_done(task, data->seq_res); + } + + struct rpc_call_ops nfs41_call_sync_ops = { +@@ -600,8 +640,7 @@ struct rpc_call_ops nfs41_call_priv_sync + .rpc_call_done = nfs41_call_sync_done, + }; + +-static int nfs4_call_sync_sequence(struct nfs_client *clp, +- struct rpc_clnt *clnt, ++static int nfs4_call_sync_sequence(struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, +@@ -611,13 +650,13 @@ static int nfs4_call_sync_sequence(struc + int ret; + struct rpc_task *task; + struct nfs41_call_sync_data data = { +- .clp = clp, ++ .seq_server = server, + .seq_args = args, + .seq_res = res, + .cache_reply = cache_reply, + }; + struct rpc_task_setup task_setup = { +- .rpc_client = clnt, ++ .rpc_client = server->client, + .rpc_message = msg, + .callback_ops = &nfs41_call_sync_ops, + .callback_data = &data +@@ -642,10 +681,15 @@ int _nfs4_call_sync_session(struct nfs_s + struct nfs4_sequence_res *res, + int cache_reply) + { +- return nfs4_call_sync_sequence(server->nfs_client, server->client, +- msg, args, res, cache_reply, 0); ++ return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0); + } + ++#else ++static int nfs4_sequence_done(struct rpc_task *task, ++ struct nfs4_sequence_res *res) ++{ ++ return 1; ++} + #endif /* CONFIG_NFS_V4_1 */ + + int _nfs4_call_sync(struct nfs_server *server, +@@ -659,18 +703,9 @@ int _nfs4_call_sync(struct nfs_server *s + } + + #define nfs4_call_sync(server, msg, args, res, cache_reply) \ +- (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \ ++ (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \ + &(res)->seq_res, (cache_reply)) + +-static void nfs4_sequence_done(const struct nfs_server *server, +- struct nfs4_sequence_res *res, int rpc_status) +-{ +-#ifdef CONFIG_NFS_V4_1 +- if (nfs4_has_session(server->nfs_client)) +- nfs41_sequence_done(server->nfs_client, res, rpc_status); +-#endif /* CONFIG_NFS_V4_1 */ +-} +- + static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) + { + struct nfs_inode *nfsi = NFS_I(dir); +@@ -745,19 +780,14 @@ static struct nfs4_opendata *nfs4_openda + p->o_arg.server = server; + p->o_arg.bitmask = server->attr_bitmask; + p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; +- if (flags & O_EXCL) { +- if (nfs4_has_persistent_session(server->nfs_client)) { +- /* GUARDED */ +- p->o_arg.u.attrs = &p->attrs; +- memcpy(&p->attrs, attrs, sizeof(p->attrs)); +- } else { /* EXCLUSIVE4_1 */ +- u32 *s = (u32 *) p->o_arg.u.verifier.data; +- s[0] = jiffies; +- s[1] = current->pid; +- } +- } else if (flags & O_CREAT) { ++ if (flags & O_CREAT) { ++ u32 *s; ++ + p->o_arg.u.attrs = &p->attrs; + memcpy(&p->attrs, attrs, sizeof(p->attrs)); ++ s = (u32 *) p->o_arg.u.verifier.data; ++ s[0] = jiffies; ++ s[1] = current->pid; + } + p->c_arg.fh = &p->o_res.fh; + p->c_arg.stateid = &p->o_res.stateid; +@@ -851,8 +881,10 @@ static void update_open_stateflags(struc + static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) + { + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) +- memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data)); +- memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data)); ++ memcpy(state->stateid.u.data, stateid->u.data, ++ sizeof(state->stateid.u.data)); ++ memcpy(state->open_stateid.u.data, stateid->u.data, ++ sizeof(state->open_stateid.u.data)); + switch (fmode) { + case FMODE_READ: + set_bit(NFS_O_RDONLY_STATE, &state->flags); +@@ -880,7 +912,8 @@ static void __update_open_stateid(struct + */ + write_seqlock(&state->seqlock); + if (deleg_stateid != NULL) { +- memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data)); ++ memcpy(state->stateid.u.data, deleg_stateid->u.data, ++ sizeof(state->stateid.u.data)); + set_bit(NFS_DELEGATED_STATE, &state->flags); + } + if (open_stateid != NULL) +@@ -911,7 +944,8 @@ static int update_open_stateid(struct nf + + if (delegation == NULL) + delegation = &deleg_cur->stateid; +- else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0) ++ else if (memcmp(deleg_cur->stateid.u.data, delegation->u.data, ++ NFS4_STATEID_SIZE) != 0) + goto no_delegation_unlock; + + nfs_mark_delegation_referenced(deleg_cur); +@@ -973,7 +1007,8 @@ static struct nfs4_state *nfs4_try_open_ + break; + } + /* Save the delegation */ +- memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); ++ memcpy(stateid.u.data, delegation->stateid.u.data, ++ sizeof(stateid.u.data)); + rcu_read_unlock(); + ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); + if (ret != 0) +@@ -1127,10 +1162,13 @@ static int nfs4_open_recover(struct nfs4 + * Check if we need to update the current stateid. + */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 && +- memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) { ++ memcmp(state->stateid.u.data, state->open_stateid.u.data, ++ sizeof(state->stateid.u.data)) != 0) { + write_seqlock(&state->seqlock); + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) +- memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)); ++ memcpy(state->stateid.u.data, ++ state->open_stateid.u.data, ++ sizeof(state->stateid.u.data)); + write_sequnlock(&state->seqlock); + } + return 0; +@@ -1199,8 +1237,8 @@ static int _nfs4_open_delegation_recall( + if (IS_ERR(opendata)) + return PTR_ERR(opendata); + opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; +- memcpy(opendata->o_arg.u.delegation.data, stateid->data, +- sizeof(opendata->o_arg.u.delegation.data)); ++ memcpy(opendata->o_arg.u.delegation.u.data, stateid->u.data, ++ sizeof(opendata->o_arg.u.delegation.u.data)); + ret = nfs4_open_recover(opendata, state); + nfs4_opendata_put(opendata); + return ret; +@@ -1258,8 +1296,8 @@ static void nfs4_open_confirm_done(struc + if (RPC_ASSASSINATED(task)) + return; + if (data->rpc_status == 0) { +- memcpy(data->o_res.stateid.data, data->c_res.stateid.data, +- sizeof(data->o_res.stateid.data)); ++ memcpy(data->o_res.stateid.u.data, data->c_res.stateid.u.data, ++ sizeof(data->o_res.stateid.u.data)); + nfs_confirm_seqid(&data->owner->so_seqid, 0); + renew_lease(data->o_res.server, data->timestamp); + data->rpc_done = 1; +@@ -1356,13 +1394,13 @@ static void nfs4_open_prepare(struct rpc + } + /* Update sequence id. */ + data->o_arg.id = sp->so_owner_id.id; +- data->o_arg.clientid = sp->so_client->cl_clientid; ++ data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; + if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; + nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); + } + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->o_arg.server->nfs_client, ++ if (nfs4_setup_sequence(data->o_arg.server, NULL, + &data->o_arg.seq_args, + &data->o_res.seq_res, 1, task)) + return; +@@ -1385,8 +1423,8 @@ static void nfs4_open_done(struct rpc_ta + + data->rpc_status = task->tk_status; + +- nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->o_res.seq_res)) ++ return; + + if (RPC_ASSASSINATED(task)) + return; +@@ -1539,9 +1577,8 @@ static int _nfs4_proc_open(struct nfs4_o + return 0; + } + +-static int nfs4_recover_expired_lease(struct nfs_server *server) ++int nfs4_recover_expired_lease(struct nfs_client *clp) + { +- struct nfs_client *clp = server->nfs_client; + unsigned int loop; + int ret; + +@@ -1557,6 +1594,7 @@ static int nfs4_recover_expired_lease(st + } + return ret; + } ++EXPORT_SYMBOL(nfs4_recover_expired_lease); + + /* + * OPEN_EXPIRED: +@@ -1646,7 +1684,7 @@ static int _nfs4_do_open(struct inode *d + dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); + goto out_err; + } +- status = nfs4_recover_expired_lease(server); ++ status = nfs4_recover_expired_lease(server->nfs_client); + if (status != 0) + goto err_put_state_owner; + if (path->dentry->d_inode != NULL) +@@ -1773,7 +1811,7 @@ static int _nfs4_do_setattr(struct inode + if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { + /* Use that stateid */ + } else if (state != NULL) { +- nfs4_copy_stateid(&arg.stateid, state, current->files); ++ nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid); + } else + memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); + +@@ -1838,7 +1876,8 @@ static void nfs4_close_done(struct rpc_t + struct nfs4_state *state = calldata->state; + struct nfs_server *server = NFS_SERVER(calldata->inode); + +- nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status); ++ if (!nfs4_sequence_done(task, &calldata->res.seq_res)) ++ return; + if (RPC_ASSASSINATED(task)) + return; + /* hmm. we are done with the inode, and in the process of freeing +@@ -1858,7 +1897,7 @@ static void nfs4_close_done(struct rpc_t + if (calldata->arg.fmode == 0) + break; + default: +- if (nfs4_async_handle_error(task, server, state) == -EAGAIN) ++ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } + nfs_release_seqid(calldata->arg.seqid); +@@ -1903,7 +1942,7 @@ static void nfs4_close_prepare(struct rp + + nfs_fattr_init(calldata->res.fattr); + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client, ++ if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), NULL, + &calldata->arg.seq_args, &calldata->res.seq_res, + 1, task)) + return; +@@ -2323,6 +2362,9 @@ nfs4_proc_setattr(struct dentry *dentry, + struct nfs4_state *state = NULL; + int status; + ++ if (pnfs_ld_layoutret_on_setattr(inode)) ++ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); ++ + nfs_fattr_init(fattr); + + /* Search for an existing open(O_WRITE) file */ +@@ -2648,8 +2690,9 @@ static int nfs4_proc_unlink_done(struct + { + struct nfs_removeres *res = task->tk_msg.rpc_resp; + +- nfs4_sequence_done(res->server, &res->seq_res, task->tk_status); +- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) ++ if (!nfs4_sequence_done(task, &res->seq_res)) ++ return 0; ++ if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN) + return 0; + update_changeattr(dir, &res->cinfo); + nfs_post_op_update_inode(dir, res->dir_attr); +@@ -3090,18 +3133,31 @@ static int nfs4_proc_pathconf(struct nfs + static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) + { + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; + + dprintk("--> %s\n", __func__); + +- nfs4_sequence_done(server, &data->res.seq_res, task->tk_status); ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; + +- if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, server->nfs_client); ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } + + nfs_invalidate_atime(data->inode); +- if (task->tk_status > 0) ++ if (task->tk_status > 0 && client == server->nfs_client) + renew_lease(server, data->timestamp); + return 0; + } +@@ -3112,20 +3168,56 @@ static void nfs4_proc_read_setup(struct + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + } + ++static void pnfs4_update_write_done(struct nfs_inode *nfsi, struct nfs_write_data *data) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ + static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- +- nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, +- task->tk_status); ++ struct nfs_server *server = NFS_SERVER(inode); ++ struct nfs_client *client = server->nfs_client; + +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; ++ ++#ifdef CONFIG_NFS_V4_1 ++ /* restore original count after retry? */ ++ if (data->pdata.orig_count) { ++ dprintk("%s: restoring original count %u\n", __func__, ++ data->pdata.orig_count); ++ data->args.count = data->pdata.orig_count; ++ } ++ ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } ++ ++ /* ++ * MDS write: renew lease ++ * DS write: update lastbyte written, mark for layout commit ++ */ + if (task->tk_status >= 0) { +- renew_lease(NFS_SERVER(inode), data->timestamp); +- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ if (client == server->nfs_client) { ++ renew_lease(server, data->timestamp); ++ nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ } else ++ pnfs4_update_write_done(NFS_I(inode), data); + } + return 0; + } +@@ -3138,20 +3230,42 @@ static void nfs4_proc_write_setup(struct + data->res.server = server; + data->timestamp = jiffies; + ++#ifdef CONFIG_NFS_V4_1 ++ /* writes to DS use pnfs vector */ ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_WRITE]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; + } + + static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- +- nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, +- task->tk_status); +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { ++ struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; ++ ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS commit\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; ++ ++ if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL, NULL) == -EAGAIN) { + nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + return -EAGAIN; + } +- nfs_refresh_inode(inode, data->res.fattr); ++ if (client == server->nfs_client) ++ nfs_refresh_inode(inode, data->res.fattr); + return 0; + } + +@@ -3161,6 +3275,12 @@ static void nfs4_proc_commit_setup(struc + + data->args.bitmask = server->cache_consistency_bitmask; + data->res.server = server; ++#if defined(CONFIG_NFS_V4_1) ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_COMMIT]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; + } + +@@ -3464,9 +3584,12 @@ static int nfs4_proc_set_acl(struct inod + } + + static int +-_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state) ++nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state, struct nfs_client *clp) + { +- if (!clp || task->tk_status >= 0) ++ if (!clp) ++ clp = server->nfs_client; ++ ++ if (task->tk_status >= 0) + return 0; + switch(task->tk_status) { + case -NFS4ERR_ADMIN_REVOKED: +@@ -3491,8 +3614,9 @@ _nfs4_async_handle_error(struct rpc_task + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: +- dprintk("%s ERROR %d, Reset session\n", __func__, +- task->tk_status); ++ dprintk("%s ERROR %d, Reset session. Exchangeid " ++ "flags 0x%x\n", __func__, task->tk_status, ++ clp->cl_exchange_flags); + nfs4_schedule_state_recovery(clp); + task->tk_status = 0; + return -EAGAIN; +@@ -3512,6 +3636,8 @@ _nfs4_async_handle_error(struct rpc_task + task->tk_status = nfs4_map_errors(task->tk_status); + return 0; + do_state_recovery: ++ if (is_ds_only_client(clp)) ++ return 0; + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); + nfs4_schedule_state_recovery(clp); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) +@@ -3520,12 +3646,6 @@ do_state_recovery: + return -EAGAIN; + } + +-static int +-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) +-{ +- return _nfs4_async_handle_error(task, server, server->nfs_client, state); +-} +- + int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, + unsigned short port, struct rpc_cred *cred, + struct nfs4_setclientid_res *res) +@@ -3641,8 +3761,8 @@ static void nfs4_delegreturn_done(struct + { + struct nfs4_delegreturndata *data = calldata; + +- nfs4_sequence_done(data->res.server, &data->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; + + switch (task->tk_status) { + case -NFS4ERR_STALE_STATEID: +@@ -3651,8 +3771,8 @@ static void nfs4_delegreturn_done(struct + renew_lease(data->res.server, data->timestamp); + break; + default: +- if (nfs4_async_handle_error(task, data->res.server, NULL) == +- -EAGAIN) { ++ if (nfs4_async_handle_error(task, data->res.server, NULL, NULL) ++ == -EAGAIN) { + nfs_restart_rpc(task, data->res.server->nfs_client); + return; + } +@@ -3672,7 +3792,7 @@ static void nfs4_delegreturn_prepare(str + + d_data = (struct nfs4_delegreturndata *)data; + +- if (nfs4_setup_sequence(d_data->res.server->nfs_client, ++ if (nfs4_setup_sequence(d_data->res.server, NULL, + &d_data->args.seq_args, + &d_data->res.seq_res, 1, task)) + return; +@@ -3892,15 +4012,16 @@ static void nfs4_locku_done(struct rpc_t + { + struct nfs4_unlockdata *calldata = data; + +- nfs4_sequence_done(calldata->server, &calldata->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &calldata->res.seq_res)) ++ return; + if (RPC_ASSASSINATED(task)) + return; + switch (task->tk_status) { + case 0: +- memcpy(calldata->lsp->ls_stateid.data, +- calldata->res.stateid.data, +- sizeof(calldata->lsp->ls_stateid.data)); ++ memcpy(calldata->lsp->ls_stateid.u.data, ++ calldata->res.stateid.u.data, ++ sizeof(calldata->lsp->ls_stateid.u. ++ data)); + renew_lease(calldata->server, calldata->timestamp); + break; + case -NFS4ERR_BAD_STATEID: +@@ -3909,7 +4030,7 @@ static void nfs4_locku_done(struct rpc_t + case -NFS4ERR_EXPIRED: + break; + default: +- if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) ++ if (nfs4_async_handle_error(task, calldata->server, NULL, NULL) == -EAGAIN) + nfs_restart_rpc(task, + calldata->server->nfs_client); + } +@@ -3927,7 +4048,7 @@ static void nfs4_locku_prepare(struct rp + return; + } + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence(calldata->server->nfs_client, ++ if (nfs4_setup_sequence(calldata->server, NULL, + &calldata->arg.seq_args, + &calldata->res.seq_res, 1, task)) + return; +@@ -4082,7 +4203,8 @@ static void nfs4_lock_prepare(struct rpc + } else + data->arg.new_lock_owner = 0; + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args, ++ if (nfs4_setup_sequence(data->server, NULL, ++ &data->arg.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +@@ -4101,8 +4223,8 @@ static void nfs4_lock_done(struct rpc_ta + + dprintk("%s: begin!\n", __func__); + +- nfs4_sequence_done(data->server, &data->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; + + data->rpc_status = task->tk_status; + if (RPC_ASSASSINATED(task)) +@@ -4114,8 +4236,8 @@ static void nfs4_lock_done(struct rpc_ta + goto out; + } + if (data->rpc_status == 0) { +- memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, +- sizeof(data->lsp->ls_stateid.data)); ++ memcpy(data->lsp->ls_stateid.u.data, data->res.stateid.u.data, ++ sizeof(data->lsp->ls_stateid.u.data)); + data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; + renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp); + } +@@ -4424,6 +4546,34 @@ out: + return err; + } + ++static void nfs4_release_lockowner_release(void *calldata) ++{ ++ kfree(calldata); ++} ++ ++const struct rpc_call_ops nfs4_release_lockowner_ops = { ++ .rpc_release = nfs4_release_lockowner_release, ++}; ++ ++void nfs4_release_lockowner(const struct nfs4_lock_state *lsp) ++{ ++ struct nfs_server *server = lsp->ls_state->owner->so_server; ++ struct nfs_release_lockowner_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], ++ }; ++ ++ if (server->nfs_client->cl_mvops->minor_version != 0) ++ return; ++ args = kmalloc(sizeof(*args), GFP_NOFS); ++ if (!args) ++ return; ++ args->lock_owner.clientid = server->nfs_client->cl_clientid; ++ args->lock_owner.id = lsp->ls_id.id; ++ msg.rpc_argp = args; ++ rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); ++} ++ + #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" + + int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, +@@ -4526,7 +4676,7 @@ int nfs4_proc_exchange_id(struct nfs_cli + nfs4_verifier verifier; + struct nfs41_exchange_id_args args = { + .client = clp, +- .flags = clp->cl_exchange_flags, ++ .flags = clp->cl_exchange_flags & ~EXCHGID4_FLAG_CONFIRMED_R, + }; + struct nfs41_exchange_id_res res = { + .client = clp, +@@ -4574,6 +4724,7 @@ int nfs4_proc_exchange_id(struct nfs_cli + dprintk("<-- %s status= %d\n", __func__, status); + return status; + } ++EXPORT_SYMBOL(nfs4_proc_exchange_id); + + struct nfs4_get_lease_time_data { + struct nfs4_get_lease_time_args *args; +@@ -4611,7 +4762,8 @@ static void nfs4_get_lease_time_done(str + (struct nfs4_get_lease_time_data *)calldata; + + dprintk("--> %s\n", __func__); +- nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status); ++ if (!nfs41_sequence_done(task, &data->res->lr_seq_res)) ++ return; + switch (task->tk_status) { + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: +@@ -4805,13 +4957,6 @@ struct nfs4_session *nfs4_alloc_session( + if (!session) + return NULL; + +- /* +- * The create session reply races with the server back +- * channel probe. Mark the client NFS_CS_SESSION_INITING +- * so that the client back channel can find the +- * nfs_client struct +- */ +- clp->cl_cons_state = NFS_CS_SESSION_INITING; + init_completion(&session->complete); + + tbl = &session->fc_slot_table; +@@ -4824,6 +4969,8 @@ struct nfs4_session *nfs4_alloc_session( + spin_lock_init(&tbl->slot_tbl_lock); + rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); + ++ session->session_state = 1<clp = clp; + return session; + } +@@ -5040,6 +5187,10 @@ int nfs4_init_session(struct nfs_server + if (!nfs4_has_session(clp)) + return 0; + ++ session = clp->cl_session; ++ if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) ++ return 0; ++ + rsize = server->rsize; + if (rsize == 0) + rsize = NFS_MAX_FILE_IO_SIZE; +@@ -5047,11 +5198,10 @@ int nfs4_init_session(struct nfs_server + if (wsize == 0) + wsize = NFS_MAX_FILE_IO_SIZE; + +- session = clp->cl_session; + session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; + session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; + +- ret = nfs4_recover_expired_lease(server); ++ ret = nfs4_recover_expired_lease(server->nfs_client); + if (!ret) + ret = nfs4_check_client_ready(clp); + return ret; +@@ -5060,69 +5210,70 @@ int nfs4_init_session(struct nfs_server + /* + * Renew the cl_session lease. + */ +-static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +-{ ++struct nfs4_sequence_data { ++ struct nfs_client *clp; + struct nfs4_sequence_args args; + struct nfs4_sequence_res res; +- +- struct rpc_message msg = { +- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], +- .rpc_argp = &args, +- .rpc_resp = &res, +- .rpc_cred = cred, +- }; +- +- args.sa_cache_this = 0; +- +- return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args, +- &res, args.sa_cache_this, 1); +-} ++}; + + static void nfs41_sequence_release(void *data) + { +- struct nfs_client *clp = (struct nfs_client *)data; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); ++ kfree(calldata); ++} ++ ++static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp) ++{ ++ switch(task->tk_status) { ++ case -NFS4ERR_DELAY: ++ case -EKEYEXPIRED: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ return -EAGAIN; ++ default: ++ nfs4_schedule_state_recovery(clp); ++ } ++ return 0; + } + + static void nfs41_sequence_call_done(struct rpc_task *task, void *data) + { +- struct nfs_client *clp = (struct nfs_client *)data; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + +- nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status); ++ if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp)) ++ return; + + if (task->tk_status < 0) { + dprintk("%s ERROR %d\n", __func__, task->tk_status); + if (atomic_read(&clp->cl_count) == 1) + goto out; + +- if (_nfs4_async_handle_error(task, NULL, clp, NULL) +- == -EAGAIN) { +- nfs_restart_rpc(task, clp); ++ if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) { ++ rpc_restart_call_prepare(task); + return; + } + } + dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); + out: +- kfree(task->tk_msg.rpc_argp); +- kfree(task->tk_msg.rpc_resp); +- + dprintk("<-- %s\n", __func__); + } + + static void nfs41_sequence_prepare(struct rpc_task *task, void *data) + { +- struct nfs_client *clp; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + struct nfs4_sequence_args *args; + struct nfs4_sequence_res *res; + +- clp = (struct nfs_client *)data; + args = task->tk_msg.rpc_argp; + res = task->tk_msg.rpc_resp; + +- if (nfs4_setup_sequence(clp, args, res, 0, task)) ++ if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task)) + return; + rpc_call_start(task); + } +@@ -5133,32 +5284,67 @@ static const struct rpc_call_ops nfs41_s + .rpc_release = nfs41_sequence_release, + }; + +-static int nfs41_proc_async_sequence(struct nfs_client *clp, +- struct rpc_cred *cred) ++static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) + { +- struct nfs4_sequence_args *args; +- struct nfs4_sequence_res *res; ++ struct nfs4_sequence_data *calldata; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], + .rpc_cred = cred, + }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = clp->cl_rpcclient, ++ .rpc_message = &msg, ++ .callback_ops = &nfs41_sequence_ops, ++ .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT, ++ }; + + if (!atomic_inc_not_zero(&clp->cl_count)) +- return -EIO; +- args = kzalloc(sizeof(*args), GFP_NOFS); +- res = kzalloc(sizeof(*res), GFP_NOFS); +- if (!args || !res) { +- kfree(args); +- kfree(res); ++ return ERR_PTR(-EIO); ++ calldata = kmalloc(sizeof(*calldata), GFP_NOFS); ++ if (calldata == NULL) { + nfs_put_client(clp); +- return -ENOMEM; ++ return ERR_PTR(-ENOMEM); + } +- res->sr_slotid = NFS4_MAX_SLOT_TABLE; +- msg.rpc_argp = args; +- msg.rpc_resp = res; ++ calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ msg.rpc_argp = &calldata->args; ++ msg.rpc_resp = &calldata->res; ++ calldata->clp = clp; ++ task_setup_data.callback_data = calldata; + +- return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, +- &nfs41_sequence_ops, (void *)clp); ++ return rpc_run_task(&task_setup_data); ++} ++ ++static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred) ++{ ++ struct rpc_task *task; ++ int ret = 0; ++ ++ task = _nfs41_proc_sequence(clp, cred); ++ if (IS_ERR(task)) ++ ret = PTR_ERR(task); ++ else ++ rpc_put_task(task); ++ dprintk("<-- %s status=%d\n", __func__, ret); ++ return ret; ++} ++ ++static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) ++{ ++ struct rpc_task *task; ++ int ret; ++ ++ task = _nfs41_proc_sequence(clp, cred); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ ret = rpc_wait_for_completion_task(task); ++ if (!ret) ++ ret = task->tk_status; ++ rpc_put_task(task); ++out: ++ dprintk("<-- %s status=%d\n", __func__, ret); ++ return ret; + } + + struct nfs4_reclaim_complete_data { +@@ -5172,13 +5358,31 @@ static void nfs4_reclaim_complete_prepar + struct nfs4_reclaim_complete_data *calldata = data; + + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); +- if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, ++ if (nfs41_setup_sequence(calldata->clp->cl_session, ++ &calldata->arg.seq_args, + &calldata->res.seq_res, 0, task)) + return; + + rpc_call_start(task); + } + ++static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) ++{ ++ switch(task->tk_status) { ++ case 0: ++ case -NFS4ERR_COMPLETE_ALREADY: ++ case -NFS4ERR_WRONG_CRED: /* What to do here? */ ++ break; ++ case -NFS4ERR_DELAY: ++ case -EKEYEXPIRED: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ return -EAGAIN; ++ default: ++ nfs4_schedule_state_recovery(clp); ++ } ++ return 0; ++} ++ + static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) + { + struct nfs4_reclaim_complete_data *calldata = data; +@@ -5186,32 +5390,13 @@ static void nfs4_reclaim_complete_done(s + struct nfs4_sequence_res *res = &calldata->res.seq_res; + + dprintk("--> %s\n", __func__); +- nfs41_sequence_done(clp, res, task->tk_status); +- switch (task->tk_status) { +- case 0: +- case -NFS4ERR_COMPLETE_ALREADY: +- break; +- case -NFS4ERR_BADSESSION: +- case -NFS4ERR_DEADSESSION: +- /* +- * Handle the session error, but do not retry the operation, as +- * we have no way of telling whether the clientid had to be +- * reset before we got our reply. If reset, a new wave of +- * reclaim operations will follow, containing their own reclaim +- * complete. We don't want our retry to get on the way of +- * recovery by incorrectly indicating to the server that we're +- * done reclaiming state since the process had to be restarted. +- */ +- _nfs4_async_handle_error(task, NULL, clp, NULL); +- break; +- default: +- if (_nfs4_async_handle_error( +- task, NULL, clp, NULL) == -EAGAIN) { +- rpc_restart_call_prepare(task); +- return; +- } +- } ++ if (!nfs41_sequence_done(task, res)) ++ return; + ++ if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) { ++ rpc_restart_call_prepare(task); ++ return; ++ } + dprintk("<-- %s\n", __func__); + } + +@@ -5268,6 +5453,404 @@ out: + dprintk("<-- %s status=%d\n", __func__, status); + return status; + } ++ ++static void ++nfs4_pnfs_layoutget_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_pnfs_layoutget *lgp = calldata; ++ struct inode *ino = lgp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args, ++ &lgp->res.seq_res, 0, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void nfs4_pnfs_layoutget_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_pnfs_layoutget *lgp = calldata; ++ struct inode *ino = lgp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!nfs4_sequence_done(task, &lgp->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ pnfs_get_layout_done(lgp, task->tk_status); ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ lgp->status = task->tk_status; ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void nfs4_pnfs_layoutget_release(void *calldata) ++{ ++ struct nfs4_pnfs_layoutget *lgp = calldata; ++ ++ dprintk("--> %s\n", __func__); ++ pnfs_layout_release(NFS_I(lgp->args.inode)->layout, NULL); ++ if (lgp->res.layout.buf != NULL) ++ free_page((unsigned long) lgp->res.layout.buf); ++ kfree(calldata); ++ dprintk("<-- %s\n", __func__); ++} ++ ++static const struct rpc_call_ops nfs4_pnfs_layoutget_call_ops = { ++ .rpc_call_prepare = nfs4_pnfs_layoutget_prepare, ++ .rpc_call_done = nfs4_pnfs_layoutget_done, ++ .rpc_release = nfs4_pnfs_layoutget_release, ++}; ++ ++/* FIXME: We need to call nfs4_handle_exception ++ * and deal with retries. ++ * Currently we can't since we release lgp and its contents. ++ */ ++static int _pnfs4_proc_layoutget(struct nfs4_pnfs_layoutget *lgp) ++{ ++ struct nfs_server *server = NFS_SERVER(lgp->args.inode); ++ struct rpc_task *task; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_LAYOUTGET], ++ .rpc_argp = &lgp->args, ++ .rpc_resp = &lgp->res, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = server->client, ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_pnfs_layoutget_call_ops, ++ .callback_data = lgp, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ int status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ ++ lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); ++ if (lgp->res.layout.buf == NULL) { ++ nfs4_pnfs_layoutget_release(lgp); ++ return -ENOMEM; ++ } ++ ++ lgp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = lgp->status; ++ if (status != 0) ++ goto out; ++ status = pnfs_layout_process(lgp); ++out: ++ rpc_put_task(task); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ return status; ++} ++ ++int pnfs4_proc_layoutget(struct nfs4_pnfs_layoutget *lgp) ++{ ++ struct nfs_server *server = NFS_SERVER(lgp->args.inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, _pnfs4_proc_layoutget(lgp), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static void pnfs_layoutcommit_prepare(struct rpc_task *task, void *data) ++{ ++ struct pnfs_layoutcommit_data *ldata = ++ (struct pnfs_layoutcommit_data *)data; ++ struct nfs_server *server = NFS_SERVER(ldata->args.inode); ++ ++ if (nfs4_setup_sequence(server, NULL, &ldata->args.seq_args, ++ &ldata->res.seq_res, 1, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void ++pnfs_layoutcommit_done(struct rpc_task *task, void *calldata) ++{ ++ struct pnfs_layoutcommit_data *data = ++ (struct pnfs_layoutcommit_data *)calldata; ++ struct nfs_server *server = NFS_SERVER(data->args.inode); ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ data->status = task->tk_status; ++} ++ ++static void pnfs_layoutcommit_release(void *lcdata) ++{ ++ struct pnfs_layoutcommit_data *data = ++ (struct pnfs_layoutcommit_data *)lcdata; ++ ++ put_rpccred(data->cred); ++ pnfs_cleanup_layoutcommit(lcdata); ++ pnfs_layoutcommit_free(lcdata); ++ /* Matched by get_layout in pnfs_layoutcommit_inode */ ++ put_layout(data->args.inode); ++} ++ ++static const struct rpc_call_ops pnfs_layoutcommit_ops = { ++ .rpc_call_prepare = pnfs_layoutcommit_prepare, ++ .rpc_call_done = pnfs_layoutcommit_done, ++ .rpc_release = pnfs_layoutcommit_release, ++}; ++ ++/* Execute a layoutcommit to the server */ ++static int ++_pnfs4_proc_layoutcommit(struct pnfs_layoutcommit_data *data, int issync) ++{ ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_LAYOUTCOMMIT], ++ .rpc_argp = &data->args, ++ .rpc_resp = &data->res, ++ .rpc_cred = data->cred, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .task = &data->task, ++ .rpc_client = NFS_CLIENT(data->args.inode), ++ .rpc_message = &msg, ++ .callback_ops = &pnfs_layoutcommit_ops, ++ .callback_data = data, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ struct rpc_task *task; ++ int status = 0; ++ ++ dprintk("NFS: %4d initiating layoutcommit call. %llu@%llu lbw: %llu " ++ "type: %d issync %d\n", ++ data->task.tk_pid, ++ data->args.lseg.length, ++ data->args.lseg.offset, ++ data->args.lastbytewritten, ++ data->args.layout_type, issync); ++ ++ data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = data->status; ++out: ++ dprintk("%s: status %d\n", __func__, status); ++ rpc_put_task(task); ++ return 0; ++} ++ ++int pnfs4_proc_layoutcommit(struct pnfs_layoutcommit_data *data, int issync) ++{ ++ struct nfs4_exception exception = { }; ++ struct nfs_server *server = NFS_SERVER(data->args.inode); ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _pnfs4_proc_layoutcommit(data, issync), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static void ++nfs4_pnfs_layoutreturn_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_pnfs_layoutreturn *lrp = calldata; ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ if (nfs4_setup_sequence(server, NULL, &lrp->args.seq_args, ++ &lrp->res.seq_res, 0, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void nfs4_pnfs_layoutreturn_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_pnfs_layoutreturn *lrp = calldata; ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!nfs4_sequence_done(task, &lrp->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void nfs4_pnfs_layoutreturn_release(void *calldata) ++{ ++ struct nfs4_pnfs_layoutreturn *lrp = calldata; ++ struct pnfs_layout_type *lo = NFS_I(lrp->args.inode)->layout; ++ ++ dprintk("--> %s return_type %d lo %p\n", __func__, ++ lrp->args.return_type, lo); ++ ++ if (lrp->args.return_type == RETURN_FILE) { ++ if (!lrp->res.lrs_present) ++ pnfs_set_layout_stateid(lo, &zero_stateid); ++ pnfs_layout_release(lo, &lrp->args.lseg); ++ } ++ kfree(calldata); ++ dprintk("<-- %s\n", __func__); ++} ++ ++static const struct rpc_call_ops nfs4_pnfs_layoutreturn_call_ops = { ++ .rpc_call_prepare = nfs4_pnfs_layoutreturn_prepare, ++ .rpc_call_done = nfs4_pnfs_layoutreturn_done, ++ .rpc_release = nfs4_pnfs_layoutreturn_release, ++}; ++ ++int _pnfs4_proc_layoutreturn(struct nfs4_pnfs_layoutreturn *lrp, bool issync) ++{ ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ struct rpc_task *task; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_LAYOUTRETURN], ++ .rpc_argp = &lrp->args, ++ .rpc_resp = &lrp->res, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = server->client, ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_pnfs_layoutreturn_call_ops, ++ .callback_data = lrp, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ int status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ lrp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = task->tk_status; ++out: ++ dprintk("<-- %s\n", __func__); ++ rpc_put_task(task); ++ return status; ++} ++ ++int pnfs4_proc_layoutreturn(struct nfs4_pnfs_layoutreturn *lrp, bool issync) ++{ ++ struct nfs_server *server = NFS_SERVER(lrp->args.inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, ++ _pnfs4_proc_layoutreturn(lrp, issync), ++ &exception); ++ } while (exception.retry); ++ ++ return err; ++} ++ ++/* ++ * Retrieve the list of Data Server devices from the MDS. ++ */ ++static int _nfs4_pnfs_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_pnfs_getdevicelist_arg arg = { ++ .fh = fh, ++ .layoutclass = server->pnfs_curr_ld->id, ++ }; ++ struct nfs4_pnfs_getdevicelist_res res = { ++ .devlist = devlist, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_GETDEVICELIST], ++ .rpc_argp = &arg, ++ .rpc_resp = &res, ++ }; ++ int status; ++ ++ dprintk("--> %s\n", __func__); ++ status = nfs4_call_sync(server, &msg, &arg, &res, 0); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ return status; ++} ++ ++int nfs4_pnfs_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_pnfs_getdevicelist(server, fh, devlist), ++ &exception); ++ } while (exception.retry); ++ ++ dprintk("nfs4_pnfs_getdevlist: err=%d, num_devs=%u\n", ++ err, devlist->num_devs); ++ ++ return err; ++} ++ ++int nfs4_pnfs_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) ++{ ++ struct nfs4_pnfs_getdeviceinfo_arg args = { ++ .pdev = pdev, ++ }; ++ struct nfs4_pnfs_getdeviceinfo_res res = { ++ .pdev = pdev, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_GETDEVICEINFO], ++ .rpc_argp = &args, ++ .rpc_resp = &res, ++ }; ++ int status; ++ ++ dprintk("--> %s\n", __func__); ++ status = nfs4_call_sync(server, &msg, &args, &res, 0); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ ++ return status; ++} ++ + #endif /* CONFIG_NFS_V4_1 */ + + struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { +@@ -5325,28 +5908,30 @@ struct nfs4_state_maintenance_ops nfs41_ + }; + #endif + +-/* +- * Per minor version reboot and network partition recovery ops +- */ +- +-struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = { +- &nfs40_reboot_recovery_ops, +-#if defined(CONFIG_NFS_V4_1) +- &nfs41_reboot_recovery_ops, +-#endif ++static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { ++ .minor_version = 0, ++ .call_sync = _nfs4_call_sync, ++ .validate_stateid = nfs4_validate_delegation_stateid, ++ .reboot_recovery_ops = &nfs40_reboot_recovery_ops, ++ .nograce_recovery_ops = &nfs40_nograce_recovery_ops, ++ .state_renewal_ops = &nfs40_state_renewal_ops, + }; + +-struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = { +- &nfs40_nograce_recovery_ops, + #if defined(CONFIG_NFS_V4_1) +- &nfs41_nograce_recovery_ops, +-#endif ++static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { ++ .minor_version = 1, ++ .call_sync = _nfs4_call_sync_session, ++ .validate_stateid = nfs41_validate_delegation_stateid, ++ .reboot_recovery_ops = &nfs41_reboot_recovery_ops, ++ .nograce_recovery_ops = &nfs41_nograce_recovery_ops, ++ .state_renewal_ops = &nfs41_state_renewal_ops, + }; ++#endif + +-struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = { +- &nfs40_state_renewal_ops, ++const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { ++ [0] = &nfs_v4_0_minor_ops, + #if defined(CONFIG_NFS_V4_1) +- &nfs41_state_renewal_ops, ++ [1] = &nfs_v4_1_minor_ops, + #endif + }; + +@@ -5364,6 +5949,7 @@ const struct nfs_rpc_ops nfs_v4_clientop + .dentry_ops = &nfs4_dentry_operations, + .dir_inode_ops = &nfs4_dir_inode_operations, + .file_inode_ops = &nfs4_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs4_proc_get_root, + .getattr = nfs4_proc_getattr, + .setattr = nfs4_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs/nfs4renewd.c +--- linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-08-23 12:09:03.341491726 -0400 +@@ -54,17 +54,17 @@ + void + nfs4_renew_state(struct work_struct *work) + { +- struct nfs4_state_maintenance_ops *ops; ++ const struct nfs4_state_maintenance_ops *ops; + struct nfs_client *clp = + container_of(work, struct nfs_client, cl_renewd.work); + struct rpc_cred *cred; + long lease; + unsigned long last, now; + +- ops = nfs4_state_renewal_ops[clp->cl_minorversion]; ++ ops = clp->cl_mvops->state_renewal_ops; + dprintk("%s: start\n", __func__); + /* Are there any active superblocks? */ +- if (list_empty(&clp->cl_superblocks)) ++ if (list_empty(&clp->cl_superblocks) && !is_ds_only_client(clp)) + goto out; + spin_lock(&clp->cl_lock); + lease = clp->cl_lease_time; +diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/nfs4state.c +--- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-08-23 12:08:29.052491341 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-23 12:09:03.342373443 -0400 +@@ -53,6 +53,9 @@ + #include "callback.h" + #include "delegation.h" + #include "internal.h" ++#include ++#include ++#include "pnfs.h" + + #define OPENOWNER_POOL_SIZE 8 + +@@ -126,6 +129,11 @@ static int nfs41_setup_state_renewal(str + int status; + struct nfs_fsinfo fsinfo; + ++ if (is_ds_only_client(clp)) { ++ nfs4_schedule_state_renewal(clp); ++ return 0; ++ } ++ + status = nfs4_proc_get_lease_time(clp, &fsinfo); + if (status == 0) { + /* Update lease time and schedule renewal */ +@@ -145,7 +153,9 @@ static void nfs4_end_drain_session(struc + struct nfs4_session *ses = clp->cl_session; + int max_slots; + +- if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) { ++ if (ses == NULL) ++ return; ++ if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + spin_lock(&ses->fc_slot_table.slot_tbl_lock); + max_slots = ses->fc_slot_table.max_slots; + while (max_slots--) { +@@ -167,7 +177,7 @@ static int nfs4_begin_drain_session(stru + struct nfs4_slot_table *tbl = &ses->fc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); +- set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state); ++ set_bit(NFS4_SESSION_DRAINING, &ses->session_state); + if (tbl->highest_used_slotid != -1) { + INIT_COMPLETION(ses->complete); + spin_unlock(&tbl->slot_tbl_lock); +@@ -371,7 +381,6 @@ nfs4_alloc_state_owner(void) + return NULL; + spin_lock_init(&sp->so_lock); + INIT_LIST_HEAD(&sp->so_states); +- INIT_LIST_HEAD(&sp->so_delegations); + rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); + sp->so_seqid.sequence = &sp->so_sequence; + spin_lock_init(&sp->so_sequence.lock); +@@ -384,7 +393,7 @@ static void + nfs4_drop_state_owner(struct nfs4_state_owner *sp) + { + if (!RB_EMPTY_NODE(&sp->so_client_node)) { +- struct nfs_client *clp = sp->so_client; ++ struct nfs_client *clp = sp->so_server->nfs_client; + + spin_lock(&clp->cl_lock); + rb_erase(&sp->so_client_node, &clp->cl_state_owners); +@@ -406,7 +415,6 @@ struct nfs4_state_owner *nfs4_get_state_ + new = nfs4_alloc_state_owner(); + if (new == NULL) + return NULL; +- new->so_client = clp; + new->so_server = server; + new->so_cred = cred; + spin_lock(&clp->cl_lock); +@@ -423,7 +431,7 @@ struct nfs4_state_owner *nfs4_get_state_ + + void nfs4_put_state_owner(struct nfs4_state_owner *sp) + { +- struct nfs_client *clp = sp->so_client; ++ struct nfs_client *clp = sp->so_server->nfs_client; + struct rpc_cred *cred = sp->so_cred; + + if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) +@@ -583,8 +591,24 @@ static void __nfs4_close(struct path *pa + if (!call_close) { + nfs4_put_open_state(state); + nfs4_put_state_owner(owner); +- } else ++ } else { ++ u32 roc_iomode; ++ struct nfs_inode *nfsi = NFS_I(state->inode); ++ ++ if (has_layout(nfsi) && ++ (roc_iomode = pnfs_layout_roc_iomode(nfsi)) != 0) { ++ struct nfs4_pnfs_layout_segment range = { ++ .iomode = roc_iomode, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ ++ pnfs_return_layout(state->inode, &range, NULL, ++ RETURN_FILE, wait); ++ } ++ + nfs4_do_close(path, state, gfp_mask, wait); ++ } + } + + void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) +@@ -602,12 +626,21 @@ void nfs4_close_sync(struct path *path, + * that is compatible with current->files + */ + static struct nfs4_lock_state * +-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) ++__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) + { + struct nfs4_lock_state *pos; + list_for_each_entry(pos, &state->lock_states, ls_locks) { +- if (pos->ls_owner != fl_owner) ++ if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type) + continue; ++ switch (pos->ls_owner.lo_type) { ++ case NFS4_POSIX_LOCK_TYPE: ++ if (pos->ls_owner.lo_u.posix_owner != fl_owner) ++ continue; ++ break; ++ case NFS4_FLOCK_LOCK_TYPE: ++ if (pos->ls_owner.lo_u.flock_owner != fl_pid) ++ continue; ++ } + atomic_inc(&pos->ls_count); + return pos; + } +@@ -619,10 +652,10 @@ __nfs4_find_lock_state(struct nfs4_state + * exists, return an uninitialized one. + * + */ +-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) ++static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) + { + struct nfs4_lock_state *lsp; +- struct nfs_client *clp = state->owner->so_client; ++ struct nfs_client *clp = state->owner->so_server->nfs_client; + + lsp = kzalloc(sizeof(*lsp), GFP_NOFS); + if (lsp == NULL) +@@ -633,7 +666,18 @@ static struct nfs4_lock_state *nfs4_allo + lsp->ls_seqid.sequence = &lsp->ls_sequence; + atomic_set(&lsp->ls_count, 1); + lsp->ls_state = state; +- lsp->ls_owner = fl_owner; ++ lsp->ls_owner.lo_type = type; ++ switch (lsp->ls_owner.lo_type) { ++ case NFS4_FLOCK_LOCK_TYPE: ++ lsp->ls_owner.lo_u.flock_owner = fl_pid; ++ break; ++ case NFS4_POSIX_LOCK_TYPE: ++ lsp->ls_owner.lo_u.posix_owner = fl_owner; ++ break; ++ default: ++ kfree(lsp); ++ return NULL; ++ } + spin_lock(&clp->cl_lock); + nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); + spin_unlock(&clp->cl_lock); +@@ -643,7 +687,7 @@ static struct nfs4_lock_state *nfs4_allo + + static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) + { +- struct nfs_client *clp = lsp->ls_state->owner->so_client; ++ struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client; + + spin_lock(&clp->cl_lock); + nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); +@@ -657,13 +701,13 @@ static void nfs4_free_lock_state(struct + * exists, return an uninitialized one. + * + */ +-static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) ++static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type) + { + struct nfs4_lock_state *lsp, *new = NULL; + + for(;;) { + spin_lock(&state->state_lock); +- lsp = __nfs4_find_lock_state(state, owner); ++ lsp = __nfs4_find_lock_state(state, owner, pid, type); + if (lsp != NULL) + break; + if (new != NULL) { +@@ -674,7 +718,7 @@ static struct nfs4_lock_state *nfs4_get_ + break; + } + spin_unlock(&state->state_lock); +- new = nfs4_alloc_lock_state(state, owner); ++ new = nfs4_alloc_lock_state(state, owner, pid, type); + if (new == NULL) + return NULL; + } +@@ -701,6 +745,8 @@ void nfs4_put_lock_state(struct nfs4_loc + if (list_empty(&state->lock_states)) + clear_bit(LK_STATE_IN_USE, &state->flags); + spin_unlock(&state->state_lock); ++ if (lsp->ls_flags & NFS_LOCK_INITIALIZED) ++ nfs4_release_lockowner(lsp); + nfs4_free_lock_state(lsp); + } + +@@ -728,7 +774,12 @@ int nfs4_set_lock_state(struct nfs4_stat + + if (fl->fl_ops != NULL) + return 0; +- lsp = nfs4_get_lock_state(state, fl->fl_owner); ++ if (fl->fl_flags & FL_POSIX) ++ lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE); ++ else if (fl->fl_flags & FL_FLOCK) ++ lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE); ++ else ++ return -EINVAL; + if (lsp == NULL) + return -ENOMEM; + fl->fl_u.nfs4_fl.owner = lsp; +@@ -740,7 +791,7 @@ int nfs4_set_lock_state(struct nfs4_stat + * Byte-range lock aware utility to initialize the stateid of read/write + * requests. + */ +-void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) ++void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid) + { + struct nfs4_lock_state *lsp; + int seq; +@@ -753,7 +804,7 @@ void nfs4_copy_stateid(nfs4_stateid *dst + return; + + spin_lock(&state->state_lock); +- lsp = __nfs4_find_lock_state(state, fl_owner); ++ lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); + if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) + memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); + spin_unlock(&state->state_lock); +@@ -1031,8 +1082,8 @@ restart: + * Open state on this file cannot be recovered + * All we can do is revert to using the zero stateid. + */ +- memset(state->stateid.data, 0, +- sizeof(state->stateid.data)); ++ memset(state->stateid.u.data, 0, ++ sizeof(state->stateid.u.data)); + /* Mark the file as being 'closed' */ + state->state = 0; + break; +@@ -1041,11 +1092,11 @@ restart: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: +- nfs4_state_mark_reclaim_nograce(sp->so_client, state); ++ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_NO_GRACE: +- nfs4_state_mark_reclaim_nograce(sp->so_client, state); ++ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: +@@ -1120,8 +1171,7 @@ static void nfs4_state_end_reclaim_reboo + if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + return; + +- nfs4_reclaim_complete(clp, +- nfs4_reboot_recovery_ops[clp->cl_minorversion]); ++ nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); + + for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); +@@ -1211,8 +1261,8 @@ restart: + static int nfs4_check_lease(struct nfs_client *clp) + { + struct rpc_cred *cred; +- struct nfs4_state_maintenance_ops *ops = +- nfs4_state_renewal_ops[clp->cl_minorversion]; ++ const struct nfs4_state_maintenance_ops *ops = ++ clp->cl_mvops->state_renewal_ops; + int status = -NFS4ERR_EXPIRED; + + /* Is the client already known to have an expired lease? */ +@@ -1235,8 +1285,8 @@ out: + static int nfs4_reclaim_lease(struct nfs_client *clp) + { + struct rpc_cred *cred; +- struct nfs4_state_recovery_ops *ops = +- nfs4_reboot_recovery_ops[clp->cl_minorversion]; ++ const struct nfs4_state_recovery_ops *ops = ++ clp->cl_mvops->reboot_recovery_ops; + int status = -ENOENT; + + cred = ops->get_clid_cred(clp); +@@ -1421,6 +1471,7 @@ static void nfs4_state_manager(struct nf + } + clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); ++ pnfs_destroy_all_layouts(clp); + } + + if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { +@@ -1444,7 +1495,7 @@ static void nfs4_state_manager(struct nf + /* First recover reboot state... */ + if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, +- nfs4_reboot_recovery_ops[clp->cl_minorversion]); ++ clp->cl_mvops->reboot_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + continue; +@@ -1458,7 +1509,7 @@ static void nfs4_state_manager(struct nf + /* Now recover expired state... */ + if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, +- nfs4_nograce_recovery_ops[clp->cl_minorversion]); ++ clp->cl_mvops->nograce_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || + test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) +diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-23 12:08:29.054481400 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-23 12:09:03.346481283 -0400 +@@ -50,8 +50,11 @@ + #include + #include + #include ++#include ++#include + #include "nfs4_fs.h" + #include "internal.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_XDR + +@@ -89,7 +92,7 @@ static int nfs4_stat_to_errno(int); + #define encode_getfh_maxsz (op_encode_hdr_maxsz) + #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ + ((3+NFS4_FHSIZE) >> 2)) +-#define nfs4_fattr_bitmap_maxsz 3 ++#define nfs4_fattr_bitmap_maxsz 4 + #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) + #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) + #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) +@@ -111,7 +114,11 @@ static int nfs4_stat_to_errno(int); + #define encode_restorefh_maxsz (op_encode_hdr_maxsz) + #define decode_restorefh_maxsz (op_decode_hdr_maxsz) + #define encode_fsinfo_maxsz (encode_getattr_maxsz) +-#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) ++/* The 5 accounts for the PNFS attributes, and assumes that at most three ++ * layout types will be returned. ++ */ ++#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ ++ nfs4_fattr_bitmap_maxsz + 8 + 5) + #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) + #define decode_renew_maxsz (op_decode_hdr_maxsz) + #define encode_setclientid_maxsz \ +@@ -202,14 +209,17 @@ static int nfs4_stat_to_errno(int); + #define encode_link_maxsz (op_encode_hdr_maxsz + \ + nfs4_name_maxsz) + #define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) ++#define encode_lockowner_maxsz (7) + #define encode_lock_maxsz (op_encode_hdr_maxsz + \ + 7 + \ +- 1 + encode_stateid_maxsz + 8) ++ 1 + encode_stateid_maxsz + 1 + \ ++ encode_lockowner_maxsz) + #define decode_lock_denied_maxsz \ + (8 + decode_lockowner_maxsz) + #define decode_lock_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) +-#define encode_lockt_maxsz (op_encode_hdr_maxsz + 12) ++#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \ ++ encode_lockowner_maxsz) + #define decode_lockt_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) + #define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \ +@@ -217,6 +227,11 @@ static int nfs4_stat_to_errno(int); + 4) + #define decode_locku_maxsz (op_decode_hdr_maxsz + \ + decode_stateid_maxsz) ++#define encode_release_lockowner_maxsz \ ++ (op_encode_hdr_maxsz + \ ++ encode_lockowner_maxsz) ++#define decode_release_lockowner_maxsz \ ++ (op_decode_hdr_maxsz) + #define encode_access_maxsz (op_encode_hdr_maxsz + 1) + #define decode_access_maxsz (op_decode_hdr_maxsz + 2) + #define encode_symlink_maxsz (op_encode_hdr_maxsz + \ +@@ -302,6 +317,35 @@ static int nfs4_stat_to_errno(int); + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) + #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) + #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) ++#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ ++ encode_verifier_maxsz) ++#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1 + \ ++ decode_verifier_maxsz + \ ++ XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ ++ NFS4_PNFS_DEVICEID4_SIZE)) ++#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ ++ XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE)) ++#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ ++ 4 /*layout type */ + \ ++ 4 /* opaque devaddr4 length */ +\ ++ 4 /* notification bitmap length */ + \ ++ 4 /* notification bitmap */) ++#define encode_layoutget_sz (op_encode_hdr_maxsz + 10 + \ ++ encode_stateid_maxsz) ++#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ ++ decode_stateid_maxsz + \ ++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) ++#define encode_layoutcommit_sz (18 + \ ++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \ ++ op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz) ++#define decode_layoutcommit_maxsz (3 + op_decode_hdr_maxsz) ++#define encode_layoutreturn_sz (8 + op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz + \ ++ 1 /* FIXME: opaque lrf_body always empty at ++ *the moment */) ++#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ ++ 1 + decode_stateid_maxsz) + #else /* CONFIG_NFS_V4_1 */ + #define encode_sequence_maxsz 0 + #define decode_sequence_maxsz 0 +@@ -471,6 +515,12 @@ static int nfs4_stat_to_errno(int); + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_locku_maxsz) ++#define NFS4_enc_release_lockowner_sz \ ++ (compound_encode_hdr_maxsz + \ ++ encode_lockowner_maxsz) ++#define NFS4_dec_release_lockowner_sz \ ++ (compound_decode_hdr_maxsz + \ ++ decode_lockowner_maxsz) + #define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ +@@ -685,6 +735,60 @@ static int nfs4_stat_to_errno(int); + #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_reclaim_complete_maxsz) ++#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_getdevicelist_maxsz) ++#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_getdevicelist_maxsz) ++#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_getdeviceinfo_maxsz) ++#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_getdeviceinfo_maxsz) ++#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_layoutget_sz) ++#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutget_maxsz) ++#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_layoutcommit_sz + \ ++ encode_getattr_maxsz) ++#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutcommit_maxsz + \ ++ decode_getattr_maxsz) ++#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_layoutreturn_sz) ++#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutreturn_maxsz) ++#define NFS4_enc_dswrite_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_write_maxsz) ++#define NFS4_dec_dswrite_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_write_maxsz) ++#define NFS4_enc_dscommit_sz (compound_encode_hdr_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_commit_maxsz) ++#define NFS4_dec_dscommit_sz (compound_decode_hdr_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_commit_maxsz) + + const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_encode_hdr_maxsz + +@@ -915,7 +1019,7 @@ static void encode_close(struct xdr_stre + p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_CLOSE); + *p++ = cpu_to_be32(arg->seqid->sequence->counter); +- xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_close_maxsz; + } +@@ -989,6 +1093,35 @@ static void encode_getattr_two(struct xd + hdr->replen += decode_getattr_maxsz; + } + ++static void ++encode_getattr_three(struct xdr_stream *xdr, ++ uint32_t bm0, uint32_t bm1, uint32_t bm2, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(OP_GETATTR); ++ if (bm2) { ++ p = reserve_space(xdr, 16); ++ *p++ = cpu_to_be32(3); ++ *p++ = cpu_to_be32(bm0); ++ *p++ = cpu_to_be32(bm1); ++ *p = cpu_to_be32(bm2); ++ } else if (bm1) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(2); ++ *p++ = cpu_to_be32(bm0); ++ *p = cpu_to_be32(bm1); ++ } else { ++ p = reserve_space(xdr, 8); ++ *p++ = cpu_to_be32(1); ++ *p = cpu_to_be32(bm0); ++ } ++ hdr->nops++; ++ hdr->replen += decode_getattr_maxsz; ++} ++ + static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { + encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], +@@ -997,8 +1130,11 @@ static void encode_getfattr(struct xdr_s + + static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { +- encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], +- bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); ++ encode_getattr_three(xdr, ++ bitmask[0] & nfs4_fsinfo_bitmap[0], ++ bitmask[1] & nfs4_fsinfo_bitmap[1], ++ bitmask[2] & nfs4_fsinfo_bitmap[2], ++ hdr); + } + + static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) +@@ -1042,6 +1178,17 @@ static inline uint64_t nfs4_lock_length( + return fl->fl_end - fl->fl_start + 1; + } + ++static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 28); ++ p = xdr_encode_hyper(p, lowner->clientid); ++ *p++ = cpu_to_be32(16); ++ p = xdr_encode_opaque_fixed(p, "lock id:", 8); ++ xdr_encode_hyper(p, lowner->id); ++} ++ + /* + * opcode,type,reclaim,offset,length,new_lock_owner = 32 + * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 +@@ -1058,18 +1205,16 @@ static void encode_lock(struct xdr_strea + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + *p = cpu_to_be32(args->new_lock_owner); + if (args->new_lock_owner){ +- p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); ++ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(args->open_seqid->sequence->counter); +- p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->open_stateid->u.data, ++ NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); +- p = xdr_encode_hyper(p, args->lock_owner.clientid); +- *p++ = cpu_to_be32(16); +- p = xdr_encode_opaque_fixed(p, "lock id:", 8); +- xdr_encode_hyper(p, args->lock_owner.id); ++ encode_lockowner(xdr, &args->lock_owner); + } + else { + p = reserve_space(xdr, NFS4_STATEID_SIZE+4); +- p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->lock_stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(args->lock_seqid->sequence->counter); + } + hdr->nops++; +@@ -1080,15 +1225,12 @@ static void encode_lockt(struct xdr_stre + { + __be32 *p; + +- p = reserve_space(xdr, 52); ++ p = reserve_space(xdr, 24); + *p++ = cpu_to_be32(OP_LOCKT); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); +- p = xdr_encode_hyper(p, args->lock_owner.clientid); +- *p++ = cpu_to_be32(16); +- p = xdr_encode_opaque_fixed(p, "lock id:", 8); +- xdr_encode_hyper(p, args->lock_owner.id); ++ encode_lockowner(xdr, &args->lock_owner); + hdr->nops++; + hdr->replen += decode_lockt_maxsz; + } +@@ -1101,13 +1243,25 @@ static void encode_locku(struct xdr_stre + *p++ = cpu_to_be32(OP_LOCKU); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + *p++ = cpu_to_be32(args->seqid->sequence->counter); +- p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->stateid->u.data, ++ NFS4_STATEID_SIZE); + p = xdr_encode_hyper(p, args->fl->fl_start); + xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + hdr->nops++; + hdr->replen += decode_locku_maxsz; + } + ++static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(OP_RELEASE_LOCKOWNER); ++ encode_lockowner(xdr, lowner); ++ hdr->nops++; ++ hdr->replen += decode_release_lockowner_maxsz; ++} ++ + static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) + { + int len = name->len; +@@ -1172,7 +1326,7 @@ static inline void encode_createmode(str + break; + default: + clp = arg->server->nfs_client; +- if (clp->cl_minorversion > 0) { ++ if (clp->cl_mvops->minor_version > 0) { + if (nfs4_has_persistent_session(clp)) { + *p = cpu_to_be32(NFS4_CREATE_GUARDED); + encode_attrs(xdr, arg->u.attrs, arg->server); +@@ -1251,7 +1405,7 @@ static inline void encode_claim_delegate + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); +- xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, stateid->u.data, NFS4_STATEID_SIZE); + encode_string(xdr, name->len, name->name); + } + +@@ -1282,7 +1436,7 @@ static void encode_open_confirm(struct x + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_CONFIRM); +- p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); + hdr->nops++; + hdr->replen += decode_open_confirm_maxsz; +@@ -1294,7 +1448,7 @@ static void encode_open_downgrade(struct + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); +- p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); + encode_share_access(xdr, arg->fmode); + hdr->nops++; +@@ -1324,17 +1478,17 @@ static void encode_putrootfh(struct xdr_ + hdr->replen += decode_putrootfh_maxsz; + } + +-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) ++static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx) + { + nfs4_stateid stateid; + __be32 *p; + + p = reserve_space(xdr, NFS4_STATEID_SIZE); + if (ctx->state != NULL) { +- nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); +- xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); ++ nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); ++ xdr_encode_opaque_fixed(p, stateid.u.data, NFS4_STATEID_SIZE); + } else +- xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, zero_stateid.u.data, NFS4_STATEID_SIZE); + } + + static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) +@@ -1344,7 +1498,7 @@ static void encode_read(struct xdr_strea + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_READ); + +- encode_stateid(xdr, args->context); ++ encode_stateid(xdr, args->context, args->lock_context); + + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->offset); +@@ -1448,7 +1602,7 @@ encode_setacl(struct xdr_stream *xdr, st + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); +- xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, zero_stateid.u.data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 2*4); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(FATTR4_WORD0_ACL); +@@ -1479,7 +1633,7 @@ static void encode_setattr(struct xdr_st + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); +- xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, arg->stateid.u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_setattr_maxsz; + encode_attrs(xdr, arg->iap, server); +@@ -1523,7 +1677,7 @@ static void encode_write(struct xdr_stre + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_WRITE); + +- encode_stateid(xdr, args->context); ++ encode_stateid(xdr, args->context, args->lock_context); + + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, args->offset); +@@ -1542,7 +1696,7 @@ static void encode_delegreturn(struct xd + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + + *p++ = cpu_to_be32(OP_DELEGRETURN); +- xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, stateid->u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_delegreturn_maxsz; + } +@@ -1696,6 +1850,162 @@ static void encode_sequence(struct xdr_s + #endif /* CONFIG_NFS_V4_1 */ + } + ++#ifdef CONFIG_NFS_V4_1 ++static void ++encode_getdevicelist(struct xdr_stream *xdr, ++ const struct nfs4_pnfs_getdevicelist_arg *args, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ nfs4_verifier dummy = { ++ .data = "dummmmmy", ++ }; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_GETDEVICELIST); ++ *p++ = cpu_to_be32(args->layoutclass); ++ *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); ++ xdr_encode_hyper(p, 0ULL); /* cookie */ ++ encode_nfs4_verifier(xdr, &dummy); ++ hdr->nops++; ++} ++ ++static void ++encode_getdeviceinfo(struct xdr_stream *xdr, ++ const struct nfs4_pnfs_getdeviceinfo_arg *args, ++ struct compound_hdr *hdr) ++{ ++ int has_bitmap = (args->pdev->dev_notify_types != 0); ++ int len = 16 + NFS4_PNFS_DEVICEID4_SIZE + (has_bitmap * 4); ++ __be32 *p; ++ ++ p = reserve_space(xdr, len); ++ *p++ = cpu_to_be32(OP_GETDEVICEINFO); ++ p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ *p++ = cpu_to_be32(args->pdev->layout_type); ++ *p++ = cpu_to_be32(args->pdev->pglen + len); /* gdia_maxcount */ ++ *p++ = cpu_to_be32(has_bitmap); /* bitmap length [01] */ ++ if (has_bitmap) ++ *p = cpu_to_be32(args->pdev->dev_notify_types); ++ hdr->nops++; ++} ++ ++static void ++encode_layoutget(struct xdr_stream *xdr, ++ const struct nfs4_pnfs_layoutget_arg *args, ++ struct compound_hdr *hdr) ++{ ++ nfs4_stateid stateid; ++ __be32 *p; ++ ++ p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(OP_LAYOUTGET); ++ *p++ = cpu_to_be32(0); /* Signal layout available */ ++ *p++ = cpu_to_be32(args->type); ++ *p++ = cpu_to_be32(args->lseg.iomode); ++ p = xdr_encode_hyper(p, args->lseg.offset); ++ p = xdr_encode_hyper(p, args->lseg.length); ++ p = xdr_encode_hyper(p, args->minlength); ++ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout); ++ p = xdr_encode_opaque_fixed(p, &stateid.u.data, NFS4_STATEID_SIZE); ++ *p = cpu_to_be32(args->maxcount); ++ ++ dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", ++ __func__, ++ args->type, ++ args->lseg.iomode, ++ (unsigned long)args->lseg.offset, ++ (unsigned long)args->lseg.length, ++ args->maxcount); ++ hdr->nops++; ++ hdr->replen += decode_layoutget_maxsz; ++} ++ ++static int ++encode_layoutcommit(struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *args, ++ struct compound_hdr *hdr) ++{ ++ struct layoutdriver_io_operations *ld_io_ops = ++ NFS_SERVER(args->inode)->pnfs_curr_ld->ld_io_ops; ++ __be32 *p; ++ ++ dprintk("%s: %llu@%llu lbw: %llu type: %d\n", __func__, ++ args->lseg.length, args->lseg.offset, args->lastbytewritten, ++ args->layout_type); ++ ++ p = reserve_space(xdr, 40 + NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); ++ p = xdr_encode_hyper(p, args->lseg.offset); ++ p = xdr_encode_hyper(p, args->lseg.length); ++ *p++ = cpu_to_be32(0); /* reclaim */ ++ p = xdr_encode_opaque_fixed(p, args->stateid.u.data, NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(1); /* newoffset = TRUE */ ++ p = xdr_encode_hyper(p, args->lastbytewritten); ++ *p = cpu_to_be32(args->time_modify_changed != 0); ++ if (args->time_modify_changed) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(0); ++ *p++ = cpu_to_be32(args->time_modify.tv_sec); ++ *p = cpu_to_be32(args->time_modify.tv_nsec); ++ } ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(args->layout_type); ++ ++ if (ld_io_ops->encode_layoutcommit) { ++ ld_io_ops->encode_layoutcommit(NFS_I(args->inode)->layout, ++ xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ xdr_encode_opaque(p, NULL, 0); ++ } ++ ++ hdr->nops++; ++ hdr->replen += decode_layoutcommit_maxsz; ++ return 0; ++} ++ ++static void ++encode_layoutreturn(struct xdr_stream *xdr, ++ const struct nfs4_pnfs_layoutreturn_arg *args, ++ struct compound_hdr *hdr) ++{ ++ nfs4_stateid stateid; ++ __be32 *p; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_LAYOUTRETURN); ++ *p++ = cpu_to_be32(args->reclaim); ++ *p++ = cpu_to_be32(args->layout_type); ++ *p++ = cpu_to_be32(args->lseg.iomode); ++ *p = cpu_to_be32(args->return_type); ++ if (args->return_type == RETURN_FILE) { ++ struct layoutdriver_io_operations *ld_io_ops = ++ NFS_SERVER(args->inode)->pnfs_curr_ld->ld_io_ops; ++ ++ p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE); ++ p = xdr_encode_hyper(p, args->lseg.offset); ++ p = xdr_encode_hyper(p, args->lseg.length); ++ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout); ++ p = xdr_encode_opaque_fixed(p, &stateid.u.data, ++ NFS4_STATEID_SIZE); ++ dprintk("%s: call %pF\n", __func__, ++ ld_io_ops->encode_layoutreturn); ++ if (ld_io_ops->encode_layoutreturn) { ++ ld_io_ops->encode_layoutreturn( ++ NFS_I(args->inode)->layout, xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(0); ++ } ++ } ++ hdr->nops++; ++ hdr->replen += decode_layoutreturn_maxsz; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ + /* + * END OF "GENERIC" ENCODE ROUTINES. + */ +@@ -1704,7 +2014,7 @@ static u32 nfs4_xdr_minorversion(const s + { + #if defined(CONFIG_NFS_V4_1) + if (args->sa_session) +- return args->sa_session->clp->cl_minorversion; ++ return args->sa_session->clp->cl_mvops->minor_version; + #endif /* CONFIG_NFS_V4_1 */ + return 0; + } +@@ -2048,6 +2358,20 @@ static int nfs4_xdr_enc_locku(struct rpc + return 0; + } + ++static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = 0, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_release_lockowner(&xdr, &args->lock_owner, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ + /* + * Encode a READLINK request + */ +@@ -2330,7 +2654,7 @@ static int nfs4_xdr_enc_setclientid_conf + struct compound_hdr hdr = { + .nops = 0, + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); +@@ -2395,7 +2719,7 @@ static int nfs4_xdr_enc_exchange_id(stru + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = args->client->cl_minorversion, ++ .minorversion = args->client->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2413,7 +2737,7 @@ static int nfs4_xdr_enc_create_session(s + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = args->client->cl_minorversion, ++ .minorversion = args->client->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2431,7 +2755,7 @@ static int nfs4_xdr_enc_destroy_session( + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = session->clp->cl_minorversion, ++ .minorversion = session->clp->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2469,7 +2793,7 @@ static int nfs4_xdr_enc_get_lease_time(s + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); +@@ -2499,6 +2823,159 @@ static int nfs4_xdr_enc_reclaim_complete + return 0; + } + ++/* ++ * Encode GETDEVICELIST request ++ */ ++static int ++nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_pnfs_getdevicelist_arg *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_getdevicelist(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode GETDEVICEINFO request ++ */ ++static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_pnfs_getdeviceinfo_arg *args) ++{ ++ struct xdr_stream xdr; ++ struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ int replen; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_getdeviceinfo(&xdr, args, &hdr); ++ ++ /* set up reply kvec. Subtract notification bitmap max size (8) ++ * so that notification bitmap is put in xdr_buf tail */ ++ replen = (RPC_REPHDRSIZE + auth->au_rslack + ++ NFS4_dec_getdeviceinfo_sz - 8) << 2; ++ xdr_inline_pages(&req->rq_rcv_buf, replen, args->pdev->pages, ++ args->pdev->pgbase, args->pdev->pglen); ++ dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", ++ __func__, replen, args->pdev->pages, ++ args->pdev->pgbase, args->pdev->pglen); ++ ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTGET request ++ */ ++static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_pnfs_layoutget_arg *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, NFS_FH(args->inode), &hdr); ++ encode_layoutget(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTCOMMIT request ++ */ ++static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, uint32_t *p, ++ struct pnfs_layoutcommit_arg *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_layoutcommit(&xdr, args, &hdr); ++ encode_getfattr(&xdr, args->bitmask, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTRETURN request ++ */ ++static int nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_pnfs_layoutreturn_arg *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, NFS_FH(args->inode), &hdr); ++ encode_layoutreturn(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server WRITE request ++ */ ++static int nfs4_xdr_enc_dswrite(struct rpc_rqst *req, uint32_t *p, ++ struct nfs_writeargs *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_write(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server COMMIT request ++ */ ++static int nfs4_xdr_enc_dscommit(struct rpc_rqst *req, uint32_t *p, ++ struct nfs_writeargs *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_commit(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} + #endif /* CONFIG_NFS_V4_1 */ + + static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +@@ -2599,14 +3076,17 @@ static int decode_attr_bitmap(struct xdr + goto out_overflow; + bmlen = be32_to_cpup(p); + +- bitmap[0] = bitmap[1] = 0; ++ bitmap[0] = bitmap[1] = bitmap[2] = 0; + p = xdr_inline_decode(xdr, (bmlen << 2)); + if (unlikely(!p)) + goto out_overflow; + if (bmlen > 0) { + bitmap[0] = be32_to_cpup(p++); +- if (bmlen > 1) +- bitmap[1] = be32_to_cpup(p); ++ if (bmlen > 1) { ++ bitmap[1] = be32_to_cpup(p++); ++ if (bmlen > 2) ++ bitmap[2] = be32_to_cpup(p); ++ } + } + return 0; + out_overflow: +@@ -2635,8 +3115,9 @@ static int decode_attr_supported(struct + decode_attr_bitmap(xdr, bitmask); + bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; + } else +- bitmask[0] = bitmask[1] = 0; +- dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); ++ bitmask[0] = bitmask[1] = bitmask[2] = 0; ++ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, ++ bitmask[0], bitmask[1], bitmask[2]); + return 0; + } + +@@ -3565,7 +4046,7 @@ static int decode_opaque_fixed(struct xd + + static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) + { +- return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); ++ return decode_opaque_fixed(xdr, stateid->u.data, NFS4_STATEID_SIZE); + } + + static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +@@ -3621,7 +4102,7 @@ out_overflow: + static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3647,7 +4128,7 @@ xdr_error: + static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3679,7 +4160,7 @@ xdr_error: + static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3705,7 +4186,7 @@ static int decode_getfattr(struct xdr_st + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}, ++ bitmap[3] = {0}, + type; + int status; + umode_t fmode = 0; +@@ -3824,24 +4305,101 @@ xdr_error: + return status; + } + +- +-static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) ++#if defined(CONFIG_NFS_V4_1) ++/* ++ * Decode potentially multiple layout types. Currently we only support ++ * one layout driver per file system. ++ */ ++static int decode_pnfs_list(struct xdr_stream *xdr, uint32_t *layoutclass) + { +- __be32 *savep; +- uint32_t attrlen, bitmap[2]; +- int status; ++ uint32_t *p; ++ int num; + +- if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +- goto xdr_error; +- if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) +- goto xdr_error; +- if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) +- goto xdr_error; ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ num = be32_to_cpup(p); + +- fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ ++ /* pNFS is not supported by the underlying file system */ ++ if (num == 0) { ++ *layoutclass = 0; ++ return 0; ++ } + +- if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) +- goto xdr_error; ++ /* TODO: We will eventually support multiple layout drivers ? */ ++ if (num > 1) ++ printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers " ++ "per filesystem not supported\n", __func__); ++ ++ /* Decode and set first layout type */ ++ p = xdr_inline_decode(xdr, num * 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ *layoutclass = be32_to_cpup(p); ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++/* ++ * The type of file system exported ++ */ ++static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, ++ uint32_t *layoutclass) ++{ ++ int status = 0; ++ ++ dprintk("%s: bitmap is %x\n", __func__, bitmap[1]); ++ if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) ++ return -EIO; ++ if (likely(bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES)) { ++ status = decode_pnfs_list(xdr, layoutclass); ++ bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; ++ } ++ return status; ++} ++ ++/* ++ * The prefered block size for layout directed io ++ */ ++static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, ++ uint32_t *res) ++{ ++ __be32 *p; ++ ++ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); ++ *res = 0; ++ if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) { ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++ } ++ *res = be32_to_cpup(p); ++ bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; ++ } ++ return 0; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ ++static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) ++{ ++ __be32 *savep; ++ uint32_t attrlen, bitmap[3]; ++ int status; ++ ++ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) ++ goto xdr_error; ++ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) ++ goto xdr_error; ++ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) ++ goto xdr_error; ++ ++ fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ ++ ++ if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) ++ goto xdr_error; + if ((status = decode_attr_maxfilesize(xdr, bitmap, &fsinfo->maxfilesize)) != 0) + goto xdr_error; + if ((status = decode_attr_maxread(xdr, bitmap, &fsinfo->rtmax)) != 0) +@@ -3850,6 +4408,14 @@ static int decode_fsinfo(struct xdr_stre + if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) + goto xdr_error; + fsinfo->wtpref = fsinfo->wtmax; ++#if defined(CONFIG_NFS_V4_1) ++ status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); ++ if (status) ++ goto xdr_error; ++ status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); ++ if (status) ++ goto xdr_error; ++#endif /* CONFIG_NFS_V4_1 */ + + status = verify_attr_len(xdr, savep, attrlen); + xdr_error: +@@ -3973,6 +4539,11 @@ static int decode_locku(struct xdr_strea + return status; + } + ++static int decode_release_lockowner(struct xdr_stream *xdr) ++{ ++ return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER); ++} ++ + static int decode_lookup(struct xdr_stream *xdr) + { + return decode_op_hdr(xdr, OP_LOOKUP); +@@ -4333,7 +4904,7 @@ static int decode_getacl(struct xdr_stre + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}; ++ bitmap[3] = {0}; + struct kvec *iov = req->rq_rcv_buf.head; + int status; + +@@ -4682,6 +5253,226 @@ out_overflow: + #endif /* CONFIG_NFS_V4_1 */ + } + ++#if defined(CONFIG_NFS_V4_1) ++/* ++ * TODO: Need to handle case when EOF != true; ++ */ ++static int decode_getdevicelist(struct xdr_stream *xdr, ++ struct pnfs_devicelist *res) ++{ ++ __be32 *p; ++ int status, i; ++ struct nfs_writeverf verftemp; ++ ++ status = decode_op_hdr(xdr, OP_GETDEVICELIST); ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 8 + 8 + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ ++ /* TODO: Skip cookie for now */ ++ p += 2; ++ ++ /* Read verifier */ ++ p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); ++ ++ res->num_devs = be32_to_cpup(p); ++ ++ dprintk("%s: num_dev %d\n", __func__, res->num_devs); ++ ++ if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) ++ return -NFS4ERR_REP_TOO_BIG; ++ ++ p = xdr_inline_decode(xdr, ++ res->num_devs * NFS4_PNFS_DEVICEID4_SIZE + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ for (i = 0; i < res->num_devs; i++) ++ p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ res->eof = be32_to_cpup(p); ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_getdeviceinfo(struct xdr_stream *xdr, ++ struct pnfs_device *pdev) ++{ ++ __be32 *p; ++ uint32_t len, type; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_GETDEVICEINFO); ++ if (status) { ++ if (status == -ETOOSMALL) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ pdev->mincount = be32_to_cpup(p); ++ dprintk("%s: Min count too small. mincnt = %u\n", ++ __func__, pdev->mincount); ++ } ++ return status; ++ } ++ ++ p = xdr_inline_decode(xdr, 8); ++ if (unlikely(!p)) ++ goto out_overflow; ++ type = be32_to_cpup(p++); ++ if (type != pdev->layout_type) { ++ dprintk("%s: layout mismatch req: %u pdev: %u\n", ++ __func__, pdev->layout_type, type); ++ return -EINVAL; ++ } ++ /* ++ * Get the length of the opaque device_addr4. xdr_read_pages places ++ * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages) ++ * and places the remaining xdr data in xdr_buf->tail ++ */ ++ pdev->mincount = be32_to_cpup(p); ++ xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ ++ ++ /* At most one bitmap word */ ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ len = be32_to_cpup(p); ++ if (len) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ pdev->dev_notify_types = be32_to_cpup(p); ++ } else ++ pdev->dev_notify_types = 0; ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, ++ struct nfs4_pnfs_layoutget_res *res) ++{ ++ __be32 *p; ++ int status; ++ u32 layout_count, dummy; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTGET); ++ if (status) ++ return status; ++ p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->return_on_close = be32_to_cpup(p++); ++ p = xdr_decode_opaque_fixed(p, res->stateid.u.data, NFS4_STATEID_SIZE); ++ layout_count = be32_to_cpup(p); ++ if (!layout_count) { ++ dprintk("%s: server responded with empty layout array\n", ++ __func__); ++ return -EINVAL; ++ } ++ ++ p = xdr_inline_decode(xdr, 24); ++ if (unlikely(!p)) ++ goto out_overflow; ++ p = xdr_decode_hyper(p, &res->lseg.offset); ++ p = xdr_decode_hyper(p, &res->lseg.length); ++ res->lseg.iomode = be32_to_cpup(p++); ++ res->type = be32_to_cpup(p++); ++ ++ status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p); ++ if (unlikely(status)) ++ return status; ++ ++ dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", ++ __func__, ++ (unsigned long)res->lseg.offset, ++ (unsigned long)res->lseg.length, ++ res->lseg.iomode, ++ res->type, ++ res->layout.len); ++ ++ /* presuambly, pnfs4_proc_layoutget allocated a single page */ ++ if (res->layout.len > PAGE_SIZE) ++ return -ENOMEM; ++ memcpy(res->layout.buf, p, res->layout.len); ++ ++ /* FIXME: the whole layout array should be passed up to the pnfs ++ * client */ ++ if (layout_count > 1) { ++ dprintk("%s: server responded with %d layouts, dropping tail\n", ++ __func__, layout_count); ++ ++ while (--layout_count) { ++ p = xdr_inline_decode(xdr, 24); ++ if (unlikely(!p)) ++ goto out_overflow; ++ status = decode_opaque_inline(xdr, &dummy, (char **)&p); ++ if (unlikely(status)) ++ return status; ++ } ++ } ++ ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutreturn(struct xdr_stream *xdr, ++ struct nfs4_pnfs_layoutreturn_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTRETURN); ++ if (status) ++ return status; ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->lrs_present = be32_to_cpup(p); ++ if (res->lrs_present) ++ status = decode_stateid(xdr, &res->stateid); ++ return status; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutcommit(struct xdr_stream *xdr, ++ struct rpc_rqst *req, ++ struct pnfs_layoutcommit_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->sizechanged = be32_to_cpup(p); ++ ++ if (res->sizechanged) { ++ p = xdr_inline_decode(xdr, 8); ++ if (unlikely(!p)) ++ goto out_overflow; ++ xdr_decode_hyper(p, &res->newsize); ++ } ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ + /* + * END OF "GENERIC" DECODE ROUTINES. + */ +@@ -5259,6 +6050,19 @@ out: + return status; + } + ++static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (!status) ++ status = decode_release_lockowner(&xdr); ++ return status; ++} ++ + /* + * Decode READLINK response + */ +@@ -5696,6 +6500,186 @@ static int nfs4_xdr_dec_reclaim_complete + status = decode_reclaim_complete(&xdr, (void *)NULL); + return status; + } ++ ++/* ++ * Decode GETDEVICELIST response ++ */ ++static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_pnfs_getdevicelist_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ dprintk("encoding getdevicelist!\n"); ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status != 0) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status != 0) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status != 0) ++ goto out; ++ status = decode_getdevicelist(&xdr, res->devlist); ++out: ++ return status; ++} ++ ++/* ++ * Decode GETDEVINFO response ++ */ ++static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_pnfs_getdeviceinfo_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status != 0) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status != 0) ++ goto out; ++ status = decode_getdeviceinfo(&xdr, res->pdev); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTGET response ++ */ ++static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_pnfs_layoutget_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutget(&xdr, rqstp, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTRETURN response ++ */ ++static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_pnfs_layoutreturn_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutreturn(&xdr, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTCOMMIT response ++ */ ++static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, uint32_t *p, ++ struct pnfs_layoutcommit_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutcommit(&xdr, rqstp, res); ++ if (status) ++ goto out; ++ decode_getfattr(&xdr, res->fattr, res->server, ++ !RPC_IS_ASYNC(rqstp->rq_task)); ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server WRITE response ++ */ ++static int nfs4_xdr_dec_dswrite(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs_writeres *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_write(&xdr, res); ++ if (!status) ++ return res->count; ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server COMMIT response ++ */ ++static int nfs4_xdr_dec_dscommit(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs_writeres *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_commit(&xdr, res); ++out: ++ return status; ++} + #endif /* CONFIG_NFS_V4_1 */ + + __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) +@@ -5866,6 +6850,7 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(GETACL, enc_getacl, dec_getacl), + PROC(SETACL, enc_setacl, dec_setacl), + PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), ++ PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), + #if defined(CONFIG_NFS_V4_1) + PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), + PROC(CREATE_SESSION, enc_create_session, dec_create_session), +@@ -5873,6 +6858,13 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(SEQUENCE, enc_sequence, dec_sequence), + PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), ++ PROC(PNFS_GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), ++ PROC(PNFS_GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), ++ PROC(PNFS_LAYOUTGET, enc_layoutget, dec_layoutget), ++ PROC(PNFS_LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), ++ PROC(PNFS_LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), ++ PROC(PNFS_WRITE, enc_dswrite, dec_dswrite), ++ PROC(PNFS_COMMIT, enc_dscommit, dec_dscommit), + #endif /* CONFIG_NFS_V4_1 */ + }; + +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild +--- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-23 12:09:03.348511665 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-08-23 12:09:03.348511665 -0400 +@@ -0,0 +1,11 @@ ++# ++# Makefile for the pNFS Objects Layout Driver kernel module ++# ++objlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o objio_osd.o ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o ++ ++# ++# Panasas pNFS Layout Driver kernel module ++# ++panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o ++obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-23 12:09:03.349501459 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-23 12:09:03.349501459 -0400 +@@ -0,0 +1,1087 @@ ++/* ++ * objio_osd.c ++ * ++ * pNFS Objects layout implementation over open-osd initiator library ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++#define _LLU(x) ((unsigned long long)x) ++ ++enum { BIO_MAX_PAGES_KMALLOC = ++ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), ++}; ++ ++/* A per mountpoint struct currently for device cache */ ++struct objio_mount_type { ++ struct list_head dev_list; ++ spinlock_t dev_list_lock; ++}; ++ ++struct _dev_ent { ++ struct list_head list; ++ struct pnfs_deviceid d_id; ++ struct osd_dev *od; ++}; ++ ++static void _dev_list_remove_all(struct objio_mount_type *omt) ++{ ++ spin_lock(&omt->dev_list_lock); ++ ++ while (!list_empty(&omt->dev_list)) { ++ struct _dev_ent *de = list_entry(omt->dev_list.next, ++ struct _dev_ent, list); ++ ++ list_del_init(&de->list); ++ osduld_put_device(de->od); ++ kfree(de); ++ } ++ ++ spin_unlock(&omt->dev_list_lock); ++} ++ ++static struct osd_dev *___dev_list_find(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id) ++{ ++ struct list_head *le; ++ ++ list_for_each(le, &omt->dev_list) { ++ struct _dev_ent *de = list_entry(le, struct _dev_ent, list); ++ ++ if (0 == memcmp(&de->d_id, d_id, sizeof(*d_id))) ++ return de->od; ++ } ++ ++ return NULL; ++} ++ ++static struct osd_dev *_dev_list_find(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id) ++{ ++ struct osd_dev *od; ++ ++ spin_lock(&omt->dev_list_lock); ++ od = ___dev_list_find(omt, d_id); ++ spin_unlock(&omt->dev_list_lock); ++ return od; ++} ++ ++static int _dev_list_add(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id, struct osd_dev *od) ++{ ++ struct _dev_ent *de = kzalloc(sizeof(*de), GFP_KERNEL); ++ ++ if (!de) ++ return -ENOMEM; ++ ++ spin_lock(&omt->dev_list_lock); ++ ++ if (___dev_list_find(omt, d_id)) { ++ kfree(de); ++ goto out; ++ } ++ ++ de->d_id = *d_id; ++ de->od = od; ++ list_add(&de->list, &omt->dev_list); ++ ++out: ++ spin_unlock(&omt->dev_list_lock); ++ return 0; ++} ++ ++struct objio_segment { ++ struct pnfs_osd_layout *layout; ++ ++ unsigned mirrors_p1; ++ unsigned stripe_unit; ++ unsigned group_width; /* Data stripe_units without integrity comps */ ++ u64 group_depth; ++ unsigned group_count; ++ ++ unsigned num_comps; ++ /* variable length */ ++ struct osd_dev *ods[1]; ++}; ++ ++struct objio_state; ++typedef ssize_t (*objio_done_fn)(struct objio_state *ios); ++ ++struct objio_state { ++ /* Generic layer */ ++ struct objlayout_io_state ol_state; ++ ++ struct objio_segment *objio_seg; ++ ++ struct kref kref; ++ objio_done_fn done; ++ void *private; ++ ++ unsigned long length; ++ unsigned numdevs; /* Actually used devs in this IO */ ++ /* A per-device variable array of size numdevs */ ++ struct _objio_per_comp { ++ struct bio *bio; ++ struct osd_request *or; ++ unsigned long length; ++ u64 offset; ++ unsigned dev; ++ } per_dev[]; ++}; ++ ++/* Send and wait for a get_device_info of devices in the layout, ++ then look them up with the osd_initiator library */ ++static struct osd_dev *_device_lookup(struct pnfs_layout_type *pnfslay, ++ struct objio_segment *objio_seg, unsigned comp) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ struct pnfs_osd_deviceaddr *deviceaddr; ++ struct pnfs_deviceid *d_id; ++ struct osd_dev *od; ++ struct osd_dev_info odi; ++ struct objio_mount_type *omt = PNFS_NFS_SERVER(pnfslay)->pnfs_ld_data; ++ int err; ++ ++ d_id = &layout->olo_comps[comp].oc_object_id.oid_device_id; ++ ++ od = _dev_list_find(omt, d_id); ++ if (od) ++ return od; ++ ++ err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr); ++ if (unlikely(err)) { ++ dprintk("%s: objlayout_get_deviceinfo=>%d\n", __func__, err); ++ return ERR_PTR(err); ++ } ++ ++ odi.systemid_len = deviceaddr->oda_systemid.len; ++ if (odi.systemid_len > sizeof(odi.systemid)) { ++ err = -EINVAL; ++ goto out; ++ } else if (odi.systemid_len) ++ memcpy(odi.systemid, deviceaddr->oda_systemid.data, ++ odi.systemid_len); ++ odi.osdname_len = deviceaddr->oda_osdname.len; ++ odi.osdname = (u8 *)deviceaddr->oda_osdname.data; ++ ++ if (!odi.osdname_len && !odi.systemid_len) { ++ dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", ++ __func__); ++ err = -ENODEV; ++ goto out; ++ } ++ ++ od = osduld_info_lookup(&odi); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ dprintk("%s: osduld_info_lookup => %d\n", __func__, err); ++ goto out; ++ } ++ ++ _dev_list_add(omt, d_id, od); ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ objlayout_put_deviceinfo(deviceaddr); ++ return err ? ERR_PTR(err) : od; ++} ++ ++static int objio_devices_lookup(struct pnfs_layout_type *pnfslay, ++ struct objio_segment *objio_seg) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ unsigned i, num_comps = layout->olo_num_comps; ++ int err; ++ ++ /* lookup all devices */ ++ for (i = 0; i < num_comps; i++) { ++ struct osd_dev *od; ++ ++ od = _device_lookup(pnfslay, objio_seg, i); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ goto out; ++ } ++ objio_seg->ods[i] = od; ++ } ++ objio_seg->num_comps = num_comps; ++ err = 0; ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ return err; ++} ++ ++static int _verify_data_map(struct pnfs_osd_layout *layout) ++{ ++ struct pnfs_osd_data_map *data_map = &layout->olo_map; ++ u64 stripe_length; ++ u32 group_width; ++ ++/* FIXME: Only raid0 for now. if not go through MDS */ ++ if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { ++ printk(KERN_ERR "Only RAID_0 for now\n"); ++ return -ENOTSUPP; ++ } ++ if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { ++ printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", ++ data_map->odm_num_comps, data_map->odm_mirror_cnt); ++ return -EINVAL; ++ } ++ ++ if (data_map->odm_group_width) ++ group_width = data_map->odm_group_width; ++ else ++ group_width = data_map->odm_num_comps / ++ (data_map->odm_mirror_cnt + 1); ++ ++ stripe_length = (u64)data_map->odm_stripe_unit * group_width; ++ if (stripe_length >= (1ULL << 32)) { ++ printk(KERN_ERR "Total Stripe length(0x%llx)" ++ " >= 32bit is not supported\n", _LLU(stripe_length)); ++ return -ENOTSUPP; ++ } ++ ++ if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { ++ printk(KERN_ERR "Stripe Unit(0x%llx)" ++ " must be Multples of PAGE_SIZE(0x%lx)\n", ++ _LLU(data_map->odm_stripe_unit), PAGE_SIZE); ++ return -ENOTSUPP; ++ } ++ ++ return 0; ++} ++ ++int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_type *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ struct objio_segment *objio_seg; ++ int err; ++ ++ err = _verify_data_map(layout); ++ if (unlikely(err)) ++ return err; ++ ++ objio_seg = kzalloc(sizeof(*objio_seg) + ++ (layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]), ++ GFP_KERNEL); ++ if (!objio_seg) ++ return -ENOMEM; ++ ++ objio_seg->layout = layout; ++ err = objio_devices_lookup(pnfslay, objio_seg); ++ if (err) ++ goto free_seg; ++ ++ objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1; ++ objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit; ++ if (layout->olo_map.odm_group_width) { ++ objio_seg->group_width = layout->olo_map.odm_group_width; ++ objio_seg->group_depth = layout->olo_map.odm_group_depth; ++ objio_seg->group_count = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1 / ++ objio_seg->group_width; ++ } else { ++ objio_seg->group_width = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1; ++ objio_seg->group_depth = -1; ++ objio_seg->group_count = 1; ++ } ++ ++ *outp = objio_seg; ++ return 0; ++ ++free_seg: ++ dprintk("%s: Error: return %d\n", __func__, err); ++ kfree(objio_seg); ++ *outp = NULL; ++ return err; ++} ++ ++void objio_free_lseg(void *p) ++{ ++ struct objio_segment *objio_seg = p; ++ ++ kfree(objio_seg); ++} ++ ++int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct objio_segment *objio_seg = seg; ++ struct objio_state *ios; ++ const unsigned first_size = sizeof(*ios) + ++ objio_seg->num_comps * sizeof(ios->per_dev[0]); ++ const unsigned sec_size = objio_seg->num_comps * ++ sizeof(ios->ol_state.ioerrs[0]); ++ ++ dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps); ++ ios = kzalloc(first_size + sec_size, GFP_KERNEL); ++ if (unlikely(!ios)) ++ return -ENOMEM; ++ ++ ios->objio_seg = objio_seg; ++ ios->ol_state.ioerrs = ((void *)ios) + first_size; ++ ios->ol_state.num_comps = objio_seg->num_comps; ++ ++ *outp = &ios->ol_state; ++ return 0; ++} ++ ++void objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ ++ kfree(ios); ++} ++ ++enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) ++{ ++ switch (oep) { ++ case OSD_ERR_PRI_NO_ERROR: ++ return (enum pnfs_osd_errno)0; ++ ++ case OSD_ERR_PRI_CLEAR_PAGES: ++ BUG_ON(1); ++ return 0; ++ ++ case OSD_ERR_PRI_RESOURCE: ++ return PNFS_OSD_ERR_RESOURCE; ++ case OSD_ERR_PRI_BAD_CRED: ++ return PNFS_OSD_ERR_BAD_CRED; ++ case OSD_ERR_PRI_NO_ACCESS: ++ return PNFS_OSD_ERR_NO_ACCESS; ++ case OSD_ERR_PRI_UNREACHABLE: ++ return PNFS_OSD_ERR_UNREACHABLE; ++ case OSD_ERR_PRI_NOT_FOUND: ++ return PNFS_OSD_ERR_NOT_FOUND; ++ case OSD_ERR_PRI_NO_SPACE: ++ return PNFS_OSD_ERR_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case OSD_ERR_PRI_EIO: ++ return PNFS_OSD_ERR_EIO; ++ } ++} ++ ++static void _clear_bio(struct bio *bio) ++{ ++ struct bio_vec *bv; ++ unsigned i; ++ ++ __bio_for_each_segment(bv, bio, i, 0) { ++ unsigned this_count = bv->bv_len; ++ ++ if (likely(PAGE_SIZE == this_count)) ++ clear_highpage(bv->bv_page); ++ else ++ zero_user(bv->bv_page, bv->bv_offset, this_count); ++ } ++} ++ ++static int _io_check(struct objio_state *ios, bool is_write) ++{ ++ enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; ++ int lin_ret = 0; ++ int i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_sense_info osi; ++ struct osd_request *or = ios->per_dev[i].or; ++ int ret; ++ ++ if (!or) ++ continue; ++ ++ ret = osd_req_decode_sense(or, &osi); ++ if (likely(!ret)) ++ continue; ++ ++ if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { ++ /* start read offset passed endof file */ ++ BUG_ON(is_write); ++ _clear_bio(ios->per_dev[i].bio); ++ dprintk("%s: start read offset passed end of file " ++ "offset=0x%llx, length=0x%lx\n", __func__, ++ _LLU(ios->per_dev[i].offset), ++ ios->per_dev[i].length); ++ ++ continue; /* we recovered */ ++ } ++ objlayout_io_set_result(&ios->ol_state, ios->per_dev[i].dev, ++ osd_pri_2_pnfs_err(osi.osd_err_pri), ++ ios->per_dev[i].offset, ++ ios->per_dev[i].length, ++ is_write); ++ ++ if (osi.osd_err_pri >= oep) { ++ oep = osi.osd_err_pri; ++ lin_ret = ret; ++ } ++ } ++ ++ return lin_ret; ++} ++ ++/* ++ * Common IO state helpers. ++ */ ++static void _io_free(struct objio_state *ios) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[i]; ++ ++ if (per_dev->or) { ++ osd_end_request(per_dev->or); ++ per_dev->or = NULL; ++ } ++ ++ if (per_dev->bio) { ++ bio_put(per_dev->bio); ++ per_dev->bio = NULL; ++ } ++ } ++} ++ ++struct osd_dev * _io_od(struct objio_state *ios, unsigned dev) ++{ ++ unsigned min_dev = ios->objio_seg->layout->olo_comps_index; ++ unsigned max_dev = min_dev + ios->ol_state.num_comps; ++ ++ BUG_ON(dev < min_dev || max_dev <= dev); ++ return ios->objio_seg->ods[dev - min_dev]; ++} ++ ++struct _striping_info { ++ u64 obj_offset; ++ u64 group_length; ++ u64 total_group_length; ++ u64 Major; ++ unsigned dev; ++ unsigned unit_off; ++}; ++ ++static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, ++ struct _striping_info *si) ++{ ++ u32 stripe_unit = ios->objio_seg->stripe_unit; ++ u32 group_width = ios->objio_seg->group_width; ++ u64 group_depth = ios->objio_seg->group_depth; ++ u32 U = stripe_unit * group_width; ++ ++ u64 T = U * group_depth; ++ u64 S = T * ios->objio_seg->group_count; ++ u64 M = div64_u64(file_offset, S); ++ ++ /* ++ G = (L - (M * S)) / T ++ H = (L - (M * S)) % T ++ */ ++ u64 LmodU = file_offset - M * S; ++ u32 G = div64_u64(LmodU, T); ++ u64 H = LmodU - G * T; ++ ++ u32 N = div_u64(H, U); ++ ++ div_u64_rem(file_offset, stripe_unit, &si->unit_off); ++ si->obj_offset = si->unit_off + (N * stripe_unit) + ++ (M * group_depth * stripe_unit); ++ ++ /* "H - (N * U)" is just "H % U" so it's bound to u32 */ ++ si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; ++ si->dev *= ios->objio_seg->mirrors_p1; ++ ++ si->group_length = T - H; ++ si->total_group_length = T; ++ si->Major = M; ++} ++ ++static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, ++ unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len) ++{ ++ unsigned pg = *cur_pg; ++ struct request_queue *q = ++ osd_request_queue(_io_od(ios, per_dev->dev)); ++ ++ per_dev->length += cur_len; ++ ++ if (per_dev->bio == NULL) { ++ unsigned stripes = ios->ol_state.num_comps / ++ ios->objio_seg->mirrors_p1; ++ unsigned pages_in_stripe = stripes * ++ (ios->objio_seg->stripe_unit / PAGE_SIZE); ++ unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / ++ stripes; ++ ++ per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); ++ if (unlikely(!per_dev->bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", bio_size); ++ return -ENOMEM; ++ } ++ } ++ ++ while (cur_len > 0) { ++ unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); ++ unsigned added_len; ++ ++ BUG_ON(ios->ol_state.nr_pages <= pg); ++ cur_len -= pglen; ++ ++ added_len = bio_add_pc_page(q, per_dev->bio, ++ ios->ol_state.pages[pg], pglen, pgbase); ++ if (unlikely(pglen != added_len)) ++ return -ENOMEM; ++ pgbase = 0; ++ ++pg; ++ } ++ BUG_ON(cur_len); ++ ++ *cur_pg = pg; ++ return 0; ++} ++ ++static int _prepare_one_group(struct objio_state *ios, u64 length, ++ struct _striping_info *si, unsigned first_comp, ++ unsigned *last_pg) ++{ ++ unsigned stripe_unit = ios->objio_seg->stripe_unit; ++ unsigned mirrors_p1 = ios->objio_seg->mirrors_p1; ++ unsigned devs_in_group = ios->objio_seg->group_width * mirrors_p1; ++ unsigned dev = si->dev; ++ unsigned first_dev = dev - (dev % devs_in_group); ++ unsigned comp = first_comp + (dev - first_dev); ++ unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; ++ unsigned cur_pg = *last_pg; ++ int ret = 0; ++ ++ while (length) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[comp]; ++ unsigned cur_len, page_off = 0; ++ ++ if (!per_dev->length) { ++ per_dev->dev = dev; ++ if (dev < si->dev) { ++ per_dev->offset = si->obj_offset + stripe_unit - ++ si->unit_off; ++ cur_len = stripe_unit; ++ } else if (dev == si->dev) { ++ per_dev->offset = si->obj_offset; ++ cur_len = stripe_unit - si->unit_off; ++ page_off = si->unit_off & ~PAGE_MASK; ++ BUG_ON(page_off && ++ (page_off != ios->ol_state.pgbase)); ++ } else { /* dev > si->dev */ ++ per_dev->offset = si->obj_offset - si->unit_off; ++ cur_len = stripe_unit; ++ } ++ ++ if (max_comp < comp) ++ max_comp = comp; ++ ++ dev += mirrors_p1; ++ dev = (dev % devs_in_group) + first_dev; ++ } else { ++ cur_len = stripe_unit; ++ } ++ if (cur_len >= length) ++ cur_len = length; ++ ++ ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, ++ cur_len); ++ if (unlikely(ret)) ++ goto out; ++ ++ comp += mirrors_p1; ++ comp = (comp % devs_in_group) + first_comp; ++ ++ length -= cur_len; ++ ios->length += cur_len; ++ } ++out: ++ ios->numdevs = max_comp + mirrors_p1; ++ *last_pg = cur_pg; ++ return ret; ++} ++ ++static int _io_rw_pagelist(struct objio_state *ios) ++{ ++ u64 length = ios->ol_state.count; ++ struct _striping_info si; ++ unsigned devs_in_group = ios->objio_seg->group_width * ++ ios->objio_seg->mirrors_p1; ++ unsigned first_comp = 0; ++ unsigned num_comps = ios->objio_seg->layout->olo_map.odm_num_comps; ++ unsigned last_pg = 0; ++ int ret = 0; ++ ++ _calc_stripe_info(ios, ios->ol_state.offset, &si); ++ while (length) { ++ if (length < si.group_length) ++ si.group_length = length; ++ ++ ret = _prepare_one_group(ios, si.group_length, &si, first_comp, ++ &last_pg); ++ if (unlikely(ret)) ++ goto out; ++ ++ length -= si.group_length; ++ ++ si.group_length = si.total_group_length; ++ si.unit_off = 0; ++ ++si.Major; ++ si.obj_offset = si.Major * ios->objio_seg->stripe_unit * ++ ios->objio_seg->group_depth; ++ ++ si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group; ++ si.dev %= num_comps; ++ ++ first_comp += devs_in_group; ++ first_comp %= num_comps; ++ } ++ ++out: ++ if (!ios->length) ++ return ret; ++ ++ return 0; ++} ++ ++static ssize_t _sync_done(struct objio_state *ios) ++{ ++ struct completion *waiting = ios->private; ++ ++ complete(waiting); ++ return 0; ++} ++ ++static void _last_io(struct kref *kref) ++{ ++ struct objio_state *ios = container_of(kref, struct objio_state, kref); ++ ++ ios->done(ios); ++} ++ ++static void _done_io(struct osd_request *or, void *p) ++{ ++ struct objio_state *ios = p; ++ ++ kref_put(&ios->kref, _last_io); ++} ++ ++static ssize_t _io_exec(struct objio_state *ios) ++{ ++ DECLARE_COMPLETION_ONSTACK(wait); ++ ssize_t status = 0; /* sync status */ ++ unsigned i; ++ objio_done_fn saved_done_fn = ios->done; ++ bool sync = ios->ol_state.sync; ++ ++ if (sync) { ++ ios->done = _sync_done; ++ ios->private = &wait; ++ } ++ ++ kref_init(&ios->kref); ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_request *or = ios->per_dev[i].or; ++ ++ if (!or) ++ continue; ++ ++ kref_get(&ios->kref); ++ osd_execute_request_async(or, _done_io, ios); ++ } ++ ++ kref_put(&ios->kref, _last_io); ++ ++ if (sync) { ++ wait_for_completion(&wait); ++ status = saved_done_fn(ios); ++ } ++ ++ return status; ++} ++ ++/* ++ * read ++ */ ++static ssize_t _read_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, false); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) ++ status = ios->length; ++ else ++ status = ret; ++ ++ objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct osd_request *or = NULL; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = per_dev->dev; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ int ret; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ ++err: ++ return ret; ++} ++ ++static ssize_t _read_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _read_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _read_done; ++ return _io_exec(ios); /* In sync mode exec returns the io status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _read_exec(ios); ++} ++ ++/* ++ * write ++ */ ++static ssize_t _write_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, true); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) { ++ /* FIXME: should be based on the OSD's persistence model ++ * See OSD2r05 Section 4.13 Data persistence model */ ++ ios->ol_state.committed = NFS_UNSTABLE; //NFS_FILE_SYNC; ++ status = ios->length; ++ } else { ++ status = ret; ++ } ++ ++ objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = ios->per_dev[cur_comp].dev; ++ unsigned last_comp = cur_comp + ios->objio_seg->mirrors_p1; ++ int ret; ++ ++ for (; cur_comp < last_comp; ++cur_comp, ++dev) { ++ struct osd_request *or = NULL; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ struct bio *bio; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ if (per_dev != master_dev) { ++ bio = bio_kmalloc(GFP_KERNEL, ++ master_dev->bio->bi_max_vecs); ++ if (unlikely(!bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", ++ master_dev->bio->bi_max_vecs); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ __bio_clone(bio, master_dev->bio); ++ bio->bi_bdev = NULL; ++ bio->bi_next = NULL; ++ per_dev->bio = bio; ++ per_dev->dev = dev; ++ per_dev->length = master_dev->length; ++ per_dev->offset = master_dev->offset; ++ } else { ++ bio = master_dev->bio; ++ /* FIXME: bio_set_dir() */ ++ bio->bi_rw |= (1 << BIO_RW); ++ } ++ ++ osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ } ++ ++err: ++ return ret; ++} ++ ++static ssize_t _write_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _write_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _write_done; ++ return _io_exec(ios); /* In sync mode exec returns the io->status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ /* TODO: ios->stable = stable; */ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _write_exec(ios); ++} ++ ++/* ++ * Policy Operations ++ */ ++ ++/* ++ * Return the stripe size for the specified file ++ */ ++ssize_t ++objlayout_get_stripesize(struct pnfs_layout_type *pnfslay) ++{ ++ ssize_t sz, maxsz = -1; ++ struct pnfs_layout_segment *lseg; ++ ++ list_for_each_entry(lseg, &pnfslay->segs, fi_list) { ++ int n; ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(lseg); ++ struct pnfs_osd_layout *lo = ++ (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; ++ struct pnfs_osd_data_map *map = &lo->olo_map; ++ ++ n = map->odm_group_width; ++ if (n == 0) ++ n = map->odm_num_comps / (map->odm_mirror_cnt + 1); ++ ++ switch (map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_5: ++ n -= 1; ++ break; ++ ++ case PNFS_OSD_RAID_PQ: ++ n -= 2; ++ break; ++ ++ default: ++ BUG_ON(1); ++ } ++ sz = map->odm_stripe_unit * n; ++ if (sz > maxsz) ++ maxsz = sz; ++ } ++ dprintk("%s: Return %Zx\n", __func__, maxsz); ++ return maxsz; ++} ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++objlayout_get_blocksize(void) ++{ ++ ssize_t sz = BIO_MAX_PAGES_KMALLOC * PAGE_SIZE; ++ ++ return sz; ++} ++ ++static struct layoutdriver_policy_operations objlayout_policy_operations = { ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ .get_stripesize = objlayout_get_stripesize, ++ .get_blocksize = objlayout_get_blocksize, ++}; ++ ++static struct pnfs_layoutdriver_type objlayout_type = { ++ .id = LAYOUT_OSD2_OBJECTS, ++ .name = "LAYOUT_OSD2_OBJECTS", ++ .ld_io_ops = &objlayout_io_operations, ++ .ld_policy_ops = &objlayout_policy_operations, ++}; ++ ++void *objio_init_mt(void) ++{ ++ struct objio_mount_type *omt = kzalloc(sizeof(*omt), GFP_KERNEL); ++ ++ if (!omt) ++ return ERR_PTR(-ENOMEM); ++ ++ INIT_LIST_HEAD(&omt->dev_list); ++ spin_lock_init(&omt->dev_list_lock); ++ return omt; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++ _dev_list_remove_all(mountid); ++ kfree(mountid); ++} ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++objlayout_init(void) ++{ ++ pnfs_client_ops = pnfs_register_layoutdriver(&objlayout_type); ++ printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", ++ __func__); ++ return 0; ++} ++ ++static void __exit ++objlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&objlayout_type); ++ printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(objlayout_init); ++module_exit(objlayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-23 12:09:03.350491564 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-08-23 12:09:03.350491564 -0400 +@@ -0,0 +1,790 @@ ++/* ++ * objlayout.c ++ * ++ * pNFS layout driver for Panasas OSDs ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pnfs_client_operations *pnfs_client_ops; ++ ++/* ++ * Create a objlayout layout structure for the given inode and return it. ++ */ ++static struct pnfs_layout_type * ++objlayout_alloc_layout(struct inode *inode) ++{ ++ struct objlayout *objlay; ++ ++ objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL); ++ if (objlay) { ++ spin_lock_init(&objlay->lock); ++ INIT_LIST_HEAD(&objlay->err_list); ++ } ++ dprintk("%s: Return %p\n", __func__, objlay); ++ return &objlay->pnfs_layout; ++} ++ ++/* ++ * Free an objlayout layout structure ++ */ ++static void ++objlayout_free_layout(struct pnfs_layout_type *lo) ++{ ++ struct objlayout *objlay = OBJLAYOUT(lo); ++ ++ dprintk("%s: objlay %p\n", __func__, objlay); ++ ++ WARN_ON(!list_empty(&objlay->err_list)); ++ kfree(objlay); ++} ++ ++/* ++ * Unmarshall layout and store it in pnfslay. ++ */ ++static struct pnfs_layout_segment * ++objlayout_alloc_lseg(struct pnfs_layout_type *pnfslay, ++ struct nfs4_pnfs_layoutget_res *lgr) ++{ ++ int status; ++ void *layout = lgr->layout.buf; ++ struct pnfs_layout_segment *lseg; ++ struct objlayout_segment *objlseg; ++ struct pnfs_osd_layout *pnfs_osd_layout; ++ ++ dprintk("%s: Begin pnfslay %p layout %p\n", __func__, pnfslay, layout); ++ ++ BUG_ON(!layout); ++ ++ status = -ENOMEM; ++ lseg = kzalloc(sizeof(*lseg) + sizeof(*objlseg) + ++ pnfs_osd_layout_incore_sz(layout), GFP_KERNEL); ++ if (!lseg) ++ goto err; ++ ++ objlseg = LSEG_LD_DATA(lseg); ++ pnfs_osd_layout = (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; ++ pnfs_osd_xdr_decode_layout(pnfs_osd_layout, layout); ++ ++ status = objio_alloc_lseg(&objlseg->internal, pnfslay, lseg, ++ pnfs_osd_layout); ++ if (status) ++ goto err; ++ ++ dprintk("%s: Return %p\n", __func__, lseg); ++ return lseg; ++ ++ err: ++ kfree(lseg); ++ return ERR_PTR(status); ++} ++ ++/* ++ * Free a layout segement ++ */ ++static void ++objlayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ struct objlayout_segment *objlseg; ++ ++ dprintk("%s: freeing layout segment %p\n", __func__, lseg); ++ ++ if (unlikely(!lseg)) ++ return; ++ ++ objlseg = LSEG_LD_DATA(lseg); ++ objio_free_lseg(objlseg->internal); ++ kfree(lseg); ++} ++ ++/* ++ * I/O Operations ++ */ ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ ++static struct objlayout_io_state * ++objlayout_alloc_io_state(struct pnfs_layout_type *pnfs_layout_type, ++ struct page **pages, ++ unsigned pgbase, ++ unsigned nr_pages, ++ loff_t offset, ++ size_t count, ++ struct pnfs_layout_segment *lseg, ++ void *rpcdata) ++{ ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(lseg); ++ struct objlayout_io_state *state; ++ u64 lseg_end_offset; ++ size_t size_nr_pages; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ if (objio_alloc_io_state(objlseg->internal, &state)) ++ return NULL; ++ ++ BUG_ON(offset < lseg->range.offset); ++ lseg_end_offset = end_offset(lseg->range.offset, lseg->range.length); ++ BUG_ON(offset >= lseg_end_offset); ++ if (offset + count > lseg_end_offset) { ++ count = lseg->range.length - (offset - lseg->range.offset); ++ dprintk("%s: truncated count %Zd\n", __func__, count); ++ } ++ ++ if (pgbase > PAGE_SIZE) { ++ unsigned n = pgbase >> PAGE_SHIFT; ++ ++ pgbase &= ~PAGE_MASK; ++ pages += n; ++ nr_pages -= n; ++ } ++ ++ size_nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ BUG_ON(nr_pages < size_nr_pages); ++ if (nr_pages > size_nr_pages) ++ nr_pages = size_nr_pages; ++ ++ INIT_LIST_HEAD(&state->err_list); ++ state->lseg = lseg; ++ state->rpcdata = rpcdata; ++ state->pages = pages; ++ state->pgbase = pgbase; ++ state->nr_pages = nr_pages; ++ state->offset = offset; ++ state->count = count; ++ state->sync = 0; ++ ++ return state; ++} ++ ++static void ++objlayout_free_io_state(struct objlayout_io_state *state) ++{ ++ dprintk("%s: freeing io_state\n", __func__); ++ if (unlikely(!state)) ++ return; ++ ++ objio_free_io_state(state); ++} ++ ++/* ++ * I/O done common code ++ */ ++static void ++objlayout_iodone(struct objlayout_io_state *state) ++{ ++ dprintk("%s: state %p status\n", __func__, state); ++ ++ if (likely(state->status >= 0)) { ++ objlayout_free_io_state(state); ++ } else { ++ struct objlayout *objlay = OBJLAYOUT(state->lseg->layout); ++ ++ spin_lock(&objlay->lock); ++ objlay->delta_space_valid = OBJ_DSU_INVALID; ++ list_add(&objlay->err_list, &state->err_list); ++ spin_unlock(&objlay->lock); ++ } ++} ++ ++/* ++ * objlayout_io_set_result - Set an osd_error code on a specific osd comp. ++ * ++ * The @index component IO failed (error returned from target). Register ++ * the error for later reporting at layout-return. ++ */ ++void ++objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, ++ int osd_error, u64 offset, u64 length, bool is_write) ++{ ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; ++ ++ BUG_ON(index >= state->num_comps); ++ if (osd_error) { ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(state->lseg); ++ struct pnfs_osd_layout *layout = ++ (typeof(layout))objlseg->pnfs_osd_layout; ++ ++ ioerr->oer_component = layout->olo_comps[index].oc_object_id; ++ ioerr->oer_comp_offset = offset; ++ ioerr->oer_comp_length = length; ++ ioerr->oer_iswrite = is_write; ++ ioerr->oer_errno = osd_error; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " ++ "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", ++ __func__, index, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ } else { ++ /* User need not call if no error is reported */ ++ ioerr->oer_errno = 0; ++ } ++} ++ ++static void _rpc_commit_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_client_ops->nfs_commit_complete(wdata); ++} ++ ++/* ++ * Commit data remotely on OSDs ++ */ ++enum pnfs_try_status ++objlayout_commit(struct nfs_write_data *wdata, int how) ++{ ++ int status = PNFS_ATTEMPTED; ++ ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_commit_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ dprintk("%s: Return %d\n", __func__, status); ++ return status; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_read_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ ++ pnfs_client_ops->nfs_readlist_complete(rdata); ++} ++ ++void ++objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) ++{ ++ int eof = state->eof; ++ struct nfs_read_data *rdata; ++ ++ state->status = status; ++ dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof); ++ rdata = state->rpcdata; ++ rdata->task.tk_status = status; ++ if (status >= 0) { ++ rdata->res.count = status; ++ rdata->res.eof = eof; ++ } ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_client_ops->nfs_readlist_complete(rdata); ++ else { ++ INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); ++ schedule_work(&rdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async reads. ++ */ ++enum pnfs_try_status ++objlayout_read_pagelist(struct nfs_read_data *rdata, unsigned nr_pages) ++{ ++ loff_t offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct objlayout_io_state *state; ++ ssize_t status = 0; ++ loff_t eof; ++ ++ dprintk("%s: Begin inode %p offset %llu count %d\n", ++ __func__, rdata->inode, offset, (int)count); ++ ++ eof = i_size_read(rdata->inode); ++ if (unlikely(offset + count > eof)) { ++ if (offset >= eof) { ++ status = 0; ++ rdata->res.count = 0; ++ rdata->res.eof = 1; ++ goto out; ++ } ++ count = eof - offset; ++ } ++ ++ state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, ++ rdata->args.pages, rdata->args.pgbase, ++ nr_pages, offset, count, ++ rdata->pdata.lseg, rdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->eof = state->offset + state->count >= eof; ++ ++ status = objio_read_pagelist(state); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ rdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_write_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_client_ops->nfs_writelist_complete(wdata); ++} ++ ++void ++objlayout_write_done(struct objlayout_io_state *state, ssize_t status, ++ bool sync) ++{ ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s: Begin\n", __func__); ++ wdata = state->rpcdata; ++ state->status = status; ++ wdata->task.tk_status = status; ++ if (status >= 0) { ++ wdata->res.count = status; ++ wdata->verf.committed = state->committed; ++ dprintk("%s: Return status %d committed %d\n", ++ __func__, wdata->task.tk_status, ++ wdata->verf.committed); ++ } else ++ dprintk("%s: Return status %d\n", ++ __func__, wdata->task.tk_status); ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_client_ops->nfs_writelist_complete(wdata); ++ else { ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async writes. ++ */ ++enum pnfs_try_status ++objlayout_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int how) ++{ ++ struct objlayout_io_state *state; ++ ssize_t status; ++ ++ dprintk("%s: Begin inode %p offset %llu count %u\n", ++ __func__, wdata->inode, wdata->args.offset, wdata->args.count); ++ ++ state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, ++ wdata->args.pages, ++ wdata->args.pgbase, ++ nr_pages, ++ wdata->args.offset, ++ wdata->args.count, ++ wdata->pdata.lseg, wdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->sync = how & FLUSH_SYNC; ++ ++ status = objio_write_pagelist(state, how & FLUSH_STABLE); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ wdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++void ++objlayout_encode_layoutcommit(struct pnfs_layout_type *pnfslay, ++ struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct pnfs_osd_layoutupdate lou; ++ __be32 *start; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ spin_lock(&objlay->lock); ++ lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); ++ lou.dsu_delta = objlay->delta_space_used; ++ objlay->delta_space_used = 0; ++ objlay->delta_space_valid = OBJ_DSU_INIT; ++ lou.olu_ioerr_flag = !list_empty(&objlay->err_list); ++ spin_unlock(&objlay->lock); ++ ++ start = xdr_reserve_space(xdr, 4); ++ ++ BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ ++ dprintk("%s: Return delta_space_used %lld err %d\n", __func__, ++ lou.dsu_delta, lou.olu_ioerr_flag); ++} ++ ++static int ++err_prio(u32 oer_errno) ++{ ++ switch (oer_errno) { ++ case 0: ++ return 0; ++ ++ case PNFS_OSD_ERR_RESOURCE: ++ return OSD_ERR_PRI_RESOURCE; ++ case PNFS_OSD_ERR_BAD_CRED: ++ return OSD_ERR_PRI_BAD_CRED; ++ case PNFS_OSD_ERR_NO_ACCESS: ++ return OSD_ERR_PRI_NO_ACCESS; ++ case PNFS_OSD_ERR_UNREACHABLE: ++ return OSD_ERR_PRI_UNREACHABLE; ++ case PNFS_OSD_ERR_NOT_FOUND: ++ return OSD_ERR_PRI_NOT_FOUND; ++ case PNFS_OSD_ERR_NO_SPACE: ++ return OSD_ERR_PRI_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case PNFS_OSD_ERR_EIO: ++ return OSD_ERR_PRI_EIO; ++ } ++} ++ ++static void ++merge_ioerr(struct pnfs_osd_ioerr *dest_err, ++ const struct pnfs_osd_ioerr *src_err) ++{ ++ u64 dest_end, src_end; ++ ++ if (!dest_err->oer_errno) { ++ *dest_err = *src_err; ++ /* accumulated device must be blank */ ++ memset(&dest_err->oer_component.oid_device_id, 0, ++ sizeof(dest_err->oer_component.oid_device_id)); ++ ++ return; ++ } ++ ++ if (dest_err->oer_component.oid_partition_id != ++ src_err->oer_component.oid_partition_id) ++ dest_err->oer_component.oid_partition_id = 0; ++ ++ if (dest_err->oer_component.oid_object_id != ++ src_err->oer_component.oid_object_id) ++ dest_err->oer_component.oid_object_id = 0; ++ ++ if (dest_err->oer_comp_offset > src_err->oer_comp_offset) ++ dest_err->oer_comp_offset = src_err->oer_comp_offset; ++ ++ dest_end = end_offset(dest_err->oer_comp_offset, ++ dest_err->oer_comp_length); ++ src_end = end_offset(src_err->oer_comp_offset, ++ src_err->oer_comp_length); ++ if (dest_end < src_end) ++ dest_end = src_end; ++ ++ dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; ++ ++ if ((src_err->oer_iswrite == dest_err->oer_iswrite) && ++ (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { ++ dest_err->oer_errno = src_err->oer_errno; ++ } else if (src_err->oer_iswrite) { ++ dest_err->oer_iswrite = true; ++ dest_err->oer_errno = src_err->oer_errno; ++ } ++} ++ ++static void ++encode_accumulated_error(struct objlayout *objlay, struct xdr_stream *xdr) ++{ ++ struct objlayout_io_state *state, *tmp; ++ struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ ++ for (i = 0; i < state->num_comps; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ merge_ioerr(&accumulated_err, ioerr); ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++ ++ BUG_ON(pnfs_osd_xdr_encode_ioerr(xdr, &accumulated_err)); ++} ++ ++void ++objlayout_encode_layoutreturn(struct pnfs_layout_type *pnfslay, ++ struct xdr_stream *xdr, ++ const struct nfs4_pnfs_layoutreturn_arg *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct objlayout_io_state *state, *tmp; ++ __be32 *start, *uninitialized_var(last_xdr); ++ ++ dprintk("%s: Begin\n", __func__); ++ start = xdr_reserve_space(xdr, 4); ++ BUG_ON(!start); ++ ++ spin_lock(&objlay->lock); ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ int res = 0; ++ ++ for (i = 0; i < state->num_comps && !res; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ last_xdr = xdr->p; ++ res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]); ++ } ++ if (unlikely(res)) { ++ /* no space for even one error descriptor */ ++ BUG_ON(last_xdr == start + 1); ++ ++ /* we've encountered a situation with lots and lots of ++ * errors and no space to encode them all. Use the last ++ * available slot to report the union of all the ++ * remaining errors. ++ */ ++ xdr_rewind_stream(xdr, last_xdr - ++ pnfs_osd_ioerr_xdr_sz() / 4); ++ encode_accumulated_error(objlay, xdr); ++ goto loop_done; ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++loop_done: ++ spin_unlock(&objlay->lock); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ dprintk("%s: Return\n", __func__); ++} ++ ++struct objlayout_deviceinfo { ++ struct page *page; ++ struct pnfs_osd_deviceaddr da; /* This must be last */ ++}; ++ ++/* Initialize and call nfs_getdeviceinfo, then decode and return a ++ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() ++ * should be called. ++ */ ++int objlayout_get_deviceinfo(struct pnfs_layout_type *pnfslay, ++ struct pnfs_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi; ++ struct pnfs_device pd; ++ struct super_block *sb; ++ struct page *page; ++ size_t sz; ++ u32 *p; ++ int err; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ pd.area = page_address(page); ++ ++ memcpy(&pd.dev_id, d_id, sizeof(*d_id)); ++ pd.layout_type = LAYOUT_OSD2_OBJECTS; ++ pd.dev_notify_types = 0; ++ pd.pages = &page; ++ pd.pgbase = 0; ++ pd.pglen = PAGE_SIZE; ++ pd.mincount = 0; ++ ++ sb = PNFS_INODE(pnfslay)->i_sb; ++ err = pnfs_client_ops->nfs_getdeviceinfo(PNFS_NFS_SERVER(pnfslay), &pd); ++ dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); ++ if (err) ++ goto err_out; ++ ++ p = pd.area; ++ sz = pnfs_osd_xdr_deviceaddr_incore_sz(p); ++ odi = kzalloc(sz + (sizeof(*odi) - sizeof(odi->da)), GFP_KERNEL); ++ if (!odi) { ++ err = -ENOMEM; ++ goto err_out; ++ } ++ pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); ++ odi->page = page; ++ *deviceaddr = &odi->da; ++ return 0; ++ ++err_out: ++ __free_page(page); ++ return err; ++} ++ ++void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi = container_of(deviceaddr, ++ struct objlayout_deviceinfo, ++ da); ++ ++ __free_page(odi->page); ++ kfree(odi); ++} ++ ++/* ++ * Initialize a mountpoint by retrieving the list of ++ * available devices for it. ++ * Return the pnfs_mount_type structure so the ++ * pNFS_client can refer to the mount point later on. ++ */ ++static int ++objlayout_initialize_mountpoint(struct nfs_server *server, ++ const struct nfs_fh *mntfh) ++{ ++ void *data; ++ ++ data = objio_init_mt(); ++ if (IS_ERR(data)) { ++ printk(KERN_INFO "%s: objlayout lib not ready err=%ld\n", ++ __func__, PTR_ERR(data)); ++ return PTR_ERR(data); ++ } ++ server->pnfs_ld_data = data; ++ ++ dprintk("%s: Return data=%p\n", __func__, data); ++ return 0; ++} ++ ++/* ++ * Uninitialize a mountpoint ++ */ ++static int ++objlayout_uninitialize_mountpoint(struct nfs_server *server) ++{ ++ dprintk("%s: Begin %p\n", __func__, server->pnfs_ld_data); ++ objio_fini_mt(server->pnfs_ld_data); ++ return 0; ++} ++ ++struct layoutdriver_io_operations objlayout_io_operations = { ++ .commit = objlayout_commit, ++ .read_pagelist = objlayout_read_pagelist, ++ .write_pagelist = objlayout_write_pagelist, ++ .alloc_layout = objlayout_alloc_layout, ++ .free_layout = objlayout_free_layout, ++ .alloc_lseg = objlayout_alloc_lseg, ++ .free_lseg = objlayout_free_lseg, ++ .encode_layoutcommit = objlayout_encode_layoutcommit, ++ .encode_layoutreturn = objlayout_encode_layoutreturn, ++ .initialize_mountpoint = objlayout_initialize_mountpoint, ++ .uninitialize_mountpoint = objlayout_uninitialize_mountpoint, ++}; +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-23 12:09:03.351434439 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-08-23 12:09:03.351434439 -0400 +@@ -0,0 +1,171 @@ ++/* ++ * objlayout.h ++ * ++ * Data types and function declerations for interfacing with the ++ * pNFS standard object layout driver. ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#ifndef _OBJLAYOUT_H ++#define _OBJLAYOUT_H ++ ++#include ++#include ++#include ++ ++/* ++ * in-core layout segment ++ */ ++struct objlayout_segment { ++ void *internal; /* for provider internal use */ ++ u8 pnfs_osd_layout[]; ++}; ++ ++/* ++ * per-inode layout ++ */ ++struct objlayout { ++ struct pnfs_layout_type pnfs_layout; ++ ++ /* for layout_commit */ ++ enum osd_delta_space_valid_enum { ++ OBJ_DSU_INIT = 0, ++ OBJ_DSU_VALID, ++ OBJ_DSU_INVALID, ++ } delta_space_valid; ++ s64 delta_space_used; /* consumed by write ops */ ++ ++ /* for layout_return */ ++ spinlock_t lock; ++ struct list_head err_list; ++}; ++ ++static inline struct objlayout * ++OBJLAYOUT(struct pnfs_layout_type *lo) ++{ ++ return container_of(lo, struct objlayout, pnfs_layout); ++} ++ ++/* ++ * per-I/O operation state ++ * embedded in objects provider io_state data structure ++ */ ++struct objlayout_io_state { ++ struct pnfs_layout_segment *lseg; ++ ++ struct page **pages; ++ unsigned pgbase; ++ unsigned nr_pages; ++ unsigned long count; ++ loff_t offset; ++ bool sync; ++ ++ void *rpcdata; ++ int status; /* res */ ++ int eof; /* res */ ++ int committed; /* res */ ++ ++ /* Error reporting (layout_return) */ ++ struct list_head err_list; ++ unsigned num_comps; ++ /* Pointer to array of error descriptors of size num_comps. ++ * It should contain as many entries as devices in the osd_layout ++ * that participate in the I/O. It is up to the io_engine to allocate ++ * needed space and set num_comps. ++ */ ++ struct pnfs_osd_ioerr *ioerrs; ++}; ++ ++/* ++ * Raid engine I/O API ++ */ ++extern void *objio_init_mt(void); ++extern void objio_fini_mt(void *mt); ++ ++extern int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_type *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout); ++extern void objio_free_lseg(void *p); ++ ++extern int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp); ++extern void objio_free_io_state(struct objlayout_io_state *state); ++ ++extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); ++extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable); ++ ++/* ++ * callback API ++ */ ++extern void objlayout_io_set_result(struct objlayout_io_state *state, ++ unsigned index, int osd_error, ++ u64 offset, u64 length, bool is_write); ++ ++static inline void ++objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) ++{ ++ struct objlayout *objlay = OBJLAYOUT(state->lseg->layout); ++ ++ /* If one of the I/Os errored out and the delta_space_used was ++ * invalid we render the complete report as invalid. Protocol mandate ++ * the DSU be accurate or not reported. ++ */ ++ spin_lock(&objlay->lock); ++ if (objlay->delta_space_valid != OBJ_DSU_INVALID) { ++ objlay->delta_space_valid = OBJ_DSU_VALID; ++ objlay->delta_space_used += space_used; ++ } ++ spin_unlock(&objlay->lock); ++} ++ ++extern void objlayout_read_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++extern void objlayout_write_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++ ++extern int objlayout_get_deviceinfo(struct pnfs_layout_type *pnfslay, ++ struct pnfs_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr); ++extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); ++ ++/* ++ * exported generic objects function vectors ++ */ ++extern struct layoutdriver_io_operations objlayout_io_operations; ++extern struct pnfs_client_operations *pnfs_client_ops; ++ ++#endif /* _OBJLAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-23 12:09:03.352501716 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-23 12:09:03.352501716 -0400 +@@ -0,0 +1,734 @@ ++/* ++ * panfs_shim.c ++ * ++ * Shim layer for interfacing with the Panasas DirectFlow module I/O stack ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include "objlayout.h" ++#include "panfs_shim.h" ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct panfs_export_operations *panfs_export_ops; ++ ++void * ++objio_init_mt(void) ++{ ++ return panfs_export_ops == NULL ? ERR_PTR(-EAGAIN) : NULL; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++} ++ ++static int ++panfs_shim_conv_raid01(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) { ++ hdr->type = PAN_AGG_RAID1; ++ hdr->hdr.raid1.num_comps = lo_map->odm_mirror_cnt + 1; ++ } else if (layout->olo_num_comps > 1) { ++ hdr->type = PAN_AGG_RAID0; ++ hdr->hdr.raid0.num_comps = layout->olo_num_comps; ++ hdr->hdr.raid0.stripe_unit = lo_map->odm_stripe_unit; ++ } else ++ hdr->type = PAN_AGG_SIMPLE; ++ return 0; ++} ++ ++static int ++panfs_shim_conv_raid5(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) ++ goto err; ++ ++ if (lo_map->odm_group_width || lo_map->odm_group_depth) { ++ if (!lo_map->odm_group_width || !lo_map->odm_group_depth) ++ goto err; ++ ++ hdr->type = PAN_AGG_GRP_RAID5_LEFT; ++ hdr->hdr.grp_raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.grp_raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.grp_raid5_left.stripe_unit = lo_map->odm_stripe_unit; ++ hdr->hdr.grp_raid5_left.rg_width = lo_map->odm_group_width; ++ hdr->hdr.grp_raid5_left.rg_depth = lo_map->odm_group_depth; ++ /* this is a guess, panasas server is not supposed to ++ hand out layotu otherwise */ ++ hdr->hdr.grp_raid5_left.group_layout_policy = ++ PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN; ++ } else { ++ hdr->type = PAN_AGG_RAID5_LEFT; ++ hdr->hdr.raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.raid5_left.stripe_unit2 = ++ hdr->hdr.raid5_left.stripe_unit1 = ++ hdr->hdr.raid5_left.stripe_unit0 = lo_map->odm_stripe_unit; ++ } ++ ++ return 0; ++err: ++ return -EINVAL; ++} ++ ++/* ++ * Convert a pnfs_osd data map into Panasas aggregation layout header ++ */ ++static int ++panfs_shim_conv_pnfs_osd_data_map( ++ struct pnfs_osd_layout *layout, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ int status = -EINVAL; ++ struct pnfs_osd_data_map *lo_map = &layout->olo_map; ++ ++ if (!layout->olo_num_comps) { ++ dprintk("%s: !!layout.n_comps(%u)\n", __func__, ++ layout->olo_num_comps); ++ goto err; ++ } ++ ++ switch (lo_map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_0 " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid01(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_5: ++ if (!lo_map->odm_group_width) { ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_5 !group_width " ++ "layout.n_comps(%u)!=map.n_comps(%u) " ++ "|| comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ } else if ((layout->olo_num_comps != lo_map->odm_num_comps && ++ layout->olo_num_comps > lo_map->odm_group_width) || ++ (layout->olo_comps_index % lo_map->odm_group_width)){ ++ dprintk("%s: !!PNFS_OSD_RAID_5 group_width(%u) " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ lo_map->odm_group_width, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid5(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_PQ: ++ default: ++ dprintk("%s: !!PNFS_OSD_RAID_(%d)\n", __func__, ++ lo_map->odm_raid_algorithm); ++ goto err; ++ } ++ ++ return 0; ++ ++err: ++ return status; ++} ++ ++/* ++ * Convert pnfs_osd layout into Panasas map and caps type ++ */ ++int ++objio_alloc_lseg(void **outp, ++ struct pnfs_layout_type *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ int i, total_comps; ++ int status; ++ struct pnfs_osd_object_cred *lo_comp; ++ pan_size_t alloc_sz, local_sz; ++ pan_sm_map_cap_t *mcs = NULL; ++ u8 *buf; ++ pan_agg_comp_obj_t *pan_comp; ++ pan_sm_sec_t *pan_sec; ++ ++ status = -EINVAL; ++ if (layout->olo_num_comps < layout->olo_map.odm_group_width) { ++ total_comps = layout->olo_comps_index + layout->olo_num_comps; ++ } else { ++ /* allocate full map, otherwise SAM gets confused */ ++ total_comps = layout->olo_map.odm_num_comps; ++ } ++ alloc_sz = total_comps * ++ (sizeof(pan_agg_comp_obj_t) + sizeof(pan_sm_sec_t)); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p = layout->olo_comps[i].oc_cap.cred; ++ if (panfs_export_ops->sm_sec_t_get_size_otw( ++ (pan_sm_sec_otw_t *)&p, &local_sz, NULL, NULL)) ++ goto err; ++ alloc_sz += local_sz; ++ } ++ ++ status = -ENOMEM; ++ mcs = kzalloc(sizeof(*mcs) + alloc_sz, GFP_KERNEL); ++ if (!mcs) ++ goto err; ++ buf = (u8 *)&mcs[1]; ++ ++ mcs->offset = lseg->range.offset; ++ mcs->length = lseg->range.length; ++#if 0 ++ /* FIXME: for now */ ++ mcs->expiration_time.ts_sec = 0; ++ mcs->expiration_time.ts_nsec = 0; ++#endif ++ mcs->full_map.map_hdr.avail_state = PAN_AGG_OBJ_STATE_NORMAL; ++ status = panfs_shim_conv_pnfs_osd_data_map(layout, ++ &mcs->full_map.layout_hdr); ++ if (status) ++ goto err; ++ ++ mcs->full_map.components.size = total_comps; ++ mcs->full_map.components.data = (pan_agg_comp_obj_t *)buf; ++ buf += total_comps * sizeof(pan_agg_comp_obj_t); ++ ++ mcs->secs.size = total_comps; ++ mcs->secs.data = (pan_sm_sec_t *)buf; ++ buf += total_comps * sizeof(pan_sm_sec_t); ++ ++ lo_comp = layout->olo_comps; ++ pan_comp = mcs->full_map.components.data + layout->olo_comps_index; ++ pan_sec = mcs->secs.data + layout->olo_comps_index; ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p; ++ pan_stor_obj_id_t *obj_id = &mcs->full_map.map_hdr.obj_id; ++ struct pnfs_osd_objid *oc_obj_id = &lo_comp->oc_object_id; ++ u64 dev_id = __be64_to_cpup( ++ (__be64 *)oc_obj_id->oid_device_id.data + 1); ++ ++ dprintk("%s: i=%d deviceid=%Lx:%Lx partition=%Lx object=%Lx\n", ++ __func__, i, ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data), ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data + 1), ++ oc_obj_id->oid_partition_id, oc_obj_id->oid_object_id); ++ ++ if (i == 0) { ++ /* make up mgr_id to calm sam down */ ++ pan_mgr_id_construct_artificial(PAN_MGR_SM, 0, ++ &obj_id->dev_id); ++ obj_id->grp_id = oc_obj_id->oid_partition_id; ++ obj_id->obj_id = oc_obj_id->oid_object_id; ++ } ++ ++ if (obj_id->grp_id != lo_comp->oc_object_id.oid_partition_id) { ++ dprintk("%s: i=%d grp_id=0x%Lx oid_partition_id=0x%Lx\n", ++ __func__, i, (u64)obj_id->grp_id, ++ lo_comp->oc_object_id.oid_partition_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ if (obj_id->obj_id != lo_comp->oc_object_id.oid_object_id) { ++ dprintk("%s: i=%d obj_id=0x%Lx oid_object_id=0x%Lx\n", ++ __func__, i, obj_id->obj_id, ++ lo_comp->oc_object_id.oid_object_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ pan_comp->dev_id = dev_id; ++ if (!pan_stor_is_device_id_an_obsd_id(pan_comp->dev_id)) { ++ dprintk("%s: i=%d dev_id=0x%Lx not an obsd_id\n", ++ __func__, i, obj_id->dev_id); ++ status = -EINVAL; ++ goto err; ++ } ++ if (lo_comp->oc_osd_version == PNFS_OSD_MISSING) { ++ dprintk("%s: degraded maps not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ pan_comp->avail_state = PAN_AGG_COMP_STATE_NORMAL; ++ if (lo_comp->oc_cap_key_sec != PNFS_OSD_CAP_KEY_SEC_NONE) { ++ dprintk("%s: cap key security not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ ++ p = lo_comp->oc_cap.cred; ++ panfs_export_ops->sm_sec_t_unmarshall( ++ (pan_sm_sec_otw_t *)&p, ++ pan_sec, ++ buf, ++ alloc_sz, ++ NULL, ++ &local_sz); ++ buf += local_sz; ++ alloc_sz -= local_sz; ++ ++ lo_comp++; ++ pan_comp++; ++ pan_sec++; ++ } ++ ++ *outp = mcs; ++ dprintk("%s:Return mcs=%p\n", __func__, mcs); ++ return 0; ++ ++err: ++ objio_free_lseg(mcs); ++ dprintk("%s:Error %d\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Free a Panasas map and caps type ++ */ ++void ++objio_free_lseg(void *p) ++{ ++ kfree(p); ++} ++ ++/* ++ * I/O routines ++ */ ++int ++objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct panfs_shim_io_state *p; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ p = kzalloc(sizeof(*p), GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ *outp = &p->ol_state; ++ return 0; ++} ++ ++/* ++ * Free an I/O state ++ */ ++void ++objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ int i; ++ ++ dprintk("%s: freeing io_state\n", __func__); ++ for (i = 0; i < state->ol_state.nr_pages; i++) ++ kunmap(state->ol_state.pages[i]); ++ ++ if (state->ucreds) ++ panfs_export_ops->ucreds_put(state->ucreds); ++ kfree(state->sg_list); ++ kfree(state); ++} ++ ++static int ++panfs_shim_pages_to_sg( ++ struct panfs_shim_io_state *state, ++ struct page **pages, ++ unsigned int pgbase, ++ unsigned nr_pages, ++ size_t count) ++{ ++ unsigned i, n; ++ pan_sg_entry_t *sg; ++ ++ dprintk("%s pgbase %u nr_pages %u count %d " ++ "pg0 %p flags 0x%x index %llu\n", ++ __func__, pgbase, nr_pages, (int)count, pages[0], ++ (unsigned)pages[0]->flags, (unsigned long long)pages[0]->index); ++ ++ sg = kmalloc(nr_pages * sizeof(*sg), GFP_KERNEL); ++ if (sg == NULL) ++ return -ENOMEM; ++ ++ dprintk("%s sg_list %p pages %p pgbase %u nr_pages %u\n", ++ __func__, sg, pages, pgbase, nr_pages); ++ ++ for (i = 0; i < nr_pages; i++) { ++ sg[i].buffer = (char *)kmap(pages[i]) + pgbase; ++ n = PAGE_SIZE - pgbase; ++ pgbase = 0; ++ if (n > count) ++ n = count; ++ sg[i].chunk_size = n; ++ count -= n; ++ if (likely(count)) { ++ sg[i].next = &sg[i+1]; ++ } else { ++ /* we're done */ ++ sg[i].next = NULL; ++ break; ++ } ++ } ++ BUG_ON(count); ++ ++ state->sg_list = sg; ++ return 0; ++} ++ ++/* ++ * Callback function for async reads ++ */ ++static void ++panfs_shim_read_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.read.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++ status = res_p->length; ++ WARN_ON(status < 0); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_read rc %d: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_read_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ struct objlayout_segment *lseg = LSEG_LD_DATA(ol_state->lseg); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)lseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.read.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.read.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_read(PAN_SAM_ACCESS_BYPASS_TIMESTAMP, ++ &state->u.read.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_read_done, ++ state, NULL, ++ &state->u.read.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_read_done(state, NULL, &state->u.read.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Callback function for async writes ++ */ ++static void ++panfs_shim_write_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.write.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++/* state->ol_state.committed = NFS_FILE_SYNC;*/ ++ state->ol_state.committed = NFS_UNSTABLE; ++ status = res_p->length; ++ WARN_ON(status < 0); ++ ++ objlayout_add_delta_space_used(&state->ol_state, ++ res_p->delta_capacity_used); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_write rc %u: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_write_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable /* unused, PanOSD writes are stable */) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ struct objlayout_segment *lseg = LSEG_LD_DATA(ol_state->lseg); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)lseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.write.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.write.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_write(PAN_SAM_ACCESS_NONE, ++ &state->u.write.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_write_done, ++ state, ++ NULL, ++ &state->u.write.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_write_done(state, NULL, &state->u.write.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++int ++panfs_shim_register(struct panfs_export_operations *ops) ++{ ++ if (panfs_export_ops) { ++ printk(KERN_INFO ++ "%s: panfs already registered (panfs ops %p)\n", ++ __func__, panfs_export_ops); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: registering panfs ops %p\n", ++ __func__, ops); ++ ++ panfs_export_ops = ops; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_register); ++ ++int ++panfs_shim_unregister(void) ++{ ++ if (!panfs_export_ops) { ++ printk(KERN_INFO "%s: panfs is not registered\n", __func__); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: unregistering panfs ops %p\n", ++ __func__, panfs_export_ops); ++ ++ panfs_export_ops = NULL; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_unregister); ++ ++/* ++ * Policy Operations ++ */ ++ ++/* ++ * Return the stripe size for the specified file ++ */ ++ssize_t ++panlayout_get_stripesize(struct pnfs_layout_type *pnfslay) ++{ ++ ssize_t sz, maxsz = -1; ++ struct pnfs_layout_segment *lseg; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ list_for_each_entry(lseg, &pnfslay->segs, fi_list) { ++ int n; ++ struct objlayout_segment *panlseg = LSEG_LD_DATA(lseg); ++ struct pnfs_osd_layout *lo = ++ (struct pnfs_osd_layout *)panlseg->pnfs_osd_layout; ++ struct pnfs_osd_data_map *map = &lo->olo_map; ++ ++ n = map->odm_group_width; ++ if (n == 0) ++ n = map->odm_num_comps / (map->odm_mirror_cnt + 1); ++ ++ switch (map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_5: ++ n -= 1; ++ n *= 8; /* FIXME: until we have 2-D coalescing */ ++ break; ++ ++ case PNFS_OSD_RAID_PQ: ++ n -= 2; ++ break; ++ ++ default: ++ BUG_ON(1); ++ } ++ sz = map->odm_stripe_unit * n; ++ if (sz > maxsz) ++ maxsz = sz; ++ } ++ dprintk("%s: Return %Zd\n", __func__, maxsz); ++ return maxsz; ++} ++ ++#define PANLAYOUT_DEF_STRIPE_UNIT (64*1024) ++#define PANLAYOUT_DEF_STRIPE_WIDTH 9 ++#define PANLAYOUT_MAX_STRIPE_WIDTH 11 ++#define PANLAYOUT_MAX_GATHER_STRIPES 8 ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++panlayout_get_blocksize(void) ++{ ++ ssize_t sz = (PANLAYOUT_MAX_STRIPE_WIDTH-1) * ++ PANLAYOUT_DEF_STRIPE_UNIT * ++ PANLAYOUT_MAX_GATHER_STRIPES; ++ dprintk("%s: Return %Zd\n", __func__, sz); ++ return sz; ++} ++ ++static struct layoutdriver_policy_operations panlayout_policy_operations = { ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ .get_stripesize = panlayout_get_stripesize, ++ .get_blocksize = panlayout_get_blocksize, ++}; ++ ++#define PNFS_LAYOUT_PANOSD (NFS4_PNFS_PRIVATE_LAYOUT | LAYOUT_OSD2_OBJECTS) ++ ++static struct pnfs_layoutdriver_type panlayout_type = { ++ .id = PNFS_LAYOUT_PANOSD, ++ .name = "PNFS_LAYOUT_PANOSD", ++ .ld_io_ops = &objlayout_io_operations, ++ .ld_policy_ops = &panlayout_policy_operations, ++}; ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for Panasas OSDs"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++panlayout_init(void) ++{ ++ pnfs_client_ops = pnfs_register_layoutdriver(&panlayout_type); ++ printk(KERN_INFO "%s: Registered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++ return 0; ++} ++ ++static void __exit ++panlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&panlayout_type); ++ printk(KERN_INFO "%s: Unregistered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(panlayout_init); ++module_exit(panlayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-23 12:09:03.353501685 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-23 12:09:03.353501685 -0400 +@@ -0,0 +1,482 @@ ++/* ++ * panfs_shim.h ++ * ++ * Data types and external function declerations for interfacing with ++ * panfs (Panasas DirectFlow) I/O stack ++ * ++ * Copyright (C) 2007 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#ifndef _PANLAYOUT_PANFS_SHIM_H ++#define _PANLAYOUT_PANFS_SHIM_H ++ ++typedef s8 pan_int8_t; ++typedef u8 pan_uint8_t; ++typedef s16 pan_int16_t; ++typedef u16 pan_uint16_t; ++typedef s32 pan_int32_t; ++typedef u32 pan_uint32_t; ++typedef s64 pan_int64_t; ++typedef u64 pan_uint64_t; ++ ++/* ++ * from pan_base_types.h ++ */ ++typedef pan_uint64_t pan_rpc_none_t; ++typedef pan_uint32_t pan_rpc_arrdim_t; ++typedef pan_uint32_t pan_status_t; ++typedef pan_uint8_t pan_otw_t; ++typedef pan_uint8_t pan_pad_t; ++ ++typedef pan_uint32_t pan_timespec_sec_t; ++typedef pan_uint32_t pan_timespec_nsec_t; ++ ++typedef struct pan_timespec_s pan_timespec_t; ++struct pan_timespec_s { ++ pan_timespec_sec_t ts_sec; ++ pan_timespec_nsec_t ts_nsec; ++}; ++ ++/* ++ * from pan_std_types.h ++ */ ++typedef pan_uint32_t pan_size_t; ++typedef int pan_bool_t; ++ ++/* ++ * from pan_common_error.h ++ */ ++#define PAN_SUCCESS ((pan_status_t)0) ++#define PAN_ERR_IN_PROGRESS ((pan_status_t)55) ++ ++/* ++ * from pan_sg.h ++ */ ++typedef struct pan_sg_entry_s pan_sg_entry_t; ++struct pan_sg_entry_s { ++ void *buffer; /* pointer to memory */ ++ pan_uint32_t chunk_size; /* size of each chunk (bytes) */ ++ pan_sg_entry_t *next; ++}; ++ ++/* ++ * from pan_storage.h ++ */ ++typedef pan_uint64_t pan_stor_dev_id_t; ++typedef pan_uint32_t pan_stor_obj_grp_id_t; ++typedef pan_uint64_t pan_stor_obj_uniq_t; ++typedef pan_uint32_t pan_stor_action_t; ++typedef pan_uint8_t pan_stor_cap_key_t[20]; ++ ++typedef pan_uint8_t pan_stor_key_type_t; ++typedef pan_uint64_t pan_stor_len_t; ++typedef pan_int64_t pan_stor_delta_len_t; ++typedef pan_uint64_t pan_stor_offset_t; ++typedef pan_uint16_t pan_stor_op_t; ++ ++typedef pan_uint16_t pan_stor_sec_level_t; ++ ++struct pan_stor_obj_id_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_obj_uniq_t obj_id; ++ pan_stor_obj_grp_id_t grp_id; ++}; ++ ++typedef struct pan_stor_obj_id_s pan_stor_obj_id_t; ++ ++#define PAN_STOR_OP_NONE ((pan_stor_op_t) 0U) ++#define PAN_STOR_OP_READ ((pan_stor_op_t) 8U) ++#define PAN_STOR_OP_WRITE ((pan_stor_op_t) 9U) ++#define PAN_STOR_OP_APPEND ((pan_stor_op_t) 10U) ++#define PAN_STOR_OP_GETATTR ((pan_stor_op_t) 11U) ++#define PAN_STOR_OP_SETATTR ((pan_stor_op_t) 12U) ++#define PAN_STOR_OP_FLUSH ((pan_stor_op_t) 13U) ++#define PAN_STOR_OP_CLEAR ((pan_stor_op_t) 14U) ++ ++/* ++ * from pan_aggregation_map.h ++ */ ++typedef pan_uint8_t pan_agg_type_t; ++typedef pan_uint64_t pan_agg_map_version_t; ++typedef pan_uint8_t pan_agg_obj_state_t; ++typedef pan_uint8_t pan_agg_comp_state_t; ++typedef pan_uint8_t pan_agg_comp_flag_t; ++ ++#define PAN_AGG_OBJ_STATE_INVALID ((pan_agg_obj_state_t) 0x00) ++#define PAN_AGG_OBJ_STATE_NORMAL ((pan_agg_obj_state_t) 0x01) ++#define PAN_AGG_OBJ_STATE_DEGRADED ((pan_agg_obj_state_t) 0x02) ++#define PAN_AGG_OBJ_STATE_RECONSTRUCT ((pan_agg_obj_state_t) 0x03) ++#define PAN_AGG_OBJ_STATE_COPYBACK ((pan_agg_obj_state_t) 0x04) ++#define PAN_AGG_OBJ_STATE_UNAVAILABLE ((pan_agg_obj_state_t) 0x05) ++#define PAN_AGG_OBJ_STATE_CREATING ((pan_agg_obj_state_t) 0x06) ++#define PAN_AGG_OBJ_STATE_DELETED ((pan_agg_obj_state_t) 0x07) ++#define PAN_AGG_COMP_STATE_INVALID ((pan_agg_comp_state_t) 0x00) ++#define PAN_AGG_COMP_STATE_NORMAL ((pan_agg_comp_state_t) 0x01) ++#define PAN_AGG_COMP_STATE_UNAVAILABLE ((pan_agg_comp_state_t) 0x02) ++#define PAN_AGG_COMP_STATE_COPYBACK ((pan_agg_comp_state_t) 0x03) ++#define PAN_AGG_COMP_F_NONE ((pan_agg_comp_flag_t) 0x00) ++#define PAN_AGG_COMP_F_ATTR_STORING ((pan_agg_comp_flag_t) 0x01) ++#define PAN_AGG_COMP_F_OBJ_CORRUPT_OBS ((pan_agg_comp_flag_t) 0x02) ++#define PAN_AGG_COMP_F_TEMP ((pan_agg_comp_flag_t) 0x04) ++ ++struct pan_aggregation_map_s { ++ pan_agg_map_version_t version; ++ pan_agg_obj_state_t avail_state; ++ pan_stor_obj_id_t obj_id; ++}; ++ ++typedef struct pan_aggregation_map_s pan_aggregation_map_t; ++ ++struct pan_agg_comp_obj_s { ++ pan_stor_dev_id_t dev_id; ++ pan_agg_comp_state_t avail_state; ++ pan_agg_comp_flag_t comp_flags; ++}; ++ ++typedef struct pan_agg_comp_obj_s pan_agg_comp_obj_t; ++ ++struct pan_agg_simple_header_s { ++ pan_uint8_t unused; ++}; ++ ++typedef struct pan_agg_simple_header_s pan_agg_simple_header_t; ++ ++struct pan_agg_raid1_header_s { ++ pan_uint16_t num_comps; ++}; ++ ++typedef struct pan_agg_raid1_header_s pan_agg_raid1_header_t; ++ ++struct pan_agg_raid0_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++}; ++ ++typedef struct pan_agg_raid0_header_s pan_agg_raid0_header_t; ++ ++struct pan_agg_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit0; ++ pan_uint32_t stripe_unit1; ++ pan_uint32_t stripe_unit2; ++}; ++ ++typedef struct pan_agg_raid5_left_header_s pan_agg_raid5_left_header_t; ++ ++typedef struct pan_agg_grp_raid5_left_header_s pan_agg_grp_raid5_left_header_t; ++ ++struct pan_agg_grp_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++ pan_uint16_t rg_width; ++ pan_uint16_t rg_depth; ++ pan_uint8_t group_layout_policy; ++}; ++ ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_INVALID ((pan_uint8_t) 0x00) ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN ((pan_uint8_t) 0x01) ++ ++#define PAN_AGG_NULL_MAP ((pan_agg_type_t) 0x00) ++#define PAN_AGG_SIMPLE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_RAID1 ((pan_agg_type_t) 0x02) ++#define PAN_AGG_RAID0 ((pan_agg_type_t) 0x03) ++#define PAN_AGG_RAID5_LEFT ((pan_agg_type_t) 0x04) ++#define PAN_AGG_GRP_RAID5_LEFT ((pan_agg_type_t) 0x06) ++#define PAN_AGG_MINTYPE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_MAXTYPE ((pan_agg_type_t) 0x06) ++ ++struct pan_agg_layout_hdr_s { ++ pan_agg_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_uint64_t null; ++ pan_agg_simple_header_t simple; ++ pan_agg_raid1_header_t raid1; ++ pan_agg_raid0_header_t raid0; ++ pan_agg_raid5_left_header_t raid5_left; ++ pan_agg_grp_raid5_left_header_t grp_raid5_left; ++ } hdr; ++}; ++ ++typedef struct pan_agg_layout_hdr_s pan_agg_layout_hdr_t; ++ ++struct pan_agg_comp_obj_a_s { ++ pan_rpc_arrdim_t size; ++ pan_agg_comp_obj_t *data; ++}; ++typedef struct pan_agg_comp_obj_a_s pan_agg_comp_obj_a; ++ ++struct pan_agg_full_map_s { ++ pan_aggregation_map_t map_hdr; ++ pan_agg_layout_hdr_t layout_hdr; ++ pan_agg_comp_obj_a components; ++}; ++ ++typedef struct pan_agg_full_map_s pan_agg_full_map_t; ++ ++/* ++ * from pan_obsd_rpc_types.h ++ */ ++typedef pan_uint8_t pan_obsd_security_key_a[16]; ++ ++typedef pan_uint8_t pan_obsd_capability_key_a[20]; ++ ++typedef pan_uint8_t pan_obsd_key_holder_id_t; ++ ++#define PAN_OBSD_KEY_HOLDER_BASIS_KEY ((pan_obsd_key_holder_id_t) 0x01) ++#define PAN_OBSD_KEY_HOLDER_CAP_KEY ((pan_obsd_key_holder_id_t) 0x02) ++ ++struct pan_obsd_key_holder_s { ++ pan_obsd_key_holder_id_t select; ++ pan_pad_t pad[3]; ++ union { ++ pan_obsd_security_key_a basis_key; ++ pan_obsd_capability_key_a cap_key; ++ } key; ++}; ++ ++typedef struct pan_obsd_key_holder_s pan_obsd_key_holder_t; ++ ++/* ++ * from pan_sm_sec.h ++ */ ++typedef pan_uint8_t pan_sm_sec_type_t; ++typedef pan_uint8_t pan_sm_sec_otw_allo_mode_t; ++ ++struct pan_obsd_capability_generic_otw_t_s { ++ pan_rpc_arrdim_t size; ++ pan_uint8_t *data; ++}; ++typedef struct pan_obsd_capability_generic_otw_t_s ++ pan_obsd_capability_generic_otw_t; ++ ++struct pan_sm_sec_obsd_s { ++ pan_obsd_key_holder_t key; ++ pan_obsd_capability_generic_otw_t cap_otw; ++ pan_sm_sec_otw_allo_mode_t allo_mode; ++}; ++ ++typedef struct pan_sm_sec_obsd_s pan_sm_sec_obsd_t; ++ ++struct pan_sm_sec_s { ++ pan_sm_sec_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_rpc_none_t none; ++ pan_sm_sec_obsd_t obsd; ++ } variant; ++}; ++ ++typedef struct pan_sm_sec_s pan_sm_sec_t; ++ ++struct pan_sm_sec_a_s { ++ pan_rpc_arrdim_t size; ++ pan_sm_sec_t *data; ++}; ++typedef struct pan_sm_sec_a_s pan_sm_sec_a; ++typedef pan_otw_t *pan_sm_sec_otw_t; ++ ++/* ++ * from pan_sm_types.h ++ */ ++typedef pan_uint64_t pan_sm_cap_handle_t; ++ ++struct pan_sm_map_cap_s { ++ pan_agg_full_map_t full_map; ++ pan_stor_offset_t offset; ++ pan_stor_len_t length; ++ pan_sm_sec_a secs; ++ pan_sm_cap_handle_t handle; ++ pan_timespec_t expiration_time; ++ pan_stor_action_t action_mask; ++ pan_uint32_t flags; ++}; ++ ++typedef struct pan_sm_map_cap_s pan_sm_map_cap_t; ++ ++/* ++ * from pan_sm_ops.h ++ */ ++typedef pan_rpc_none_t pan_sm_cache_ptr_t; ++ ++/* ++ * from pan_sam_api.h ++ */ ++typedef pan_uint32_t pan_sam_access_flags_t; ++ ++typedef struct pan_sam_dev_error_s pan_sam_dev_error_t; ++struct pan_sam_dev_error_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_op_t stor_op; ++ pan_status_t error; ++}; ++ ++typedef struct pan_sam_ext_status_s pan_sam_ext_status_t; ++struct pan_sam_ext_status_s { ++ pan_uint32_t available; ++ pan_uint32_t size; ++ pan_sam_dev_error_t *errors; ++}; ++ ++enum pan_sam_rpc_sec_sel_e { ++ PAN_SAM_RPC_SEC_DEFAULT, ++ PAN_SAM_RPC_SEC_ATLEAST, ++ PAN_SAM_RPC_SEC_EXACTLY ++}; ++typedef enum pan_sam_rpc_sec_sel_e pan_sam_rpc_sec_sel_t; ++ ++typedef struct pan_sam_obj_sec_s pan_sam_obj_sec_t; ++struct pan_sam_obj_sec_s { ++ pan_stor_sec_level_t min_security; ++ pan_sm_map_cap_t *map_ccaps; ++}; ++ ++typedef struct pan_sam_rpc_sec_s pan_sam_rpc_sec_t; ++struct pan_sam_rpc_sec_s { ++ pan_sam_rpc_sec_sel_t selector; ++}; ++ ++typedef struct pan_sam_read_args_s pan_sam_read_args_t; ++struct pan_sam_read_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ void *return_attr; ++ void *checksum; ++ pan_stor_offset_t offset; ++ pan_uint16_t sm_options; ++ void *callout; ++ void *callout_arg; ++}; ++ ++typedef struct pan_sam_read_res_s pan_sam_read_res_t; ++struct pan_sam_read_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ void *attr; ++ void *checksum; ++}; ++ ++typedef void (*pan_sam_read_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t status); ++ ++#define PAN_SAM_ACCESS_NONE 0x0000 ++#define PAN_SAM_ACCESS_BYPASS_TIMESTAMP 0x0020 ++ ++typedef struct pan_sam_write_args_s pan_sam_write_args_t; ++struct pan_sam_write_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ pan_stor_offset_t offset; ++ void *attr; ++ void *return_attr; ++}; ++ ++typedef struct pan_sam_write_res_s pan_sam_write_res_t; ++struct pan_sam_write_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ pan_stor_delta_len_t delta_capacity_used; ++ pan_bool_t parity_dirty; ++ void *attr; ++}; ++ ++typedef void (*pan_sam_write_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t status); ++ ++/* ++ * from pan_mgr_types.h ++ */ ++#define PAN_MGR_ID_TYPE_SHIFT 56 ++#define PAN_MGR_ID_TYPE_MASK ((pan_mgr_id_t)18374686479671623680ULL) ++#define PAN_MGR_ID_UNIQ_MASK ((pan_mgr_id_t)72057594037927935ULL) ++ ++typedef pan_uint16_t pan_mgr_type_t; ++typedef pan_uint64_t pan_mgr_id_t; ++ ++#define PAN_MGR_SM ((pan_mgr_type_t) 2U) ++#define PAN_MGR_OBSD ((pan_mgr_type_t) 6U) ++ ++/* ++ * from pan_mgr_types_c.h ++ */ ++#define pan_mgr_id_construct_artificial(_mgr_type_, _mgr_uniq_, _mgr_id_p_) { \ ++ pan_mgr_id_t _id1, _id2; \ ++\ ++ _id1 = (_mgr_type_); \ ++ _id1 <<= PAN_MGR_ID_TYPE_SHIFT; \ ++ _id1 &= PAN_MGR_ID_TYPE_MASK; \ ++ _id2 = (_mgr_uniq_); \ ++ _id2 &= PAN_MGR_ID_UNIQ_MASK; \ ++ _id1 |= _id2; \ ++ *(_mgr_id_p_) = _id1; \ ++} ++ ++/* ++ * from pan_storage_c.h ++ */ ++#define pan_stor_is_device_id_an_obsd_id(_device_id_) \ ++ ((((_device_id_) & PAN_MGR_ID_TYPE_MASK) >> PAN_MGR_ID_TYPE_SHIFT) \ ++ == PAN_MGR_OBSD) ++ ++/* ++ * pnfs_shim internal definitions ++ */ ++ ++struct panfs_shim_io_state { ++ struct objlayout_io_state ol_state; ++ ++ pan_sg_entry_t *sg_list; ++ pan_sam_obj_sec_t obj_sec; ++ void *ucreds; ++ union { ++ struct { ++ pan_sam_read_args_t args; ++ pan_sam_read_res_t res; ++ } read; ++ struct { ++ pan_sam_write_args_t args; ++ pan_sam_write_res_t res; ++ } write; ++ } u; ++}; ++ ++#endif /* _PANLAYOUT_PANFS_SHIM_H */ +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-23 12:09:03.354501721 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-23 12:09:03.354501721 -0400 +@@ -0,0 +1,435 @@ ++/* ++ * pnfs_osd_xdr.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* ++ * The following implementation is based on these Internet Drafts: ++ * ++ * draft-ietf-nfsv4-minorversion-21 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_objid(u32 *p, struct pnfs_osd_objid *objid) ++{ ++ COPYMEM(objid->oid_device_id.data, sizeof(objid->oid_device_id.data)); ++ READ64(objid->oid_partition_id); ++ READ64(objid->oid_object_id); ++ return p; ++} ++ ++static inline u32 * ++pnfs_osd_xdr_decode_opaque_cred(u32 *p, ++ struct pnfs_osd_opaque_cred *opaque_cred) ++{ ++ READ32(opaque_cred->cred_len); ++ COPYMEM(opaque_cred->cred, opaque_cred->cred_len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_object_cred(u32 *p, struct pnfs_osd_object_cred *comp, ++ u8 **credp) ++{ ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_objid(p, &comp->oc_object_id); ++ READ32(comp->oc_osd_version); ++ READ32(comp->oc_cap_key_sec); ++ ++ cred = *credp; ++ comp->oc_cap_key.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap_key); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap_key.cred_len)); ++ comp->oc_cap.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap.cred_len)); ++ *credp = cred; ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_data_map(u32 *p, struct pnfs_osd_data_map *data_map) ++{ ++ READ32(data_map->odm_num_comps); ++ READ64(data_map->odm_stripe_unit); ++ READ32(data_map->odm_group_width); ++ READ32(data_map->odm_group_depth); ++ READ32(data_map->odm_mirror_cnt); ++ READ32(data_map->odm_raid_algorithm); ++ dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " ++ "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", ++ __func__, ++ data_map->odm_num_comps, ++ (unsigned long long)data_map->odm_stripe_unit, ++ data_map->odm_group_width, ++ data_map->odm_group_depth, ++ data_map->odm_mirror_cnt, ++ data_map->odm_raid_algorithm); ++ return p; ++} ++ ++struct pnfs_osd_layout * ++pnfs_osd_xdr_decode_layout(struct pnfs_osd_layout *layout, u32 *p) ++{ ++ int i; ++ u32 *start = p; ++ struct pnfs_osd_object_cred *comp; ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_data_map(p, &layout->olo_map); ++ READ32(layout->olo_comps_index); ++ READ32(layout->olo_num_comps); ++ layout->olo_comps = (struct pnfs_osd_object_cred *)(layout + 1); ++ comp = layout->olo_comps; ++ cred = (u8 *)(comp + layout->olo_num_comps); ++ dprintk("%s: comps_index=%u num_comps=%u\n", ++ __func__, layout->olo_comps_index, layout->olo_num_comps); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ p = pnfs_osd_xdr_decode_object_cred(p, comp, &cred); ++ dprintk("%s: comp[%d]=dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "key_len=%u cap_len=%u\n", ++ __func__, i, ++ _DEVID_LO(&comp->oc_object_id.oid_device_id), ++ _DEVID_HI(&comp->oc_object_id.oid_device_id), ++ comp->oc_object_id.oid_partition_id, ++ comp->oc_object_id.oid_object_id, ++ comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); ++ comp++; ++ } ++ dprintk("%s: xdr_size=%Zd end=%p in_core_size=%Zd\n", __func__, ++ (char *)p - (char *)start, cred, (char *)cred - (char *)layout); ++ return layout; ++} ++ ++/* ++ * Get Device Information Decoding ++ * ++ * Note: since Device Information is currently done synchronously, most ++ * of the actual fields are left inside the rpc buffer and are only ++ * pointed to by the pnfs_osd_deviceaddr members. So the read buffer ++ * should not be freed while the returned information is in use. ++ */ ++ ++u32 *__xdr_read_calc_nfs4_string( ++ u32 *p, struct nfs4_string *str, u8 **freespace) ++{ ++ u32 len; ++ char *data; ++ bool need_copy; ++ ++ READ32(len); ++ data = (char *)p; ++ ++ if (data[len]) { /* Not null terminated we'll need extra space */ ++ data = *freespace; ++ *freespace += len + 1; ++ need_copy = true; ++ } else { ++ need_copy = false; ++ } ++ ++ if (str) { ++ str->len = len; ++ str->data = data; ++ if (need_copy) { ++ memcpy(data, p, len); ++ data[len] = 0; ++ } ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++u32 *__xdr_read_calc_u8_opaque( ++ u32 *p, struct nfs4_string *str) ++{ ++ u32 len; ++ ++ READ32(len); ++ ++ if (str) { ++ str->len = len; ++ str->data = (char *)p; ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetid { ++ * u32 oti_type; ++ * struct nfs4_string oti_scsi_device_id; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetid( ++ u32 *p, struct pnfs_osd_targetid* targetid, u8 **freespace) ++{ ++ u32 oti_type; ++ ++ READ32(oti_type); ++ if (targetid) ++ targetid->oti_type = oti_type; ++ ++ switch (oti_type) { ++ case OBJ_TARGET_SCSI_NAME: ++ case OBJ_TARGET_SCSI_DEVICE_ID: ++ p = __xdr_read_calc_u8_opaque(p, ++ targetid ? &targetid->oti_scsi_device_id : NULL); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_net_addr { ++ * struct nfs4_string r_netid; ++ * struct nfs4_string r_addr; ++ * }; ++ */ ++u32 *__xdr_read_calc_net_addr( ++ u32 *p, struct pnfs_osd_net_addr* netaddr, u8 **freespace) ++{ ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_netid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_addr : NULL, ++ freespace); ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetaddr { ++ * u32 ota_available; ++ * struct pnfs_osd_net_addr ota_netaddr; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetaddr( ++ u32 *p, struct pnfs_osd_targetaddr *targetaddr, u8 **freespace) ++{ ++ u32 ota_available; ++ ++ READ32(ota_available); ++ if (targetaddr) ++ targetaddr->ota_available = ota_available; ++ ++ if (ota_available) { ++ p = __xdr_read_calc_net_addr(p, ++ targetaddr ? &targetaddr->ota_netaddr : NULL, ++ freespace); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++u32 *__xdr_read_calc_deviceaddr( ++ u32 *p, struct pnfs_osd_deviceaddr *deviceaddr, u8 **freespace) ++{ ++ p = __xdr_read_calc_targetid(p, ++ deviceaddr ? &deviceaddr->oda_targetid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_targetaddr(p, ++ deviceaddr ? &deviceaddr->oda_targetaddr : NULL, ++ freespace); ++ ++ if (deviceaddr) ++ COPYMEM(deviceaddr->oda_lun, sizeof(deviceaddr->oda_lun)); ++ else ++ p += XDR_QUADLEN(sizeof(deviceaddr->oda_lun)); ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_systemid : NULL); ++ ++ if (deviceaddr) { ++ p = pnfs_osd_xdr_decode_object_cred(p, ++ &deviceaddr->oda_root_obj_cred, freespace); ++ } else { ++ *freespace += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_osdname : NULL); ++ ++ return p; ++} ++ ++size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p) ++{ ++ u8 *null_freespace = NULL; ++ size_t sz; ++ ++ __xdr_read_calc_deviceaddr(p, NULL, &null_freespace); ++ sz = sizeof(struct pnfs_osd_deviceaddr) + (size_t)null_freespace; ++ ++ return sz; ++} ++ ++void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p) ++{ ++ u8 *freespace = (u8 *)(deviceaddr + 1); ++ ++ __xdr_read_calc_deviceaddr(p, deviceaddr, &freespace); ++} ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou) ++{ ++ __be32 *p = xdr_reserve_space(xdr, 16); ++ ++ if (!p) ++ return -E2BIG; ++ ++ *p++ = cpu_to_be32(lou->dsu_valid); ++ if (lou->dsu_valid) ++ p = xdr_encode_hyper(p, lou->dsu_delta); ++ *p++ = cpu_to_be32(lou->olu_ioerr_flag); ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ */ ++static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p; ++ ++ p = xdr_reserve_space(xdr, 32); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, ++ sizeof(object_id->oid_device_id.data)); ++ p = xdr_encode_hyper(p, object_id->oid_partition_id); ++ p = xdr_encode_hyper(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, ++ struct pnfs_osd_ioerr *ioerr) ++{ ++ __be32 *p; ++ int ret; ++ ++ ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component); ++ if (ret) ++ return ret; ++ ++ p = xdr_reserve_space(xdr, 24); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_hyper(p, ioerr->oer_comp_offset); ++ p = xdr_encode_hyper(p, ioerr->oer_comp_length); ++ *p++ = cpu_to_be32(ioerr->oer_iswrite); ++ *p = cpu_to_be32(ioerr->oer_errno); ++ ++ return 0; ++} +diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/pagelist.c +--- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-08-23 12:08:29.056411363 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-23 12:09:03.355511659 -0400 +@@ -20,6 +20,7 @@ + #include + + #include "internal.h" ++#include "pnfs.h" + + static struct kmem_cache *nfs_page_cachep; + +@@ -56,7 +57,8 @@ nfs_page_free(struct nfs_page *p) + struct nfs_page * + nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, + struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + +@@ -79,7 +81,11 @@ nfs_create_request(struct nfs_open_conte + req->wb_pgbase = offset; + req->wb_bytes = count; + req->wb_context = get_nfs_open_context(ctx); ++ req->wb_lock_context = nfs_get_lock_context(ctx); + kref_init(&req->wb_kref); ++ req->wb_lseg = lseg; ++ if (lseg) ++ get_lseg(lseg); + return req; + } + +@@ -141,18 +147,26 @@ void nfs_clear_request(struct nfs_page * + { + struct page *page = req->wb_page; + struct nfs_open_context *ctx = req->wb_context; ++ struct nfs_lock_context *l_ctx = req->wb_lock_context; + + if (page != NULL) { + page_cache_release(page); + req->wb_page = NULL; + } ++ if (l_ctx != NULL) { ++ nfs_put_lock_context(l_ctx); ++ req->wb_lock_context = NULL; ++ } + if (ctx != NULL) { + put_nfs_open_context(ctx); + req->wb_context = NULL; + } ++ if (req->wb_lseg != NULL) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } + } + +- + /** + * nfs_release_request - Release the count on an NFS read/write request + * @req: request to release +@@ -231,11 +245,12 @@ void nfs_pageio_init(struct nfs_pageio_d + * Return 'true' if this is the case, else return 'false'. + */ + static int nfs_can_coalesce_requests(struct nfs_page *prev, +- struct nfs_page *req) ++ struct nfs_page *req, ++ struct nfs_pageio_descriptor *pgio) + { + if (req->wb_context->cred != prev->wb_context->cred) + return 0; +- if (req->wb_context->lockowner != prev->wb_context->lockowner) ++ if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) + return 0; + if (req->wb_context->state != prev->wb_context->state) + return 0; +@@ -245,6 +260,12 @@ static int nfs_can_coalesce_requests(str + return 0; + if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) + return 0; ++ if (req->wb_lseg != prev->wb_lseg) ++ return 0; ++#ifdef CONFIG_NFS_V4_1 ++ if (pgio->pg_test && !pgio->pg_test(pgio, prev, req)) ++ return 0; ++#endif /* CONFIG_NFS_V4_1 */ + return 1; + } + +@@ -277,7 +298,7 @@ static int nfs_pageio_do_add_request(str + if (newlen > desc->pg_bsize) + return 0; + prev = nfs_list_entry(desc->pg_list.prev); +- if (!nfs_can_coalesce_requests(prev, req)) ++ if (!nfs_can_coalesce_requests(prev, req, desc)) + return 0; + } else + desc->pg_base = req->wb_pgbase; +@@ -366,6 +387,7 @@ void nfs_pageio_cond_complete(struct nfs + * @idx_start: lower bound of page->index to scan + * @npages: idx_start + npages sets the upper bound to scan. + * @tag: tag to scan for ++ * @use_pnfs: will be set TRUE if commit needs to be handled by layout driver + * + * Moves elements from one of the inode request lists. + * If the number of requests is set to 0, the entire address_space +@@ -375,7 +397,7 @@ void nfs_pageio_cond_complete(struct nfs + */ + int nfs_scan_list(struct nfs_inode *nfsi, + struct list_head *dst, pgoff_t idx_start, +- unsigned int npages, int tag) ++ unsigned int npages, int tag, int *use_pnfs) + { + struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; + struct nfs_page *req; +@@ -406,6 +428,8 @@ int nfs_scan_list(struct nfs_inode *nfsi + radix_tree_tag_clear(&nfsi->nfs_page_tree, + req->wb_index, tag); + nfs_list_add_request(req, dst); ++ if (req->wb_lseg) ++ *use_pnfs = 1; + res++; + if (res == INT_MAX) + goto out; +diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs.c +--- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-08-23 12:09:03.356501413 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-08-23 12:09:03.357481204 -0400 +@@ -0,0 +1,2027 @@ ++/* ++ * linux/fs/nfs/pnfs.c ++ * ++ * pNFS functions to call and manage layout drivers. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "internal.h" ++#include "nfs4_fs.h" ++#include "pnfs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS ++ ++#define MIN_POOL_LC (4) ++ ++static int pnfs_initialized; ++ ++static void pnfs_free_layout(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layout_segment *range); ++static inline void get_layout(struct pnfs_layout_type *lo); ++ ++/* Locking: ++ * ++ * pnfs_spinlock: ++ * protects pnfs_modules_tbl. ++ */ ++static spinlock_t pnfs_spinlock = __SPIN_LOCK_UNLOCKED(pnfs_spinlock); ++ ++/* ++ * pnfs_modules_tbl holds all pnfs modules ++ */ ++static struct list_head pnfs_modules_tbl; ++static struct kmem_cache *pnfs_cachep; ++static mempool_t *pnfs_layoutcommit_mempool; ++ ++static inline struct pnfs_layoutcommit_data *pnfs_layoutcommit_alloc(void) ++{ ++ struct pnfs_layoutcommit_data *p = ++ mempool_alloc(pnfs_layoutcommit_mempool, GFP_NOFS); ++ if (p) ++ memset(p, 0, sizeof(*p)); ++ ++ return p; ++} ++ ++void pnfs_layoutcommit_free(struct pnfs_layoutcommit_data *p) ++{ ++ mempool_free(p, pnfs_layoutcommit_mempool); ++} ++ ++/* ++ * struct pnfs_module - One per pNFS device module. ++ */ ++struct pnfs_module { ++ struct pnfs_layoutdriver_type *pnfs_ld_type; ++ struct list_head pnfs_tblid; ++}; ++ ++int ++pnfs_initialize(void) ++{ ++ INIT_LIST_HEAD(&pnfs_modules_tbl); ++ ++ pnfs_cachep = kmem_cache_create("pnfs_layoutcommit_data", ++ sizeof(struct pnfs_layoutcommit_data), ++ 0, SLAB_HWCACHE_ALIGN, NULL); ++ if (pnfs_cachep == NULL) ++ return -ENOMEM; ++ ++ pnfs_layoutcommit_mempool = mempool_create(MIN_POOL_LC, ++ mempool_alloc_slab, ++ mempool_free_slab, ++ pnfs_cachep); ++ if (pnfs_layoutcommit_mempool == NULL) { ++ kmem_cache_destroy(pnfs_cachep); ++ return -ENOMEM; ++ } ++ ++ pnfs_initialized = 1; ++ return 0; ++} ++ ++void pnfs_uninitialize(void) ++{ ++ mempool_destroy(pnfs_layoutcommit_mempool); ++ kmem_cache_destroy(pnfs_cachep); ++} ++ ++/* search pnfs_modules_tbl for right pnfs module */ ++static int ++find_pnfs(u32 id, struct pnfs_module **module) { ++ struct pnfs_module *local = NULL; ++ ++ dprintk("PNFS: %s: Searching for %u\n", __func__, id); ++ list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) { ++ if (local->pnfs_ld_type->id == id) { ++ *module = local; ++ return(1); ++ } ++ } ++ return 0; ++} ++ ++/* Set lo_cred to indicate we require a layoutcommit ++ * If we don't even have a layout, we don't need to commit it. ++ */ ++void ++pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx) ++{ ++ dprintk("%s: has_layout=%d ctx=%p\n", __func__, has_layout(nfsi), ctx); ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (has_layout(nfsi) && ++ !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->pnfs_layout_state)) { ++ nfsi->layout->lo_cred = get_rpccred(ctx->state->owner->so_cred); ++ __set_bit(NFS_INO_LAYOUTCOMMIT, ++ &nfsi->layout->pnfs_layout_state); ++ nfsi->change_attr++; ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ dprintk("%s: Set layoutcommit\n", __func__); ++ return; ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* Update last_write_offset for layoutcommit. ++ * TODO: We should only use commited extents, but the current nfs ++ * implementation does not calculate the written range in nfs_commit_done. ++ * We therefore update this field in writeback_done. ++ */ ++void ++pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent) ++{ ++ loff_t end_pos; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (offset < nfsi->layout->pnfs_write_begin_pos) ++ nfsi->layout->pnfs_write_begin_pos = offset; ++ end_pos = offset + extent - 1; /* I'm being inclusive */ ++ if (end_pos > nfsi->layout->pnfs_write_end_pos) ++ nfsi->layout->pnfs_write_end_pos = end_pos; ++ dprintk("%s: Wrote %lu@%lu bpos %lu, epos: %lu\n", ++ __func__, ++ (unsigned long) extent, ++ (unsigned long) offset , ++ (unsigned long) nfsi->layout->pnfs_write_begin_pos, ++ (unsigned long) nfsi->layout->pnfs_write_end_pos); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* Unitialize a mountpoint in a layout driver */ ++void ++unmount_pnfs_layoutdriver(struct nfs_server *nfss) ++{ ++ if (PNFS_EXISTS_LDIO_OP(nfss, uninitialize_mountpoint)) ++ nfss->pnfs_curr_ld->ld_io_ops->uninitialize_mountpoint(nfss); ++} ++ ++/* ++ * Set the server pnfs module to the first registered pnfs_type. ++ * Only one pNFS layout driver is supported. ++ */ ++void ++set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, ++ u32 id) ++{ ++ struct pnfs_module *mod = NULL; ++ ++ if (server->pnfs_curr_ld) ++ return; ++ ++ if (!find_pnfs(id, &mod)) { ++ request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); ++ find_pnfs(id, &mod); ++ } ++ ++ if (!mod) { ++ dprintk("%s: No pNFS module found for %u. ", __func__, id); ++ goto out_err; ++ } ++ ++ server->pnfs_curr_ld = mod->pnfs_ld_type; ++ if (mod->pnfs_ld_type->ld_io_ops->initialize_mountpoint( ++ server, mntfh)) { ++ printk(KERN_ERR "%s: Error initializing mount point " ++ "for layout driver %u. ", __func__, id); ++ goto out_err; ++ } ++ ++ dprintk("%s: pNFS module for %u set\n", __func__, id); ++ return; ++ ++out_err: ++ dprintk("Using NFSv4 I/O\n"); ++ server->pnfs_curr_ld = NULL; ++} ++ ++/* Allow I/O module to set its functions structure */ ++struct pnfs_client_operations* ++pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) ++{ ++ struct pnfs_module *pnfs_mod; ++ struct layoutdriver_io_operations *io_ops = ld_type->ld_io_ops; ++ ++ if (!pnfs_initialized) { ++ printk(KERN_ERR "%s Registration failure. " ++ "pNFS not initialized.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops || !io_ops->alloc_layout || !io_ops->free_layout) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "alloc_layout and free_layout.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops->alloc_lseg || !io_ops->free_lseg) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "alloc_lseg and free_lseg.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops->read_pagelist || !io_ops->write_pagelist || ++ !io_ops->commit) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "read_pagelist, write_pagelist, and commit.\n", ++ __func__); ++ return NULL; ++ } ++ ++ pnfs_mod = kmalloc(sizeof(struct pnfs_module), GFP_KERNEL); ++ if (pnfs_mod != NULL) { ++ dprintk("%s Registering id:%u name:%s\n", ++ __func__, ++ ld_type->id, ++ ld_type->name); ++ pnfs_mod->pnfs_ld_type = ld_type; ++ INIT_LIST_HEAD(&pnfs_mod->pnfs_tblid); ++ ++ spin_lock(&pnfs_spinlock); ++ list_add(&pnfs_mod->pnfs_tblid, &pnfs_modules_tbl); ++ spin_unlock(&pnfs_spinlock); ++ } ++ ++ return &pnfs_ops; ++} ++ ++/* Allow I/O module to set its functions structure */ ++void ++pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) ++{ ++ struct pnfs_module *pnfs_mod; ++ ++ if (find_pnfs(ld_type->id, &pnfs_mod)) { ++ dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); ++ spin_lock(&pnfs_spinlock); ++ list_del(&pnfs_mod->pnfs_tblid); ++ spin_unlock(&pnfs_spinlock); ++ kfree(pnfs_mod); ++ } ++} ++ ++/* ++ * pNFS client layout cache ++ */ ++#if defined(CONFIG_SMP) ++#define BUG_ON_UNLOCKED_INO(ino) \ ++ BUG_ON(!spin_is_locked(&ino->i_lock)) ++#define BUG_ON_UNLOCKED_LO(lo) \ ++ BUG_ON_UNLOCKED_INO(PNFS_INODE(lo)) ++#else /* CONFIG_SMP */ ++#define BUG_ON_UNLOCKED_INO(lo) do {} while (0) ++#define BUG_ON_UNLOCKED_LO(lo) do {} while (0) ++#endif /* CONFIG_SMP */ ++ ++static inline void ++get_layout(struct pnfs_layout_type *lo) ++{ ++ BUG_ON_UNLOCKED_LO(lo); ++ lo->refcount++; ++} ++ ++static inline void ++put_layout_locked(struct pnfs_layout_type *lo) ++{ ++ BUG_ON_UNLOCKED_LO(lo); ++ BUG_ON(lo->refcount <= 0); ++ ++ lo->refcount--; ++ if (!lo->refcount) { ++ struct layoutdriver_io_operations *io_ops = PNFS_LD_IO_OPS(lo); ++ struct nfs_inode *nfsi = PNFS_NFS_INODE(lo); ++ ++ dprintk("%s: freeing layout cache %p\n", __func__, lo); ++ WARN_ON(!list_empty(&lo->lo_layouts)); ++ io_ops->free_layout(lo); ++ nfsi->layout = NULL; ++ } ++} ++ ++void ++put_layout(struct inode *inode) ++{ ++ spin_lock(&inode->i_lock); ++ put_layout_locked(NFS_I(inode)->layout); ++ spin_unlock(&inode->i_lock); ++ ++} ++ ++void ++pnfs_layout_release(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ struct nfs_inode *nfsi = PNFS_NFS_INODE(lo); ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (range) ++ pnfs_free_layout(lo, range); ++ /* ++ * Matched in _pnfs_update_layout for layoutget ++ * and by get_layout in _pnfs_return_layout for layoutreturn ++ */ ++ put_layout_locked(lo); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ wake_up_all(&nfsi->lo_waitq); ++} ++ ++void ++pnfs_destroy_layout(struct nfs_inode *nfsi) ++{ ++ struct pnfs_layout_type *lo; ++ struct nfs4_pnfs_layout_segment range = { ++ .iomode = IOMODE_ANY, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ lo = nfsi->layout; ++ if (lo) { ++ pnfs_free_layout(lo, &range); ++ WARN_ON(!list_empty(&nfsi->layout->segs)); ++ WARN_ON(!list_empty(&nfsi->layout->lo_layouts)); ++ ++ if (nfsi->layout->refcount != 1) ++ printk(KERN_WARNING "%s: layout refcount not=1 %d\n", ++ __func__, nfsi->layout->refcount); ++ WARN_ON(nfsi->layout->refcount != 1); ++ ++ /* Matched by refcount set to 1 in alloc_init_layout */ ++ put_layout_locked(lo); ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* ++ * Called by the state manger to remove all layouts established under an ++ * expired lease. ++ */ ++void ++pnfs_destroy_all_layouts(struct nfs_client *clp) ++{ ++ struct pnfs_layout_type *lo; ++ ++ while (!list_empty(&clp->cl_layouts)) { ++ lo = list_entry(clp->cl_layouts.next, struct pnfs_layout_type, ++ lo_layouts); ++ dprintk("%s freeing layout for inode %lu\n", __func__, ++ lo->lo_inode->i_ino); ++ pnfs_destroy_layout(NFS_I(lo->lo_inode)); ++ } ++} ++ ++static inline void ++init_lseg(struct pnfs_layout_type *lo, struct pnfs_layout_segment *lseg) ++{ ++ INIT_LIST_HEAD(&lseg->fi_list); ++ kref_init(&lseg->kref); ++ lseg->valid = true; ++ lseg->layout = lo; ++} ++ ++static void ++destroy_lseg(struct kref *kref) ++{ ++ struct pnfs_layout_segment *lseg = ++ container_of(kref, struct pnfs_layout_segment, kref); ++ ++ dprintk("--> %s\n", __func__); ++ /* Matched by get_layout in pnfs_insert_layout */ ++ put_layout_locked(lseg->layout); ++ PNFS_LD_IO_OPS(lseg->layout)->free_lseg(lseg); ++} ++ ++static void ++put_lseg_locked(struct pnfs_layout_segment *lseg) ++{ ++ bool do_wake_up; ++ struct nfs_inode *nfsi; ++ ++ if (!lseg) ++ return; ++ ++ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, ++ atomic_read(&lseg->kref.refcount), lseg->valid); ++ do_wake_up = !lseg->valid; ++ nfsi = PNFS_NFS_INODE(lseg->layout); ++ kref_put(&lseg->kref, destroy_lseg); ++ if (do_wake_up) ++ wake_up(&nfsi->lo_waitq); ++} ++ ++void ++put_lseg(struct pnfs_layout_segment *lseg) ++{ ++ bool do_wake_up; ++ struct nfs_inode *nfsi; ++ ++ if (!lseg) ++ return; ++ ++ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, ++ atomic_read(&lseg->kref.refcount), lseg->valid); ++ do_wake_up = !lseg->valid; ++ nfsi = PNFS_NFS_INODE(lseg->layout); ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ kref_put(&lseg->kref, destroy_lseg); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ if (do_wake_up) ++ wake_up(&nfsi->lo_waitq); ++} ++EXPORT_SYMBOL(put_lseg); ++ ++void get_lseg(struct pnfs_layout_segment *lseg) ++{ ++ kref_get(&lseg->kref); ++} ++EXPORT_SYMBOL(get_lseg); ++ ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end: NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1: NFS4_MAX_UINT64; ++} ++ ++/* ++ * is l2 fully contained in l1? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_contained(struct nfs4_pnfs_layout_segment *l1, ++ struct nfs4_pnfs_layout_segment *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (start1 <= start2) && (end1 >= end2); ++} ++ ++/* ++ * is l1 and l2 intersecting? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_intersecting(struct nfs4_pnfs_layout_segment *l1, ++ struct nfs4_pnfs_layout_segment *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (end1 == NFS4_MAX_UINT64 || end1 > start2) && ++ (end2 == NFS4_MAX_UINT64 || end2 > start1); ++} ++ ++void ++pnfs_set_layout_stateid(struct pnfs_layout_type *lo, ++ const nfs4_stateid *stateid) ++{ ++ write_seqlock(&lo->seqlock); ++ memcpy(lo->stateid.u.data, stateid->u.data, sizeof(lo->stateid.u.data)); ++ write_sequnlock(&lo->seqlock); ++} ++ ++void ++pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_type *lo) ++{ ++ int seq; ++ ++ dprintk("--> %s\n", __func__); ++ ++ do { ++ seq = read_seqbegin(&lo->seqlock); ++ memcpy(dst->u.data, lo->stateid.u.data, ++ sizeof(lo->stateid.u.data)); ++ } while (read_seqretry(&lo->seqlock, seq)); ++ ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void ++pnfs_layout_from_open_stateid(struct pnfs_layout_type *lo, ++ struct nfs4_state *state) ++{ ++ int seq; ++ ++ dprintk("--> %s\n", __func__); ++ ++ write_seqlock(&lo->seqlock); ++ if (!memcmp(lo->stateid.u.data, &zero_stateid, NFS4_STATEID_SIZE)) ++ do { ++ seq = read_seqbegin(&state->seqlock); ++ memcpy(lo->stateid.u.data, state->stateid.u.data, ++ sizeof(state->stateid.u.data)); ++ } while (read_seqretry(&state->seqlock, seq)); ++ write_sequnlock(&lo->seqlock); ++ dprintk("<-- %s\n", __func__); ++} ++ ++/* ++* Get layout from server. ++* for now, assume that whole file layouts are requested. ++* arg->offset: 0 ++* arg->length: all ones ++*/ ++static int ++send_layoutget(struct inode *ino, ++ struct nfs_open_context *ctx, ++ struct nfs4_pnfs_layout_segment *range, ++ struct pnfs_layout_segment **lsegpp, ++ struct pnfs_layout_type *lo) ++{ ++ int status; ++ struct nfs_server *server = NFS_SERVER(ino); ++ struct nfs4_pnfs_layoutget *lgp; ++ ++ dprintk("--> %s\n", __func__); ++ ++ lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); ++ if (lgp == NULL) { ++ pnfs_layout_release(lo, NULL); ++ return -ENOMEM; ++ } ++ lgp->args.minlength = NFS4_MAX_UINT64; ++ lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; ++ lgp->args.lseg.iomode = range->iomode; ++ lgp->args.lseg.offset = 0; ++ lgp->args.lseg.length = NFS4_MAX_UINT64; ++ lgp->args.type = server->pnfs_curr_ld->id; ++ lgp->args.inode = ino; ++ lgp->lsegpp = lsegpp; ++ ++ if (!memcmp(lo->stateid.u.data, &zero_stateid, NFS4_STATEID_SIZE)) { ++ struct nfs_open_context *oldctx = ctx; ++ ++ if (!oldctx) { ++ ctx = nfs_find_open_context(ino, NULL, ++ (range->iomode == IOMODE_READ) ? ++ FMODE_READ: FMODE_WRITE); ++ BUG_ON(!ctx); ++ } ++ /* Set the layout stateid from the open stateid */ ++ pnfs_layout_from_open_stateid(NFS_I(ino)->layout, ctx->state); ++ if (!oldctx) ++ put_nfs_open_context(ctx); ++ } ++ ++ /* Retrieve layout information from server */ ++ status = pnfs4_proc_layoutget(lgp); ++ ++ dprintk("<-- %s status %d\n", __func__, status); ++ return status; ++} ++ ++/* ++ * iomode matching rules: ++ * range lseg match ++ * ----- ----- ----- ++ * ANY READ true ++ * ANY RW true ++ * RW READ false ++ * RW RW true ++ * READ READ true ++ * READ RW false ++ */ ++static inline int ++should_free_lseg(struct pnfs_layout_segment *lseg, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ return (range->iomode == IOMODE_ANY || ++ lseg->range.iomode == range->iomode) && ++ lo_seg_intersecting(&lseg->range, range); ++} ++ ++static struct pnfs_layout_segment * ++has_layout_to_return(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ struct pnfs_layout_segment *out = NULL, *lseg; ++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", ++ __func__, lo, range->offset, range->length, range->iomode); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry (lseg, &lo->segs, fi_list) ++ if (should_free_lseg(lseg, range)) { ++ out = lseg; ++ break; ++ } ++ ++ dprintk("%s:Return lseg=%p\n", __func__, out); ++ return out; ++} ++ ++static inline bool ++_pnfs_can_return_lseg(struct pnfs_layout_segment *lseg) ++{ ++ return atomic_read(&lseg->kref.refcount) == 1; ++} ++ ++ ++static void ++pnfs_free_layout(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ struct pnfs_layout_segment *lseg, *next; ++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", ++ __func__, lo, range->offset, range->length, range->iomode); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry_safe (lseg, next, &lo->segs, fi_list) { ++ if (!should_free_lseg(lseg, range) || ++ !_pnfs_can_return_lseg(lseg)) ++ continue; ++ dprintk("%s: freeing lseg %p iomode %d " ++ "offset %llu length %llu\n", __func__, ++ lseg, lseg->range.iomode, lseg->range.offset, ++ lseg->range.length); ++ list_del(&lseg->fi_list); ++ put_lseg_locked(lseg); ++ } ++ if (list_empty(&lo->segs)) { ++ struct nfs_client *clp; ++ ++ clp = PNFS_NFS_SERVER(lo)->nfs_client; ++ spin_lock(&clp->cl_lock); ++ list_del_init(&lo->lo_layouts); ++ spin_unlock(&clp->cl_lock); ++ pnfs_set_layout_stateid(lo, &zero_stateid); ++ } ++ ++ dprintk("%s:Return\n", __func__); ++} ++ ++static bool ++pnfs_return_layout_barrier(struct nfs_inode *nfsi, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ struct pnfs_layout_segment *lseg; ++ bool ret = false; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ list_for_each_entry(lseg, &nfsi->layout->segs, fi_list) { ++ if (!should_free_lseg(lseg, range)) ++ continue; ++ lseg->valid = false; ++ if (!_pnfs_can_return_lseg(lseg)) { ++ dprintk("%s: wait on lseg %p refcount %d\n", ++ __func__, lseg, ++ atomic_read(&lseg->kref.refcount)); ++ ret = true; ++ } ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ dprintk("%s:Return %d\n", __func__, ret); ++ return ret; ++} ++ ++static int ++return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range, ++ enum pnfs_layoutreturn_type type, struct pnfs_layout_type *lo, ++ bool wait) ++{ ++ struct nfs4_pnfs_layoutreturn *lrp; ++ struct nfs_server *server = NFS_SERVER(ino); ++ int status = -ENOMEM; ++ ++ dprintk("--> %s\n", __func__); ++ ++ BUG_ON(type != RETURN_FILE); ++ ++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); ++ if (lrp == NULL) { ++ if (lo && (type == RETURN_FILE)) ++ pnfs_layout_release(lo, NULL); ++ goto out; ++ } ++ lrp->args.reclaim = 0; ++ lrp->args.layout_type = server->pnfs_curr_ld->id; ++ lrp->args.return_type = type; ++ lrp->args.lseg = *range; ++ lrp->args.inode = ino; ++ ++ status = pnfs4_proc_layoutreturn(lrp, wait); ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; ++} ++ ++int ++_pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ struct pnfs_layout_type *lo = NULL; ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct nfs4_pnfs_layout_segment arg; ++ int status = 0; ++ ++ dprintk("--> %s type %d\n", __func__, type); ++ ++ ++ arg.iomode = range ? range->iomode : IOMODE_ANY; ++ arg.offset = 0; ++ arg.length = NFS4_MAX_UINT64; ++ ++ if (type == RETURN_FILE) { ++ spin_lock(&ino->i_lock); ++ lo = nfsi->layout; ++ if (lo && !has_layout_to_return(lo, &arg)) { ++ lo = NULL; ++ } ++ if (!lo) { ++ spin_unlock(&ino->i_lock); ++ dprintk("%s: no layout segments to return\n", __func__); ++ goto out; ++ } ++ ++ /* Reference for layoutreturn matched in pnfs_layout_release */ ++ get_layout(lo); ++ ++ spin_unlock(&ino->i_lock); ++ ++ if (pnfs_return_layout_barrier(nfsi, &arg)) { ++ if (stateid) { /* callback */ ++ status = -EAGAIN; ++ goto out_put; ++ } ++ dprintk("%s: waiting\n", __func__); ++ wait_event(nfsi->lo_waitq, ++ !pnfs_return_layout_barrier(nfsi, &arg)); ++ } ++ ++ if (layoutcommit_needed(nfsi)) { ++ if (stateid && !wait) { /* callback */ ++ dprintk("%s: layoutcommit pending\n", __func__); ++ status = -EAGAIN; ++ goto out_put; ++ } ++ status = pnfs_layoutcommit_inode(ino, wait); ++ if (status) { ++ /* Return layout even if layoutcommit fails */ ++ dprintk("%s: layoutcommit failed, status=%d. " ++ "Returning layout anyway\n", ++ __func__, status); ++ } ++ } ++ ++ if (!stateid) ++ status = return_layout(ino, &arg, type, lo, wait); ++ else ++ pnfs_layout_release(lo, &arg); ++ } ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; ++out_put: ++ put_layout(ino); ++ goto out; ++} ++ ++/* ++ * cmp two layout segments for sorting into layout cache ++ */ ++static inline s64 ++cmp_layout(struct nfs4_pnfs_layout_segment *l1, ++ struct nfs4_pnfs_layout_segment *l2) ++{ ++ s64 d; ++ ++ /* higher offset > lower offset */ ++ d = l1->offset - l2->offset; ++ if (d) ++ return d; ++ ++ /* longer length > shorter length */ ++ d = l1->length - l2->length; ++ if (d) ++ return d; ++ ++ /* read > read/write */ ++ return (int)(l1->iomode == IOMODE_READ) - ++ (int)(l2->iomode == IOMODE_READ); ++} ++ ++static void ++pnfs_insert_layout(struct pnfs_layout_type *lo, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct pnfs_layout_segment *lp; ++ int found = 0; ++ ++ dprintk("%s:Begin\n", __func__); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ if (list_empty(&lo->segs)) { ++ struct nfs_client *clp = PNFS_NFS_SERVER(lo)->nfs_client; ++ ++ spin_lock(&clp->cl_lock); ++ BUG_ON(!list_empty(&lo->lo_layouts)); ++ list_add_tail(&lo->lo_layouts, &clp->cl_layouts); ++ spin_unlock(&clp->cl_lock); ++ } ++ list_for_each_entry (lp, &lo->segs, fi_list) { ++ if (cmp_layout(&lp->range, &lseg->range) > 0) ++ continue; ++ list_add_tail(&lseg->fi_list, &lp->fi_list); ++ dprintk("%s: inserted lseg %p " ++ "iomode %d offset %llu length %llu before " ++ "lp %p iomode %d offset %llu length %llu\n", ++ __func__, lseg, lseg->range.iomode, ++ lseg->range.offset, lseg->range.length, ++ lp, lp->range.iomode, lp->range.offset, ++ lp->range.length); ++ found = 1; ++ break; ++ } ++ if (!found) { ++ list_add_tail(&lseg->fi_list, &lo->segs); ++ dprintk("%s: inserted lseg %p " ++ "iomode %d offset %llu length %llu at tail\n", ++ __func__, lseg, lseg->range.iomode, ++ lseg->range.offset, lseg->range.length); ++ } ++ get_layout(lo); ++ ++ dprintk("%s:Return\n", __func__); ++} ++ ++/* ++ * Each layoutdriver embeds pnfs_layout_type as the first field in it's ++ * per-layout type layout cache structure and returns it ZEROed ++ * from layoutdriver_io_ops->alloc_layout ++ */ ++static struct pnfs_layout_type * ++alloc_init_layout(struct inode *ino) ++{ ++ struct pnfs_layout_type *lo; ++ struct layoutdriver_io_operations *io_ops; ++ ++ io_ops = NFS_SERVER(ino)->pnfs_curr_ld->ld_io_ops; ++ lo = io_ops->alloc_layout(ino); ++ if (!lo) { ++ printk(KERN_ERR ++ "%s: out of memory: io_ops->alloc_layout failed\n", ++ __func__); ++ return NULL; ++ } ++ lo->refcount = 1; ++ INIT_LIST_HEAD(&lo->lo_layouts); ++ INIT_LIST_HEAD(&lo->segs); ++ seqlock_init(&lo->seqlock); ++ lo->lo_inode = ino; ++ return lo; ++} ++ ++/* ++ * Retrieve and possibly allocate the inode layout ++ * ++ * ino->i_lock must be taken by the caller. ++ */ ++static struct pnfs_layout_type * ++pnfs_alloc_layout(struct inode *ino) ++{ ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_type *new = NULL; ++ ++ dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); ++ ++ BUG_ON_UNLOCKED_INO(ino); ++ if (likely(nfsi->layout)) ++ return nfsi->layout; ++ ++ spin_unlock(&ino->i_lock); ++ new = alloc_init_layout(ino); ++ spin_lock(&ino->i_lock); ++ ++ if (likely(nfsi->layout == NULL)) { /* Won the race? */ ++ nfsi->layout = new; ++ } else if (new) { ++ /* Reference the layout accross i_lock release and grab */ ++ get_layout(nfsi->layout); ++ spin_unlock(&ino->i_lock); ++ NFS_SERVER(ino)->pnfs_curr_ld->ld_io_ops->free_layout(new); ++ spin_lock(&ino->i_lock); ++ put_layout_locked(nfsi->layout); ++ } ++ return nfsi->layout; ++} ++ ++/* ++ * iomode matching rules: ++ * range lseg match ++ * ----- ----- ----- ++ * ANY READ true ++ * ANY RW true ++ * RW READ false ++ * RW RW true ++ * READ READ true ++ * READ RW true ++ */ ++static inline int ++has_matching_lseg(struct pnfs_layout_segment *lseg, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ struct nfs4_pnfs_layout_segment range1; ++ ++ if ((range->iomode == IOMODE_RW && lseg->range.iomode != IOMODE_RW) || ++ !lo_seg_intersecting(&lseg->range, range)) ++ return 0; ++ ++ /* range1 covers only the first byte in the range */ ++ range1 = *range; ++ range1.length = 1; ++ return lo_seg_contained(&lseg->range, &range1); ++} ++ ++/* ++ * lookup range in layout ++ */ ++static struct pnfs_layout_segment * ++pnfs_has_layout(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layout_segment *range, ++ bool take_ref, ++ bool only_valid) ++{ ++ struct pnfs_layout_segment *lseg, *ret = NULL; ++ ++ dprintk("%s:Begin\n", __func__); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry (lseg, &lo->segs, fi_list) { ++ if (has_matching_lseg(lseg, range) && ++ (lseg->valid || !only_valid)) { ++ ret = lseg; ++ if (take_ref) ++ get_lseg(ret); ++ break; ++ } ++ if (cmp_layout(range, &lseg->range) > 0) ++ break; ++ } ++ ++ dprintk("%s:Return lseg %p take_ref %d ref %d valid %d\n", ++ __func__, ret, take_ref, ++ ret ? atomic_read(&ret->kref.refcount) : 0, ++ ret ? ret->valid : 0); ++ return ret; ++} ++ ++/* Update the file's layout for the given range and iomode. ++ * Layout is retreived from the server if needed. ++ * If lsegpp is given, the appropriate layout segment is referenced and ++ * returned to the caller. ++ */ ++void ++_pnfs_update_layout(struct inode *ino, ++ struct nfs_open_context *ctx, ++ loff_t pos, ++ u64 count, ++ enum pnfs_iomode iomode, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ struct nfs4_pnfs_layout_segment arg = { ++ .iomode = iomode, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_type *lo; ++ struct pnfs_layout_segment *lseg = NULL; ++ bool take_ref = (lsegpp != NULL); ++ ++ if (take_ref) ++ *lsegpp = NULL; ++ spin_lock(&ino->i_lock); ++ lo = pnfs_alloc_layout(ino); ++ if (lo == NULL) { ++ dprintk("%s ERROR: can't get pnfs_layout_type\n", __func__); ++ goto out_unlock; ++ } ++ ++ /* Check to see if the layout for the given range already exists */ ++ lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref); ++ if (lseg && !lseg->valid) { ++ if (take_ref) ++ put_lseg_locked(lseg); ++ /* someone is cleaning the layout */ ++ lseg = NULL; ++ goto out_unlock; ++ } ++ ++ if (lseg) { ++ dprintk("%s: Using cached lseg %p for %llu@%llu iomode %d)\n", ++ __func__, ++ lseg, ++ arg.length, ++ arg.offset, ++ arg.iomode); ++ ++ goto out_unlock; ++ } ++ ++ /* if get layout already failed once goto out */ ++ if (test_bit(lo_fail_bit(iomode), &nfsi->layout->pnfs_layout_state)) { ++ if (unlikely(nfsi->pnfs_layout_suspend && ++ get_seconds() >= nfsi->pnfs_layout_suspend)) { ++ dprintk("%s: layout_get resumed\n", __func__); ++ clear_bit(lo_fail_bit(iomode), ++ &nfsi->layout->pnfs_layout_state); ++ nfsi->pnfs_layout_suspend = 0; ++ } else ++ goto out_unlock; ++ } ++ ++ /* Reference the layout for layoutget matched in pnfs_layout_release */ ++ get_layout(lo); ++ spin_unlock(&ino->i_lock); ++ ++ send_layoutget(ino, ctx, &arg, lsegpp, lo); ++out: ++ dprintk("%s end, state 0x%lx lseg %p\n", __func__, ++ nfsi->layout->pnfs_layout_state, lseg); ++ return; ++out_unlock: ++ if (lsegpp) ++ *lsegpp = lseg; ++ spin_unlock(&ino->i_lock); ++ goto out; ++} ++ ++void ++pnfs_get_layout_done(struct nfs4_pnfs_layoutget *lgp, int rpc_status) ++{ ++ struct pnfs_layout_segment *lseg = NULL; ++ struct nfs_inode *nfsi = NFS_I(lgp->args.inode); ++ time_t suspend = 0; ++ ++ dprintk("-->%s\n", __func__); ++ ++ lgp->status = rpc_status; ++ if (likely(!rpc_status)) { ++ if (unlikely(lgp->res.layout.len < 0)) { ++ printk(KERN_ERR ++ "%s: ERROR Returned layout size is ZERO\n", __func__); ++ lgp->status = -EIO; ++ } ++ goto out; ++ } ++ ++ dprintk("%s: ERROR retrieving layout %d\n", __func__, rpc_status); ++ switch (rpc_status) { ++ case -NFS4ERR_BADLAYOUT: ++ lgp->status = -ENOENT; ++ /* FALLTHROUGH */ ++ case -EACCES: /* NFS4ERR_ACCESS */ ++ /* transient error, don't mark with NFS_INO_LAYOUT_FAILED */ ++ goto out; ++ ++ case -NFS4ERR_LAYOUTTRYLATER: ++ case -NFS4ERR_RECALLCONFLICT: ++ case -NFS4ERR_OLD_STATEID: ++ case -EAGAIN: /* NFS4ERR_LOCKED */ ++ lgp->status = -NFS4ERR_DELAY; /* for nfs4_handle_exception */ ++ /* FALLTHROUGH */ ++ case -NFS4ERR_GRACE: ++ case -NFS4ERR_DELAY: ++ goto out; ++ ++ case -NFS4ERR_ADMIN_REVOKED: ++ case -NFS4ERR_DELEG_REVOKED: ++ /* The layout is expected to be returned at this point. ++ * This should clear the layout stateid as well */ ++ suspend = get_seconds() + 1; ++ break; ++ ++ case -NFS4ERR_LAYOUTUNAVAILABLE: ++ lgp->status = -ENOTSUPP; ++ break; ++ ++ case -NFS4ERR_REP_TOO_BIG: ++ case -NFS4ERR_REP_TOO_BIG_TO_CACHE: ++ lgp->status = -E2BIG; ++ break; ++ ++ /* Leave the following errors untranslated */ ++ case -NFS4ERR_DEADSESSION: ++ case -NFS4ERR_DQUOT: ++ case -EINVAL: /* NFS4ERR_INVAL */ ++ case -EIO: /* NFS4ERR_IO */ ++ case -NFS4ERR_FHEXPIRED: ++ case -NFS4ERR_MOVED: ++ case -NFS4ERR_NOSPC: ++ case -ESERVERFAULT: /* NFS4ERR_SERVERFAULT */ ++ case -ESTALE: /* NFS4ERR_STALE */ ++ case -ETOOSMALL: /* NFS4ERR_TOOSMALL */ ++ break; ++ ++ /* The following errors are our fault and should never happen */ ++ case -NFS4ERR_BADIOMODE: ++ case -NFS4ERR_BADXDR: ++ case -NFS4ERR_REQ_TOO_BIG: ++ case -NFS4ERR_UNKNOWN_LAYOUTTYPE: ++ case -NFS4ERR_WRONG_TYPE: ++ lgp->status = -EINVAL; ++ /* FALLTHROUGH */ ++ case -NFS4ERR_BAD_STATEID: ++ case -NFS4ERR_NOFILEHANDLE: ++ case -ENOTSUPP: /* NFS4ERR_NOTSUPP */ ++ case -NFS4ERR_OPENMODE: ++ case -NFS4ERR_OP_NOT_IN_SESSION: ++ case -NFS4ERR_TOO_MANY_OPS: ++ dprintk("%s: error %d: should never happen\n", __func__, ++ rpc_status); ++ break; ++ ++ /* The following errors are the server's fault */ ++ default: ++ dprintk("%s: illegal error %d\n", __func__, rpc_status); ++ lgp->status = -EIO; ++ break; ++ } ++ ++ /* remember that get layout failed and suspend trying */ ++ nfsi->pnfs_layout_suspend = suspend; ++ set_bit(lo_fail_bit(lgp->args.lseg.iomode), ++ &nfsi->layout->pnfs_layout_state); ++ dprintk("%s: layout_get suspended until %ld\n", ++ __func__, suspend); ++out: ++ dprintk("%s end (err:%d) state 0x%lx lseg %p\n", ++ __func__, lgp->status, nfsi->layout->pnfs_layout_state, lseg); ++ return; ++} ++ ++int ++pnfs_layout_process(struct nfs4_pnfs_layoutget *lgp) ++{ ++ struct pnfs_layout_type *lo = NFS_I(lgp->args.inode)->layout; ++ struct nfs4_pnfs_layoutget_res *res = &lgp->res; ++ struct pnfs_layout_segment *lseg; ++ struct inode *ino = PNFS_INODE(lo); ++ int status = 0; ++ ++ /* Inject layout blob into I/O device driver */ ++ lseg = PNFS_LD_IO_OPS(lo)->alloc_lseg(lo, res); ++ if (!lseg || IS_ERR(lseg)) { ++ if (!lseg) ++ status = -ENOMEM; ++ else ++ status = PTR_ERR(lseg); ++ dprintk("%s: Could not allocate layout: error %d\n", ++ __func__, status); ++ goto out; ++ } ++ ++ spin_lock(&ino->i_lock); ++ init_lseg(lo, lseg); ++ lseg->range = res->lseg; ++ if (lgp->lsegpp) { ++ get_lseg(lseg); ++ *lgp->lsegpp = lseg; ++ } ++ pnfs_insert_layout(lo, lseg); ++ ++ if (res->return_on_close) { ++ lo->roc_iomode |= res->lseg.iomode; ++ if (!lo->roc_iomode) ++ lo->roc_iomode = IOMODE_ANY; ++ } ++ ++ /* Done processing layoutget. Set the layout stateid */ ++ pnfs_set_layout_stateid(lo, &res->stateid); ++ spin_unlock(&ino->i_lock); ++out: ++ return status; ++} ++ ++void ++readahead_range(struct inode *inode, struct list_head *pages, loff_t *offset, ++ size_t *count) ++{ ++ struct page *first, *last; ++ loff_t foff, i_size = i_size_read(inode); ++ pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; ++ size_t range; ++ ++ ++ first = list_entry((pages)->prev, struct page, lru); ++ last = list_entry((pages)->next, struct page, lru); ++ ++ foff = (loff_t)first->index << PAGE_CACHE_SHIFT; ++ ++ range = (last->index - first->index) * PAGE_CACHE_SIZE; ++ if (last->index == end_index) ++ range += ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; ++ else ++ range += PAGE_CACHE_SIZE; ++ dprintk("%s foff %lu, range %Zu\n", __func__, (unsigned long)foff, ++ range); ++ *offset = foff; ++ *count = range; ++} ++ ++void ++pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio) ++{ ++ struct pnfs_layout_type *laytype; ++ struct pnfs_layoutdriver_type *ld; ++ ++ pgio->pg_test = NULL; ++ ++ laytype = NFS_I(inode)->layout; ++ ld = NFS_SERVER(inode)->pnfs_curr_ld; ++ if (!pnfs_enabled_sb(NFS_SERVER(inode)) || !laytype) ++ return; ++ ++ if (ld->ld_policy_ops) ++ pgio->pg_test = ld->ld_policy_ops->pg_test; ++} ++ ++static u32 ++pnfs_getboundary(struct inode *inode) ++{ ++ u32 stripe_size = 0; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct layoutdriver_policy_operations *policy_ops; ++ ++ if (!nfss->pnfs_curr_ld) ++ goto out; ++ ++ policy_ops = nfss->pnfs_curr_ld->ld_policy_ops; ++ if (!policy_ops || !policy_ops->get_stripesize) ++ goto out; ++ ++ /* The default is to not gather across stripes */ ++ if (pnfs_ld_gather_across_stripes(nfss->pnfs_curr_ld)) ++ goto out; ++ ++ spin_lock(&inode->i_lock); ++ if (NFS_I(inode)->layout) ++ stripe_size = policy_ops->get_stripesize(NFS_I(inode)->layout); ++ spin_unlock(&inode->i_lock); ++out: ++ return stripe_size; ++} ++ ++/* ++ * rsize is already set by caller to MDS rsize. ++ */ ++void ++pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, ++ struct inode *inode, ++ struct nfs_open_context *ctx, ++ struct list_head *pages, ++ size_t *rsize) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ size_t count = 0; ++ loff_t loff; ++ ++ pgio->pg_iswrite = 0; ++ pgio->pg_boundary = 0; ++ pgio->pg_test = NULL; ++ pgio->pg_lseg = NULL; ++ ++ if (!pnfs_enabled_sb(nfss)) ++ return; ++ ++ /* Calculate the total read-ahead count */ ++ readahead_range(inode, pages, &loff, &count); ++ ++ if (count > 0) { ++ _pnfs_update_layout(inode, ctx, loff, count, IOMODE_READ, ++ &pgio->pg_lseg); ++ if (!pgio->pg_lseg) ++ return; ++ ++ *rsize = NFS_SERVER(inode)->ds_rsize; ++ pgio->pg_boundary = pnfs_getboundary(inode); ++ if (pgio->pg_boundary) ++ pnfs_set_pg_test(inode, pgio); ++ } ++} ++ ++void ++pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, ++ size_t *wsize) ++{ ++ struct nfs_server *server = NFS_SERVER(inode); ++ ++ pgio->pg_iswrite = 1; ++ if (!pnfs_enabled_sb(server)) { ++ pgio->pg_boundary = 0; ++ pgio->pg_test = NULL; ++ return; ++ } ++ pgio->pg_boundary = pnfs_getboundary(inode); ++ pnfs_set_pg_test(inode, pgio); ++ *wsize = server->ds_wsize; ++} ++ ++/* Return I/O buffer size for a layout driver ++ * This value will determine what size reads and writes ++ * will be gathered into and sent to the data servers. ++ * blocksize must be a multiple of the page cache size. ++ */ ++unsigned int ++pnfs_getiosize(struct nfs_server *server) ++{ ++ if (!PNFS_EXISTS_LDPOLICY_OP(server, get_blocksize)) ++ return 0; ++ return server->pnfs_curr_ld->ld_policy_ops->get_blocksize(); ++} ++ ++void ++pnfs_set_ds_iosize(struct nfs_server *server) ++{ ++ unsigned dssize = pnfs_getiosize(server); ++ ++ /* Set buffer size for data servers */ ++ if (dssize > 0) { ++ server->ds_rsize = server->ds_wsize = ++ nfs_block_size(dssize, NULL); ++ } else { ++ server->ds_wsize = server->wsize; ++ server->ds_rsize = server->rsize; ++ } ++} ++ ++static int ++pnfs_call_done(struct pnfs_call_data *pdata, struct rpc_task *task, void *data) ++{ ++ put_lseg(pdata->lseg); ++ pdata->lseg = NULL; ++ pdata->call_ops->rpc_call_done(task, data); ++ if (pdata->pnfs_error == -EAGAIN || task->tk_status == -EAGAIN) ++ return -EAGAIN; ++ if (pdata->pnfsflags & PNFS_NO_RPC) { ++ pdata->call_ops->rpc_release(data); ++ } else { ++ /* ++ * just restore original rpc call ops ++ * rpc_release will be called later by the rpc scheduling layer. ++ */ ++ task->tk_ops = pdata->call_ops; ++ } ++ return 0; ++} ++ ++/* Post-write completion function ++ * Invoked by all layout drivers when write_pagelist is done. ++ * ++ * NOTE: callers set data->pnfsflags PNFS_NO_RPC ++ * so that the NFS cleanup routines perform only the page cache ++ * cleanup. ++ */ ++static void ++pnfs_write_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ struct nfs4_pnfs_layout_segment range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = wdata->args.offset; ++ range.length = wdata->args.count; ++ _pnfs_return_layout(wdata->inode, &range, NULL, RETURN_FILE, true); ++ pnfs_initiate_write(wdata, NFS_CLIENT(wdata->inode), ++ wdata->pdata.call_ops, wdata->pdata.how); ++} ++ ++static void ++pnfs_writeback_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ /* update last write offset and need layout commit ++ * for non-files layout types (files layout calls ++ * pnfs4_write_done for this) ++ */ ++ if ((pdata->pnfsflags & PNFS_NO_RPC) && ++ data->task.tk_status >= 0 && data->res.count > 0) { ++ struct nfs_inode *nfsi = NFS_I(data->inode); ++ ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++ } ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_write_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++ ++static void _pnfs_clear_lseg_from_pages(struct list_head *head) ++{ ++ struct nfs_page *req; ++ ++ list_for_each_entry(req, head, wb_list) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } ++} ++ ++/* ++ * Call the appropriate parallel I/O subsystem write function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ * ++ * TODO: Is wdata->how and wdata->args.stable always the same value? ++ * TODO: It seems in NFS, the server may not do a stable write even ++ * though it was requested (and vice-versa?). To check, it looks ++ * in data->res.verf->committed. Do we need this ability ++ * for non-file layout drivers? ++ */ ++enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *wdata, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ struct inode *inode = wdata->inode; ++ enum pnfs_try_status trypnfs; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = wdata->req->wb_lseg; ++ ++ wdata->pdata.call_ops = call_ops; ++ wdata->pdata.pnfs_error = 0; ++ wdata->pdata.how = how; ++ ++ dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, ++ inode->i_ino, wdata->args.count, wdata->args.offset, how); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ wdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ wdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->write_pagelist(wdata, ++ nfs_page_array_len(wdata->args.pgbase, wdata->args.count), ++ how); ++ ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ wdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ wdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&wdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* Post-read completion function. Invoked by all layout drivers when ++ * read_pagelist is done ++ */ ++static void ++pnfs_read_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ struct nfs4_pnfs_layout_segment range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = rdata->args.offset; ++ range.length = rdata->args.count; ++ _pnfs_return_layout(rdata->inode, &range, NULL, RETURN_FILE, true); ++ pnfs_initiate_read(rdata, NFS_CLIENT(rdata->inode), ++ rdata->pdata.call_ops); ++} ++ ++static void ++pnfs_read_done(struct nfs_read_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_read_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++ ++/* ++ * Call the appropriate parallel I/O subsystem read function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ */ ++enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *rdata, ++ const struct rpc_call_ops *call_ops) ++{ ++ struct inode *inode = rdata->inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = rdata->req->wb_lseg; ++ enum pnfs_try_status trypnfs; ++ ++ rdata->pdata.call_ops = call_ops; ++ rdata->pdata.pnfs_error = 0; ++ ++ dprintk("%s: Reading ino:%lu %u@%llu\n", ++ __func__, inode->i_ino, rdata->args.count, rdata->args.offset); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ rdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ rdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->read_pagelist(rdata, ++ nfs_page_array_len(rdata->args.pgbase, rdata->args.count)); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ rdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ rdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&rdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_READ); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* ++ * This gives the layout driver an opportunity to read in page "around" ++ * the data to be written. It returns 0 on success, otherwise an error code ++ * which will either be passed up to user, or ignored if ++ * some previous part of write succeeded. ++ * Note the range [pos, pos+len-1] is entirely within the page. ++ */ ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata) ++{ ++ struct pnfs_fsdata *data; ++ int status = 0; ++ ++ dprintk("--> %s: pos=%llu len=%u\n", ++ __func__, (unsigned long long)pos, len); ++ data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL); ++ if (!data) { ++ status = -ENOMEM; ++ goto out; ++ } ++ data->lseg = lseg; /* refcount passed into data to be managed there */ ++ status = NFS_SERVER(inode)->pnfs_curr_ld->ld_io_ops->write_begin( ++ lseg, page, pos, len, data); ++ if (status) { ++ kfree(data); ++ data = NULL; ++ } ++out: ++ *fsdata = data; ++ dprintk("<-- %s: status=%d\n", __func__, status); ++ return status; ++} ++ ++/* Return 0 on succes, negative on failure */ ++/* CAREFUL - what happens if copied < len??? */ ++int _pnfs_write_end(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int status; ++ ++ status = nfss->pnfs_curr_ld->ld_io_ops->write_end(inode, page, ++ pos, len, copied, lseg); ++ return status; ++} ++ ++/* pNFS Commit callback function for all layout drivers */ ++static void ++pnfs_commit_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ struct nfs4_pnfs_layout_segment range = { ++ .iomode = IOMODE_RW, ++ .offset = data->args.offset, ++ .length = data->args.count, ++ }; ++ dprintk("%s: retrying\n", __func__); ++ _pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE, ++ true); ++ pnfs_initiate_commit(data, NFS_CLIENT(data->inode), ++ pdata->call_ops, pdata->how, 1); ++ } ++} ++ ++enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int sync) ++{ ++ struct inode *inode = data->inode; ++ struct nfs_server *nfss = NFS_SERVER(data->inode); ++ enum pnfs_try_status trypnfs; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ /* We need to account for possibility that ++ * each nfs_page can point to a different lseg (or be NULL). ++ * For the immediate case of whole-file-only layouts, we at ++ * least know there can be only a single lseg. ++ * We still have to account for the possibility of some being NULL. ++ * This will be done by passing the buck to the layout driver. ++ */ ++ data->pdata.call_ops = call_ops; ++ data->pdata.pnfs_error = 0; ++ data->pdata.how = sync; ++ data->pdata.lseg = NULL; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->commit(data, sync); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ data->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ _pnfs_clear_lseg_from_pages(&data->pages); ++ } else ++ nfs_inc_stats(inode, NFSIOS_PNFS_COMMIT); ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++void pnfs_cleanup_layoutcommit(struct pnfs_layoutcommit_data *data) ++{ ++ struct nfs_server *nfss = NFS_SERVER(data->args.inode); ++ ++ /* TODO: Maybe we should avoid this by allowing the layout driver ++ * to directly xdr its layout on the wire. ++ */ ++ if (nfss->pnfs_curr_ld->ld_io_ops->cleanup_layoutcommit) ++ nfss->pnfs_curr_ld->ld_io_ops->cleanup_layoutcommit( ++ NFS_I(data->args.inode)->layout, ++ &data->args, data->status); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int ++pnfs_layoutcommit_setup(struct inode *inode, ++ struct pnfs_layoutcommit_data *data, ++ loff_t write_begin_pos, loff_t write_end_pos) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int result = 0; ++ ++ dprintk("--> %s\n", __func__); ++ ++ data->args.inode = inode; ++ data->args.fh = NFS_FH(inode); ++ data->args.layout_type = nfss->pnfs_curr_ld->id; ++ data->res.fattr = &data->fattr; ++ nfs_fattr_init(&data->fattr); ++ ++ /* TODO: Need to determine the correct values */ ++ data->args.time_modify_changed = 0; ++ ++ /* Set values from inode so it can be reset ++ */ ++ data->args.lseg.iomode = IOMODE_RW; ++ data->args.lseg.offset = write_begin_pos; ++ data->args.lseg.length = write_end_pos - write_begin_pos + 1; ++ data->args.lastbytewritten = min(write_end_pos, ++ i_size_read(inode) - 1); ++ data->args.bitmask = nfss->attr_bitmask; ++ data->res.server = nfss; ++ ++ /* Call layout driver to set the arguments */ ++ if (nfss->pnfs_curr_ld->ld_io_ops->setup_layoutcommit) ++ result = nfss->pnfs_curr_ld->ld_io_ops->setup_layoutcommit( ++ NFS_I(inode)->layout, &data->args); ++ ++ dprintk("<-- %s Status %d\n", __func__, result); ++ return result; ++} ++ ++/* Issue a async layoutcommit for an inode. ++ */ ++int ++pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ struct pnfs_layoutcommit_data *data; ++ struct nfs_inode *nfsi = NFS_I(inode); ++ loff_t write_begin_pos; ++ loff_t write_end_pos; ++ ++ int status = 0; ++ ++ dprintk("%s Begin (sync:%d)\n", __func__, sync); ++ ++ BUG_ON(!has_layout(nfsi)); ++ ++ data = pnfs_layoutcommit_alloc(); ++ if (!data) ++ return -ENOMEM; ++ ++ spin_lock(&inode->i_lock); ++ if (!layoutcommit_needed(nfsi)) { ++ spin_unlock(&inode->i_lock); ++ goto out_free; ++ } ++ ++ /* Clear layoutcommit properties in the inode so ++ * new lc info can be generated ++ */ ++ write_begin_pos = nfsi->layout->pnfs_write_begin_pos; ++ write_end_pos = nfsi->layout->pnfs_write_end_pos; ++ data->cred = nfsi->layout->lo_cred; ++ nfsi->layout->pnfs_write_begin_pos = 0; ++ nfsi->layout->pnfs_write_end_pos = 0; ++ nfsi->layout->lo_cred = NULL; ++ __clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->pnfs_layout_state); ++ pnfs_get_layout_stateid(&data->args.stateid, nfsi->layout); ++ ++ /* Reference for layoutcommit matched in pnfs_layoutcommit_release */ ++ get_layout(NFS_I(inode)->layout); ++ ++ spin_unlock(&inode->i_lock); ++ ++ /* Set up layout commit args */ ++ status = pnfs_layoutcommit_setup(inode, data, write_begin_pos, ++ write_end_pos); ++ if (status) { ++ /* The layout driver failed to setup the layoutcommit */ ++ put_rpccred(data->cred); ++ put_layout(inode); ++ goto out_free; ++ } ++ status = pnfs4_proc_layoutcommit(data, sync); ++out: ++ dprintk("%s end (err:%d)\n", __func__, status); ++ return status; ++out_free: ++ pnfs_layoutcommit_free(data); ++ goto out; ++} ++ ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata) ++{ ++ if (fsdata) { ++ /* lseg refcounting handled directly in nfs_Write_end */ ++ kfree(fsdata); ++ } ++} ++ ++/* Callback operations for layout drivers. ++ */ ++struct pnfs_client_operations pnfs_ops = { ++ .nfs_getdevicelist = nfs4_pnfs_getdevicelist, ++ .nfs_getdeviceinfo = nfs4_pnfs_getdeviceinfo, ++ .nfs_readlist_complete = pnfs_read_done, ++ .nfs_writelist_complete = pnfs_writeback_done, ++ .nfs_commit_complete = pnfs_commit_done, ++}; ++ ++EXPORT_SYMBOL(pnfs_unregister_layoutdriver); ++EXPORT_SYMBOL(pnfs_register_layoutdriver); ++ ++ ++/* Device ID cache. Supports one layout type per struct nfs_client */ ++int ++nfs4_alloc_init_deviceid_cache(struct nfs_client *clp, ++ void (*free_callback)(struct kref *)) ++{ ++ struct nfs4_deviceid_cache *c; ++ ++ c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL); ++ if (!c) ++ return -ENOMEM; ++ spin_lock(&clp->cl_lock); ++ if (clp->cl_devid_cache != NULL) { ++ kref_get(&clp->cl_devid_cache->dc_kref); ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [kref [%d]]\n", __func__, ++ atomic_read(&clp->cl_devid_cache->dc_kref.refcount)); ++ kfree(c); ++ } else { ++ int i; ++ ++ spin_lock_init(&c->dc_lock); ++ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE ; i++) ++ INIT_HLIST_HEAD(&c->dc_deviceids[i]); ++ kref_init(&c->dc_kref); ++ c->dc_free_callback = free_callback; ++ clp->cl_devid_cache = c; ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [new]\n", __func__); ++ } ++ return 0; ++} ++EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache); ++ ++void ++nfs4_init_deviceid_node(struct nfs4_deviceid *d) ++{ ++ INIT_HLIST_NODE(&d->de_node); ++ kref_init(&d->de_kref); ++} ++EXPORT_SYMBOL(nfs4_init_deviceid_node); ++ ++/* Called from layoutdriver_io_operations->alloc_lseg */ ++void ++nfs4_set_layout_deviceid(struct pnfs_layout_segment *l, struct nfs4_deviceid *d) ++{ ++ dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); ++ l->deviceid = d; ++ kref_get(&d->de_kref); ++} ++EXPORT_SYMBOL(nfs4_set_layout_deviceid); ++ ++/* Called from layoutdriver_io_operations->free_lseg */ ++void ++nfs4_unset_layout_deviceid(struct pnfs_layout_segment *l, ++ struct nfs4_deviceid *d, ++ void (*free_callback)(struct kref *)) ++{ ++ dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); ++ l->deviceid = NULL; ++ kref_put(&d->de_kref, free_callback); ++} ++EXPORT_SYMBOL(nfs4_unset_layout_deviceid); ++ ++struct nfs4_deviceid * ++nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ long hash = nfs4_deviceid_hash(id); ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ rcu_read_lock(); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) { ++ rcu_read_unlock(); ++ return d; ++ } ++ } ++ rcu_read_unlock(); ++ return NULL; ++} ++EXPORT_SYMBOL(nfs4_find_deviceid); ++ ++/* ++ * Add or kref_get a deviceid. ++ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new ++ */ ++struct nfs4_deviceid * ++nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ long hash = nfs4_deviceid_hash(&new->de_id); ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ spin_lock(&c->dc_lock); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) { ++ spin_unlock(&c->dc_lock); ++ dprintk("%s [discard]\n", __func__); ++ c->dc_free_callback(&new->de_kref); ++ return d; ++ } ++ } ++ hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]); ++ spin_unlock(&c->dc_lock); ++ dprintk("%s [new]\n", __func__); ++ return new; ++} ++EXPORT_SYMBOL(nfs4_add_deviceid); ++ ++static int ++nfs4_remove_deviceid(struct nfs4_deviceid_cache *c, long hash, ++ struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ spin_lock(&c->dc_lock); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (id && memcmp(id, &d->de_id, NFS4_PNFS_DEVICEID4_SIZE)) ++ continue; ++ hlist_del_rcu(&d->de_node); ++ spin_unlock(&c->dc_lock); ++ synchronize_rcu(); ++ dprintk("%s [%d]\n", __func__, ++ atomic_read(&d->de_kref.refcount)); ++ kref_put(&d->de_kref, c->dc_free_callback); ++ return 1; ++ } ++ spin_unlock(&c->dc_lock); ++ return 0; ++} ++ ++void ++nfs4_delete_device(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) ++{ ++ long hash = nfs4_deviceid_hash(id); ++ ++ nfs4_remove_deviceid(c, hash, id); ++} ++EXPORT_SYMBOL(nfs4_delete_device); ++ ++static void ++nfs4_free_deviceid_cache(struct kref *kref) ++{ ++ struct nfs4_deviceid_cache *cache = ++ container_of(kref, struct nfs4_deviceid_cache, dc_kref); ++ long i; ++ ++ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) ++ while (nfs4_remove_deviceid(cache, i, NULL)) ++ ; ++ kfree(cache); ++} ++ ++void ++nfs4_put_deviceid_cache(struct nfs_client *clp) ++{ ++ struct nfs4_deviceid_cache *tmp = clp->cl_devid_cache; ++ int refcount; ++ ++ dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); ++ spin_lock(&clp->cl_lock); ++ refcount = atomic_read(&clp->cl_devid_cache->dc_kref.refcount); ++ if (refcount == 1) ++ clp->cl_devid_cache = NULL; ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [%d]\n", __func__, refcount); ++ kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache); ++} ++EXPORT_SYMBOL(nfs4_put_deviceid_cache); +diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs.h +--- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-08-23 12:09:03.358501440 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-08-23 12:09:03.358501440 -0400 +@@ -0,0 +1,355 @@ ++/* ++ * fs/nfs/pnfs.h ++ * ++ * pNFS client data structures. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef FS_NFS_PNFS_H ++#define FS_NFS_PNFS_H ++ ++#include ++ ++#ifdef CONFIG_NFS_V4_1 ++ ++#include ++#include ++#include ++#include "iostat.h" ++ ++/* nfs4proc.c */ ++extern int nfs4_pnfs_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist); ++extern int nfs4_pnfs_getdeviceinfo(struct nfs_server *server, ++ struct pnfs_device *dev); ++extern int pnfs4_proc_layoutget(struct nfs4_pnfs_layoutget *lgp); ++extern int pnfs4_proc_layoutcommit(struct pnfs_layoutcommit_data *data, ++ int issync); ++extern int pnfs4_proc_layoutreturn(struct nfs4_pnfs_layoutreturn *lrp, bool wait); ++ ++/* pnfs.c */ ++extern const nfs4_stateid zero_stateid; ++ ++void _pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp); ++ ++int _pnfs_return_layout(struct inode *, struct nfs4_pnfs_layout_segment *, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type, bool wait); ++void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id); ++void unmount_pnfs_layoutdriver(struct nfs_server *); ++enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, ++ const struct rpc_call_ops *); ++int pnfs_initialize(void); ++void pnfs_uninitialize(void); ++void pnfs_layoutcommit_free(struct pnfs_layoutcommit_data *data); ++void pnfs_cleanup_layoutcommit(struct pnfs_layoutcommit_data *data); ++int pnfs_layoutcommit_inode(struct inode *inode, int sync); ++void pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent); ++void pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx); ++unsigned int pnfs_getiosize(struct nfs_server *server); ++void pnfs_set_ds_iosize(struct nfs_server *server); ++enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, ++ struct nfs_open_context *, struct list_head *, ++ size_t *); ++void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, ++ size_t *); ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata); ++void pnfs_get_layout_done(struct nfs4_pnfs_layoutget *, int rpc_status); ++int pnfs_layout_process(struct nfs4_pnfs_layoutget *lgp); ++void pnfs_layout_release(struct pnfs_layout_type *, struct nfs4_pnfs_layout_segment *range); ++void pnfs_set_layout_stateid(struct pnfs_layout_type *lo, ++ const nfs4_stateid *stateid); ++void pnfs_destroy_layout(struct nfs_inode *); ++void pnfs_destroy_all_layouts(struct nfs_client *); ++void put_layout(struct inode *inode); ++void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_type *lo); ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata); ++int _pnfs_write_end(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg); ++ ++#define PNFS_EXISTS_LDIO_OP(srv, opname) ((srv)->pnfs_curr_ld && \ ++ (srv)->pnfs_curr_ld->ld_io_ops && \ ++ (srv)->pnfs_curr_ld->ld_io_ops->opname) ++#define PNFS_EXISTS_LDPOLICY_OP(srv, opname) ((srv)->pnfs_curr_ld && \ ++ (srv)->pnfs_curr_ld->ld_policy_ops && \ ++ (srv)->pnfs_curr_ld->ld_policy_ops->opname) ++ ++#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" ++ ++static inline int lo_fail_bit(u32 iomode) ++{ ++ return iomode == IOMODE_RW ? ++ NFS_INO_RW_LAYOUT_FAILED : NFS_INO_RO_LAYOUT_FAILED; ++} ++ ++/* Return true if a layout driver is being used for this mountpoint */ ++static inline int pnfs_enabled_sb(struct nfs_server *nfss) ++{ ++ return nfss->pnfs_curr_ld != NULL; ++} ++ ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) ++{ ++ return !fsdata || ((struct pnfs_layout_segment *)fsdata == lseg) || ++ !fsdata->bypass_eof; ++} ++ ++/* Should the pNFS client commit and return the layout upon a setattr */ ++static inline bool ++pnfs_ld_layoutret_on_setattr(struct inode *inode) ++{ ++ if (!pnfs_enabled_sb(NFS_SERVER(inode))) ++ return false; ++ return NFS_SERVER(inode)->pnfs_curr_ld->ld_policy_ops->flags & ++ PNFS_LAYOUTRET_ON_SETATTR; ++} ++ ++/* Should the pNFS client commit and return the layout on close ++ */ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout->roc_iomode; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int status = 0; ++ ++ *fsdata = lseg; ++ if (lseg && PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ status = _pnfs_write_begin(inode, page, pos, len, lseg, ++ (struct pnfs_fsdata **) fsdata); ++ return status; ++} ++ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_end)) ++ return _pnfs_write_end(inode, page, pos, len, copied, lseg); ++ else ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++ if (fsdata) { ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_end_cleanup)) ++ nfss->pnfs_curr_ld->ld_io_ops->write_end_cleanup(filp, fsdata); ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ pnfs_free_fsdata(fsdata); ++ } ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct nfs4_pnfs_layout_segment *lseg, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct nfs_server *nfss = NFS_SERVER(ino); ++ ++ if (pnfs_enabled_sb(nfss) && ++ (type != RETURN_FILE || has_layout(nfsi))) ++ return _pnfs_return_layout(ino, lseg, stateid, type, wait); ++ ++ return 0; ++} ++ ++static inline void pnfs_update_layout(struct inode *ino, ++ struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ struct nfs_server *nfss = NFS_SERVER(ino); ++ ++ if (pnfs_enabled_sb(nfss)) ++ _pnfs_update_layout(ino, ctx, pos, count, access_type, lsegpp); ++ else { ++ if (lsegpp) ++ *lsegpp = NULL; ++ } ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline int pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ if (pnfs_enabled_sb(nfss)) ++ return pnfs_ld_use_rpc_code(nfss->pnfs_curr_ld); ++ ++ return 1; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ if (fsdata) { ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ return ((struct pnfs_fsdata *) fsdata)->lseg; ++ } ++ return fsdata; ++} ++#else /* CONFIG_NFS_V4_1 */ ++ ++static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) ++{ ++} ++ ++static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) ++{ ++} ++ ++static inline void get_lseg(struct pnfs_layout_segment *lseg) ++{ ++} ++ ++static inline void put_lseg(struct pnfs_layout_segment *lseg) ++{ ++} ++ ++static inline void ++pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ if (lsegpp) ++ *lsegpp = NULL; ++} ++ ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) ++{ ++ return 1; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ *fsdata = NULL; ++ return 0; ++} ++ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return 0; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return 0; ++} ++ ++static inline int pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ return 1; ++} ++ ++static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ return 0; ++} ++ ++static inline bool ++pnfs_ld_layoutret_on_setattr(struct inode *inode) ++{ ++ return false; ++} ++ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return 0; ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct nfs4_pnfs_layout_segment *lseg, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ return 0; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ return NULL; ++} ++ ++#endif /* CONFIG_NFS_V4_1 */ ++ ++#endif /* FS_NFS_PNFS_H */ +diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc.c +--- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-08-23 12:08:29.057511533 -0400 ++++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-23 12:09:03.359501471 -0400 +@@ -443,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, stru + fattr = nfs_alloc_fattr(); + status = -ENOMEM; + if (fh == NULL || fattr == NULL) +- goto out; ++ goto out_free; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); +@@ -455,7 +455,7 @@ nfs_proc_symlink(struct inode *dir, stru + */ + if (status == 0) + status = nfs_instantiate(dentry, fh, fattr); +- ++out_free: + nfs_free_fattr(fattr); + nfs_free_fhandle(fh); + out: +@@ -694,6 +694,7 @@ const struct nfs_rpc_ops nfs_v2_clientop + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs_dir_inode_operations, + .file_inode_ops = &nfs_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs_proc_get_root, + .getattr = nfs_proc_getattr, + .setattr = nfs_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read.c +--- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-08-23 12:08:29.057511533 -0400 ++++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-23 12:09:03.359501471 -0400 +@@ -18,8 +18,12 @@ + #include + #include + #include ++#include ++#include + + #include ++#include ++#include "pnfs.h" + + #include "nfs4_fs.h" + #include "internal.h" +@@ -117,11 +121,14 @@ int nfs_readpage_async(struct nfs_open_c + LIST_HEAD(one_request); + struct nfs_page *new; + unsigned int len; ++ struct pnfs_layout_segment *lseg; + + len = nfs_page_length(page); + if (len == 0) + return nfs_return_empty_page(page); +- new = nfs_create_request(ctx, inode, page, 0, len); ++ pnfs_update_layout(inode, ctx, 0, NFS4_MAX_UINT64, IOMODE_READ, &lseg); ++ new = nfs_create_request(ctx, inode, page, 0, len, lseg); ++ put_lseg(lseg); + if (IS_ERR(new)) { + unlock_page(page); + return PTR_ERR(new); +@@ -155,24 +162,20 @@ static void nfs_readpage_release(struct + nfs_release_request(req); + } + +-/* +- * Set up the NFS read request struct +- */ +-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset) ++int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, +@@ -180,9 +183,46 @@ static int nfs_read_rpcsetup(struct nfs_ + .flags = RPC_TASK_ASYNC | swap_flags, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->read_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_read); ++ ++int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) ++ return pnfs_get_read_status(data); ++ ++ return nfs_initiate_read(data, clnt, call_ops); ++} ++ ++/* ++ * Set up the NFS read request struct ++ */ ++static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + data->req = req; + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -190,27 +230,14 @@ static int nfs_read_rpcsetup(struct nfs_ + data->args.pages = data->pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); ++ data->args.lock_context = req->wb_lock_context; + + data->res.fattr = &data->fattr; + data->res.count = count; + data->res.eof = 0; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- rpc_put_task(task); +- return 0; ++ return pnfs_initiate_read(data, NFS_CLIENT(inode), call_ops); + } + + static void +@@ -354,7 +381,14 @@ static void nfs_readpage_retry(struct rp + { + struct nfs_readargs *argp = &data->args; + struct nfs_readres *resp = &data->res; ++ struct nfs_client *clp = NFS_SERVER(data->inode)->nfs_client; + ++#ifdef CONFIG_NFS_V4_1 ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + if (resp->eof || resp->count == argp->count) + return; + +@@ -368,7 +402,10 @@ static void nfs_readpage_retry(struct rp + argp->offset += resp->count; + argp->pgbase += resp->count; + argp->count -= resp->count; +- nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ ++ nfs_restart_rpc(task, clp); + } + + /* +@@ -409,13 +446,19 @@ static void nfs_readpage_release_partial + void nfs_read_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_read_data *data = calldata; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client, ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, + &data->args.seq_args, &data->res.seq_res, + 0, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_read_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_read_partial_ops = { +@@ -568,7 +611,8 @@ readpage_async_filler(void *data, struct + if (len == 0) + return nfs_return_empty_page(page); + +- new = nfs_create_request(desc->ctx, inode, page, 0, len); ++ new = nfs_create_request(desc->ctx, inode, page, 0, len, ++ desc->pgio->pg_lseg); + if (IS_ERR(new)) + goto out_error; + +@@ -624,6 +668,9 @@ int nfs_readpages(struct file *filp, str + if (ret == 0) + goto read_complete; /* all pages were read */ + ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_pageio_init_read(&pgio, inode, desc.ctx, pages, &rsize); ++#endif /* CONFIG_NFS_V4_1 */ + if (rsize < PAGE_CACHE_SIZE) + nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); + else +@@ -632,6 +679,7 @@ int nfs_readpages(struct file *filp, str + ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); + + nfs_pageio_complete(&pgio); ++ put_lseg(pgio.pg_lseg); + npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nfs_add_stats(inode, NFSIOS_READPAGES, npages); + read_complete: +diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/super.c +--- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-23 12:08:29.059491391 -0400 ++++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-23 12:09:03.361501458 -0400 +@@ -64,6 +64,7 @@ + #include "iostat.h" + #include "internal.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_VFS + +@@ -669,6 +670,28 @@ static int nfs_show_options(struct seq_f + + return 0; + } ++#ifdef CONFIG_NFS_V4_1 ++void show_sessions(struct seq_file *m, struct nfs_server *server) ++{ ++ if (nfs4_has_session(server->nfs_client)) ++ seq_printf(m, ",sessions"); ++} ++#else ++void show_sessions(struct seq_file *m, struct nfs_server *server) {} ++#endif ++ ++#ifdef CONFIG_NFS_V4_1 ++void show_pnfs(struct seq_file *m, struct nfs_server *server) ++{ ++ seq_printf(m, ",pnfs="); ++ if (server->pnfs_curr_ld) ++ seq_printf(m, "%s", server->pnfs_curr_ld->name); ++ else ++ seq_printf(m, "not configured"); ++} ++#else /* CONFIG_NFS_V4_1 */ ++void show_pnfs(struct seq_file *m, struct nfs_server *server) {} ++#endif /* CONFIG_NFS_V4_1 */ + + /* + * Present statistical information for this VFS mountpoint +@@ -707,6 +730,8 @@ static int nfs_show_stats(struct seq_fil + seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); + seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); + seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); ++ show_sessions(m, nfss); ++ show_pnfs(m, nfss); + } + #endif + +diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unlink.c +--- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-08-23 12:08:29.060501485 -0400 ++++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-23 12:09:03.362419975 -0400 +@@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task + struct nfs_unlinkdata *data = calldata; + struct nfs_server *server = NFS_SERVER(data->dir); + +- if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, ++ if (nfs4_setup_sequence(server, NULL, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/write.c +--- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-08-23 12:08:27.630563929 -0400 ++++ linux-2.6.34.noarch/fs/nfs/write.c 2010-08-23 12:09:03.364491337 -0400 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + #include + +@@ -28,6 +29,7 @@ + #include "iostat.h" + #include "nfs4_fs.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PAGECACHE + +@@ -59,6 +61,7 @@ struct nfs_write_data *nfs_commitdata_al + } + return p; + } ++EXPORT_SYMBOL(nfs_commitdata_alloc); + + void nfs_commit_free(struct nfs_write_data *p) + { +@@ -66,6 +69,7 @@ void nfs_commit_free(struct nfs_write_da + kfree(p->pagevec); + mempool_free(p, nfs_commit_mempool); + } ++EXPORT_SYMBOL(nfs_commit_free); + + struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) + { +@@ -418,6 +422,17 @@ static void nfs_inode_remove_request(str + nfs_clear_request(req); + nfs_release_request(req); + } ++static void ++nfs_mark_request_nopnfs(struct nfs_page *req) ++{ ++ struct pnfs_layout_segment *lseg = req->wb_lseg; ++ ++ if (req->wb_lseg == NULL) ++ return; ++ req->wb_lseg = NULL; ++ put_lseg(lseg); ++ dprintk(" retry through MDS\n"); ++} + + static void + nfs_mark_request_dirty(struct nfs_page *req) +@@ -523,7 +538,7 @@ nfs_need_commit(struct nfs_inode *nfsi) + * The requests are *not* checked to ensure that they form a contiguous set. + */ + static int +-nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) ++nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int *use_pnfs) + { + struct nfs_inode *nfsi = NFS_I(inode); + int ret; +@@ -531,7 +546,8 @@ nfs_scan_commit(struct inode *inode, str + if (!nfs_need_commit(nfsi)) + return 0; + +- ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); ++ ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT, ++ use_pnfs); + if (ret > 0) + nfsi->ncommit -= ret; + if (nfs_need_commit(NFS_I(inode))) +@@ -560,7 +576,8 @@ static inline int nfs_scan_commit(struct + static struct nfs_page *nfs_try_to_update_request(struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int bytes) ++ unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + unsigned int rqend; +@@ -585,8 +602,8 @@ static struct nfs_page *nfs_try_to_updat + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. + */ +- if (offset > rqend +- || end < req->wb_offset) ++ if (offset > rqend || end < req->wb_offset || ++ req->wb_lseg != lseg) + goto out_flushme; + + if (nfs_set_page_tag_locked(req)) +@@ -634,16 +651,17 @@ out_err: + * already called nfs_flush_incompatible() if necessary. + */ + static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, +- struct page *page, unsigned int offset, unsigned int bytes) ++ struct page *page, unsigned int offset, unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct inode *inode = page->mapping->host; + struct nfs_page *req; + int error; + +- req = nfs_try_to_update_request(inode, page, offset, bytes); ++ req = nfs_try_to_update_request(inode, page, offset, bytes, lseg); + if (req != NULL) + goto out; +- req = nfs_create_request(ctx, inode, page, offset, bytes); ++ req = nfs_create_request(ctx, inode, page, offset, bytes, lseg); + if (IS_ERR(req)) + goto out; + error = nfs_inode_add_request(inode, req); +@@ -656,23 +674,27 @@ out: + } + + static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, ++ void *fsdata) + { + struct nfs_page *req; + +- req = nfs_setup_write_request(ctx, page, offset, count); ++ req = nfs_setup_write_request(ctx, page, offset, count, lseg); + if (IS_ERR(req)) + return PTR_ERR(req); + nfs_mark_request_dirty(req); + /* Update file length */ +- nfs_grow_file(page, offset, count); ++ if (pnfs_grow_ok(lseg, fsdata)) ++ nfs_grow_file(page, offset, count); + nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + return 0; + } + +-int nfs_flush_incompatible(struct file *file, struct page *page) ++int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct nfs_page *req; +@@ -689,7 +711,10 @@ int nfs_flush_incompatible(struct file * + req = nfs_page_find_request(page); + if (req == NULL) + return 0; +- do_flush = req->wb_page != page || req->wb_context != ctx; ++ do_flush = req->wb_page != page || req->wb_context != ctx || ++ req->wb_lock_context->lockowner != current->files || ++ req->wb_lock_context->pid != current->tgid || ++ req->wb_lseg != lseg; + nfs_release_request(req); + if (!do_flush) + return 0; +@@ -716,7 +741,8 @@ static int nfs_write_pageuptodate(struct + * things with a page scheduled for an RPC call (e.g. invalidate it). + */ + int nfs_updatepage(struct file *file, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct inode *inode = page->mapping->host; +@@ -741,7 +767,7 @@ int nfs_updatepage(struct file *file, st + offset = 0; + } + +- status = nfs_writepage_setup(ctx, page, offset, count); ++ status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata); + if (status < 0) + nfs_set_pageerror(page); + +@@ -771,25 +797,21 @@ static int flush_task_priority(int how) + return RPC_PRIORITY_NORMAL; + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_write_rpcsetup(struct nfs_page *req, +- struct nfs_write_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset, +- int how) ++int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .task = &data->task, + .rpc_message = &msg, + .callback_ops = call_ops, +@@ -800,12 +822,62 @@ static int nfs_write_rpcsetup(struct nfs + }; + int ret = 0; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->write_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ if (how & FLUSH_SYNC) { ++ ret = rpc_wait_for_completion_task(task); ++ if (ret == 0) ++ ret = task->tk_status; ++ } ++ rpc_put_task(task); ++out: ++ return ret; ++} ++EXPORT_SYMBOL(nfs_initiate_write); ++ ++int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_write(data, clnt, call_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_write_rpcsetup(struct nfs_page *req, ++ struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset, ++ int how) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + data->req = req; + data->inode = inode = req->wb_context->path.dentry->d_inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -813,6 +885,7 @@ static int nfs_write_rpcsetup(struct nfs + data->args.pages = data->pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); ++ data->args.lock_context = req->wb_lock_context; + data->args.stable = NFS_UNSTABLE; + if (how & FLUSH_STABLE) { + data->args.stable = NFS_DATA_SYNC; +@@ -825,30 +898,7 @@ static int nfs_write_rpcsetup(struct nfs + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- dprintk("NFS: %5u initiated write call " +- "(req %s/%lld, %u bytes @ offset %llu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) { +- ret = PTR_ERR(task); +- goto out; +- } +- if (how & FLUSH_SYNC) { +- ret = rpc_wait_for_completion_task(task); +- if (ret == 0) +- ret = task->tk_status; +- } +- rpc_put_task(task); +-out: +- return ret; ++ return pnfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); + } + + /* If a nfs_flush_* function fails, it should remove reqs from @head and +@@ -859,6 +909,7 @@ static void nfs_redirty_request(struct n + { + struct page *page = req->wb_page; + ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); +@@ -971,6 +1022,10 @@ static void nfs_pageio_init_write(struct + { + size_t wsize = NFS_SERVER(inode)->wsize; + ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_pageio_init_write(pgio, inode, &wsize); ++#endif /* CONFIG_NFS_V4_1 */ ++ + if (wsize < PAGE_CACHE_SIZE) + nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); + else +@@ -1036,13 +1091,27 @@ out: + void nfs_write_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_write_data *data = calldata; +- struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(clp, &data->args.seq_args, ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } else if (data->args.count > NFS_SERVER(data->inode)->wsize) { ++ /* retrying via MDS? */ ++ data->pdata.orig_count = data->args.count; ++ data->args.count = NFS_SERVER(data->inode)->wsize; ++ dprintk("%s: trimmed count %u to wsize %u\n", __func__, ++ data->pdata.orig_count, data->args.count); ++ } else ++ data->pdata.orig_count = 0; ++ ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, ++ &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_write_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_write_partial_ops = { +@@ -1126,10 +1195,11 @@ int nfs_writeback_done(struct rpc_task * + struct nfs_writeargs *argp = &data->args; + struct nfs_writeres *resp = &data->res; + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *clp = server->nfs_client; + int status; + +- dprintk("NFS: %5u nfs_writeback_done (status %d)\n", +- task->tk_pid, task->tk_status); ++ dprintk("NFS: %5u nfs_writeback_done (status %d count %u)\n", ++ task->tk_pid, task->tk_status, resp->count); + + /* + * ->write_done will attempt to use post-op attributes to detect +@@ -1142,6 +1212,13 @@ int nfs_writeback_done(struct rpc_task * + if (status != 0) + return status; + nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); ++#ifdef CONFIG_NFS_V4_1 ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) + if (resp->verf->committed < argp->stable && task->tk_status >= 0) { +@@ -1158,7 +1235,7 @@ int nfs_writeback_done(struct rpc_task * + if (time_before(complain, jiffies)) { + dprintk("NFS: faulty NFS server %s:" + " (committed = %d) != (stable = %d)\n", +- server->nfs_client->cl_hostname, ++ clp->cl_hostname, + resp->verf->committed, argp->stable); + complain = jiffies + 300 * HZ; + } +@@ -1168,6 +1245,9 @@ int nfs_writeback_done(struct rpc_task * + if (task->tk_status >= 0 && resp->count < argp->count) { + static unsigned long complain; + ++ dprintk("NFS: short write:" ++ " (resp->count %u) < (argp->count = %u)\n", ++ resp->count, argp->count); + nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE); + + /* Has the server at least made some progress? */ +@@ -1184,7 +1264,10 @@ int nfs_writeback_done(struct rpc_task * + */ + argp->stable = NFS_FILE_SYNC; + } +- nfs_restart_rpc(task, server->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ ++ nfs_restart_rpc(task, clp); + return -EAGAIN; + } + if (time_before(complain, jiffies)) { +@@ -1228,40 +1311,73 @@ static void nfs_commitdata_release(void + nfs_commit_free(wdata); + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_commit_rpcsetup(struct list_head *head, +- struct nfs_write_data *data, +- int how) ++int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct nfs_page *first = nfs_list_entry(head->next); +- struct inode *inode = first->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = first->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, +- .callback_ops = &nfs_commit_ops, ++ .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + .priority = priority, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->commit_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_commit); ++ ++ ++int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs) ++{ ++ if (pnfs && ++ (pnfs_try_to_commit(data, &nfs_commit_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_commit(data, clnt, &nfs_commit_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_commit_rpcsetup(struct list_head *head, ++ struct nfs_write_data *data, ++ int how, int pnfs) ++{ ++ struct nfs_page *first = nfs_list_entry(head->next); ++ struct inode *inode = first->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + list_splice_init(head, &data->pages); + + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = first->wb_context->cred; + + data->args.fh = NFS_FH(data->inode); + /* Note: we always request a commit of the entire inode */ +@@ -1272,45 +1388,47 @@ static int nfs_commit_rpcsetup(struct li + data->res.fattr = &data->fattr; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); ++ kref_init(&data->refcount); ++ data->parent = NULL; ++ data->args.context = first->wb_context; /* used by commit done */ + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->commit_setup(data, &msg); ++ return pnfs_initiate_commit(data, NFS_CLIENT(inode), &nfs_commit_ops, ++ how, pnfs); ++} + +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++/* Handle memory error during commit */ ++void nfs_mark_list_commit(struct list_head *head) ++{ ++ struct nfs_page *req; + +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- rpc_put_task(task); +- return 0; ++ while (!list_empty(head)) { ++ req = nfs_list_entry(head->next); ++ nfs_list_remove_request(req); ++ nfs_mark_request_commit(req); ++ dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); ++ dec_bdi_stat(req->wb_page->mapping->backing_dev_info, ++ BDI_RECLAIMABLE); ++ nfs_clear_page_tag_locked(req); ++ } + } ++EXPORT_SYMBOL(nfs_mark_list_commit); + + /* + * Commit dirty pages + */ + static int +-nfs_commit_list(struct inode *inode, struct list_head *head, int how) ++nfs_commit_list(struct inode *inode, struct list_head *head, int how, int pnfs) + { + struct nfs_write_data *data; +- struct nfs_page *req; + + data = nfs_commitdata_alloc(); +- + if (!data) + goto out_bad; + + /* Set up the argument struct */ +- return nfs_commit_rpcsetup(head, data, how); ++ return nfs_commit_rpcsetup(head, data, how, pnfs); + out_bad: +- while (!list_empty(head)) { +- req = nfs_list_entry(head->next); +- nfs_list_remove_request(req); +- nfs_mark_request_commit(req); +- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); +- dec_bdi_stat(req->wb_page->mapping->backing_dev_info, +- BDI_RECLAIMABLE); +- nfs_clear_page_tag_locked(req); +- } ++ nfs_mark_list_commit(head); + nfs_commit_clear_lock(NFS_I(inode)); + return -ENOMEM; + } +@@ -1330,6 +1448,19 @@ static void nfs_commit_done(struct rpc_t + return; + } + ++static inline void nfs_commit_cleanup(struct kref *kref) ++{ ++ struct nfs_write_data *data; ++ ++ data = container_of(kref, struct nfs_write_data, refcount); ++ /* Clear lock only when all cloned commits are finished */ ++ if (data->parent) ++ kref_put(&data->parent->refcount, nfs_commit_cleanup); ++ else ++ nfs_commit_clear_lock(NFS_I(data->inode)); ++ nfs_commitdata_release(data); ++} ++ + static void nfs_commit_release(void *calldata) + { + struct nfs_write_data *data = calldata; +@@ -1347,6 +1478,11 @@ static void nfs_commit_release(void *cal + req->wb_bytes, + (long long)req_offset(req)); + if (status < 0) { ++ if (req->wb_lseg) { ++ nfs_mark_request_nopnfs(req); ++ nfs_mark_request_dirty(req); ++ goto next; ++ } + nfs_context_set_write_error(req->wb_context, status); + nfs_inode_remove_request(req); + dprintk(", error = %d\n", status); +@@ -1363,12 +1499,12 @@ static void nfs_commit_release(void *cal + } + /* We have a mismatch. Write the page again */ + dprintk(" mismatch\n"); ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + next: + nfs_clear_page_tag_locked(req); + } +- nfs_commit_clear_lock(NFS_I(data->inode)); +- nfs_commitdata_release(calldata); ++ kref_put(&data->refcount, nfs_commit_cleanup); + } + + static const struct rpc_call_ops nfs_commit_ops = { +@@ -1384,21 +1520,22 @@ int nfs_commit_inode(struct inode *inode + LIST_HEAD(head); + int may_wait = how & FLUSH_SYNC; + int res = 0; ++ int use_pnfs = 0; + + if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) + goto out_mark_dirty; + spin_lock(&inode->i_lock); +- res = nfs_scan_commit(inode, &head, 0, 0); ++ res = nfs_scan_commit(inode, &head, 0, 0, &use_pnfs); + spin_unlock(&inode->i_lock); + if (res) { +- int error = nfs_commit_list(inode, &head, how); ++ int error = nfs_commit_list(inode, &head, how, use_pnfs); + if (error < 0) + return error; +- if (may_wait) ++ if (may_wait) { + wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); +- else ++ } else + goto out_mark_dirty; + } else + nfs_commit_clear_lock(NFS_I(inode)); +@@ -1451,7 +1588,18 @@ static int nfs_commit_unstable_pages(str + + int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) + { +- return nfs_commit_unstable_pages(inode, wbc); ++ int ret; ++ ret = nfs_commit_unstable_pages(inode, wbc); ++ if (ret >= 0 && layoutcommit_needed(NFS_I(inode))) { ++ int err, sync = wbc->sync_mode; ++ ++ if (wbc->nonblocking || wbc->for_background) ++ sync = 0; ++ err = pnfs_layoutcommit_inode(inode, sync); ++ if (err < 0) ++ ret = err; ++ } ++ return ret; + } + + /* +@@ -1459,6 +1607,7 @@ int nfs_write_inode(struct inode *inode, + */ + int nfs_wb_all(struct inode *inode) + { ++ int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, +@@ -1466,7 +1615,8 @@ int nfs_wb_all(struct inode *inode) + .range_end = LLONG_MAX, + }; + +- return sync_inode(inode, &wbc); ++ ret = sync_inode(inode, &wbc); ++ return ret; + } + + int nfs_wb_page_cancel(struct inode *inode, struct page *page) +diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/include/linux/exportfs.h +--- linux-2.6.34.noarch/include/linux/exportfs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-08-23 12:09:03.365501459 -0400 +@@ -2,6 +2,7 @@ + #define LINUX_EXPORTFS_H 1 + + #include ++#include + + struct dentry; + struct inode; +@@ -175,4 +176,62 @@ extern struct dentry *generic_fh_to_pare + struct fid *fid, int fh_len, int fh_type, + struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)); + ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct pnfs_filelayout_device; ++struct pnfs_filelayout_layout; ++ ++extern int filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev); ++extern enum nfsstat4 filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct list_head; ++ ++extern int blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes); ++ ++extern enum nfsstat4 blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *layouts); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_PNFSD) ++#include ++ ++struct pnfsd_cb_operations; ++ ++struct pnfsd_cb_ctl { ++ spinlock_t lock; ++ struct module *module; ++ const struct pnfsd_cb_operations *cb_op; ++}; ++ ++/* in expfs.c so that file systems can depend on it */ ++extern struct pnfsd_cb_ctl pnfsd_cb_ctl; ++ ++static inline int ++pnfsd_get_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ int ret = -ENOENT; ++ ++ spin_lock(&pnfsd_cb_ctl.lock); ++ if (!pnfsd_cb_ctl.cb_op) ++ goto out; ++ if (!try_module_get(pnfsd_cb_ctl.module)) ++ goto out; ++ ctl->cb_op = pnfsd_cb_ctl.cb_op; ++ ctl->module = pnfsd_cb_ctl.module; ++ ret = 0; ++out: ++ spin_unlock(&pnfsd_cb_ctl.lock); ++ return ret; ++} ++ ++static inline void ++pnfsd_put_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ module_put(ctl->module); ++} ++#endif /* CONFIG_PNFSD */ + #endif /* LINUX_EXPORTFS_H */ +diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/include/linux/exp_xdr.h +--- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-08-23 12:09:03.367491365 -0400 ++++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-08-23 12:09:03.367491365 -0400 +@@ -0,0 +1,141 @@ ++#ifndef _LINUX_EXP_XDR_H ++#define _LINUX_EXP_XDR_H ++ ++#include ++#include ++#include ++ ++struct exp_xdr_stream { ++ __be32 *p; ++ __be32 *end; ++}; ++ ++/** ++ * exp_xdr_qwords - Calculate the number of quad-words holding nbytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline size_t ++exp_xdr_qwords(__u32 nbytes) ++{ ++ return DIV_ROUND_UP(nbytes, 4); ++} ++ ++/** ++ * exp_xdr_qbytes - Calculate the number of bytes holding qwords ++ * @qwords: number of quad-words to encode ++ */ ++static inline size_t ++exp_xdr_qbytes(size_t qwords) ++{ ++ return qwords << 2; ++} ++ ++/** ++ * exp_xdr_reserve_space - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nbytes: number of bytes to reserve ++ * ++ * Checks that we have enough buffer space to encode 'nbytes' more ++ * bytes of data. If so, update the xdr stream. ++ */ ++static inline __be32 * ++exp_xdr_reserve_space(struct exp_xdr_stream *xdr, size_t nbytes) ++{ ++ __be32 *p = xdr->p; ++ __be32 *q; ++ ++ /* align nbytes on the next 32-bit boundary */ ++ q = p + exp_xdr_qwords(nbytes); ++ if (unlikely(q > xdr->end || q < p)) ++ return NULL; ++ xdr->p = q; ++ return p; ++} ++ ++/** ++ * exp_xdr_reserve_qwords - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nwords: number of quad words (u32's) to reserve ++ */ ++static inline __be32 * ++exp_xdr_reserve_qwords(struct exp_xdr_stream *xdr, size_t qwords) ++{ ++ return exp_xdr_reserve_space(xdr, exp_xdr_qbytes(qwords)); ++} ++ ++/** ++ * exp_xdr_encode_u32 - Encode an unsigned 32-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u32(__be32 *p, __u32 val) ++{ ++ *p = cpu_to_be32(val); ++ return p + 1; ++} ++ ++/** ++ * exp_xdr_encode_u64 - Encode an unsigned 64-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u64(__be32 *p, __u64 val) ++{ ++ put_unaligned_be64(val, p); ++ return p + 2; ++} ++ ++/** ++ * exp_xdr_encode_bytes - Encode an array of bytes onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the array of bytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_bytes(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ if (likely(nbytes != 0)) { ++ unsigned int qwords = exp_xdr_qwords(nbytes); ++ unsigned int padding = exp_xdr_qbytes(qwords) - nbytes; ++ ++ memcpy(p, ptr, nbytes); ++ if (padding != 0) ++ memset((char *)p + nbytes, 0, padding); ++ p += qwords; ++ } ++ return p; ++} ++ ++/** ++ * exp_xdr_encode_opaque - Encode an opaque type onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the opaque array ++ * @nbytes: number of bytes to encode ++ * ++ * Encodes the 32-bit opaque size in bytes followed by the opaque value. ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ p = exp_xdr_encode_u32(p, nbytes); ++ return exp_xdr_encode_bytes(p, ptr, nbytes); ++} ++ ++/** ++ * exp_xdr_encode_opaque_qlen - Encode the opaque length onto a xdr stream ++ * @lenp: pointer to the opaque length destination ++ * @endp: pointer to the end of the opaque array ++ * ++ * Encodes the 32-bit opaque size in bytes given the start and end pointers ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque_len(__be32 *lenp, const void *endp) ++{ ++ size_t nbytes = (char *)endp - (char *)(lenp + 1); ++ ++ exp_xdr_encode_u32(lenp, nbytes); ++ return lenp + 1 + exp_xdr_qwords(nbytes); ++} ++#endif /* _LINUX_EXP_XDR_H */ +diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include/linux/fs.h +--- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-08-23 12:08:29.021511898 -0400 ++++ linux-2.6.34.noarch/include/linux/fs.h 2010-08-23 12:09:03.369481147 -0400 +@@ -387,6 +387,7 @@ struct inodes_stat_t { + #include + + struct export_operations; ++struct pnfs_export_operations; + struct hd_geometry; + struct iovec; + struct nameidata; +@@ -1329,6 +1330,7 @@ struct super_block { + const struct dquot_operations *dq_op; + const struct quotactl_ops *s_qcop; + const struct export_operations *s_export_op; ++ const struct pnfs_export_operations *s_pnfs_op; + unsigned long s_flags; + unsigned long s_magic; + struct dentry *s_root; +diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/include/linux/nfs4.h +--- linux-2.6.34.noarch/include/linux/nfs4.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-08-23 12:09:03.371491472 -0400 +@@ -17,7 +17,10 @@ + + #define NFS4_BITMAP_SIZE 2 + #define NFS4_VERIFIER_SIZE 8 +-#define NFS4_STATEID_SIZE 16 ++#define NFS4_CLIENTID_SIZE 8 ++#define NFS4_STATEID_SEQID_SIZE 4 ++#define NFS4_STATEID_OTHER_SIZE 12 ++#define NFS4_STATEID_SIZE (NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE) + #define NFS4_FHSIZE 128 + #define NFS4_MAXPATHLEN PATH_MAX + #define NFS4_MAXNAMLEN NAME_MAX +@@ -119,6 +122,13 @@ + #define EXCHGID4_FLAG_MASK_A 0x40070003 + #define EXCHGID4_FLAG_MASK_R 0x80070003 + ++static inline bool ++is_ds_only_session(u32 exchange_flags) ++{ ++ u32 mask = EXCHGID4_FLAG_USE_PNFS_DS | EXCHGID4_FLAG_USE_PNFS_MDS; ++ return (exchange_flags & mask) == EXCHGID4_FLAG_USE_PNFS_DS; ++} ++ + #define SEQ4_STATUS_CB_PATH_DOWN 0x00000001 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING 0x00000002 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED 0x00000004 +@@ -166,8 +176,25 @@ struct nfs4_acl { + struct nfs4_ace aces[0]; + }; + ++struct nfs4_fsid { ++ u64 major; ++ u64 minor; ++}; ++ + typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier; +-typedef struct { char data[NFS4_STATEID_SIZE]; } nfs4_stateid; ++typedef struct { char data[NFS4_CLIENTID_SIZE]; } nfs4_clientid; ++ ++struct nfs41_stateid { ++ __be32 seqid; ++ char other[NFS4_STATEID_OTHER_SIZE]; ++} __attribute__ ((packed)); ++ ++typedef struct { ++ union { ++ char data[NFS4_STATEID_SIZE]; ++ struct nfs41_stateid stateid; ++ } u; ++} nfs4_stateid; + + enum nfs_opnum4 { + OP_ACCESS = 3, +@@ -471,6 +498,8 @@ enum lock_type4 { + #define FATTR4_WORD1_TIME_MODIFY (1UL << 21) + #define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22) + #define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23) ++#define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30) ++#define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) + + #define NFSPROC4_NULL 0 + #define NFSPROC4_COMPOUND 1 +@@ -523,6 +552,7 @@ enum { + NFSPROC4_CLNT_GETACL, + NFSPROC4_CLNT_SETACL, + NFSPROC4_CLNT_FS_LOCATIONS, ++ NFSPROC4_CLNT_RELEASE_LOCKOWNER, + + /* nfs41 */ + NFSPROC4_CLNT_EXCHANGE_ID, +@@ -531,6 +561,13 @@ enum { + NFSPROC4_CLNT_SEQUENCE, + NFSPROC4_CLNT_GET_LEASE_TIME, + NFSPROC4_CLNT_RECLAIM_COMPLETE, ++ NFSPROC4_CLNT_PNFS_LAYOUTGET, ++ NFSPROC4_CLNT_PNFS_LAYOUTCOMMIT, ++ NFSPROC4_CLNT_PNFS_LAYOUTRETURN, ++ NFSPROC4_CLNT_PNFS_GETDEVICELIST, ++ NFSPROC4_CLNT_PNFS_GETDEVICEINFO, ++ NFSPROC4_CLNT_PNFS_WRITE, ++ NFSPROC4_CLNT_PNFS_COMMIT, + }; + + /* nfs41 types */ +@@ -549,6 +586,43 @@ enum state_protect_how4 { + SP4_SSV = 2 + }; + ++enum pnfs_layouttype { ++ LAYOUT_NFSV4_1_FILES = 1, ++ LAYOUT_OSD2_OBJECTS = 2, ++ LAYOUT_BLOCK_VOLUME = 3, ++}; ++ ++/* used for both layout return and recall */ ++enum pnfs_layoutreturn_type { ++ RETURN_FILE = 1, ++ RETURN_FSID = 2, ++ RETURN_ALL = 3 ++}; ++ ++enum pnfs_iomode { ++ IOMODE_READ = 1, ++ IOMODE_RW = 2, ++ IOMODE_ANY = 3, ++}; ++ ++enum pnfs_notify_deviceid_type4 { ++ NOTIFY_DEVICEID4_CHANGE = 1 << 1, ++ NOTIFY_DEVICEID4_DELETE = 1 << 2, ++}; ++ ++#define NFL4_UFLG_MASK 0x0000003F ++#define NFL4_UFLG_DENSE 0x00000001 ++#define NFL4_UFLG_COMMIT_THRU_MDS 0x00000002 ++#define NFL4_UFLG_STRIPE_UNIT_SIZE_MASK 0xFFFFFFC0 ++ ++/* Encoded in the loh_body field of type layouthint4 */ ++enum filelayout_hint_care4 { ++ NFLH4_CARE_DENSE = NFL4_UFLG_DENSE, ++ NFLH4_CARE_COMMIT_THRU_MDS = NFL4_UFLG_COMMIT_THRU_MDS, ++ NFLH4_CARE_STRIPE_UNIT_SIZE = 0x00000040, ++ NFLH4_CARE_STRIPE_COUNT = 0x00000080 ++}; ++ + #endif + #endif + +diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfs4_pnfs.h +--- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-23 12:09:03.372501550 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-08-23 12:09:03.372501550 -0400 +@@ -0,0 +1,330 @@ ++/* ++ * include/linux/nfs4_pnfs.h ++ * ++ * Common data structures needed by the pnfs client and pnfs layout driver. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef LINUX_NFS4_PNFS_H ++#define LINUX_NFS4_PNFS_H ++ ++#include ++#include ++ ++enum pnfs_try_status { ++ PNFS_ATTEMPTED = 0, ++ PNFS_NOT_ATTEMPTED = 1, ++}; ++ ++#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 ++ ++/* Per-layout driver specific registration structure */ ++struct pnfs_layoutdriver_type { ++ const u32 id; ++ const char *name; ++ struct layoutdriver_io_operations *ld_io_ops; ++ struct layoutdriver_policy_operations *ld_policy_ops; ++}; ++ ++struct pnfs_fsdata { ++ int bypass_eof; ++ struct pnfs_layout_segment *lseg; ++ void *private; ++}; ++ ++#if defined(CONFIG_NFS_V4_1) ++ ++static inline struct nfs_inode * ++PNFS_NFS_INODE(struct pnfs_layout_type *lo) ++{ ++ return NFS_I(lo->lo_inode); ++} ++ ++static inline struct inode * ++PNFS_INODE(struct pnfs_layout_type *lo) ++{ ++ return lo->lo_inode; ++} ++ ++static inline struct nfs_server * ++PNFS_NFS_SERVER(struct pnfs_layout_type *lo) ++{ ++ return NFS_SERVER(PNFS_INODE(lo)); ++} ++ ++static inline struct pnfs_layoutdriver_type * ++PNFS_LD(struct pnfs_layout_type *lo) ++{ ++ return NFS_SERVER(PNFS_INODE(lo))->pnfs_curr_ld; ++} ++ ++static inline struct layoutdriver_io_operations * ++PNFS_LD_IO_OPS(struct pnfs_layout_type *lo) ++{ ++ return PNFS_LD(lo)->ld_io_ops; ++} ++ ++static inline struct layoutdriver_policy_operations * ++PNFS_LD_POLICY_OPS(struct pnfs_layout_type *lo) ++{ ++ return PNFS_LD(lo)->ld_policy_ops; ++} ++ ++static inline bool ++has_layout(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout != NULL; ++} ++ ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) ++{ ++ return has_layout(nfsi) && ++ test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->pnfs_layout_state); ++} ++ ++extern void put_lseg(struct pnfs_layout_segment *lseg); ++extern void get_lseg(struct pnfs_layout_segment *lseg); ++ ++#else /* CONFIG_NFS_V4_1 */ ++ ++static inline bool ++has_layout(struct nfs_inode *nfsi) ++{ ++ return false; ++} ++ ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_NFS_V4_1 */ ++ ++struct pnfs_layout_segment { ++ struct list_head fi_list; ++ struct nfs4_pnfs_layout_segment range; ++ struct kref kref; ++ bool valid; ++ struct pnfs_layout_type *layout; ++ struct nfs4_deviceid *deviceid; ++ u8 ld_data[]; /* layout driver private data */ ++}; ++ ++static inline void * ++LSEG_LD_DATA(struct pnfs_layout_segment *lseg) ++{ ++ return lseg->ld_data; ++} ++ ++/* Layout driver I/O operations. ++ * Either the pagecache or non-pagecache read/write operations must be implemented ++ */ ++struct layoutdriver_io_operations { ++ /* Functions that use the pagecache. ++ * If use_pagecache == 1, then these functions must be implemented. ++ */ ++ /* read and write pagelist should return just 0 (to indicate that ++ * the layout code has taken control) or 1 (to indicate that the ++ * layout code wishes to fall back to normal nfs.) If 0 is returned, ++ * information can be passed back through nfs_data->res and ++ * nfs_data->task.tk_status, and the appropriate pnfs done function ++ * MUST be called. ++ */ ++ enum pnfs_try_status ++ (*read_pagelist) (struct nfs_read_data *nfs_data, unsigned nr_pages); ++ enum pnfs_try_status ++ (*write_pagelist) (struct nfs_write_data *nfs_data, unsigned nr_pages, int how); ++ int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page, ++ loff_t pos, unsigned count, ++ struct pnfs_fsdata *fsdata); ++ int (*write_end)(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, ++ struct pnfs_layout_segment *lseg); ++ void (*write_end_cleanup)(struct file *filp, ++ struct pnfs_fsdata *fsdata); ++ ++ /* Consistency ops */ ++ /* 2 problems: ++ * 1) the page list contains nfs_pages, NOT pages ++ * 2) currently the NFS code doesn't create a page array (as it does with read/write) ++ */ ++ enum pnfs_try_status ++ (*commit) (struct nfs_write_data *nfs_data, int how); ++ ++ /* Layout information. For each inode, alloc_layout is executed once to retrieve an ++ * inode specific layout structure. Each subsequent layoutget operation results in ++ * a set_layout call to set the opaque layout in the layout driver.*/ ++ struct pnfs_layout_type * (*alloc_layout) (struct inode *inode); ++ void (*free_layout) (struct pnfs_layout_type *); ++ struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_type *layoutid, struct nfs4_pnfs_layoutget_res *lgr); ++ void (*free_lseg) (struct pnfs_layout_segment *lseg); ++ ++ int (*setup_layoutcommit) (struct pnfs_layout_type *layoutid, ++ struct pnfs_layoutcommit_arg *args); ++ ++ void (*encode_layoutcommit) (struct pnfs_layout_type *layoutid, ++ struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *args); ++ void (*cleanup_layoutcommit) (struct pnfs_layout_type *layoutid, ++ struct pnfs_layoutcommit_arg *args, ++ int status); ++ void (*encode_layoutreturn) (struct pnfs_layout_type *layoutid, ++ struct xdr_stream *xdr, ++ const struct nfs4_pnfs_layoutreturn_arg *args); ++ ++ /* Registration information for a new mounted file system ++ */ ++ int (*initialize_mountpoint) (struct nfs_server *, ++ const struct nfs_fh * mntfh); ++ int (*uninitialize_mountpoint) (struct nfs_server *server); ++}; ++ ++enum layoutdriver_policy_flags { ++ /* Should the full nfs rpc cleanup code be used after io */ ++ PNFS_USE_RPC_CODE = 1 << 0, ++ ++ /* Should the NFS req. gather algorithm cross stripe boundaries? */ ++ PNFS_GATHER_ACROSS_STRIPES = 1 << 1, ++ ++ /* Should the pNFS client commit and return the layout upon a setattr */ ++ PNFS_LAYOUTRET_ON_SETATTR = 1 << 3, ++}; ++ ++struct layoutdriver_policy_operations { ++ unsigned flags; ++ ++ /* The stripe size of the file system */ ++ ssize_t (*get_stripesize) (struct pnfs_layout_type *layoutid); ++ ++ /* test for nfs page cache coalescing */ ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++ ++ /* Retreive the block size of the file system. ++ * If gather_across_stripes == 1, then the file system will gather ++ * requests into the block size. ++ * TODO: Where will the layout driver get this info? It is hard ++ * coded in PVFS2. ++ */ ++ ssize_t (*get_blocksize) (void); ++}; ++ ++/* Should the full nfs rpc cleanup code be used after io */ ++static inline int ++pnfs_ld_use_rpc_code(struct pnfs_layoutdriver_type *ld) ++{ ++ return ld->ld_policy_ops->flags & PNFS_USE_RPC_CODE; ++} ++ ++/* Should the NFS req. gather algorithm cross stripe boundaries? */ ++static inline int ++pnfs_ld_gather_across_stripes(struct pnfs_layoutdriver_type *ld) ++{ ++ return ld->ld_policy_ops->flags & PNFS_GATHER_ACROSS_STRIPES; ++} ++ ++struct pnfs_device { ++ struct pnfs_deviceid dev_id; ++ unsigned int layout_type; ++ unsigned int mincount; ++ struct page **pages; ++ void *area; ++ unsigned int pgbase; ++ unsigned int pglen; ++ unsigned int dev_notify_types; ++}; ++ ++struct pnfs_devicelist { ++ unsigned int eof; ++ unsigned int num_devs; ++ struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; ++}; ++ ++/* ++ * Device ID RCU cache. A device ID is unique per client ID and layout type. ++ */ ++#define NFS4_DEVICE_ID_HASH_BITS 5 ++#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) ++#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) ++ ++static inline u32 ++nfs4_deviceid_hash(struct pnfs_deviceid *id) ++{ ++ unsigned char *cptr = (unsigned char *)id->data; ++ unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE; ++ u32 x = 0; ++ ++ while (nbytes--) { ++ x *= 37; ++ x += *cptr++; ++ } ++ return x & NFS4_DEVICE_ID_HASH_MASK; ++} ++ ++struct nfs4_deviceid_cache { ++ spinlock_t dc_lock; ++ struct kref dc_kref; ++ void (*dc_free_callback)(struct kref *); ++ struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE]; ++}; ++ ++/* Device ID cache node */ ++struct nfs4_deviceid { ++ struct hlist_node de_node; ++ struct pnfs_deviceid de_id; ++ struct kref de_kref; ++}; ++ ++extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *, ++ void (*free_callback)(struct kref *)); ++extern void nfs4_put_deviceid_cache(struct nfs_client *); ++extern void nfs4_init_deviceid_node(struct nfs4_deviceid *); ++extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *, ++ struct pnfs_deviceid *); ++extern struct nfs4_deviceid *nfs4_add_deviceid(struct nfs4_deviceid_cache *, ++ struct nfs4_deviceid *); ++extern void nfs4_set_layout_deviceid(struct pnfs_layout_segment *, ++ struct nfs4_deviceid *); ++extern void nfs4_unset_layout_deviceid(struct pnfs_layout_segment *, ++ struct nfs4_deviceid *, ++ void (*free_callback)(struct kref *)); ++extern void nfs4_delete_device(struct nfs4_deviceid_cache *, ++ struct pnfs_deviceid *); ++ ++/* pNFS client callback functions. ++ * These operations allow the layout driver to access pNFS client ++ * specific information or call pNFS client->server operations. ++ * E.g., getdeviceinfo, I/O callbacks, etc ++ */ ++struct pnfs_client_operations { ++ int (*nfs_getdevicelist) (struct nfs_server *, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist); ++ int (*nfs_getdeviceinfo) (struct nfs_server *, ++ struct pnfs_device *dev); ++ ++ /* Post read callback. */ ++ void (*nfs_readlist_complete) (struct nfs_read_data *nfs_data); ++ ++ /* Post write callback. */ ++ void (*nfs_writelist_complete) (struct nfs_write_data *nfs_data); ++ ++ /* Post commit callback. */ ++ void (*nfs_commit_complete) (struct nfs_write_data *nfs_data); ++ void (*nfs_return_layout) (struct inode *); ++}; ++ ++extern struct pnfs_client_operations pnfs_ops; ++ ++extern struct pnfs_client_operations *pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); ++extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); ++ ++#define NFS4_PNFS_MAX_LAYOUTS 4 ++#define NFS4_PNFS_PRIVATE_LAYOUT 0x80000000 ++ ++#endif /* LINUX_NFS4_PNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarch/include/linux/nfsd4_block.h +--- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-08-23 12:09:03.373491892 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-08-23 12:09:03.374491393 -0400 +@@ -0,0 +1,101 @@ ++#ifndef NFSD4_BLOCK ++#define NFSD4_BLOCK ++ ++#include ++#include ++#include ++#include ++ ++#define PNFS_BLOCK_SUCCESS 1 ++#define PNFS_BLOCK_FAILURE 0 ++ ++#define PNFS_BLOCK_CTL_START 1 ++#define PNFS_BLOCK_CTL_STOP 2 ++#define PNFS_BLOCK_CTL_VERS 3 /* Allows daemon to request current ++ * version from kernel via an upcall. ++ */ ++ ++#define PNFS_UPCALL_MSG_STOP 0 ++#define PNFS_UPCALL_MSG_GETSIG 1 ++#define PNFS_UPCALL_MSG_GETSLICE 2 ++#define PNFS_UPCALL_MSG_DMCHK 3 // See if dev_t is a DM volume ++#define PNFS_UPCALL_MSG_DMGET 4 ++#define PNFS_UPCALL_MSG_VERS 5 ++ ++#define PNFS_UPCALL_VERS 8 ++ ++typedef struct stripe_dev { ++ int major, ++ minor, ++ offset; ++} stripe_dev_t; ++ ++typedef struct bl_comm_res { ++ int res_status; ++ union { ++ struct { ++ long long start, ++ length; ++ } slice; ++ struct { ++ int num_stripes, ++ stripe_size; ++ stripe_dev_t devs[]; ++ } stripe; ++ struct { ++ long long sector; ++ int offset, ++ len; ++ char sig[]; ++ } sig; ++ int vers, ++ dm_vol; ++ } u; ++} bl_comm_res_t; ++ ++typedef struct bl_comm_msg { ++ int msg_type, ++ msg_status; ++ union { ++ dev_t msg_dev; ++ int msg_vers; ++ } u; ++ bl_comm_res_t *msg_res; ++} bl_comm_msg_t; ++ ++#ifdef __KERNEL__ ++ ++typedef struct bl_comm { ++ /* ---- protects access to this structure ---- */ ++ struct mutex lock; ++ /* ---- protects access to rpc pipe ---- */ ++ struct mutex pipe_lock; ++ struct dentry *pipe_dentry; ++ wait_queue_head_t pipe_wq; ++ bl_comm_msg_t msg; ++} bl_comm_t; ++ ++int pnfs_block_enabled(struct inode *, int); ++int bl_layout_type(struct super_block *sb); ++int bl_getdeviceiter(struct super_block *, u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int bl_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++enum nfsstat4 bl_layoutget(struct inode *, struct exp_xdr_stream *, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int bl_layoutcommit(struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++int bl_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len); ++int bl_init_proc(void); ++int bl_upcall(bl_comm_t *, bl_comm_msg_t *, bl_comm_res_t **); ++ ++extern bl_comm_t *bl_comm_global; // Ugly... ++#endif /* __KERNEL__ */ ++ ++#endif /* NFSD4_BLOCK */ ++ +diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h +--- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-23 12:09:03.375501481 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-08-23 12:09:03.375501481 -0400 +@@ -0,0 +1,345 @@ ++/* ++ * include/linux/nfsd4_spnfs.h ++ * ++ * spNFS - simple pNFS implementation with userspace daemon ++ * ++ */ ++ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#ifndef NFS_SPNFS_H ++#define NFS_SPNFS_H ++ ++ ++#ifdef __KERNEL__ ++#include "exportfs.h" ++#include "sunrpc/svc.h" ++#include "nfsd/nfsfh.h" ++#else ++#include ++#endif /* __KERNEL__ */ ++ ++#define SPNFS_STATUS_INVALIDMSG 0x01 ++#define SPNFS_STATUS_AGAIN 0x02 ++#define SPNFS_STATUS_FAIL 0x04 ++#define SPNFS_STATUS_SUCCESS 0x08 ++ ++#define SPNFS_TYPE_LAYOUTGET 0x01 ++#define SPNFS_TYPE_LAYOUTCOMMIT 0x02 ++#define SPNFS_TYPE_LAYOUTRETURN 0x03 ++#define SPNFS_TYPE_GETDEVICEITER 0x04 ++#define SPNFS_TYPE_GETDEVICEINFO 0x05 ++#define SPNFS_TYPE_SETATTR 0x06 ++#define SPNFS_TYPE_OPEN 0x07 ++#define SPNFS_TYPE_CLOSE 0x08 ++#define SPNFS_TYPE_CREATE 0x09 ++#define SPNFS_TYPE_REMOVE 0x0a ++#define SPNFS_TYPE_COMMIT 0x0b ++#define SPNFS_TYPE_READ 0x0c ++#define SPNFS_TYPE_WRITE 0x0d ++ ++#define SPNFS_MAX_DEVICES 1 ++#define SPNFS_MAX_DATA_SERVERS 16 ++#define SPNFS_MAX_IO 512 ++ ++/* layout */ ++struct spnfs_msg_layoutget_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_filelayout_list { ++ u_int32_t fh_len; ++ unsigned char fh_val[128]; /* DMXXX fix this const */ ++}; ++ ++struct spnfs_msg_layoutget_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t stripe_size; ++ u_int32_t stripe_type; ++ u_int32_t stripe_count; ++ struct spnfs_filelayout_list flist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++/* layoutcommit */ ++struct spnfs_msg_layoutcommit_args { ++ unsigned long inode; ++ unsigned long generation; ++ u_int64_t file_size; ++}; ++ ++struct spnfs_msg_layoutcommit_res { ++ int status; ++}; ++ ++/* layoutreturn */ ++/* No op for the daemon */ ++/* ++struct spnfs_msg_layoutreturn_args { ++}; ++ ++struct spnfs_msg_layoutreturn_res { ++}; ++*/ ++ ++/* getdeviceiter */ ++struct spnfs_msg_getdeviceiter_args { ++ unsigned long inode; ++ u_int64_t cookie; ++ u_int64_t verf; ++}; ++ ++struct spnfs_msg_getdeviceiter_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t cookie; ++ u_int64_t verf; ++ u_int32_t eof; ++}; ++ ++/* getdeviceinfo */ ++struct spnfs_data_server { ++ u_int32_t dsid; ++ char netid[5]; ++ char addr[29]; ++}; ++ ++struct spnfs_device { ++ u_int64_t devid; ++ int dscount; ++ struct spnfs_data_server dslist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++struct spnfs_msg_getdeviceinfo_args { ++ u_int64_t devid; ++}; ++ ++struct spnfs_msg_getdeviceinfo_res { ++ int status; ++ struct spnfs_device devinfo; ++}; ++ ++/* setattr */ ++struct spnfs_msg_setattr_args { ++ unsigned long inode; ++ unsigned long generation; ++ int file_size; ++}; ++ ++struct spnfs_msg_setattr_res { ++ int status; ++}; ++ ++/* open */ ++struct spnfs_msg_open_args { ++ unsigned long inode; ++ unsigned long generation; ++ int create; ++ int createmode; ++ int truncate; ++}; ++ ++struct spnfs_msg_open_res { ++ int status; ++}; ++ ++/* close */ ++/* No op for daemon */ ++struct spnfs_msg_close_args { ++ int x; ++}; ++ ++struct spnfs_msg_close_res { ++ int y; ++}; ++ ++/* create */ ++/* ++struct spnfs_msg_create_args { ++ int x; ++}; ++ ++struct spnfs_msg_create_res { ++ int y; ++}; ++*/ ++ ++/* remove */ ++struct spnfs_msg_remove_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_msg_remove_res { ++ int status; ++}; ++ ++/* commit */ ++/* ++struct spnfs_msg_commit_args { ++ int x; ++}; ++ ++struct spnfs_msg_commit_res { ++ int y; ++}; ++*/ ++ ++/* read */ ++struct spnfs_msg_read_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++}; ++ ++struct spnfs_msg_read_res { ++ int status; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++/* write */ ++struct spnfs_msg_write_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++struct spnfs_msg_write_res { ++ int status; ++}; ++ ++/* bundle args and responses */ ++union spnfs_msg_args { ++ struct spnfs_msg_layoutget_args layoutget_args; ++ struct spnfs_msg_layoutcommit_args layoutcommit_args; ++/* ++ struct spnfs_msg_layoutreturn_args layoutreturn_args; ++*/ ++ struct spnfs_msg_getdeviceiter_args getdeviceiter_args; ++ struct spnfs_msg_getdeviceinfo_args getdeviceinfo_args; ++ struct spnfs_msg_setattr_args setattr_args; ++ struct spnfs_msg_open_args open_args; ++ struct spnfs_msg_close_args close_args; ++/* ++ struct spnfs_msg_create_args create_args; ++*/ ++ struct spnfs_msg_remove_args remove_args; ++/* ++ struct spnfs_msg_commit_args commit_args; ++*/ ++ struct spnfs_msg_read_args read_args; ++ struct spnfs_msg_write_args write_args; ++}; ++ ++union spnfs_msg_res { ++ struct spnfs_msg_layoutget_res layoutget_res; ++ struct spnfs_msg_layoutcommit_res layoutcommit_res; ++/* ++ struct spnfs_msg_layoutreturn_res layoutreturn_res; ++*/ ++ struct spnfs_msg_getdeviceiter_res getdeviceiter_res; ++ struct spnfs_msg_getdeviceinfo_res getdeviceinfo_res; ++ struct spnfs_msg_setattr_res setattr_res; ++ struct spnfs_msg_open_res open_res; ++ struct spnfs_msg_close_res close_res; ++/* ++ struct spnfs_msg_create_res create_res; ++*/ ++ struct spnfs_msg_remove_res remove_res; ++/* ++ struct spnfs_msg_commit_res commit_res; ++*/ ++ struct spnfs_msg_read_res read_res; ++ struct spnfs_msg_write_res write_res; ++}; ++ ++/* a spnfs message, args and response */ ++struct spnfs_msg { ++ unsigned char im_type; ++ unsigned char im_status; ++ union spnfs_msg_args im_args; ++ union spnfs_msg_res im_res; ++}; ++ ++/* spnfs configuration info */ ++struct spnfs_config { ++ unsigned char dense_striping; ++ int stripe_size; ++ int num_ds; ++ char ds_dir[SPNFS_MAX_DATA_SERVERS][80]; /* XXX */ ++}; ++ ++#if defined(__KERNEL__) && defined(CONFIG_SPNFS) ++ ++#include ++ ++/* pipe mgmt structure. messages flow through here */ ++struct spnfs { ++ struct dentry *spnfs_dentry; /* dentry for pipe */ ++ wait_queue_head_t spnfs_wq; ++ struct spnfs_msg spnfs_im; /* spnfs message */ ++ struct mutex spnfs_lock; /* Serializes upcalls */ ++ struct mutex spnfs_plock; ++}; ++ ++struct nfsd4_open; ++ ++int spnfs_layout_type(struct super_block *); ++enum nfsstat4 spnfs_layoutget(struct inode *, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int spnfs_layoutcommit(void); ++int spnfs_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int spnfs_getdeviceiter(struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int spnfs_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++int spnfs_setattr(void); ++int spnfs_open(struct inode *, struct nfsd4_open *); ++int spnfs_get_state(struct inode *, struct knfsd_fh *, struct pnfs_get_state *); ++int spnfs_remove(unsigned long, unsigned long); ++__be32 spnfs_read(struct inode *, loff_t, unsigned long *, ++ int, struct svc_rqst *); ++__be32 spnfs_write(struct inode *, loff_t, size_t, int, struct svc_rqst *); ++int spnfs_getfh(int, struct nfs_fh *); ++int spnfs_test_layoutrecall(char *, u64, u64); ++int spnfs_layoutrecall(struct inode *, int, u64, u64); ++ ++int nfsd_spnfs_new(void); ++void nfsd_spnfs_delete(void); ++int spnfs_upcall(struct spnfs *, struct spnfs_msg *, union spnfs_msg_res *); ++int spnfs_enabled(void); ++int spnfs_init_proc(void); ++ ++extern struct spnfs_config *spnfs_config; ++ ++#endif /* __KERNEL__ && CONFIG_SPNFS */ ++ ++#endif /* NFS_SPNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch/include/linux/nfsd/const.h +--- linux-2.6.34.noarch/include/linux/nfsd/const.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-08-23 12:09:03.376401789 -0400 +@@ -29,6 +29,7 @@ + #ifdef __KERNEL__ + + #include ++#include + + /* + * Largest number of bytes we need to allocate for an NFS +diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch/include/linux/nfsd/debug.h +--- linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-08-23 12:09:03.376401789 -0400 +@@ -32,6 +32,8 @@ + #define NFSDDBG_REPCACHE 0x0080 + #define NFSDDBG_XDR 0x0100 + #define NFSDDBG_LOCKD 0x0200 ++#define NFSDDBG_PNFS 0x0400 ++#define NFSDDBG_FILELAYOUT 0x0800 + #define NFSDDBG_ALL 0x7FFF + #define NFSDDBG_NOCHANGE 0xFFFF + +diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarch/include/linux/nfsd/export.h +--- linux-2.6.34.noarch/include/linux/nfsd/export.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-08-23 12:09:03.377481954 -0400 +@@ -100,6 +100,7 @@ struct svc_export { + uid_t ex_anon_uid; + gid_t ex_anon_gid; + int ex_fsid; ++ int ex_pnfs; + unsigned char * ex_uuid; /* 16 byte fsid */ + struct nfsd4_fs_locations ex_fslocs; + int ex_nflavors; +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-23 12:09:03.377481954 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-23 12:09:03.378501747 -0400 +@@ -0,0 +1,132 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef NFSD_NFS4LAYOUTXDR_H ++#define NFSD_NFS4LAYOUTXDR_H ++ ++#include ++#include ++ ++/* the nfsd4_pnfs_devlist dev_addr for the file layout type */ ++struct pnfs_filelayout_devaddr { ++ struct xdr_netobj r_netid; ++ struct xdr_netobj r_addr; ++}; ++ ++/* list of multipath servers */ ++struct pnfs_filelayout_multipath { ++ u32 fl_multipath_length; ++ struct pnfs_filelayout_devaddr *fl_multipath_list; ++}; ++ ++struct pnfs_filelayout_device { ++ u32 fl_stripeindices_length; ++ u32 *fl_stripeindices_list; ++ u32 fl_device_length; ++ struct pnfs_filelayout_multipath *fl_device_list; ++}; ++ ++struct pnfs_filelayout_layout { ++ u32 lg_layout_type; /* response */ ++ u32 lg_stripe_type; /* response */ ++ u32 lg_commit_through_mds; /* response */ ++ u64 lg_stripe_unit; /* response */ ++ u64 lg_pattern_offset; /* response */ ++ u32 lg_first_stripe_index; /* response */ ++ struct nfsd4_pnfs_deviceid device_id; /* response */ ++ u32 lg_fh_length; /* response */ ++ struct knfsd_fh *lg_fh_list; /* response */ ++}; ++ ++enum stripetype4 { ++ STRIPE_SPARSE = 1, ++ STRIPE_DENSE = 2 ++}; ++ ++enum pnfs_block_extent_state4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, ++ PNFS_BLOCK_NONE_DATA = 3 ++}; ++ ++enum pnfs_block_volume_type4 { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, ++ PNFS_BLOCK_VOLUME_SLICE = 1, ++ PNFS_BLOCK_VOLUME_CONCAT = 2, ++ PNFS_BLOCK_VOLUME_STRIPE = 3, ++}; ++typedef enum pnfs_block_volume_type4 pnfs_block_volume_type4; ++ ++enum bl_cache_state { ++ BLOCK_LAYOUT_NEW = 0, ++ BLOCK_LAYOUT_CACHE = 1, ++ BLOCK_LAYOUT_UPDATE = 2, ++}; ++ ++typedef struct pnfs_blocklayout_layout { ++ struct list_head bll_list; ++ struct nfsd4_pnfs_deviceid bll_vol_id; ++ u64 bll_foff; // file offset ++ u64 bll_len; ++ u64 bll_soff; // storage offset ++ int bll_recalled; ++ enum pnfs_block_extent_state4 bll_es; ++ enum bl_cache_state bll_cache_state; ++} pnfs_blocklayout_layout_t; ++ ++typedef struct pnfs_blocklayout_devinfo { ++ struct list_head bld_list; ++ pnfs_block_volume_type4 bld_type; ++ struct nfsd4_pnfs_deviceid bld_devid; ++ int bld_index_loc; ++ union { ++ struct { ++ u64 bld_offset; ++ u32 bld_sig_len, ++ *bld_sig; ++ } simple; ++ struct { ++ u64 bld_start, ++ bld_len; ++ u32 bld_index; /* Index of Simple Volume */ ++ } slice; ++ struct { ++ u32 bld_stripes; ++ u64 bld_chunk_size; ++ u32 *bld_stripe_indexs; ++ } stripe; ++ } u; ++} pnfs_blocklayout_devinfo_t; ++ ++#endif /* NFSD_NFS4LAYOUTXDR_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-23 12:09:03.378501747 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-23 12:09:03.378501747 -0400 +@@ -0,0 +1,54 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++#include ++ ++/* ++ * Length of comma separated pnfs data server IPv4 addresses. Enough room for ++ * 32 addresses. ++ */ ++#define NFSD_DLM_DS_LIST_MAX 512 ++/* ++ * Length of colon separated pnfs dlm device of the form ++ * disk_name:comma separated data server IPv4 address ++ */ ++#define NFSD_PNFS_DLM_DEVICE_MAX (NFSD_DLM_DS_LIST_MAX + DISK_NAME_LEN + 1) ++ ++#ifdef CONFIG_PNFSD ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++extern const struct pnfs_export_operations pnfs_dlm_export_ops; ++ ++int nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len); ++ ++void nfsd4_pnfs_dlm_shutdown(void); ++ ++ssize_t nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen); ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-23 12:09:03.379487099 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-23 12:09:03.379487099 -0400 +@@ -0,0 +1,271 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef _LINUX_NFSD_NFSD4_PNFS_H ++#define _LINUX_NFSD_NFSD4_PNFS_H ++ ++#include ++#include ++#include ++ ++struct nfsd4_pnfs_deviceid { ++ u64 sbid; /* per-superblock unique ID */ ++ u64 devid; /* filesystem-wide unique device ID */ ++}; ++ ++struct nfsd4_pnfs_dev_iter_res { ++ u64 gd_cookie; /* request/repsonse */ ++ u64 gd_verf; /* request/repsonse */ ++ u64 gd_devid; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++/* Arguments for set_device_notify */ ++struct pnfs_devnotify_arg { ++ struct nfsd4_pnfs_deviceid dn_devid; /* request */ ++ u32 dn_layout_type; /* request */ ++ u32 dn_notify_types; /* request/response */ ++}; ++ ++struct nfsd4_layout_seg { ++ u64 clientid; ++ u32 layout_type; ++ u32 iomode; ++ u64 offset; ++ u64 length; ++}; ++ ++/* Used by layout_get to encode layout (loc_body var in spec) ++ * Args: ++ * minlength - min number of accessible bytes given by layout ++ * fsid - Major part of struct pnfs_deviceid. File system uses this ++ * to build the deviceid returned in the layout. ++ * fh - fs can modify the file handle for use on data servers ++ * seg - layout info requested and layout info returned ++ * xdr - xdr info ++ * return_on_close - true if layout to be returned on file close ++ */ ++ ++struct nfsd4_pnfs_layoutget_arg { ++ u64 lg_minlength; ++ u64 lg_sbid; ++ const struct knfsd_fh *lg_fh; ++}; ++ ++struct nfsd4_pnfs_layoutget_res { ++ struct nfsd4_layout_seg lg_seg; /* request/resopnse */ ++ u32 lg_return_on_close; ++}; ++ ++struct nfsd4_pnfs_layoutcommit_arg { ++ struct nfsd4_layout_seg lc_seg; /* request */ ++ u32 lc_reclaim; /* request */ ++ u32 lc_newoffset; /* request */ ++ u64 lc_last_wr; /* request */ ++ struct nfstime4 lc_mtime; /* request */ ++ u32 lc_up_len; /* layout length */ ++ void *lc_up_layout; /* decoded by callback */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit_res { ++ u32 lc_size_chg; /* boolean for response */ ++ u64 lc_newsize; /* response */ ++}; ++ ++#define PNFS_LAST_LAYOUT_NO_RECALLS ((void *)-1) /* used with lr_cookie below */ ++ ++struct nfsd4_pnfs_layoutreturn_arg { ++ u32 lr_return_type; /* request */ ++ struct nfsd4_layout_seg lr_seg; /* request */ ++ u32 lr_reclaim; /* request */ ++ u32 lrf_body_len; /* request */ ++ void *lrf_body; /* request */ ++ void *lr_cookie; /* fs private */ ++}; ++ ++/* pNFS Metadata to Data server state communication */ ++struct pnfs_get_state { ++ u32 dsid; /* request */ ++ u64 ino; /* request */ ++ nfs4_stateid stid; /* request;response */ ++ nfs4_clientid clid; /* response */ ++ u32 access; /* response */ ++ u32 stid_gen; /* response */ ++ u32 verifier[2]; /* response */ ++}; ++ ++/* ++ * pNFS export operations vector. ++ * ++ * The filesystem must implement the following methods: ++ * layout_type ++ * get_device_info ++ * layout_get ++ * ++ * All other methods are optional and can be set to NULL if not implemented. ++ */ ++struct pnfs_export_operations { ++ /* Returns the supported pnfs_layouttype4. */ ++ int (*layout_type) (struct super_block *); ++ ++ /* Encode device info onto the xdr stream. */ ++ int (*get_device_info) (struct super_block *, ++ struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++ ++ /* Retrieve all available devices via an iterator. ++ * arg->cookie == 0 indicates the beginning of the list, ++ * otherwise arg->verf is used to verify that the list hasn't changed ++ * while retrieved. ++ * ++ * On output, the filesystem sets the devid based on the current cookie ++ * and sets res->cookie and res->verf corresponding to the next entry. ++ * When the last entry in the list is retrieved, res->eof is set to 1. ++ */ ++ int (*get_device_iter) (struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++ ++ int (*set_device_notify) (struct super_block *, ++ struct pnfs_devnotify_arg *); ++ ++ /* Retrieve and encode a layout for inode onto the xdr stream. ++ * arg->minlength is the minimum number of accessible bytes required ++ * by the client. ++ * The maximum number of bytes to encode the layout is given by ++ * the xdr stream end pointer. ++ * arg->fsid contains the major part of struct pnfs_deviceid. ++ * The file system uses this to build the deviceid returned ++ * in the layout. ++ * res->seg - layout segment requested and layout info returned. ++ * res->fh can be modified the file handle for use on data servers ++ * res->return_on_close - true if layout to be returned on file close ++ * ++ * return one of the following nfs errors: ++ * NFS_OK Success ++ * NFS4ERR_ACCESS Permission error ++ * NFS4ERR_BADIOMODE Server does not support requested iomode ++ * NFS4ERR_BADLAYOUT No layout matching loga_minlength rules ++ * NFS4ERR_INVAL Parameter other than layout is invalid ++ * NFS4ERR_IO I/O error ++ * NFS4ERR_LAYOUTTRYLATER Layout may be retrieved later ++ * NFS4ERR_LAYOUTUNAVAILABLE Layout unavailable for this file ++ * NFS4ERR_LOCKED Lock conflict ++ * NFS4ERR_NOSPC Out-of-space error occured ++ * NFS4ERR_RECALLCONFLICT Layout currently unavialable due to ++ * a conflicting CB_LAYOUTRECALL ++ * NFS4ERR_SERVERFAULT Server went bezerk ++ * NFS4ERR_TOOSMALL loga_maxcount too small to fit layout ++ * NFS4ERR_WRONG_TYPE Wrong file type (not a regular file) ++ */ ++ enum nfsstat4 (*layout_get) (struct inode *, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++ ++ /* Commit changes to layout */ ++ int (*layout_commit) (struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++ ++ /* Returns the layout */ ++ int (*layout_return) (struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++ ++ /* Can layout segments be merged for this layout type? */ ++ int (*can_merge_layouts) (u32 layout_type); ++ ++ /* pNFS Files layout specific operations */ ++ ++ /* Get the write verifier for DS (called on MDS only) */ ++ void (*get_verifier) (struct super_block *, u32 *p); ++ /* Call fs on DS only */ ++ int (*get_state) (struct inode *, struct knfsd_fh *, ++ struct pnfs_get_state *); ++}; ++ ++struct nfsd4_pnfs_cb_layout { ++ u32 cbl_recall_type; /* request */ ++ struct nfsd4_layout_seg cbl_seg; /* request */ ++ u32 cbl_layoutchanged; /* request */ ++ nfs4_stateid cbl_sid; /* request */ ++ struct nfs4_fsid cbl_fsid; ++ void *cbl_cookie; /* fs private */ ++}; ++ ++/* layoutrecall request (from exported filesystem) */ ++struct nfs4_layoutrecall { ++ struct kref clr_ref; ++ struct nfsd4_pnfs_cb_layout cb; /* request */ ++ struct list_head clr_perclnt; /* on cl_layoutrecalls */ ++ struct nfs4_client *clr_client; ++ struct nfs4_file *clr_file; ++ struct timespec clr_time; /* last activity */ ++ struct super_block *clr_sb; /* We might not have a file */ ++ struct nfs4_layoutrecall *parent; /* The initiating recall */ ++ ++ void *clr_args; /* nfsd internal */ ++}; ++ ++struct nfsd4_pnfs_cb_dev_item { ++ u32 cbd_notify_type; /* request */ ++ u32 cbd_layout_type; /* request */ ++ struct nfsd4_pnfs_deviceid cbd_devid; /* request */ ++ u32 cbd_immediate; /* request */ ++}; ++ ++struct nfsd4_pnfs_cb_dev_list { ++ u32 cbd_len; /* request */ ++ struct nfsd4_pnfs_cb_dev_item *cbd_list; /* request */ ++}; ++ ++/* ++ * callbacks provided by the nfsd ++ */ ++struct pnfsd_cb_operations { ++ /* Generic callbacks */ ++ int (*cb_layout_recall) (struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++ int (*cb_device_notify) (struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++ ++ /* pNFS Files layout specific callbacks */ ++ ++ /* Callback from fs on MDS only */ ++ int (*cb_get_state) (struct super_block *, struct pnfs_get_state *); ++ /* Callback from fs on DS only */ ++ int (*cb_change_state) (struct pnfs_get_state *); ++}; ++ ++#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noarch/include/linux/nfsd/syscall.h +--- linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-08-23 12:09:03.380502500 -0400 +@@ -29,6 +29,7 @@ + /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ + #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ + #define NFSCTL_GETFS 8 /* get an fh by path with max FH len */ ++#define NFSCTL_FD2FH 9 /* get a fh from a fd */ + + /* SVC */ + struct nfsctl_svc { +@@ -71,6 +72,11 @@ struct nfsctl_fsparm { + int gd_maxlen; + }; + ++/* FD2FH */ ++struct nfsctl_fd2fh { ++ int fd; ++}; ++ + /* + * This is the argument union. + */ +@@ -82,6 +88,7 @@ struct nfsctl_arg { + struct nfsctl_export u_export; + struct nfsctl_fdparm u_getfd; + struct nfsctl_fsparm u_getfs; ++ struct nfsctl_fd2fh u_fd2fh; + /* + * The following dummy member is needed to preserve binary compatibility + * on platforms where alignof(void*)>alignof(int). It's needed because +@@ -95,6 +102,7 @@ struct nfsctl_arg { + #define ca_export u.u_export + #define ca_getfd u.u_getfd + #define ca_getfs u.u_getfs ++#define ca_fd2fh u.u_fd2fh + }; + + union nfsctl_res { +diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/include/linux/nfs_fs.h +--- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-23 12:08:29.061494081 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-23 12:09:03.381511751 -0400 +@@ -72,13 +72,20 @@ struct nfs_access_entry { + int mask; + }; + ++struct nfs_lock_context { ++ atomic_t count; ++ struct list_head list; ++ struct nfs_open_context *open_context; ++ fl_owner_t lockowner; ++ pid_t pid; ++}; ++ + struct nfs4_state; + struct nfs_open_context { +- atomic_t count; ++ struct nfs_lock_context lock_context; + struct path path; + struct rpc_cred *cred; + struct nfs4_state *state; +- fl_owner_t lockowner; + fmode_t mode; + + unsigned long flags; +@@ -97,6 +104,26 @@ struct nfs_delegation; + + struct posix_acl; + ++struct pnfs_layout_type { ++ int refcount; ++ struct list_head lo_layouts; /* other client layouts */ ++ struct list_head segs; /* layout segments list */ ++ int roc_iomode; /* iomode to return on close, 0=none */ ++ seqlock_t seqlock; /* Protects the stateid */ ++ nfs4_stateid stateid; ++ unsigned long pnfs_layout_state; ++ #define NFS_INO_RO_LAYOUT_FAILED 0 /* get ro layout failed stop trying */ ++ #define NFS_INO_RW_LAYOUT_FAILED 1 /* get rw layout failed stop trying */ ++ #define NFS_INO_LAYOUTCOMMIT 3 /* LAYOUTCOMMIT needed */ ++ struct rpc_cred *lo_cred; /* layoutcommit credential */ ++ /* DH: These vars keep track of the maximum write range ++ * so the values can be used for layoutcommit. ++ */ ++ loff_t pnfs_write_begin_pos; ++ loff_t pnfs_write_end_pos; ++ struct inode *lo_inode; ++}; ++ + /* + * nfs fs inode data in memory + */ +@@ -181,6 +208,13 @@ struct nfs_inode { + struct nfs_delegation *delegation; + fmode_t delegation_state; + struct rw_semaphore rwsem; ++ ++ /* pNFS layout information */ ++#if defined(CONFIG_NFS_V4_1) ++ wait_queue_head_t lo_waitq; ++ struct pnfs_layout_type *layout; ++ time_t pnfs_layout_suspend; ++#endif /* CONFIG_NFS_V4_1 */ + #endif /* CONFIG_NFS_V4*/ + #ifdef CONFIG_NFS_FSCACHE + struct fscache_cookie *fscache; +@@ -353,6 +387,8 @@ extern void nfs_setattr_update_inode(str + extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); + extern void put_nfs_open_context(struct nfs_open_context *ctx); + extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode); ++extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx); ++extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); + extern u64 nfs_compat_user_ino64(u64 fileid); + extern void nfs_fattr_init(struct nfs_fattr *fattr); + +@@ -481,8 +517,12 @@ extern void nfs_unblock_sillyrename(stru + extern int nfs_congestion_kb; + extern int nfs_writepage(struct page *page, struct writeback_control *wbc); + extern int nfs_writepages(struct address_space *, struct writeback_control *); +-extern int nfs_flush_incompatible(struct file *file, struct page *page); +-extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); ++struct pnfs_layout_segment; ++extern int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg); ++extern int nfs_updatepage(struct file *, struct page *, ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata); + extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); + + /* +@@ -604,6 +644,8 @@ extern void * nfs_root_data(void); + #define NFSDBG_CLIENT 0x0200 + #define NFSDBG_MOUNT 0x0400 + #define NFSDBG_FSCACHE 0x0800 ++#define NFSDBG_PNFS 0x1000 ++#define NFSDBG_PNFS_LD 0x2000 + #define NFSDBG_ALL 0xFFFF + + #ifdef __KERNEL__ +diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/include/linux/nfs_fs_sb.h +--- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-23 12:08:29.062501618 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-23 12:09:03.383491395 -0400 +@@ -15,6 +15,7 @@ struct nlm_host; + struct nfs4_sequence_args; + struct nfs4_sequence_res; + struct nfs_server; ++struct nfs4_minor_version_ops; + + /* + * The nfs_client identifies our client state to the server. +@@ -70,11 +71,7 @@ struct nfs_client { + */ + char cl_ipaddr[48]; + unsigned char cl_id_uniquifier; +- int (* cl_call_sync)(struct nfs_server *server, +- struct rpc_message *msg, +- struct nfs4_sequence_args *args, +- struct nfs4_sequence_res *res, +- int cache_reply); ++ const struct nfs4_minor_version_ops *cl_mvops; + #endif /* CONFIG_NFS_V4 */ + + #ifdef CONFIG_NFS_V4_1 +@@ -85,6 +82,8 @@ struct nfs_client { + /* The flags used for obtaining the clientid during EXCHANGE_ID */ + u32 cl_exchange_flags; + struct nfs4_session *cl_session; /* sharred session */ ++ struct list_head cl_layouts; ++ struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */ + #endif /* CONFIG_NFS_V4_1 */ + + #ifdef CONFIG_NFS_FSCACHE +@@ -92,6 +91,16 @@ struct nfs_client { + #endif + }; + ++static inline bool ++is_ds_only_client(struct nfs_client *clp) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ return is_ds_only_session(clp->cl_exchange_flags); ++#else ++ return false; ++#endif ++} ++ + /* + * NFS client parameters stored in the superblock. + */ +@@ -136,7 +145,7 @@ struct nfs_server { + #endif + + #ifdef CONFIG_NFS_V4 +- u32 attr_bitmask[2];/* V4 bitmask representing the set ++ u32 attr_bitmask[3];/* V4 bitmask representing the set + of attributes supported on this + filesystem */ + u32 cache_consistency_bitmask[2]; +@@ -148,6 +157,15 @@ struct nfs_server { + that are supported on this + filesystem */ + #endif ++ ++#ifdef CONFIG_NFS_V4_1 ++ u32 pnfs_blksize; /* layout_blksize attr */ ++ struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ ++ void *pnfs_ld_data; /* Per-mount data */ ++ unsigned int ds_rsize; /* Data server read size */ ++ unsigned int ds_wsize; /* Data server write size */ ++#endif /* CONFIG_NFS_V4_1 */ ++ + void (*destroy)(struct nfs_server *); + + atomic_t active; /* Keep trace of any activity to this server */ +diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch/include/linux/nfs_iostat.h +--- linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-08-23 12:09:03.384501540 -0400 +@@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { + NFSIOS_SHORTREAD, + NFSIOS_SHORTWRITE, + NFSIOS_DELAY, ++ NFSIOS_PNFS_READ, ++ NFSIOS_PNFS_WRITE, ++ NFSIOS_PNFS_COMMIT, + __NFSIOS_COUNTSMAX, + }; + +diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/include/linux/nfs_page.h +--- linux-2.6.34.noarch/include/linux/nfs_page.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-08-23 12:09:03.385491518 -0400 +@@ -39,6 +39,7 @@ struct nfs_page { + struct list_head wb_list; /* Defines state of page: */ + struct page *wb_page; /* page to read in/write out */ + struct nfs_open_context *wb_context; /* File state context info */ ++ struct nfs_lock_context *wb_lock_context; /* lock context info */ + atomic_t wb_complete; /* i/os we're waiting for */ + pgoff_t wb_index; /* Offset >> PAGE_CACHE_SHIFT */ + unsigned int wb_offset, /* Offset & ~PAGE_CACHE_MASK */ +@@ -47,6 +48,7 @@ struct nfs_page { + struct kref wb_kref; /* reference count */ + unsigned long wb_flags; + struct nfs_writeverf wb_verf; /* Commit cookie */ ++ struct pnfs_layout_segment *wb_lseg; /* Pnfs layout info */ + }; + + struct nfs_pageio_descriptor { +@@ -60,6 +62,12 @@ struct nfs_pageio_descriptor { + int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int); + int pg_ioflags; + int pg_error; ++ struct pnfs_layout_segment *pg_lseg; ++#ifdef CONFIG_NFS_V4_1 ++ int pg_iswrite; ++ int pg_boundary; ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++#endif /* CONFIG_NFS_V4_1 */ + }; + + #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) +@@ -68,13 +76,15 @@ extern struct nfs_page *nfs_create_reque + struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int count); ++ unsigned int count, ++ struct pnfs_layout_segment *lseg); + extern void nfs_clear_request(struct nfs_page *req); + extern void nfs_release_request(struct nfs_page *req); + + + extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst, +- pgoff_t idx_start, unsigned int npages, int tag); ++ pgoff_t idx_start, unsigned int npages, int tag, ++ int *use_pnfs); + extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, + struct inode *inode, + int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), +diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/include/linux/nfs_xdr.h +--- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-08-23 12:08:29.062501618 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-23 12:09:03.387491422 -0400 +@@ -3,6 +3,8 @@ + + #include + #include ++#include ++#include + + /* + * To change the maximum rsize and wsize supported by the NFS client, adjust +@@ -10,7 +12,7 @@ + * support a megabyte or more. The default is left at 4096 bytes, which is + * reasonable for NFS over UDP. + */ +-#define NFS_MAX_FILE_IO_SIZE (1048576U) ++#define NFS_MAX_FILE_IO_SIZE (4U * 1048576U) + #define NFS_DEF_FILE_IO_SIZE (4096U) + #define NFS_MIN_FILE_IO_SIZE (1024U) + +@@ -113,6 +115,10 @@ struct nfs_fsinfo { + __u32 dtpref; /* pref. readdir transfer size */ + __u64 maxfilesize; + __u32 lease_time; /* in seconds */ ++#if defined(CONFIG_NFS_V4_1) ++ __u32 layouttype; /* supported pnfs layout driver */ ++ __u32 blksize; /* preferred pnfs io block size */ ++#endif + }; + + struct nfs_fsstat { +@@ -196,8 +202,10 @@ struct nfs_openargs { + __u64 clientid; + __u64 id; + union { +- struct iattr * attrs; /* UNCHECKED, GUARDED */ +- nfs4_verifier verifier; /* EXCLUSIVE */ ++ struct { ++ struct iattr * attrs; /* UNCHECKED, GUARDED */ ++ nfs4_verifier verifier; /* EXCLUSIVE */ ++ }; + nfs4_stateid delegation; /* CLAIM_DELEGATE_CUR */ + fmode_t delegation_type; /* CLAIM_PREVIOUS */ + } u; +@@ -313,6 +321,10 @@ struct nfs_lockt_res { + struct nfs4_sequence_res seq_res; + }; + ++struct nfs_release_lockowner_args { ++ struct nfs_lowner lock_owner; ++}; ++ + struct nfs4_delegreturnargs { + const struct nfs_fh *fhandle; + const nfs4_stateid *stateid; +@@ -332,6 +344,7 @@ struct nfs4_delegreturnres { + struct nfs_readargs { + struct nfs_fh * fh; + struct nfs_open_context *context; ++ struct nfs_lock_context *lock_context; + __u64 offset; + __u32 count; + unsigned int pgbase; +@@ -352,6 +365,7 @@ struct nfs_readres { + struct nfs_writeargs { + struct nfs_fh * fh; + struct nfs_open_context *context; ++ struct nfs_lock_context *lock_context; + __u64 offset; + __u32 count; + enum nfs3_stable_how stable; +@@ -846,7 +860,7 @@ struct nfs4_server_caps_arg { + }; + + struct nfs4_server_caps_res { +- u32 attr_bitmask[2]; ++ u32 attr_bitmask[3]; + u32 acl_bitmask; + u32 has_links; + u32 has_symlinks; +@@ -961,6 +975,27 @@ struct nfs_page; + + #define NFS_PAGEVEC_SIZE (8U) + ++#if defined(CONFIG_NFS_V4_1) ++/* pnfsflag values */ ++#define PNFS_NO_RPC 0x0001 /* non rpc result callback switch */ ++ ++/* pnfs-specific data needed for read, write, and commit calls */ ++struct pnfs_call_data { ++ struct pnfs_layout_segment *lseg; ++ const struct rpc_call_ops *call_ops; ++ u32 orig_count; /* for retry via MDS */ ++ int pnfs_error; ++ u8 pnfsflags; ++ u8 how; /* for FLUSH_STABLE */ ++}; ++ ++/* files layout-type specific data for read, write, and commit */ ++struct pnfs_fl_call_data { ++ struct nfs_client *ds_nfs_client; ++ __u64 orig_offset; ++}; ++#endif /* CONFIG_NFS_V4_1 */ ++ + struct nfs_read_data { + int flags; + struct rpc_task task; +@@ -976,10 +1011,16 @@ struct nfs_read_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + + struct nfs_write_data { ++ struct kref refcount; /* For pnfs commit splitting */ ++ struct nfs_write_data *parent; /* For pnfs commit splitting */ + int flags; + struct rpc_task task; + struct inode *inode; +@@ -995,6 +1036,10 @@ struct nfs_write_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + +@@ -1008,6 +1053,7 @@ struct nfs_rpc_ops { + const struct dentry_operations *dentry_ops; + const struct inode_operations *dir_inode_ops; + const struct inode_operations *file_inode_ops; ++ const struct file_operations *file_ops; + + int (*getroot) (struct nfs_server *, struct nfs_fh *, + struct nfs_fsinfo *); +@@ -1072,6 +1118,7 @@ struct nfs_rpc_ops { + extern const struct nfs_rpc_ops nfs_v2_clientops; + extern const struct nfs_rpc_ops nfs_v3_clientops; + extern const struct nfs_rpc_ops nfs_v4_clientops; ++extern const struct nfs_rpc_ops pnfs_v4_clientops; + extern struct rpc_version nfs_version2; + extern struct rpc_version nfs_version3; + extern struct rpc_version nfs_version4; +diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.noarch/include/linux/panfs_shim_api.h +--- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-08-23 12:09:03.388491527 -0400 ++++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-08-23 12:09:03.388491527 -0400 +@@ -0,0 +1,57 @@ ++#ifndef _PANFS_SHIM_API_H ++#define _PANFS_SHIM_API_H ++ ++/* ++ * imported panfs functions ++ */ ++struct panfs_export_operations { ++ int (*convert_rc)(pan_status_t rc); ++ ++ int (*sm_sec_t_get_size_otw)( ++ pan_sm_sec_otw_t *var, ++ pan_size_t *core_sizep, ++ pan_size_t *wire_size, ++ void *buf_end); ++ ++ int (*sm_sec_t_unmarshall)( ++ pan_sm_sec_otw_t *in, ++ pan_sm_sec_t *out, ++ void *buf, ++ pan_size_t size, ++ pan_size_t *otw_consumed, ++ pan_size_t *in_core_consumed); ++ ++ int (*ucreds_get)(void **ucreds_pp); ++ ++ void (*ucreds_put)(void *ucreds); ++ ++ int (*sam_read)( ++ pan_sam_access_flags_t flags, ++ pan_sam_read_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_read_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p); ++ ++ int (*sam_write)( ++ pan_sam_access_flags_t flags, ++ pan_sam_write_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_write_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p); ++}; ++ ++extern int ++panfs_shim_register(struct panfs_export_operations *ops); ++ ++extern int ++panfs_shim_unregister(void); ++ ++#endif /* _PANFS_SHIM_API_H */ +diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h +--- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-23 12:09:03.390501461 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-08-23 12:09:03.390501461 -0400 +@@ -0,0 +1,440 @@ ++/* ++ * pnfs_osd_xdr.h ++ * ++ * pNFS-osd on-the-wire data structures ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#ifndef __PNFS_OSD_XDR_H__ ++#define __PNFS_OSD_XDR_H__ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#define PNFS_OSD_OSDNAME_MAXSIZE 256 ++ ++/* ++ * START OF "GENERIC" DECODE ROUTINES. ++ * These may look a little ugly since they are imported from a "generic" ++ * set of XDR encode/decode routines which are intended to be shared by ++ * all of our NFSv4 implementations (OpenBSD, MacOS X...). ++ * ++ * If the pain of reading these is too great, it should be a straightforward ++ * task to translate them into Linux-specific versions which are more ++ * consistent with the style used in NFSv2/v3... ++ */ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (u64)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++ ++/* ++ * draft-ietf-nfsv4-minorversion-22 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* Layout Structure */ ++ ++enum pnfs_osd_raid_algorithm4 { ++ PNFS_OSD_RAID_0 = 1, ++ PNFS_OSD_RAID_4 = 2, ++ PNFS_OSD_RAID_5 = 3, ++ PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ ++}; ++ ++/* struct pnfs_osd_data_map4 { ++ * uint32_t odm_num_comps; ++ * length4 odm_stripe_unit; ++ * uint32_t odm_group_width; ++ * uint32_t odm_group_depth; ++ * uint32_t odm_mirror_cnt; ++ * pnfs_osd_raid_algorithm4 odm_raid_algorithm; ++ * }; ++ */ ++struct pnfs_osd_data_map { ++ u32 odm_num_comps; ++ u64 odm_stripe_unit; ++ u32 odm_group_width; ++ u32 odm_group_depth; ++ u32 odm_mirror_cnt; ++ u32 odm_raid_algorithm; ++}; ++ ++static inline int ++pnfs_osd_data_map_xdr_sz(void) ++{ ++ return 1 + 2 + 1 + 1 + 1 + 1; ++} ++ ++static inline size_t ++pnfs_osd_data_map_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_data_map); ++} ++ ++/* struct pnfs_osd_objid4 { ++ * deviceid4 oid_device_id; ++ * uint64_t oid_partition_id; ++ * uint64_t oid_object_id; ++ * }; ++ */ ++struct pnfs_osd_objid { ++ struct pnfs_deviceid oid_device_id; ++ u64 oid_partition_id; ++ u64 oid_object_id; ++}; ++ ++/* For printout. I use "dev(%llx:%llx)", _DEVID_LO(), _DEVID_HI BE style */ ++#define _DEVID_LO(oid_device_id) \ ++ (unsigned long long)be64_to_cpup((__be64 *)oid_device_id.data) ++ ++#define _DEVID_HI(oid_device_id) \ ++ (unsigned long long)be64_to_cpup(((__be64 *)oid_device_id.data) + 1) ++ ++static inline int ++pnfs_osd_objid_xdr_sz(void) ++{ ++ return (NFS4_PNFS_DEVICEID4_SIZE / 4) + 2 + 2; ++} ++ ++static inline size_t ++pnfs_osd_objid_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_objid); ++} ++ ++enum pnfs_osd_version { ++ PNFS_OSD_MISSING = 0, ++ PNFS_OSD_VERSION_1 = 1, ++ PNFS_OSD_VERSION_2 = 2 ++}; ++ ++struct pnfs_osd_opaque_cred { ++ u32 cred_len; ++ u8 *cred; ++}; ++ ++static inline int ++pnfs_osd_opaque_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ READ32(n); ++ p += XDR_QUADLEN(n); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_opaque_cred_incore_sz(u32 *p) ++{ ++ u32 n; ++ ++ READ32(n); ++ return XDR_QUADLEN(n) * 4; ++} ++ ++enum pnfs_osd_cap_key_sec { ++ PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ PNFS_OSD_CAP_KEY_SEC_SSV = 1, ++}; ++ ++/* struct pnfs_osd_object_cred4 { ++ * pnfs_osd_objid4 oc_object_id; ++ * pnfs_osd_version4 oc_osd_version; ++ * pnfs_osd_cap_key_sec4 oc_cap_key_sec; ++ * opaque oc_capability_key<>; ++ * opaque oc_capability<>; ++ * }; ++ */ ++struct pnfs_osd_object_cred { ++ struct pnfs_osd_objid oc_object_id; ++ u32 oc_osd_version; ++ u32 oc_cap_key_sec; ++ struct pnfs_osd_opaque_cred oc_cap_key; ++ struct pnfs_osd_opaque_cred oc_cap; ++}; ++ ++static inline int ++pnfs_osd_object_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_object_cred_incore_sz(u32 *p) ++{ ++ size_t sz = sizeof(struct pnfs_osd_object_cred); ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ return sz; ++} ++ ++/* struct pnfs_osd_layout4 { ++ * pnfs_osd_data_map4 olo_map; ++ * uint32_t olo_comps_index; ++ * pnfs_osd_object_cred4 olo_components<>; ++ * }; ++ */ ++struct pnfs_osd_layout { ++ struct pnfs_osd_data_map olo_map; ++ u32 olo_comps_index; ++ u32 olo_num_comps; ++ struct pnfs_osd_object_cred *olo_comps; ++}; ++ ++static inline int ++pnfs_osd_layout_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ while ((int)(n--) > 0) ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_layout_incore_sz(u32 *p) ++{ ++ u32 n; ++ size_t sz; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ sz = sizeof(struct pnfs_osd_layout); ++ while ((int)(n--) > 0) { ++ sz += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ return sz; ++} ++ ++/* Device Address */ ++ ++enum pnfs_osd_targetid_type { ++ OBJ_TARGET_ANON = 1, ++ OBJ_TARGET_SCSI_NAME = 2, ++ OBJ_TARGET_SCSI_DEVICE_ID = 3, ++}; ++ ++/* union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) { ++ * case OBJ_TARGET_SCSI_NAME: ++ * string oti_scsi_name<>; ++ * ++ * case OBJ_TARGET_SCSI_DEVICE_ID: ++ * opaque oti_scsi_device_id<>; ++ * ++ * default: ++ * void; ++ * }; ++ * ++ * union pnfs_osd_targetaddr4 switch (bool ota_available) { ++ * case TRUE: ++ * netaddr4 ota_netaddr; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_deviceaddr4 { ++ * pnfs_osd_targetid4 oda_targetid; ++ * pnfs_osd_targetaddr4 oda_targetaddr; ++ * uint64_t oda_lun; ++ * opaque oda_systemid<>; ++ * pnfs_osd_object_cred4 oda_root_obj_cred; ++ * opaque oda_osdname<>; ++ * }; ++ */ ++struct pnfs_osd_targetid { ++ u32 oti_type; ++ struct nfs4_string oti_scsi_device_id; ++}; ++ ++enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 }; ++ ++/* struct netaddr4 { ++ * // see struct rpcb in RFC1833 ++ * string r_netid<>; // network id ++ * string r_addr<>; // universal address ++ * }; ++ */ ++struct pnfs_osd_net_addr { ++ struct nfs4_string r_netid; ++ struct nfs4_string r_addr; ++}; ++ ++struct pnfs_osd_targetaddr { ++ u32 ota_available; ++ struct pnfs_osd_net_addr ota_netaddr; ++}; ++ ++enum { ++ NETWORK_ID_MAX = 16 / 4, ++ UNIVERSAL_ADDRESS_MAX = 64 / 4, ++ PNFS_OSD_TARGETADDR_MAX = 3 + NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX, ++}; ++ ++struct pnfs_osd_deviceaddr { ++ struct pnfs_osd_targetid oda_targetid; ++ struct pnfs_osd_targetaddr oda_targetaddr; ++ u8 oda_lun[8]; ++ struct nfs4_string oda_systemid; ++ struct pnfs_osd_object_cred oda_root_obj_cred; ++ struct nfs4_string oda_osdname; ++}; ++ ++enum { ++ ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4, ++ PNFS_OSD_DEVICEADDR_MAX = ++ PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX + ++ 2 /*oda_lun*/ + ++ 1 + OSD_SYSTEMID_LEN + ++ 1 + ODA_OSDNAME_MAX, ++}; ++ ++/* LAYOUTCOMMIT: layoutupdate */ ++ ++/* union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) { ++ * case TRUE: ++ * int64_t dsu_delta; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_layoutupdate4 { ++ * pnfs_osd_deltaspaceused4 olu_delta_space_used; ++ * bool olu_ioerr_flag; ++ * }; ++ */ ++struct pnfs_osd_layoutupdate { ++ u32 dsu_valid; ++ s64 dsu_delta; ++ u32 olu_ioerr_flag; ++}; ++ ++/* LAYOUTRETURN: I/O Rrror Report */ ++ ++enum pnfs_osd_errno { ++ PNFS_OSD_ERR_EIO = 1, ++ PNFS_OSD_ERR_NOT_FOUND = 2, ++ PNFS_OSD_ERR_NO_SPACE = 3, ++ PNFS_OSD_ERR_BAD_CRED = 4, ++ PNFS_OSD_ERR_NO_ACCESS = 5, ++ PNFS_OSD_ERR_UNREACHABLE = 6, ++ PNFS_OSD_ERR_RESOURCE = 7 ++}; ++ ++/* struct pnfs_osd_ioerr4 { ++ * pnfs_osd_objid4 oer_component; ++ * length4 oer_comp_offset; ++ * length4 oer_comp_length; ++ * bool oer_iswrite; ++ * pnfs_osd_errno4 oer_errno; ++ * }; ++ */ ++struct pnfs_osd_ioerr { ++ struct pnfs_osd_objid oer_component; ++ u64 oer_comp_offset; ++ u64 oer_comp_length; ++ u32 oer_iswrite; ++ u32 oer_errno; ++}; ++ ++static inline unsigned ++pnfs_osd_ioerr_xdr_sz(void) ++{ ++ return pnfs_osd_objid_xdr_sz() + 2 + 2 + 1 + 1; ++} ++ ++/* OSD XDR API */ ++ ++/* Layout helpers */ ++extern struct pnfs_osd_layout *pnfs_osd_xdr_decode_layout( ++ struct pnfs_osd_layout *layout, u32 *p); ++ ++extern int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *layout); ++ ++/* Device Info helpers */ ++ ++/* First pass calculate total size for space needed */ ++extern size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p); ++ ++/* Note: some strings pointed to inside @deviceaddr might point ++ * to space inside @p. @p should stay valid while @deviceaddr ++ * is in use. ++ * It is assumed that @deviceaddr points to bigger memory of size ++ * calculated in first pass by pnfs_osd_xdr_deviceaddr_incore_sz() ++ */ ++extern void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p); ++ ++/* For Servers */ ++extern int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr); ++ ++/* layoutupdate (layout_commit) xdr helpers */ ++extern int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou); ++extern __be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p); ++ ++/* osd_ioerror encoding/decoding (layout_return) */ ++extern int ++pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, struct pnfs_osd_ioerr *ioerr); ++extern __be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p); ++ ++#endif /* __PNFS_OSD_XDR_H__ */ +diff -up linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_xdr.h +--- linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig 2010-08-23 12:09:03.391491550 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_xdr.h 2010-08-23 12:09:03.391491550 -0400 +@@ -0,0 +1,134 @@ ++/* ++ * include/linux/pnfs_xdr.h ++ * ++ * Common xdr data structures needed by pnfs client. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef LINUX_PNFS_XDR_H ++#define LINUX_PNFS_XDR_H ++ ++#define PNFS_LAYOUT_MAXSIZE 4096 ++#define NFS4_PNFS_DEVICEID4_SIZE 16 ++ ++struct pnfs_deviceid { ++ char data[NFS4_PNFS_DEVICEID4_SIZE]; ++}; ++ ++struct nfs4_pnfs_layout { ++ __u32 len; ++ void *buf; ++}; ++ ++struct nfs4_pnfs_layout_segment { ++ u32 iomode; ++ u64 offset; ++ u64 length; ++}; ++ ++struct nfs4_pnfs_layoutget_arg { ++ __u32 type; ++ struct nfs4_pnfs_layout_segment lseg; ++ __u64 minlength; ++ __u32 maxcount; ++ struct inode *inode; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_pnfs_layoutget_res { ++ __u32 return_on_close; ++ struct nfs4_pnfs_layout_segment lseg; ++ __u32 type; ++ nfs4_stateid stateid; ++ struct nfs4_pnfs_layout layout; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_pnfs_layoutget { ++ struct nfs4_pnfs_layoutget_arg args; ++ struct nfs4_pnfs_layoutget_res res; ++ struct pnfs_layout_segment **lsegpp; ++ int status; ++}; ++ ++struct pnfs_layoutcommit_arg { ++ nfs4_stateid stateid; ++ __u64 lastbytewritten; ++ __u32 time_modify_changed; ++ struct timespec time_modify; ++ const u32 *bitmask; ++ struct nfs_fh *fh; ++ struct inode *inode; ++ ++ /* Values set by layout driver */ ++ struct nfs4_pnfs_layout_segment lseg; ++ __u32 layout_type; ++ void *layoutdriver_data; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct pnfs_layoutcommit_res { ++ __u32 sizechanged; ++ __u64 newsize; ++ struct nfs_fattr *fattr; ++ const struct nfs_server *server; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct pnfs_layoutcommit_data { ++ struct rpc_task task; ++ struct rpc_cred *cred; ++ struct nfs_fattr fattr; ++ struct pnfs_layoutcommit_arg args; ++ struct pnfs_layoutcommit_res res; ++ int status; ++}; ++ ++struct nfs4_pnfs_layoutreturn_arg { ++ __u32 reclaim; ++ __u32 layout_type; ++ __u32 return_type; ++ struct nfs4_pnfs_layout_segment lseg; ++ struct inode *inode; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_pnfs_layoutreturn_res { ++ struct nfs4_sequence_res seq_res; ++ u32 lrs_present; ++ nfs4_stateid stateid; ++}; ++ ++struct nfs4_pnfs_layoutreturn { ++ struct nfs4_pnfs_layoutreturn_arg args; ++ struct nfs4_pnfs_layoutreturn_res res; ++ struct rpc_cred *cred; ++ int rpc_status; ++}; ++ ++struct nfs4_pnfs_getdevicelist_arg { ++ const struct nfs_fh *fh; ++ u32 layoutclass; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_pnfs_getdevicelist_res { ++ struct pnfs_devicelist *devlist; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_pnfs_getdeviceinfo_arg { ++ struct pnfs_device *pdev; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_pnfs_getdeviceinfo_res { ++ struct pnfs_device *pdev; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++#endif /* LINUX_PNFS_XDR_H */ +diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/include/linux/posix_acl.h +--- linux-2.6.34.noarch/include/linux/posix_acl.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-08-23 12:09:03.393501437 -0400 +@@ -8,6 +8,7 @@ + #ifndef __LINUX_POSIX_ACL_H + #define __LINUX_POSIX_ACL_H + ++#include + #include + + #define ACL_UNDEFINED_ID (-1) +diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h +--- linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-08-23 12:09:03.393501437 -0400 +@@ -14,6 +14,8 @@ + /* size of an XDR encoding unit in bytes, i.e. 32bit */ + #define XDR_UNIT (4) + ++#include ++ + /* spec defines authentication flavor as an unsigned 32 bit integer */ + typedef u32 rpc_authflavor_t; + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h +--- linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-23 12:09:03.394512138 -0400 +@@ -3,6 +3,7 @@ + + #ifdef __KERNEL__ + ++#include + #include + + struct rpc_pipe_msg { +@@ -11,6 +12,10 @@ struct rpc_pipe_msg { + size_t len; + size_t copied; + int errno; ++#define PIPEFS_AUTOFREE_RPCMSG 0x01 /* frees rpc_pipe_msg */ ++#define PIPEFS_AUTOFREE_RPCMSG_DATA 0x02 /* frees rpc_pipe_msg->data */ ++#define PIPEFS_AUTOFREE_UPCALL_MSG PIPEFS_AUTOFREE_RPCMSG_DATA ++ u8 flags; + }; + + struct rpc_pipe_ops { +diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h +--- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-23 12:09:03.394512138 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-23 12:09:03.395501822 -0400 +@@ -0,0 +1,111 @@ ++/* ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#ifndef _SIMPLE_RPC_PIPEFS_H_ ++#define _SIMPLE_RPC_PIPEFS_H_ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#define payload_of(headerp) ((void *)(headerp + 1)) ++ ++/* ++ * struct pipefs_hdr -- the generic message format for simple_rpc_pipefs. ++ * Messages may simply be the header itself, although having an optional ++ * data payload follow the header allows much more flexibility. ++ * ++ * Messages are created using pipefs_alloc_init_msg() and ++ * pipefs_alloc_init_msg_padded(), both of which accept a pointer to an ++ * (optional) data payload. ++ * ++ * Given a struct pipefs_hdr *msg that has a struct foo payload, the data ++ * can be accessed using: struct foo *foop = payload_of(msg) ++ */ ++struct pipefs_hdr { ++ u32 msgid; ++ u8 type; ++ u8 flags; ++ u16 totallen; /* length of entire message, including hdr itself */ ++ u32 status; ++}; ++ ++/* ++ * struct pipefs_list -- a type of list used for tracking callers who've made an ++ * upcall and are blocked waiting for a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_assign_upcall_reply(). ++ */ ++struct pipefs_list { ++ struct list_head list; ++ spinlock_t list_lock; ++}; ++ ++ ++/* See net/sunrpc/simple_rpc_pipefs.c for more info on using these functions. */ ++extern struct dentry *pipefs_mkpipe(const char *name, ++ const struct rpc_pipe_ops *ops, ++ int wait_for_open); ++extern void pipefs_closepipe(struct dentry *pipe); ++extern void pipefs_init_list(struct pipefs_list *list); ++extern struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen); ++extern struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, ++ u8 flags, void *data, ++ u16 datalen, u16 padlen); ++extern struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list ++ *uplist, u8 upflags, ++ u32 timeout); ++extern int pipefs_queue_upcall_noreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, u8 upflags); ++extern int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist); ++extern struct pipefs_hdr *pipefs_readmsg(struct file *filp, ++ const char __user *src, size_t len); ++extern ssize_t pipefs_generic_upcall(struct file *filp, ++ struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen); ++extern void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg); ++ ++#endif /* _SIMPLE_RPC_PIPEFS_H_ */ +diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h +--- linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-23 12:09:03.395501822 -0400 +@@ -166,4 +166,41 @@ static inline char *__svc_print_addr(con + + return buf; + } ++ ++/* ++ * Print a network address in a universal format (see rfc1833 and nfsv4.1) ++ */ ++static inline int __svc_print_netaddr(struct sockaddr *addr, ++ struct xdr_netobj *na) ++{ ++ u16 port; ++ ssize_t len; ++ ++ switch (addr->sa_family) { ++ case AF_INET: { ++ struct sockaddr_in *sin = (struct sockaddr_in *)addr; ++ port = ntohs(sin->sin_port); ++ ++ len = snprintf(na->data, na->len, "%pI4.%u.%u", ++ &sin->sin_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ case AF_INET6: { ++ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; ++ port = ntohs(sin6->sin6_port); ++ ++ len = snprintf(na->data, na->len, "%pI6.%u.%u", ++ &sin6->sin6_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ default: ++ snprintf(na->data, na->len, "unknown address type: %d", ++ addr->sa_family); ++ len = -EINVAL; ++ break; ++ } ++ return len; ++} + #endif /* SUNRPC_SVC_XPRT_H */ +diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xdr.h +--- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-23 12:08:29.066475323 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-23 12:09:03.396464612 -0400 +@@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp) + return p + 2; + } + ++static inline __be32 * ++xdr_decode_opaque_fixed(__be32 *p, void *ptr, unsigned int len) ++{ ++ memcpy(ptr, p, len); ++ return p + XDR_QUADLEN(len); ++} ++ + /* + * Adjust kvec to reflect end of xdr'ed data (RPC client XDR) + */ +@@ -197,6 +204,7 @@ struct xdr_stream { + + extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); + extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); ++extern __be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q); + extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, + unsigned int base, unsigned int len); + extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); +diff -up linux-2.6.34.noarch/localversion-pnfs.orig linux-2.6.34.noarch/localversion-pnfs +--- linux-2.6.34.noarch/localversion-pnfs.orig 2010-08-23 12:09:03.396464612 -0400 ++++ linux-2.6.34.noarch/localversion-pnfs 2010-08-23 12:09:03.396464612 -0400 +@@ -0,0 +1 @@ ++-pnfs +diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/sunrpc/Makefile +--- linux-2.6.34.noarch/net/sunrpc/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-08-23 12:09:03.397501662 -0400 +@@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt + svc.o svcsock.o svcauth.o svcauth_unix.o \ + addr.o rpcb_clnt.o timer.o xdr.o \ + sunrpc_syms.o cache.o rpc_pipe.o \ +- svc_xprt.o ++ svc_xprt.o simple_rpc_pipefs.o + sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o + sunrpc-$(CONFIG_PROC_FS) += stats.o + sunrpc-$(CONFIG_SYSCTL) += sysctl.o +diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c +--- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-23 12:09:03.398522348 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-23 12:09:03.398522348 -0400 +@@ -0,0 +1,424 @@ ++/* ++ * net/sunrpc/simple_rpc_pipefs.c ++ * ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++ ++/* ++ * Make an rpc_pipefs pipe named @name at the root of the mounted rpc_pipefs ++ * filesystem. ++ * ++ * If @wait_for_open is non-zero and an upcall is later queued but the userland ++ * end of the pipe has not yet been opened, the upcall will remain queued until ++ * the pipe is opened; otherwise, the upcall queueing will return with -EPIPE. ++ */ ++struct dentry *pipefs_mkpipe(const char *name, const struct rpc_pipe_ops *ops, ++ int wait_for_open) ++{ ++ struct dentry *dir, *pipe; ++ struct vfsmount *mnt; ++ ++ mnt = rpc_get_mount(); ++ if (IS_ERR(mnt)) { ++ pipe = ERR_CAST(mnt); ++ goto out; ++ } ++ dir = mnt->mnt_root; ++ if (!dir) { ++ pipe = ERR_PTR(-ENOENT); ++ goto out; ++ } ++ pipe = rpc_mkpipe(dir, name, NULL, ops, ++ wait_for_open ? RPC_PIPE_WAIT_FOR_OPEN : 0); ++out: ++ return pipe; ++} ++EXPORT_SYMBOL(pipefs_mkpipe); ++ ++/* ++ * Shutdown a pipe made by pipefs_mkpipe(). ++ * XXX: do we need to retain an extra reference on the mount? ++ */ ++void pipefs_closepipe(struct dentry *pipe) ++{ ++ rpc_unlink(pipe); ++ rpc_put_mount(); ++} ++EXPORT_SYMBOL(pipefs_closepipe); ++ ++/* ++ * Initialize a struct pipefs_list -- which are a way to keep track of callers ++ * who're blocked having made an upcall and are awaiting a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_find_upcall_msgid() for how ++ * to use them. ++ */ ++inline void pipefs_init_list(struct pipefs_list *list) ++{ ++ INIT_LIST_HEAD(&list->list); ++ spin_lock_init(&list->list_lock); ++} ++EXPORT_SYMBOL(pipefs_init_list); ++ ++/* ++ * Alloc/init a generic pipefs message header and copy into its message body ++ * an arbitrary data payload. ++ * ++ * struct pipefs_hdr's are meant to serve as generic, general-purpose message ++ * headers for easy rpc_pipefs I/O. When an upcall is made, the ++ * struct pipefs_hdr is assigned to a struct rpc_pipe_msg and delivered ++ * therein. --And yes, the naming can seem a little confusing at first: ++ * ++ * When one thinks of an upcall "message", in simple_rpc_pipefs that's a ++ * struct pipefs_hdr (possibly with an attached message body). A ++ * struct rpc_pipe_msg is actually only the -vehicle- by which the "real" ++ * message is delivered and processed. ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen, u16 padlen) ++{ ++ u16 totallen; ++ struct pipefs_hdr *msg = NULL; ++ ++ totallen = sizeof(*msg) + datalen + padlen; ++ if (totallen > PAGE_SIZE) { ++ msg = ERR_PTR(-E2BIG); ++ goto out; ++ } ++ ++ msg = kzalloc(totallen, GFP_KERNEL); ++ if (!msg) { ++ msg = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ msg->msgid = msgid; ++ msg->type = type; ++ msg->flags = flags; ++ msg->totallen = totallen; ++ memcpy(payload_of(msg), data, datalen); ++out: ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg_padded); ++ ++/* ++ * See the description of pipefs_alloc_init_msg_padded(). ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen) ++{ ++ return pipefs_alloc_init_msg_padded(msgid, type, flags, data, ++ datalen, 0); ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg); ++ ++ ++static void pipefs_init_rpcmsg(struct rpc_pipe_msg *rpcmsg, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ memset(rpcmsg, 0, sizeof(*rpcmsg)); ++ rpcmsg->data = msg; ++ rpcmsg->len = msg->totallen; ++ rpcmsg->flags = upflags; ++} ++ ++static struct rpc_pipe_msg *pipefs_alloc_init_rpcmsg(struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ struct rpc_pipe_msg *rpcmsg; ++ ++ rpcmsg = kmalloc(sizeof(*rpcmsg), GFP_KERNEL); ++ if (!rpcmsg) ++ return ERR_PTR(-ENOMEM); ++ ++ pipefs_init_rpcmsg(rpcmsg, msg, upflags); ++ return rpcmsg; ++} ++ ++ ++/* represents an upcall that'll block and wait for a reply */ ++struct pipefs_upcall { ++ u32 msgid; ++ struct rpc_pipe_msg rpcmsg; ++ struct list_head list; ++ wait_queue_head_t waitq; ++ struct pipefs_hdr *reply; ++}; ++ ++ ++static void pipefs_init_upcall_waitreply(struct pipefs_upcall *upcall, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ upcall->reply = NULL; ++ upcall->msgid = msg->msgid; ++ INIT_LIST_HEAD(&upcall->list); ++ init_waitqueue_head(&upcall->waitq); ++ pipefs_init_rpcmsg(&upcall->rpcmsg, msg, upflags); ++} ++ ++static int __pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_upcall *upcall, ++ struct pipefs_list *uplist, ++ u32 timeout) ++{ ++ int err = 0; ++ DECLARE_WAITQUEUE(wq, current); ++ ++ add_wait_queue(&upcall->waitq, &wq); ++ spin_lock(&uplist->list_lock); ++ list_add(&upcall->list, &uplist->list); ++ spin_unlock(&uplist->list_lock); ++ ++ err = rpc_queue_upcall(pipe->d_inode, &upcall->rpcmsg); ++ if (err < 0) ++ goto out; ++ ++ if (timeout) { ++ /* retval of 0 means timer expired */ ++ err = schedule_timeout_uninterruptible(timeout); ++ if (err == 0 && upcall->reply == NULL) ++ err = -ETIMEDOUT; ++ } else { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ } ++ ++out: ++ spin_lock(&uplist->list_lock); ++ list_del_init(&upcall->list); ++ spin_unlock(&uplist->list_lock); ++ remove_wait_queue(&upcall->waitq, &wq); ++ return err; ++} ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace, place the calling thread ++ * on @uplist, and block the thread to wait for a reply. If @timeout is ++ * nonzero, the thread will be blocked for at most @timeout jiffies. ++ * ++ * (To convert time units into jiffies, consider the functions ++ * msecs_to_jiffies(), usecs_to_jiffies(), timeval_to_jiffies(), and ++ * timespec_to_jiffies().) ++ * ++ * Once a reply is received by your downcall handler, call ++ * pipefs_assign_upcall_reply() with @uplist to find the corresponding upcall, ++ * assign the reply, and wake the waiting thread. ++ * ++ * This function's return value pointer may be an error and should be checked ++ * with IS_ERR() before attempting to access the reply message. ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list *uplist, ++ u8 upflags, u32 timeout) ++{ ++ int err = 0; ++ struct pipefs_upcall upcall; ++ ++ pipefs_init_upcall_waitreply(&upcall, msg, upflags); ++ err = __pipefs_queue_upcall_waitreply(pipe, &upcall, uplist, timeout); ++ if (err < 0) { ++ kfree(upcall.reply); ++ upcall.reply = ERR_PTR(err); ++ } ++ ++ return upcall.reply; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_waitreply); ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace and immediately return (i.e., ++ * no reply is expected). ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++int pipefs_queue_upcall_noreply(struct dentry *pipe, struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ int err = 0; ++ struct rpc_pipe_msg *rpcmsg; ++ ++ upflags |= PIPEFS_AUTOFREE_RPCMSG; ++ rpcmsg = pipefs_alloc_init_rpcmsg(msg, upflags); ++ if (IS_ERR(rpcmsg)) { ++ err = PTR_ERR(rpcmsg); ++ goto out; ++ } ++ err = rpc_queue_upcall(pipe->d_inode, rpcmsg); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_noreply); ++ ++ ++static struct pipefs_upcall *pipefs_find_upcall_msgid(u32 msgid, ++ struct pipefs_list *uplist) ++{ ++ struct pipefs_upcall *upcall; ++ ++ spin_lock(&uplist->list_lock); ++ list_for_each_entry(upcall, &uplist->list, list) ++ if (upcall->msgid == msgid) ++ goto out; ++ upcall = NULL; ++out: ++ spin_unlock(&uplist->list_lock); ++ return upcall; ++} ++ ++/* ++ * In your rpc_pipe_ops->downcall() handler, once you've read in a downcall ++ * message and have determined that it is a reply to a waiting upcall, ++ * you can use this function to find the appropriate upcall, assign the result, ++ * and wake the upcall thread. ++ * ++ * The reply message must have the same msgid as the original upcall message's. ++ * ++ * See also pipefs_queue_upcall_waitreply() and pipefs_readmsg(). ++ */ ++int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist) ++{ ++ int err = 0; ++ struct pipefs_upcall *upcall; ++ ++ upcall = pipefs_find_upcall_msgid(reply->msgid, uplist); ++ if (!upcall) { ++ printk(KERN_ERR "%s: ERROR: have reply but no matching upcall " ++ "for msgid %d\n", __func__, reply->msgid); ++ err = -ENOENT; ++ goto out; ++ } ++ upcall->reply = reply; ++ wake_up(&upcall->waitq); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_assign_upcall_reply); ++ ++/* ++ * Generic method to read-in and return a newly-allocated message which begins ++ * with a struct pipefs_hdr. ++ */ ++struct pipefs_hdr *pipefs_readmsg(struct file *filp, const char __user *src, ++ size_t len) ++{ ++ int err = 0, hdrsize; ++ struct pipefs_hdr *msg = NULL; ++ ++ hdrsize = sizeof(*msg); ++ if (len < hdrsize) { ++ printk(KERN_ERR "%s: ERROR: header is too short (%d vs %d)\n", ++ __func__, (int) len, hdrsize); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ msg = kzalloc(len, GFP_KERNEL); ++ if (!msg) { ++ err = -ENOMEM; ++ goto out; ++ } ++ if (copy_from_user(msg, src, len)) ++ err = -EFAULT; ++out: ++ if (err) { ++ kfree(msg); ++ msg = ERR_PTR(err); ++ } ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_readmsg); ++ ++/* ++ * Generic rpc_pipe_ops->upcall() handler implementation. ++ * ++ * Don't call this directly: to make an upcall, use ++ * pipefs_queue_upcall_waitreply() or pipefs_queue_upcall_noreply(). ++ */ ++ssize_t pipefs_generic_upcall(struct file *filp, struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen) ++{ ++ char *data; ++ ssize_t len, left; ++ ++ data = (char *)rpcmsg->data + rpcmsg->copied; ++ len = rpcmsg->len - rpcmsg->copied; ++ if (len > buflen) ++ len = buflen; ++ ++ left = copy_to_user(dst, data, len); ++ if (left < 0) { ++ rpcmsg->errno = left; ++ return left; ++ } ++ ++ len -= left; ++ rpcmsg->copied += len; ++ rpcmsg->errno = 0; ++ return len; ++} ++EXPORT_SYMBOL(pipefs_generic_upcall); ++ ++/* ++ * Generic rpc_pipe_ops->destroy_msg() handler implementation. ++ * ++ * Items are only freed if @rpcmsg->flags has been set appropriately. ++ * See pipefs_queue_upcall_noreply() and rpc_pipe_fs.h. ++ */ ++void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg) ++{ ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_UPCALL_MSG) ++ kfree(rpcmsg->data); ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_RPCMSG) ++ kfree(rpcmsg); ++} ++EXPORT_SYMBOL(pipefs_generic_destroy_msg); +diff -up linux-2.6.34.noarch/net/sunrpc/xdr.c.orig linux-2.6.34.noarch/net/sunrpc/xdr.c +--- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-08-23 12:08:29.081501640 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-23 12:09:03.399443371 -0400 +@@ -403,16 +403,14 @@ xdr_shrink_pagelen(struct xdr_buf *buf, + + /* Shift the tail first */ + if (tail->iov_len != 0) { +- p = (char *)tail->iov_base + len; +- if (tail->iov_len > len) { +- copy = tail->iov_len - len; +- memmove(p, tail->iov_base, copy); +- } else +- buf->buflen -= len; +- /* Copy from the inlined pages into the tail */ + copy = len; +- if (copy > tail->iov_len) ++ if (tail->iov_len > len) { ++ p = (char *)tail->iov_base + len; ++ memmove(p, tail->iov_base, tail->iov_len - len); ++ } else { + copy = tail->iov_len; ++ } ++ /* Copy from the inlined pages into the tail */ + _copy_from_pages((char *)tail->iov_base, + buf->pages, buf->page_base + pglen - len, + copy); +@@ -496,6 +494,27 @@ __be32 * xdr_reserve_space(struct xdr_st + EXPORT_SYMBOL_GPL(xdr_reserve_space); + + /** ++ * xdr_rewind_stream - rewind a stream back to some checkpoint ++ * @xdr: pointer to xdr_stream ++ * @q: some checkpoint at historical place of @xdr ++ * ++ * Restors an xdr stream to some historical point. @q must be ++ * a logical xdr point in the past that was sampled by @q = @xdr->p. ++ */ ++__be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q) ++{ ++ size_t nbytes = (xdr->p - q) << 2; ++ ++ BUG_ON(xdr->p < q); ++ BUG_ON(nbytes > xdr->iov->iov_len || nbytes > xdr->buf->len); ++ xdr->p = q; ++ xdr->iov->iov_len -= nbytes; ++ xdr->buf->len -= nbytes; ++ return q; ++} ++EXPORT_SYMBOL_GPL(xdr_rewind_stream); ++ ++/** + * xdr_write_pages - Insert a list of pages into an XDR buffer for sending + * @xdr: pointer to xdr_stream + * @pages: list of pages From b07c836a880dbed3d7509ad31b42a05c8270cac3 Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Mon, 23 Aug 2010 14:15:46 -0400 Subject: [PATCH 06/20] Fixed a couple compile errors in the server code. Signed-off-by: Steve Dickson --- nfsd-35-fc.patch | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/nfsd-35-fc.patch b/nfsd-35-fc.patch index ef99b4995..2825464af 100644 --- a/nfsd-35-fc.patch +++ b/nfsd-35-fc.patch @@ -1,6 +1,6 @@ diff -up linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt --- linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt 2010-08-23 09:57:18.233564439 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt 2010-08-23 14:12:24.165356789 -0400 @@ -137,7 +137,7 @@ NS*| OPENATTR | OPT | | READ | REQ | | Section 18.22 | | READDIR | REQ | | Section 18.23 | @@ -12,7 +12,7 @@ diff -up linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig | RENAME | REQ | | Section 18.26 | diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c --- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 09:57:18.234564075 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 14:12:24.519356675 -0400 @@ -259,10 +259,9 @@ static struct cache_detail svc_expkey_ca .alloc = expkey_alloc, }; @@ -108,7 +108,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/e out_put_clp: diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c --- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 10:00:37.257414684 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 14:12:52.625429773 -0400 @@ -79,11 +79,6 @@ enum nfs_cb_opnum4 { cb_sequence_dec_sz + \ op_dec_sz) @@ -211,7 +211,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/ int status; - status = rpc_call_async(cb->cb_client, &msg, -+ status = rpc_call_async(cb->cl_cb_client, &msg, ++ status = rpc_call_async(clp->cl_cb_client, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, &nfsd4_cb_probe_ops, (void *)clp); - if (status) { @@ -402,7 +402,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/ } diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c --- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 09:57:18.237376763 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 14:12:25.698356909 -0400 @@ -969,20 +969,36 @@ static struct nfsd4_operation nfsd4_ops[ static const char *nfsd4_op_name(unsigned opnum); @@ -490,7 +490,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd static const char *nfsd4_op_name(unsigned opnum) diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c --- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 09:57:18.240356512 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 14:12:25.700356284 -0400 @@ -45,8 +45,8 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -1280,9 +1280,21 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs -{ - user_lease_time = leasetime; -} +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-08-23 14:14:22.882428704 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-23 14:14:33.418376589 -0400 +@@ -1900,7 +1900,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s + if (bmval0 & FATTR4_WORD0_LEASE_TIME) { + if ((buflen -= 4) < 0) + goto out_resource; +- WRITE32(NFSD_LEASE_TIME); ++ WRITE32(nfsd4_lease); + } + if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { + if ((buflen -= 4) < 0) diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c --- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 09:57:20.629370282 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 14:12:25.821359224 -0400 @@ -46,6 +46,7 @@ enum { */ #ifdef CONFIG_NFSD_V4 @@ -1403,7 +1415,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/n /* last one */ {""} diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h --- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 09:57:20.629370282 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 14:12:25.835418441 -0400 @@ -82,7 +82,6 @@ int nfs4_state_init(void); void nfsd4_free_slabs(void); int nfs4_state_start(void); @@ -1440,7 +1452,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfs /* diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h --- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 09:57:21.807501619 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 14:12:25.836366516 -0400 @@ -70,6 +70,16 @@ struct nfsd4_cb_sequence { struct nfs4_client *cbs_clp; }; @@ -1558,7 +1570,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/st nfs4_put_stateowner(struct nfs4_stateowner *so) diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h --- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 09:57:23.994379831 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 14:12:25.837387292 -0400 @@ -381,6 +381,10 @@ struct nfsd4_destroy_session { struct nfs4_sessionid sessionid; }; @@ -1600,7 +1612,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h --- linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h 2010-08-23 09:57:23.994379831 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h 2010-08-23 14:12:25.838377224 -0400 @@ -40,12 +40,12 @@ struct nfs_fhbase_old { * This is the new flexible, extensible style NFSv2/v3 file handle. * by Neil Brown - March 2000 @@ -1619,7 +1631,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig linux-2.6.34.noarch * This might allow a file to be confirmed to be in a writable part of a diff -up linux-2.6.34.noarch/net/sunrpc/cache.c.orig linux-2.6.34.noarch/net/sunrpc/cache.c --- linux-2.6.34.noarch/net/sunrpc/cache.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/cache.c 2010-08-23 09:57:23.995376793 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/cache.c 2010-08-23 14:12:25.839376838 -0400 @@ -49,11 +49,17 @@ static void cache_init(struct cache_head h->last_refresh = now; } @@ -1686,7 +1698,7 @@ diff -up linux-2.6.34.noarch/net/sunrpc/cache.c.orig linux-2.6.34.noarch/net/sun /* entry is valid */ diff -up linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig linux-2.6.34.noarch/net/sunrpc/svcsock.c --- linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/svcsock.c 2010-08-23 09:57:23.997368707 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/svcsock.c 2010-08-23 14:12:25.840384371 -0400 @@ -547,7 +547,6 @@ static int svc_udp_recvfrom(struct svc_r dprintk("svc: recvfrom returned error %d\n", -err); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); @@ -1753,7 +1765,7 @@ diff -up linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig linux-2.6.34.noarch/net/s error: diff -up linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig linux-2.6.34.noarch/net/sunrpc/svc_xprt.c --- linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/svc_xprt.c 2010-08-23 09:57:23.996377209 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/svc_xprt.c 2010-08-23 14:12:25.841371223 -0400 @@ -744,8 +744,10 @@ int svc_recv(struct svc_rqst *rqstp, lon if (rqstp->rq_deferred) { svc_xprt_received(xprt); @@ -1782,7 +1794,7 @@ diff -up linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig linux-2.6.34.noarch/net/ void svc_close_xprt(struct svc_xprt *xprt) diff -up linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c.orig linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c --- linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 2010-08-23 09:57:23.998377481 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 2010-08-23 14:12:25.842376584 -0400 @@ -566,7 +566,6 @@ static int rdma_read_complete(struct svc ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, rqstp->rq_arg.head[0].iov_len); From 2121c4cc7dfeed915de92d51ac364c76bf4b5b6e Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Tue, 24 Aug 2010 14:49:23 -0400 Subject: [PATCH 07/20] Removed the localversion-pnfs file from the pnfs patch Signed-off-by: Steve Dickson --- kernel.spec | 2 +- pnfs-all-2.6.35-2010-08-19-f13.patch | 395 +++++++++++++-------------- 2 files changed, 196 insertions(+), 201 deletions(-) diff --git a/kernel.spec b/kernel.spec index 14956777b..4fb3481f3 100644 --- a/kernel.spec +++ b/kernel.spec @@ -23,7 +23,7 @@ Summary: The Linux kernel # # (Uncomment the '#' and both spaces below to set the buildid.) # -%define buildid .pnfs_all_2.6.35_2010_08_19 +%define buildid .pnfs34.2010.08.19 ################################################################### # The buildid can also be specified on the rpmbuild command line diff --git a/pnfs-all-2.6.35-2010-08-19-f13.patch b/pnfs-all-2.6.35-2010-08-19-f13.patch index a9d78ba0e..10df9b15c 100644 --- a/pnfs-all-2.6.35-2010-08-19-f13.patch +++ b/pnfs-all-2.6.35-2010-08-19-f13.patch @@ -1,6 +1,6 @@ diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arch/um/os-Linux/mem.c ---- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-08-23 12:08:27.310584826 -0400 -+++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-08-23 12:09:03.273553977 -0400 +--- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-08-24 14:14:03.643355000 -0400 ++++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-08-24 14:17:48.415730000 -0400 @@ -13,6 +13,7 @@ #include #include @@ -11,7 +11,7 @@ diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arc #include "os.h" diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd.c --- linux-2.6.34.noarch/block/genhd.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/block/genhd.c 2010-08-23 12:09:03.273553977 -0400 ++++ linux-2.6.34.noarch/block/genhd.c 2010-08-24 14:17:48.421730000 -0400 @@ -1009,6 +1009,7 @@ static void disk_release(struct device * struct class block_class = { .name = "block", @@ -21,8 +21,8 @@ diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd. static char *block_devnode(struct device *dev, mode_t *mode) { diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt ---- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-23 12:09:03.274563927 -0400 -+++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-08-23 12:09:03.274563927 -0400 +--- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-24 14:17:48.423729000 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-08-24 14:17:48.425730000 -0400 @@ -0,0 +1,211 @@ +(c) 2007 Network Appliance Inc. + @@ -237,7 +237,7 @@ diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6. + diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/drivers/md/dm-ioctl.c --- linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-08-23 12:09:03.275584050 -0400 ++++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-08-24 14:17:48.430730000 -0400 @@ -651,6 +651,12 @@ static int dev_create(struct dm_ioctl *p return r; } @@ -292,7 +292,7 @@ diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/driv int r; diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drivers/scsi/hosts.c --- linux-2.6.34.noarch/drivers/scsi/hosts.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-08-23 12:09:03.276563906 -0400 ++++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-08-24 14:17:48.435733000 -0400 @@ -49,7 +49,7 @@ static void scsi_host_cls_release(struct put_device(&class_to_shost(dev)->shost_gendev); } @@ -304,7 +304,7 @@ diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drive }; diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/exofs.h --- linux-2.6.34.noarch/fs/exofs/exofs.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-08-23 12:09:03.277563890 -0400 ++++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-08-24 14:17:48.440733000 -0400 @@ -36,13 +36,9 @@ #include #include @@ -360,8 +360,8 @@ diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/ + #endif diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs/export.c ---- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-08-23 12:09:03.278386746 -0400 -+++ linux-2.6.34.noarch/fs/exofs/export.c 2010-08-23 12:09:03.278386746 -0400 +--- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-08-24 14:17:48.444731000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/export.c 2010-08-24 14:17:48.446730000 -0400 @@ -0,0 +1,396 @@ +/* + * export.c - Implementation of the pnfs_export_operations @@ -761,7 +761,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs +} diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/inode.c --- linux-2.6.34.noarch/fs/exofs/inode.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-08-23 12:09:03.279502002 -0400 ++++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-08-24 14:17:48.452730000 -0400 @@ -833,7 +833,7 @@ void exofs_truncate(struct inode *inode) if (unlikely(wait_obj_created(oi))) goto fail; @@ -781,7 +781,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/ * Fill in an inode read from the OSD and set it up for use diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/Kbuild --- linux-2.6.34.noarch/fs/exofs/Kbuild.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-08-23 12:09:03.279502002 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-08-24 14:17:48.457733000 -0400 @@ -13,4 +13,5 @@ # @@ -790,7 +790,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/K obj-$(CONFIG_EXOFS_FS) += exofs.o diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/Kconfig --- linux-2.6.34.noarch/fs/exofs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-08-23 12:09:03.280553663 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-08-24 14:17:48.462739000 -0400 @@ -1,6 +1,7 @@ config EXOFS_FS tristate "exofs: OSD based file system support" @@ -801,7 +801,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/ as its backing storage. diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/super.c --- linux-2.6.34.noarch/fs/exofs/super.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/super.c 2010-08-23 12:09:03.281511951 -0400 ++++ linux-2.6.34.noarch/fs/exofs/super.c 2010-08-24 14:17:48.468730000 -0400 @@ -621,6 +621,7 @@ static int exofs_fill_super(struct super sb->s_fs_info = sbi; sb->s_op = &exofs_sops; @@ -812,7 +812,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/ EXOFS_ERR("ERROR: exofs_iget failed\n"); diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exportfs/expfs.c --- linux-2.6.34.noarch/fs/exportfs/expfs.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-08-23 12:09:03.282511528 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-08-24 14:17:48.473730000 -0400 @@ -16,6 +16,13 @@ #include #include @@ -829,7 +829,7 @@ diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exp diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/exportfs/Makefile --- linux-2.6.34.noarch/fs/exportfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-08-23 12:09:03.282511528 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-08-24 14:17:48.478733000 -0400 @@ -3,4 +3,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs.o @@ -840,8 +840,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/ex +exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o +exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c ---- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-23 12:09:03.283511561 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-23 12:09:03.283511561 -0400 +--- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-24 14:17:48.482731000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-24 14:17:48.484734000 -0400 @@ -0,0 +1,158 @@ +/* + * linux/fs/nfsd/nfs4blocklayoutxdr.c @@ -1002,8 +1002,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34. +} +EXPORT_SYMBOL_GPL(blocklayout_encode_layout); diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c ---- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-23 12:09:03.283511561 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-23 12:09:03.283511561 -0400 +--- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-24 14:17:48.487733000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-24 14:17:48.489734000 -0400 @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -1224,8 +1224,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.n +} +EXPORT_SYMBOL(filelayout_encode_layout); diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c ---- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-23 12:09:03.284511493 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-23 12:09:03.284511493 -0400 +--- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-24 14:17:48.493729000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-24 14:17:48.494735000 -0400 @@ -0,0 +1,289 @@ +/* + * pnfs_osd_xdr_enc.c @@ -1518,7 +1518,7 @@ diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.no +EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gfs2/ops_fstype.c --- linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-08-23 12:09:03.285539075 -0400 ++++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-08-24 14:17:48.499730000 -0400 @@ -19,6 +19,7 @@ #include #include @@ -1539,7 +1539,7 @@ diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gf sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig --- linux-2.6.34.noarch/fs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/Kconfig 2010-08-23 12:09:03.286512316 -0400 ++++ linux-2.6.34.noarch/fs/Kconfig 2010-08-24 14:17:48.505733000 -0400 @@ -224,6 +224,31 @@ config LOCKD_V4 config EXPORTFS tristate @@ -1573,8 +1573,8 @@ diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig tristate select FS_POSIX_ACL diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-23 12:09:03.287381619 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-23 12:09:03.287381619 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-24 14:17:48.509734000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-24 14:17:48.511732000 -0400 @@ -0,0 +1,66 @@ +#include +#include @@ -1643,8 +1643,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.or + return; +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-23 12:09:03.288501648 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-23 12:09:03.288501648 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-24 14:17:48.514733000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-24 14:17:48.516731000 -0400 @@ -0,0 +1,1160 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.c @@ -2807,8 +2807,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34. +module_init(nfs4blocklayout_init); +module_exit(nfs4blocklayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-23 12:09:03.289501933 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-23 12:09:03.289501933 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-24 14:17:48.519731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-24 14:17:48.521730000 -0400 @@ -0,0 +1,335 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdev.c @@ -3146,8 +3146,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6. + goto out; +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-23 12:09:03.290395707 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-23 12:09:03.290395707 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-24 14:17:48.523733000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-24 14:17:48.525730000 -0400 @@ -0,0 +1,120 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdm.c @@ -3270,8 +3270,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.3 + } +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-23 12:09:03.290395707 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-23 12:09:03.291501560 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-24 14:17:48.528729000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-24 14:17:48.529735000 -0400 @@ -0,0 +1,303 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h @@ -3577,8 +3577,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34. + +#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-23 12:09:03.292511531 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-08-23 12:09:03.292511531 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-24 14:17:48.532731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-08-24 14:17:48.534734000 -0400 @@ -0,0 +1,948 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h @@ -4529,8 +4529,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noar + } +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile ---- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-23 12:09:03.292511531 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-08-23 12:09:03.293491476 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-24 14:17:48.537729000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-08-24 14:17:48.538739000 -0400 @@ -0,0 +1,6 @@ +# +# Makefile for the pNFS block layout driver kernel module @@ -4540,7 +4540,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarc + extents.o block-device-discovery-pipe.o diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/callback.h --- linux-2.6.34.noarch/fs/nfs/callback.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-08-23 12:09:03.293491476 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-08-24 14:17:48.544730000 -0400 @@ -8,6 +8,8 @@ #ifndef __LINUX_FS_NFS_CALLBACK_H #define __LINUX_FS_NFS_CALLBACK_H @@ -4613,7 +4613,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/c extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/nfs/callback_proc.c --- linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-08-23 12:09:03.294522414 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-08-24 14:17:48.562731000 -0400 @@ -8,10 +8,15 @@ #include #include @@ -5096,7 +5096,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/ return status; diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/nfs/callback_xdr.c --- linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-08-23 12:09:03.295502055 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-08-24 14:17:48.568730000 -0400 @@ -22,6 +22,8 @@ #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) @@ -5298,8 +5298,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/n .process_op = (callback_process_op_t)nfs4_callback_sequence, .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/client.c ---- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-08-23 12:08:29.037481540 -0400 -+++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-23 12:09:03.297501650 -0400 +--- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-08-24 14:14:13.062705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-24 14:17:48.575730000 -0400 @@ -39,6 +39,7 @@ #include #include @@ -5508,8 +5508,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/cli goto error; diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/bl_com.c ---- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-08-23 12:09:03.297501650 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-08-23 12:09:03.298501447 -0400 +--- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-08-24 14:17:48.578729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-08-24 14:17:48.579735000 -0400 @@ -0,0 +1,292 @@ +#if defined(CONFIG_SPNFS_BLOCK) + @@ -5804,8 +5804,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/b +} +#endif /* CONFIG_SPNFS_BLOCK */ diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/bl_ops.c ---- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-08-23 12:09:03.299501445 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-08-23 12:09:03.299501445 -0400 +--- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-08-24 14:17:48.584729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-08-24 14:17:48.586730000 -0400 @@ -0,0 +1,1672 @@ +/* + * bl_ops.c @@ -7480,8 +7480,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/b + +#endif /* CONFIG_SPNFS_BLOCK */ diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs/delegation.c ---- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-08-23 12:08:29.037481540 -0400 -+++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-23 12:09:03.300491952 -0400 +--- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-08-24 14:14:13.068705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-24 14:17:48.592730000 -0400 @@ -104,7 +104,8 @@ again: continue; if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) @@ -7558,7 +7558,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs rcu_read_unlock(); diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs/delegation.h --- linux-2.6.34.noarch/fs/nfs/delegation.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-08-23 12:09:03.301431797 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-08-24 14:17:48.597733000 -0400 @@ -34,9 +34,7 @@ enum { int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); @@ -7571,8 +7571,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c ---- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-08-23 12:08:29.089481525 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 12:09:03.302511603 -0400 +--- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-08-24 14:14:13.612707000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-24 14:17:48.604730000 -0400 @@ -17,11 +17,19 @@ #include #include @@ -7750,7 +7750,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/e svcauth_unix_purge(); diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/direct.c --- linux-2.6.34.noarch/fs/nfs/direct.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-08-23 12:09:03.303491500 -0400 ++++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-08-24 14:17:48.610730000 -0400 @@ -267,6 +267,38 @@ static const struct rpc_call_ops nfs_rea .rpc_release = nfs_direct_read_release, }; @@ -7996,7 +7996,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/dir user_addr += bytes; diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kconfig --- linux-2.6.34.noarch/fs/nfsd/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-08-23 12:09:03.304505472 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-08-24 14:17:48.616730000 -0400 @@ -79,3 +79,52 @@ config NFSD_V4 available from http://linux-nfs.org/. @@ -8052,7 +8052,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kc + If unsure, say N. diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/Makefile --- linux-2.6.34.noarch/fs/nfsd/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-08-23 12:09:03.304505472 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-08-24 14:17:48.621733000 -0400 @@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ @@ -8062,8 +8062,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/M +nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o +nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-23 12:08:29.090501507 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 12:09:03.306491345 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-24 14:14:13.618705000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-24 14:17:48.628730000 -0400 @@ -40,7 +40,6 @@ #define NFSPROC4_CB_NULL 0 @@ -8603,8 +8603,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/ +} +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-23 12:09:03.307491492 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-23 12:09:03.308491262 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-24 14:17:48.633729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-24 14:17:48.641730000 -0400 @@ -0,0 +1,1679 @@ +/****************************************************************************** + * @@ -10286,8 +10286,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfs + return status; +} diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-23 12:09:03.309501439 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-23 12:09:03.309501439 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-24 14:17:48.645731000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-24 14:17:48.647730000 -0400 @@ -0,0 +1,461 @@ +/****************************************************************************** + * @@ -10751,8 +10751,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/n +}; +EXPORT_SYMBOL(pnfs_dlm_export_ops); diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-23 12:09:03.310501390 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-23 12:09:03.310501390 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-24 14:17:48.651729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-24 14:17:48.652735000 -0400 @@ -0,0 +1,620 @@ +/* +* linux/fs/nfsd/nfs4pnfsds.c @@ -11375,8 +11375,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nf + +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-23 12:08:29.091491685 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 12:09:03.311501496 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-24 14:14:13.623707000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-24 14:17:48.658733000 -0400 @@ -34,10 +34,14 @@ */ #include @@ -11851,8 +11851,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd static const char *nfsd4_op_name(unsigned opnum) diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-08-23 12:08:29.093491375 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 12:09:03.313491310 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-08-24 14:14:13.632707000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-24 14:17:48.667732000 -0400 @@ -42,6 +42,8 @@ #include "xdr4.h" #include "vfs.h" @@ -12368,8 +12368,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs } diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-23 12:09:03.315491356 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-08-24 14:14:13.639707000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-24 14:17:48.675730000 -0400 @@ -47,9 +47,14 @@ #include #include @@ -12988,8 +12988,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/ [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c ---- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-08-23 12:08:29.094491943 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 12:09:03.317501495 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-08-24 14:14:13.645705000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-24 14:17:48.681730000 -0400 @@ -13,10 +13,15 @@ #include #include @@ -13166,8 +13166,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/n remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h ---- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-08-23 12:08:29.095491390 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 12:09:03.318355741 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-08-24 14:14:13.651705000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-24 14:17:48.687730000 -0400 @@ -285,11 +285,17 @@ extern time_t nfsd4_grace; #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ NFSD4_SUPPORTED_ATTRS_WORD0 @@ -13189,7 +13189,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfs { diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.c --- linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-08-23 12:09:03.319511586 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-08-24 14:17:48.693730000 -0400 @@ -10,6 +10,7 @@ #include @@ -13227,7 +13227,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nf __u32 tfh[2]; diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.h --- linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-08-23 12:09:03.319511586 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-08-24 14:17:48.698733000 -0400 @@ -14,6 +14,7 @@ enum nfsd_fsid { FSID_UUID8, FSID_UUID16, @@ -13280,8 +13280,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nf + #endif /* _LINUX_NFSD_FH_INT_H */ diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/nfssvc.c ---- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-08-23 12:08:27.631563969 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-08-23 12:09:03.320416974 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-08-24 14:14:06.365163000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-08-24 14:17:48.704731000 -0400 @@ -115,7 +115,7 @@ struct svc_program nfsd_program = { }; @@ -13292,8 +13292,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/n int nfsd_vers(int vers, enum vers_op change) { diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pnfsd.h ---- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-08-23 12:09:03.321376171 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-08-23 12:09:03.321376171 -0400 +--- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-08-24 14:17:48.708729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-08-24 14:17:48.710730000 -0400 @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2005 The Regents of the University of Michigan. @@ -13439,8 +13439,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pn + +#endif /* LINUX_NFSD_PNFSD_H */ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c ---- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-23 12:09:03.321376171 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-23 12:09:03.322501672 -0400 +--- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-24 14:17:48.713731000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-24 14:17:48.715730000 -0400 @@ -0,0 +1,225 @@ +/* + * linux/fs/nfsd/pnfs_lexp.c @@ -13668,8 +13668,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nf + inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; +} diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_com.c ---- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-23 12:09:03.322501672 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-08-23 12:09:03.323511608 -0400 +--- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-24 14:17:48.719729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-08-24 14:17:48.720735000 -0400 @@ -0,0 +1,535 @@ +/* + * fs/nfsd/spnfs_com.c @@ -14207,8 +14207,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfs +} +#endif /* CONFIG_PROC_FS */ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c ---- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-23 12:09:03.324501390 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-08-23 12:09:03.324501390 -0400 +--- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-24 14:17:48.724733000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-08-24 14:17:48.726730000 -0400 @@ -0,0 +1,878 @@ +/* + * fs/nfsd/spnfs_ops.c @@ -15089,8 +15089,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfs + return 0; +} diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h ---- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-08-23 12:08:29.096512142 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 12:09:03.325501424 -0400 +--- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-08-24 14:14:13.656705000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-24 14:17:48.731738000 -0400 @@ -242,6 +242,12 @@ struct nfs4_client { u32 cl_cb_seq_nr; struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ @@ -15207,8 +15207,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/st + #endif /* NFSD4_STATE_H */ diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs.c ---- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-08-23 12:08:27.632564132 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-08-23 12:09:03.326501490 -0400 +--- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-08-24 14:14:06.371160000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-08-24 14:17:48.737742000 -0400 @@ -37,7 +37,12 @@ #ifdef CONFIG_NFSD_V4 #include @@ -15335,8 +15335,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs. out_nfserr: err = nfserrno(host_err); diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h ---- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-08-23 12:08:29.097425997 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 12:09:03.327451643 -0400 +--- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-08-24 14:14:13.661705000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-24 14:17:48.743747000 -0400 @@ -37,6 +37,8 @@ #ifndef _LINUX_NFSD_XDR4_H #define _LINUX_NFSD_XDR4_H @@ -15413,8 +15413,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr struct nfs4_replay * replay; }; diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file.c ---- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-23 12:08:29.039491912 -0400 -+++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-23 12:09:03.328501680 -0400 +--- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-24 14:14:13.079708000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-24 14:17:48.749746000 -0400 @@ -28,6 +28,7 @@ #include #include @@ -15540,8 +15540,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file. if (!ret) return VM_FAULT_LOCKED; diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inode.c ---- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-08-23 12:08:29.042511552 -0400 -+++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-23 12:09:03.329501644 -0400 +--- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-08-24 14:14:13.095705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-24 14:17:48.757730000 -0400 @@ -48,6 +48,7 @@ #include "internal.h" #include "fscache.h" @@ -15755,8 +15755,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inod nfs_fs_proc_exit(); nfsiod_stop(); diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/internal.h ---- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-08-23 12:08:29.042511552 -0400 -+++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-23 12:09:03.330502148 -0400 +--- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-08-24 14:14:13.100708000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-24 14:17:48.763734000 -0400 @@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_serv struct nfs_fattr *); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); @@ -15817,7 +15817,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/i struct page *, struct page *); diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kconfig --- linux-2.6.34.noarch/fs/nfs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-08-23 12:09:03.331395814 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-08-24 14:17:48.769730000 -0400 @@ -79,10 +79,48 @@ config NFS_V4_1 depends on NFS_V4 && EXPERIMENTAL help @@ -15870,7 +15870,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kcon depends on NFS_FS=y && IP_PNP diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Makefile --- linux-2.6.34.noarch/fs/nfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-08-23 12:09:03.331395814 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-08-24 14:17:48.774730000 -0400 @@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4x delegation.o idmap.o \ callback.o callback_xdr.o callback_proc.o \ @@ -15885,8 +15885,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Mak +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ +obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs3proc.c ---- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-08-23 12:08:29.045525837 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-23 12:09:03.332511640 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-08-24 14:14:13.119708000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-24 14:17:48.780730000 -0400 @@ -833,6 +833,7 @@ const struct nfs_rpc_ops nfs_v3_clientop .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs3_dir_inode_operations, @@ -15896,8 +15896,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/n .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-23 12:09:03.333512111 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-08-23 12:09:03.334491472 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-24 14:17:48.784731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-08-24 14:17:48.786730000 -0400 @@ -0,0 +1,765 @@ +/* + * linux/fs/nfs/nfs4filelayout.c @@ -16665,8 +16665,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs +module_init(nfs4filelayout_init); +module_exit(nfs4filelayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-23 12:09:03.334491472 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-23 12:09:03.335501543 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-24 14:17:48.790731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-24 14:17:48.792730000 -0400 @@ -0,0 +1,636 @@ +/* + * linux/fs/nfs/nfs4filelayoutdev.c @@ -17305,8 +17305,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch +} + diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-23 12:09:03.335501543 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-08-23 12:09:03.335501543 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-24 14:17:48.795731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-08-24 14:17:48.796742000 -0400 @@ -0,0 +1,97 @@ +/* + * pnfs_nfs4filelayout.h @@ -17406,8 +17406,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs + +#endif /* FS_NFS_NFS4FILELAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nfs4_fs.h ---- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-23 12:08:29.047512264 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-23 12:09:03.336490079 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-24 14:14:13.130705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-24 14:17:48.802730000 -0400 @@ -45,8 +45,28 @@ enum nfs4_client_state { NFS4CLNT_RECLAIM_NOGRACE, NFS4CLNT_DELEGRETURN, @@ -17556,8 +17556,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nf /* nfs4xdr.c */ diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs4proc.c ---- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-08-23 12:08:29.050481368 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-23 12:09:03.339481253 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-08-24 14:14:13.143709000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-24 14:17:48.811734000 -0400 @@ -49,12 +49,15 @@ #include #include @@ -19223,7 +19223,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/n .setattr = nfs4_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs/nfs4renewd.c --- linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-08-23 12:09:03.341491726 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-08-24 14:17:48.818733000 -0400 @@ -54,17 +54,17 @@ void nfs4_renew_state(struct work_struct *work) @@ -19246,8 +19246,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs spin_lock(&clp->cl_lock); lease = clp->cl_lease_time; diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/nfs4state.c ---- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-08-23 12:08:29.052491341 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-23 12:09:03.342373443 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-08-24 14:14:13.150705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-24 14:17:48.825730000 -0400 @@ -53,6 +53,9 @@ #include "callback.h" #include "delegation.h" @@ -19566,8 +19566,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/ test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs4xdr.c ---- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-23 12:08:29.054481400 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-23 12:09:03.346481283 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-24 14:14:13.159705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-24 14:17:48.834738000 -0400 @@ -50,8 +50,11 @@ #include #include @@ -21078,8 +21078,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nf }; diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild ---- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-23 12:09:03.348511665 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-08-23 12:09:03.348511665 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-24 14:17:48.839734000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-08-24 14:17:48.840742000 -0400 @@ -0,0 +1,11 @@ +# +# Makefile for the pNFS Objects Layout Driver kernel module @@ -21093,8 +21093,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs +panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o +obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-23 12:09:03.349501459 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-23 12:09:03.349501459 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-24 14:17:48.843735000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-24 14:17:48.845739000 -0400 @@ -0,0 +1,1087 @@ +/* + * objio_osd.c @@ -22184,8 +22184,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noar +module_init(objlayout_init); +module_exit(objlayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-23 12:09:03.350491564 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-08-23 12:09:03.350491564 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-24 14:17:48.848735000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-08-24 14:17:48.851730000 -0400 @@ -0,0 +1,790 @@ +/* + * objlayout.c @@ -22978,8 +22978,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noar + .uninitialize_mountpoint = objlayout_uninitialize_mountpoint, +}; diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h ---- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-23 12:09:03.351434439 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-08-23 12:09:03.351434439 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-24 14:17:48.852735000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-08-24 14:17:48.854746000 -0400 @@ -0,0 +1,171 @@ +/* + * objlayout.h @@ -23153,8 +23153,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noar + +#endif /* _OBJLAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-23 12:09:03.352501716 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-23 12:09:03.352501716 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-24 14:17:48.857735000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-24 14:17:48.860740000 -0400 @@ -0,0 +1,734 @@ +/* + * panfs_shim.c @@ -23891,8 +23891,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noa +module_init(panlayout_init); +module_exit(panlayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h ---- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-23 12:09:03.353501685 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-23 12:09:03.353501685 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-24 14:17:48.863734000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-24 14:17:48.864730000 -0400 @@ -0,0 +1,482 @@ +/* + * panfs_shim.h @@ -24377,8 +24377,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noa + +#endif /* _PANLAYOUT_PANFS_SHIM_H */ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-23 12:09:03.354501721 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-23 12:09:03.354501721 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-24 14:17:48.868731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-24 14:17:48.869739000 -0400 @@ -0,0 +1,435 @@ +/* + * pnfs_osd_xdr.c @@ -24816,8 +24816,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6. + return 0; +} diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/pagelist.c ---- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-08-23 12:08:29.056411363 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-23 12:09:03.355511659 -0400 +--- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-08-24 14:14:13.169705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-24 14:17:48.875733000 -0400 @@ -20,6 +20,7 @@ #include @@ -24940,8 +24940,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/p if (res == INT_MAX) goto out; diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs.c ---- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-08-23 12:09:03.356501413 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-08-23 12:09:03.357481204 -0400 +--- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-08-24 14:17:48.880733000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-08-24 14:17:48.883730000 -0400 @@ -0,0 +1,2027 @@ +/* + * linux/fs/nfs/pnfs.c @@ -26971,8 +26971,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs. +} +EXPORT_SYMBOL(nfs4_put_deviceid_cache); diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs.h ---- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-08-23 12:09:03.358501440 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-08-23 12:09:03.358501440 -0400 +--- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-08-24 14:17:48.886733000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-08-24 14:17:48.887735000 -0400 @@ -0,0 +1,355 @@ +/* + * fs/nfs/pnfs.h @@ -27330,8 +27330,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs. + +#endif /* FS_NFS_PNFS_H */ diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc.c ---- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-08-23 12:08:29.057511533 -0400 -+++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-23 12:09:03.359501471 -0400 +--- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-08-24 14:14:13.174707000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-24 14:17:48.893730000 -0400 @@ -443,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, stru fattr = nfs_alloc_fattr(); status = -ENOMEM; @@ -27359,8 +27359,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc. .getattr = nfs_proc_getattr, .setattr = nfs_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read.c ---- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-08-23 12:08:29.057511533 -0400 -+++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-23 12:09:03.359501471 -0400 +--- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-08-24 14:14:13.179708000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-24 14:17:48.899733000 -0400 @@ -18,8 +18,12 @@ #include #include @@ -27575,8 +27575,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read. nfs_add_stats(inode, NFSIOS_READPAGES, npages); read_complete: diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/super.c ---- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-23 12:08:29.059491391 -0400 -+++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-23 12:09:03.361501458 -0400 +--- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-24 14:14:13.186707000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-24 14:17:48.907729000 -0400 @@ -64,6 +64,7 @@ #include "iostat.h" #include "internal.h" @@ -27624,8 +27624,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/supe #endif diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unlink.c ---- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-08-23 12:08:29.060501485 -0400 -+++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-23 12:09:03.362419975 -0400 +--- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-08-24 14:14:13.192705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-24 14:17:48.913730000 -0400 @@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task struct nfs_unlinkdata *data = calldata; struct nfs_server *server = NFS_SERVER(data->dir); @@ -27636,8 +27636,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unl return; rpc_call_start(task); diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/write.c ---- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-08-23 12:08:27.630563929 -0400 -+++ linux-2.6.34.noarch/fs/nfs/write.c 2010-08-23 12:09:03.364491337 -0400 +--- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-08-24 14:14:06.360160000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/write.c 2010-08-24 14:17:48.921712000 -0400 @@ -20,6 +20,7 @@ #include #include @@ -28326,7 +28326,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/writ int nfs_wb_page_cancel(struct inode *inode, struct page *page) diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/include/linux/exportfs.h --- linux-2.6.34.noarch/include/linux/exportfs.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-08-23 12:09:03.365501459 -0400 ++++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-08-24 14:17:48.933713000 -0400 @@ -2,6 +2,7 @@ #define LINUX_EXPORTFS_H 1 @@ -28399,8 +28399,8 @@ diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/i +#endif /* CONFIG_PNFSD */ #endif /* LINUX_EXPORTFS_H */ diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/include/linux/exp_xdr.h ---- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-08-23 12:09:03.367491365 -0400 -+++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-08-23 12:09:03.367491365 -0400 +--- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-08-24 14:17:48.945690000 -0400 ++++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-08-24 14:17:48.946693000 -0400 @@ -0,0 +1,141 @@ +#ifndef _LINUX_EXP_XDR_H +#define _LINUX_EXP_XDR_H @@ -28544,8 +28544,8 @@ diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/in +} +#endif /* _LINUX_EXP_XDR_H */ diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include/linux/fs.h ---- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-08-23 12:08:29.021511898 -0400 -+++ linux-2.6.34.noarch/include/linux/fs.h 2010-08-23 12:09:03.369481147 -0400 +--- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-08-24 14:14:13.014707000 -0400 ++++ linux-2.6.34.noarch/include/linux/fs.h 2010-08-24 14:17:48.961675000 -0400 @@ -387,6 +387,7 @@ struct inodes_stat_t { #include @@ -28564,7 +28564,7 @@ diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include struct dentry *s_root; diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/include/linux/nfs4.h --- linux-2.6.34.noarch/include/linux/nfs4.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-08-23 12:09:03.371491472 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-08-24 14:17:48.974681000 -0400 @@ -17,7 +17,10 @@ #define NFS4_BITMAP_SIZE 2 @@ -28694,8 +28694,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/inclu #endif diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfs4_pnfs.h ---- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-23 12:09:03.372501550 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-08-23 12:09:03.372501550 -0400 +--- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-24 14:17:48.986670000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-08-24 14:17:48.989666000 -0400 @@ -0,0 +1,330 @@ +/* + * include/linux/nfs4_pnfs.h @@ -29028,8 +29028,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/ + +#endif /* LINUX_NFS4_PNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarch/include/linux/nfsd4_block.h ---- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-08-23 12:09:03.373491892 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-08-23 12:09:03.374491393 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-08-24 14:17:48.998668000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-08-24 14:17:49.000665000 -0400 @@ -0,0 +1,101 @@ +#ifndef NFSD4_BLOCK +#define NFSD4_BLOCK @@ -29133,8 +29133,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarc +#endif /* NFSD4_BLOCK */ + diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h ---- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-23 12:09:03.375501481 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-08-23 12:09:03.375501481 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-24 14:17:49.012664000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-08-24 14:17:49.013671000 -0400 @@ -0,0 +1,345 @@ +/* + * include/linux/nfsd4_spnfs.h @@ -29483,7 +29483,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarc +#endif /* NFS_SPNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch/include/linux/nfsd/const.h --- linux-2.6.34.noarch/include/linux/nfsd/const.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-08-23 12:09:03.376401789 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-08-24 14:17:49.018668000 -0400 @@ -29,6 +29,7 @@ #ifdef __KERNEL__ @@ -29494,7 +29494,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch * Largest number of bytes we need to allocate for an NFS diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch/include/linux/nfsd/debug.h --- linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-08-23 12:09:03.376401789 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-08-24 14:17:49.024673000 -0400 @@ -32,6 +32,8 @@ #define NFSDDBG_REPCACHE 0x0080 #define NFSDDBG_XDR 0x0100 @@ -29506,7 +29506,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarch/include/linux/nfsd/export.h --- linux-2.6.34.noarch/include/linux/nfsd/export.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-08-23 12:09:03.377481954 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-08-24 14:17:49.030665000 -0400 @@ -100,6 +100,7 @@ struct svc_export { uid_t ex_anon_uid; gid_t ex_anon_gid; @@ -29516,8 +29516,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarc struct nfsd4_fs_locations ex_fslocs; int ex_nflavors; diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h ---- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-23 12:09:03.377481954 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-23 12:09:03.378501747 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-24 14:17:49.033666000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-24 14:17:49.034665000 -0400 @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -29652,8 +29652,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.3 + +#endif /* NFSD_NFS4LAYOUTXDR_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h ---- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-23 12:09:03.378501747 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-23 12:09:03.378501747 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-24 14:17:49.037666000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-24 14:17:49.039665000 -0400 @@ -0,0 +1,54 @@ +/****************************************************************************** + * @@ -29710,8 +29710,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34. + +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h ---- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-23 12:09:03.379487099 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-23 12:09:03.379487099 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-24 14:17:49.042666000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-24 14:17:49.044665000 -0400 @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -29986,7 +29986,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.n +#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noarch/include/linux/nfsd/syscall.h --- linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-08-23 12:09:03.380502500 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-08-24 14:17:49.049665000 -0400 @@ -29,6 +29,7 @@ /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ @@ -30024,8 +30024,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noar union nfsctl_res { diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/include/linux/nfs_fs.h ---- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-23 12:08:29.061494081 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-23 12:09:03.381511751 -0400 +--- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-24 14:14:13.201710000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-24 14:17:49.063666000 -0400 @@ -72,13 +72,20 @@ struct nfs_access_entry { int mask; }; @@ -30124,8 +30124,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/inc #ifdef __KERNEL__ diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/include/linux/nfs_fs_sb.h ---- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-23 12:08:29.062501618 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-23 12:09:03.383491395 -0400 +--- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-24 14:14:13.206708000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-24 14:17:49.077665000 -0400 @@ -15,6 +15,7 @@ struct nlm_host; struct nfs4_sequence_args; struct nfs4_sequence_res; @@ -30200,7 +30200,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/ atomic_t active; /* Keep trace of any activity to this server */ diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch/include/linux/nfs_iostat.h --- linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-08-23 12:09:03.384501540 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-08-24 14:17:49.089668000 -0400 @@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { NFSIOS_SHORTREAD, NFSIOS_SHORTWRITE, @@ -30213,7 +30213,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/include/linux/nfs_page.h --- linux-2.6.34.noarch/include/linux/nfs_page.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-08-23 12:09:03.385491518 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-08-24 14:17:49.103665000 -0400 @@ -39,6 +39,7 @@ struct nfs_page { struct list_head wb_list; /* Defines state of page: */ struct page *wb_page; /* page to read in/write out */ @@ -30262,8 +30262,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/i struct inode *inode, int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/include/linux/nfs_xdr.h ---- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-08-23 12:08:29.062501618 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-23 12:09:03.387491422 -0400 +--- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-08-24 14:14:13.211708000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-24 14:17:49.116665000 -0400 @@ -3,6 +3,8 @@ #include @@ -30415,8 +30415,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/in extern struct rpc_version nfs_version3; extern struct rpc_version nfs_version4; diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.noarch/include/linux/panfs_shim_api.h ---- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-08-23 12:09:03.388491527 -0400 -+++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-08-23 12:09:03.388491527 -0400 +--- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-08-24 14:17:49.128664000 -0400 ++++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-08-24 14:17:49.129670000 -0400 @@ -0,0 +1,57 @@ +#ifndef _PANFS_SHIM_API_H +#define _PANFS_SHIM_API_H @@ -30476,8 +30476,8 @@ diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.no + +#endif /* _PANFS_SHIM_API_H */ diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h ---- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-23 12:09:03.390501461 -0400 -+++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-08-23 12:09:03.390501461 -0400 +--- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-24 14:17:49.141664000 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-08-24 14:17:49.142670000 -0400 @@ -0,0 +1,440 @@ +/* + * pnfs_osd_xdr.h @@ -30920,8 +30920,8 @@ diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noar + +#endif /* __PNFS_OSD_XDR_H__ */ diff -up linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_xdr.h ---- linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig 2010-08-23 12:09:03.391491550 -0400 -+++ linux-2.6.34.noarch/include/linux/pnfs_xdr.h 2010-08-23 12:09:03.391491550 -0400 +--- linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig 2010-08-24 14:17:49.153666000 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_xdr.h 2010-08-24 14:17:49.155665000 -0400 @@ -0,0 +1,134 @@ +/* + * include/linux/pnfs_xdr.h @@ -31059,7 +31059,7 @@ diff -up linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig linux-2.6.34.noarch/i +#endif /* LINUX_PNFS_XDR_H */ diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/include/linux/posix_acl.h --- linux-2.6.34.noarch/include/linux/posix_acl.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-08-23 12:09:03.393501437 -0400 ++++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-08-24 14:17:49.168668000 -0400 @@ -8,6 +8,7 @@ #ifndef __LINUX_POSIX_ACL_H #define __LINUX_POSIX_ACL_H @@ -31070,7 +31070,7 @@ diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/ #define ACL_UNDEFINED_ID (-1) diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h --- linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-08-23 12:09:03.393501437 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-08-24 14:17:49.174665000 -0400 @@ -14,6 +14,8 @@ /* size of an XDR encoding unit in bytes, i.e. 32bit */ #define XDR_UNIT (4) @@ -31082,7 +31082,7 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.n diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h --- linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-23 12:09:03.394512138 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-24 14:17:49.179667000 -0400 @@ -3,6 +3,7 @@ #ifdef __KERNEL__ @@ -31103,8 +31103,8 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.3 struct rpc_pipe_ops { diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h ---- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-23 12:09:03.394512138 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-23 12:09:03.395501822 -0400 +--- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-24 14:17:49.183664000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-24 14:17:49.184674000 -0400 @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2008 The Regents of the University of Michigan. @@ -31219,7 +31219,7 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux +#endif /* _SIMPLE_RPC_PIPEFS_H_ */ diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h --- linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-23 12:09:03.395501822 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-24 14:17:49.190665000 -0400 @@ -166,4 +166,41 @@ static inline char *__svc_print_addr(con return buf; @@ -31263,8 +31263,8 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.n +} #endif /* SUNRPC_SVC_XPRT_H */ diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xdr.h ---- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-23 12:08:29.066475323 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-23 12:09:03.396464612 -0400 +--- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-24 14:14:13.258707000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-24 14:17:49.195672000 -0400 @@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp) return p + 2; } @@ -31287,14 +31287,9 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, unsigned int len); extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); -diff -up linux-2.6.34.noarch/localversion-pnfs.orig linux-2.6.34.noarch/localversion-pnfs ---- linux-2.6.34.noarch/localversion-pnfs.orig 2010-08-23 12:09:03.396464612 -0400 -+++ linux-2.6.34.noarch/localversion-pnfs 2010-08-23 12:09:03.396464612 -0400 -@@ -0,0 +1 @@ -+-pnfs diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/sunrpc/Makefile --- linux-2.6.34.noarch/net/sunrpc/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-08-23 12:09:03.397501662 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-08-24 14:17:49.204668000 -0400 @@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt svc.o svcsock.o svcauth.o svcauth_unix.o \ addr.o rpcb_clnt.o timer.o xdr.o \ @@ -31305,8 +31300,8 @@ diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/su sunrpc-$(CONFIG_PROC_FS) += stats.o sunrpc-$(CONFIG_SYSCTL) += sysctl.o diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c ---- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-23 12:09:03.398522348 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-23 12:09:03.398522348 -0400 +--- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-24 14:17:49.208664000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-24 14:17:49.209670000 -0400 @@ -0,0 +1,424 @@ +/* + * net/sunrpc/simple_rpc_pipefs.c @@ -31733,8 +31728,8 @@ diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.no +} +EXPORT_SYMBOL(pipefs_generic_destroy_msg); diff -up linux-2.6.34.noarch/net/sunrpc/xdr.c.orig linux-2.6.34.noarch/net/sunrpc/xdr.c ---- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-08-23 12:08:29.081501640 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-23 12:09:03.399443371 -0400 +--- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-08-24 14:14:13.447705000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-24 14:17:49.215665000 -0400 @@ -403,16 +403,14 @@ xdr_shrink_pagelen(struct xdr_buf *buf, /* Shift the tail first */ From f578792412bcedf3ba25d53774b683fc57dfbcdb Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Tue, 24 Aug 2010 15:13:05 -0400 Subject: [PATCH 08/20] set the kernel flags --with firmware --with debuginfo --without vdso_install --without debug --without headers Signed-off-by: Steve Dickson --- kernel.spec | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel.spec b/kernel.spec index 4fb3481f3..48f821659 100644 --- a/kernel.spec +++ b/kernel.spec @@ -101,23 +101,23 @@ Summary: The Linux kernel # kernel-smp (only valid for ppc 32-bit) %define with_smp %{?_without_smp: 0} %{?!_without_smp: 1} # kernel-debug -%define with_debug %{?_without_debug: 0} %{?!_without_debug: 1} +%define with_debug %{?_without_debug: 0} %{?!_without_debug: 0} # kernel-doc -%define with_doc %{?_without_doc: 0} %{?!_without_doc: 1} +%define with_doc %{?_without_doc: 0} %{?!_without_doc: 0} # kernel-headers -%define with_headers %{?_without_headers: 0} %{?!_without_headers: 1} +%define with_headers %{?_without_headers: 0} %{?!_without_headers: 0} # kernel-firmware %define with_firmware %{?_with_firmware: 1} %{?!_with_firmware: 1} # tools/perf -%define with_perftool %{?_without_perftool: 0} %{?!_without_perftool: 1} +%define with_perftool %{?_without_perftool: 0} %{?!_without_perftool: 0} # perf noarch subpkg -%define with_perf %{?_without_perf: 0} %{?!_without_perf: 1} +%define with_perf %{?_without_perf: 0} %{?!_without_perf: 0} # kernel-debuginfo -%define with_debuginfo %{?_without_debuginfo: 0} %{?!_without_debuginfo: 1} +%define with_debuginfo %{?_without_debuginfo: 1} %{?!_without_debuginfo: 1} # kernel-bootwrapper (for creating zImages from kernel + initrd) %define with_bootwrapper %{?_without_bootwrapper: 0} %{?!_without_bootwrapper: 1} # Want to build a the vsdo directories installed -%define with_vdso_install %{?_without_vdso_install: 0} %{?!_without_vdso_install: 1} +%define with_vdso_install %{?_without_vdso_install: 0} %{?!_without_vdso_install: 0} # Build the kernel-doc package, but don't fail the build if it botches. # Here "true" means "continue" and "false" means "fail the build". From dcf28a529829080f523c5c4b5b01b25720a928ea Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Tue, 31 Aug 2010 20:57:01 -0400 Subject: [PATCH 09/20] - Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-24 Signed-off-by: Steve Dickson --- kernel.spec | 9 +- pnfs-all-2.6.35-2010-08-24-f13.patch | 31778 +++++++++++++++++++++++++ 2 files changed, 31784 insertions(+), 3 deletions(-) create mode 100644 pnfs-all-2.6.35-2010-08-24-f13.patch diff --git a/kernel.spec b/kernel.spec index 48f821659..094922072 100644 --- a/kernel.spec +++ b/kernel.spec @@ -23,7 +23,7 @@ Summary: The Linux kernel # # (Uncomment the '#' and both spaces below to set the buildid.) # -%define buildid .pnfs34.2010.08.19 +%define buildid .pnfs34.2010.08.24 ################################################################### # The buildid can also be specified on the rpmbuild command line @@ -768,7 +768,7 @@ Patch12480: kprobes-x86-fix-kprobes-to-skip-prefixes-correctly.patch Patch30000: nfs-35-fc.patch Patch30001: nfsd-35-fc.patch -Patch30002: pnfs-all-2.6.35-2010-08-19-f13.patch +Patch30002: pnfs-all-2.6.35-2010-08-24-f13.patch Patch30003: linux-2.6-pnfs-compile.patch Patch30004: linux-2.6.35-inline.patch @@ -1432,7 +1432,7 @@ ApplyPatch kprobes-x86-fix-kprobes-to-skip-prefixes-correctly.patch ApplyPatch nfs-35-fc.patch ApplyPatch nfsd-35-fc.patch -ApplyPatch pnfs-all-2.6.35-2010-08-19-f13.patch +ApplyPatch pnfs-all-2.6.35-2010-08-24-f13.patch ApplyPatch linux-2.6-pnfs-compile.patch ApplyPatch linux-2.6.35-inline.patch # END OF PATCH APPLICATIONS @@ -2056,6 +2056,9 @@ fi %changelog +* Tue Aug 31 2010 Steve Dickson +- Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-24 + * Fri Aug 27 2010 Chuck Ebbert 2.6.34.6-47 - Linux 2.6.34.6 - drivers-hwmon-coretemp-c-detect-the-thermal-sensors-by-cpuid.patch (#625734) diff --git a/pnfs-all-2.6.35-2010-08-24-f13.patch b/pnfs-all-2.6.35-2010-08-24-f13.patch new file mode 100644 index 000000000..17d1c844d --- /dev/null +++ b/pnfs-all-2.6.35-2010-08-24-f13.patch @@ -0,0 +1,31778 @@ +diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arch/um/os-Linux/mem.c +--- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-08-31 20:41:16.924243041 -0400 ++++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-08-31 20:42:05.486160576 -0400 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include "init.h" + #include "kern_constants.h" + #include "os.h" +diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd.c +--- linux-2.6.34.noarch/block/genhd.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/block/genhd.c 2010-08-31 20:42:05.487160201 -0400 +@@ -1009,6 +1009,7 @@ static void disk_release(struct device * + struct class block_class = { + .name = "block", + }; ++EXPORT_SYMBOL(block_class); + + static char *block_devnode(struct device *dev, mode_t *mode) + { +diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt +--- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-31 20:42:05.486160576 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-08-31 20:42:05.486160576 -0400 +@@ -0,0 +1,211 @@ ++(c) 2007 Network Appliance Inc. ++ ++spNFS ++----- ++ ++An spNFS system consists of a Meta Data Server (MDS), a number of Client machines (C) and a number of Data Servers (DS). ++ ++A file system is mounted by the clients from the MDS, and all file data ++is striped across the DSs. ++ ++Identify the machines that will be filling each of these roles. ++ ++The spnfs kernel will be installed on all machines: clients, the MDS and DSs. ++ ++ ++Building and installing the spNFS kernel ++---------------------------------------- ++ ++Get the spNFS kernel from: ++ ++ git://linux-nfs.org/~bhalevy/linux-pnfs.git ++ ++Use the pnfs-all-latest branch and add these options to your .config file ++ ++ CONFIG_NETWORK_FILESYSTEMS=y ++ CONFIG_NFS_FS=m ++ CONFIG_NFS_V4=y ++ CONFIG_NFS_V4_1=y ++ CONFIG_PNFS=y ++ CONFIG_NFSD=m ++ CONFIG_PNFSD=y ++ # CONFIG_PNFSD_LOCAL_EXPORT is not set ++ CONFIG_SPNFS=y ++ ++By default, spNFS uses whole-file layouts. Layout segments can be enabled ++by adding: ++ ++ CONFIG_SPNFS_LAYOUTSEGMENTS=y ++ ++to your .config file. ++ ++Building and installation of kernel+modules is as usual. ++This kernel should be installed and booted on the client, MDS and DSs. ++ ++Note that CONFIG_PNFSD_LOCAL_EXPORT must be disabled for spnfs as it ++takes over the pnfs export interface. ++ ++Building nfs-utils ++------------------ ++ ++Get the nfs-utils package containing spnfsd from: ++ ++ git://linux-nfs.org/~bhalevy/pnfs-nfs-utils.git ++ ++Follow the standard instructions for building nfs-utils. ++ ++After building, the spnfsd daemon will be located in utils/spnfsd. The spnfsd ++daemon will only be needed on the MDS. ++ ++ ++Installation ++------------ ++ ++The nfs-utils package contains a default spnfsd.conf file in ++utils/spnfsd/spnfsd.conf. Copy this file to /etc/spnfsd.conf. ++ ++By default, the DS-Mount-Directory is set to /spnfs (see spnfsd.conf). Under ++this directory, mount points must be created for each DS to ++be used for pNFS data stripes. These mount points are named by the ip address ++of the corresponding DS. In the sample spnfsd.conf, there are two ++DSs defined (172.16.28.134 and 172.16.28.141). ++ ++Following the sample spnfsd.conf, ++ ++ mkdir /spnfs ++ ++on the MDS (corresponding to DS-Mount-Directory). Then ++ ++ mkdir /spnfs/172.16.28.134 ++ mkdir /spnfs/172.16.28.141 ++ ++to create the mount points for the DSs. ++ ++On the DSs, chose a directory where data stripes will be created by the MDS. ++For the sample file, this directory is /pnfs, so on each DS execute: ++ ++ mkdir /pnfs ++ ++This directory is specified in the spnfsd.conf file by the DS*_ROOT option ++(where * is replaced by the DS number). DS_ROOT is specified relative to ++the directory being exported by the DSs. In our example, our DSs are exporting ++the root directory (/) and therefore our DS_ROOT is /pnfs. On the DSs, we have ++the following entry in /etc/exports: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check) ++ ++N.B. If we had created a /exports directory and a /pnfs directory under ++/exports, and if we were exporting /exports, then DS_ROOT would still be /pnfs ++(not /exports/pnfs). ++ ++It may be useful to add entries to /etc/fstab on the MDS to automatically ++mount the DS_ROOT file systems. For this example, our MDS fstab would ++contain: ++ ++ 172.17.84.128:/pnfs /spnfs/172.17.84.128 nfs defaults 1 2 ++ 172.17.84.122:/pnfs /spnfs/172.17.84.122 nfs defaults 1 2 ++ ++The DS mounts must be performed manually or via fstab at this time (automatic ++mounting, directory creation, etc. are on the todo list). To perform I/O ++through the MDS, the DS mounts MUST use NFSv3 at this time (this restriction ++will eventually be removed). ++ ++ ++On the MDS, choose a file system to use with spNFS and export it, e.g.: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check,pnfs) ++ ++Make sure nfsd and all supporting processes are running on the MDS and DSs. ++ ++ ++Running ++------- ++ ++If rpc_pipefs is not already mounted (if you're running idmapd it probably is), ++you may want to add the following line to /etc/fstab: ++ ++ rpc_pipefs /var/lib/nfs/rpc_pipefs rpc_pipefs defaults 0 0 ++ ++to automatically mount rpc_pipefs. ++ ++With spnfsd.conf configured for your environment and the mounts mounted as ++described above, spnfsd can now be started. ++ ++On the MDS, execute spnfsd: ++ ++ spnfsd ++ ++The executable is located in the directory where it was built, and ++may also have been installed elsewhere depending on how you built nfs-utils. ++It will run in the foreground by default, and in fact will do so despite ++any options suggesting the contrary (it's still a debugging build). ++ ++On the client, make sure the nfslayoutdriver module is loaded: ++ ++ modprobe nfslayoutdriver ++ ++Then mount the file system from the MDS: ++ ++ mount -t nfs4 -o minorversion=1 mds:/ /mnt ++ ++I/O through the MDS is now supported. To use it, do not load the ++nfslayoutdriver on the client, and mount the MDS using NFSv4 or 4.1 ++(NFSv2 and v3 are not yet supported). ++ ++You may now use spNFS by performing file system activities in /mnt. ++If you create files in /mnt, you should see stripe files corresponding to ++new files being created on the DSs. The current implementation names the ++stripe files based on the inode number of the file on the MDS. For example, ++if you create a file foo in /mnt and do an 'ls -li /mnt/foo': ++ ++ # ls -li foo ++ 1233 -rw-r--r-- 1 root root 0 Nov 29 15:54 foo ++ ++You should see stripe files on each under /pnfs (per the sample) named ++1233. The file /pnfs/1233 on DS1 will contain the first bytes ++of data written to foo, DS2 will contain the next bytes, etc. ++Removing /mnt/foo will remove the corresponding stripe files on the DSs. ++Other file system operations should behave (mostly :-) as expected. ++ ++ ++Layout Segments ++--------------- ++ ++If the kernel is compiled to support layout segments, there will ++be two files created under /proc/fs/spnfs for controlling layout ++segment functionality. ++ ++To enable layout segments, write a '1' to /proc/fs/spnfs/layoutseg, e.g.: ++ ++ echo 1 > /proc/fs/spnfs/layoutseg ++ ++Layout segments can be disabled (returning to whole-file layouts) by ++writing a '0' to /proc/fs/spnfs/layoutseg: ++ ++ echo 0 > /proc/fs/spnfs/layoutseg ++ ++When layout segments are enabled, the size of the layouts returned can ++be specified by writing a decimal number (ascii representation) to ++/proc/fs/spnfs/layoutsegsize: ++ ++ echo 1024 > /proc/fs/spnfs/layoutsegsize ++ ++The value'0' has a special meaning--it causes the server to return a ++layout that is exactly the size requested by the client: ++ ++ echo 0 > /proc/fs/spnfs/layoutsegsize ++ ++ ++Troubleshooting ++--------------- ++ ++If you see data being written to the files on the MDS rather than ++the stripe files, make sure the nfslayoutdriver is loaded on the client ++(see above). ++ ++If you get a "permission denied" error, make sure mountd is running on the mds ++(it occasionally fails to start). ++ ++Bugs, enhancements, compliments, complaints to: dmuntz@netapp.com ++ ++ +diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/drivers/md/dm-ioctl.c +--- linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig 2010-08-31 20:41:17.063232968 -0400 ++++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-08-31 20:42:05.488160560 -0400 +@@ -657,6 +657,12 @@ static int dev_create(struct dm_ioctl *p + return r; + } + ++int dm_dev_create(struct dm_ioctl *param) ++{ ++ return dev_create(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_create); ++ + /* + * Always use UUID for lookups if it's present, otherwise use name or dev. + */ +@@ -751,6 +757,12 @@ static int dev_remove(struct dm_ioctl *p + return 0; + } + ++int dm_dev_remove(struct dm_ioctl *param) ++{ ++ return dev_remove(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_remove); ++ + /* + * Check a string doesn't overrun the chunk of + * memory we copied from userland. +@@ -923,6 +935,12 @@ static int do_resume(struct dm_ioctl *pa + return r; + } + ++int dm_do_resume(struct dm_ioctl *param) ++{ ++ return do_resume(param); ++} ++EXPORT_SYMBOL(dm_do_resume); ++ + /* + * Set or unset the suspension state of a device. + * If the device already is in the requested state we just return its status. +@@ -1200,6 +1218,12 @@ out: + return r; + } + ++int dm_table_load(struct dm_ioctl *param, size_t param_size) ++{ ++ return table_load(param, param_size); ++} ++EXPORT_SYMBOL(dm_table_load); ++ + static int table_clear(struct dm_ioctl *param, size_t param_size) + { + int r; +diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drivers/scsi/hosts.c +--- linux-2.6.34.noarch/drivers/scsi/hosts.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-08-31 20:42:05.489160594 -0400 +@@ -49,7 +49,7 @@ static void scsi_host_cls_release(struct + put_device(&class_to_shost(dev)->shost_gendev); + } + +-static struct class shost_class = { ++struct class shost_class = { + .name = "scsi_host", + .dev_release = scsi_host_cls_release, + }; +diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/exofs.h +--- linux-2.6.34.noarch/fs/exofs/exofs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-08-31 20:42:05.492243039 -0400 +@@ -36,13 +36,9 @@ + #include + #include + #include ++#include + #include "common.h" + +-/* FIXME: Remove once pnfs hits mainline +- * #include +- */ +-#include "pnfs.h" +- + #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) + + #ifdef CONFIG_EXOFS_DEBUG +@@ -103,6 +99,7 @@ struct exofs_sb_info { + struct exofs_i_info { + struct inode vfs_inode; /* normal in-memory inode */ + wait_queue_head_t i_wq; /* wait queue for inode */ ++ spinlock_t i_layout_lock; /* lock for layout/return/recall */ + unsigned long i_flags; /* various atomic flags */ + uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ + uint32_t i_dir_start_lookup; /* which page to start lookup */ +@@ -166,6 +163,9 @@ static inline unsigned exofs_io_state_si + */ + #define OBJ_2BCREATED 0 /* object will be created soon*/ + #define OBJ_CREATED 1 /* object has been created on the osd*/ ++/* Below are not used atomic but reuse the same i_flags */ ++#define OBJ_LAYOUT_IS_GIVEN 2 /* inode has given layouts to clients*/ ++#define OBJ_IN_LAYOUT_RECALL 3 /* inode is in the middle of a layout recall*/ + + static inline int obj_2bcreated(struct exofs_i_info *oi) + { +@@ -304,4 +304,20 @@ extern const struct inode_operations exo + extern const struct inode_operations exofs_symlink_inode_operations; + extern const struct inode_operations exofs_fast_symlink_inode_operations; + ++/* export.c */ ++typedef int (exofs_recall_fn)(struct inode *inode); ++#ifdef CONFIG_PNFSD ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo); ++void exofs_init_export(struct super_block *sb); ++#else ++static inline int exofs_inode_recall_layout(struct inode *inode, ++ enum pnfs_iomode iomode, exofs_recall_fn todo) ++{ ++ return todo(inode); ++} ++ ++static inline void exofs_init_export(struct super_block *sb) {} ++#endif ++ + #endif +diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs/export.c +--- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-08-31 20:42:05.493222759 -0400 ++++ linux-2.6.34.noarch/fs/exofs/export.c 2010-08-31 20:42:05.493222759 -0400 +@@ -0,0 +1,396 @@ ++/* ++ * export.c - Implementation of the pnfs_export_operations ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Boaz Harrosh ++ * ++ * This file is part of exofs. ++ * ++ * exofs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation. Since it is based on ext2, and the only ++ * valid version of GPL for the Linux kernel is version 2, the only valid ++ * version of GPL for exofs is version 2. ++ * ++ * exofs is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with exofs; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include ++#include "exofs.h" ++ ++static int exofs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_OSD2_OBJECTS; ++} ++ ++static void set_dev_id(struct pnfs_deviceid *pnfs_devid, u64 sbid, u64 devid) ++{ ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)pnfs_devid; ++ ++ dev_id->sbid = sbid; ++ dev_id->devid = devid; ++} ++ ++static int cb_layout_recall(struct inode *inode, enum pnfs_iomode iomode, ++ u64 offset, u64 length, void *cookie) ++{ ++ struct nfsd4_pnfs_cb_layout cbl; ++ struct pnfsd_cb_ctl cb_ctl; ++ int status; ++ ++ memset(&cb_ctl, 0, sizeof(cb_ctl)); ++ status = pnfsd_get_cb_op(&cb_ctl); ++ if (unlikely(status)) { ++ EXOFS_ERR("%s: nfsd unloaded!! inode (0x%lx) status=%d\n", ++ __func__, inode->i_ino, status); ++ goto err; ++ } ++ ++ memset(&cbl, 0, sizeof(cbl)); ++ cbl.cbl_recall_type = RETURN_FILE; ++ cbl.cbl_seg.layout_type = LAYOUT_OSD2_OBJECTS; ++ cbl.cbl_seg.iomode = iomode; ++ cbl.cbl_seg.offset = offset; ++ cbl.cbl_seg.length = length; ++ cbl.cbl_cookie = cookie; ++ ++ status = cb_ctl.cb_op->cb_layout_recall(inode->i_sb, inode, &cbl); ++ pnfsd_put_cb_op(&cb_ctl); ++ ++err: ++ return status; ++} ++ ++static enum nfsstat4 exofs_layout_get( ++ struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; ++ struct exofs_layout *el = &sbi->layout; ++ struct pnfs_osd_object_cred *creds = NULL; ++ struct pnfs_osd_layout layout; ++ __be32 *start; ++ bool in_recall; ++ int i, err; ++ enum nfsstat4 nfserr; ++ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ res->lg_seg.iomode = IOMODE_RW; ++ res->lg_return_on_close = true; /* TODO: unused but will be soon */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ creds = kcalloc(el->s_numdevs, sizeof(*creds), GFP_KERNEL); ++ if (!creds) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto out; ++ } ++ ++ /* Fill in a pnfs_osd_layout struct */ ++ layout.olo_map = sbi->data_map; ++ ++ for (i = 0; i < el->s_numdevs; i++) { ++ struct pnfs_osd_object_cred *cred = &creds[i]; ++ osd_id id = exofs_oi_objno(oi); ++ unsigned dev = exofs_layout_od_id(el, id, i); ++ ++ set_dev_id(&cred->oc_object_id.oid_device_id, args->lg_sbid, ++ dev); ++ cred->oc_object_id.oid_partition_id = el->s_pid; ++ cred->oc_object_id.oid_object_id = id; ++ cred->oc_osd_version = osd_dev_is_ver1(el->s_ods[dev]) ? ++ PNFS_OSD_VERSION_1 : ++ PNFS_OSD_VERSION_2; ++ cred->oc_cap_key_sec = PNFS_OSD_CAP_KEY_SEC_NONE; ++ ++ cred->oc_cap_key.cred_len = 0; ++ cred->oc_cap_key.cred = NULL; ++ ++ cred->oc_cap.cred_len = OSD_CAP_LEN; ++ cred->oc_cap.cred = oi->i_cred; ++ } ++ ++ layout.olo_comps_index = 0; ++ layout.olo_num_comps = el->s_numdevs; ++ layout.olo_comps = creds; ++ ++ err = pnfs_osd_xdr_encode_layout(xdr, &layout); ++ if (err) { ++ nfserr = NFS4ERR_TOOSMALL; /* FIXME: Change osd_xdr error codes */ ++ goto out; ++ } ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ if (!in_recall) { ++ __set_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ nfserr = NFS4_OK; ++ } else { ++ nfserr = NFS4ERR_RECALLCONFLICT; ++ } ++ spin_unlock(&oi->i_layout_lock); ++ ++out: ++ kfree(creds); ++ EXOFS_DBGMSG("(0x%lx) nfserr=%u xdr_bytes=%zu\n", ++ inode->i_ino, nfserr, exp_xdr_qbytes(xdr->p - start)); ++ return nfserr; ++} ++ ++/* NOTE: inode mutex must NOT be held */ ++static int exofs_layout_commit( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct timespec mtime; ++ loff_t i_size; ++ int in_recall; ++ ++ /* In case of a recall we ignore the new size and mtime since they ++ * are going to be changed again by truncate, and since we cannot take ++ * the inode lock in that case. ++ */ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ if (in_recall) { ++ EXOFS_DBGMSG("(0x%lx) commit was called during recall\n", ++ inode->i_ino); ++ return 0; ++ } ++ ++ /* NOTE: I would love to call inode_setattr here ++ * but i cannot since this will cause an eventual vmtruncate, ++ * which will cause a layout_recall. So open code the i_size ++ * and mtime/atime changes under i_mutex. ++ */ ++ mutex_lock_nested(&inode->i_mutex, I_MUTEX_NORMAL); ++ ++ if (args->lc_mtime.seconds) { ++ mtime.tv_sec = args->lc_mtime.seconds; ++ mtime.tv_nsec = args->lc_mtime.nseconds; ++ ++ /* layout commit may only make time bigger, since there might ++ * be reordering of the notifications and it might arrive after ++ * A local change. ++ * TODO: if mtime > ctime then we know set_attr did an mtime ++ * in the future. and we can let this update through ++ */ ++ if (0 <= timespec_compare(&mtime, &inode->i_mtime)) ++ mtime = inode->i_mtime; ++ } else { ++ mtime = current_fs_time(inode->i_sb); ++ } ++ ++ /* TODO: Will below work? since mark_inode_dirty has it's own ++ * Time handling ++ */ ++ inode->i_atime = inode->i_mtime = mtime; ++ ++ i_size = i_size_read(inode); ++ if (args->lc_newoffset) { ++ loff_t new_size = args->lc_last_wr + 1; ++ ++ if (i_size < new_size) { ++ i_size_write(inode, i_size = new_size); ++ res->lc_size_chg = 1; ++ res->lc_newsize = new_size; ++ } ++ } ++ /* TODO: else { i_size = osd_get_object_length() } */ ++ ++/* TODO: exofs does not currently use the osd_xdr part of the layout_commit */ ++ ++ mark_inode_dirty_sync(inode); ++ ++ mutex_unlock(&inode->i_mutex); ++ EXOFS_DBGMSG("(0x%lx) i_size=0x%llx lcp->off=0x%llx\n", ++ inode->i_ino, i_size, args->lc_last_wr); ++ return 0; ++} ++ ++static void exofs_handle_error(struct pnfs_osd_ioerr *ioerr) ++{ ++ EXOFS_ERR("exofs_handle_error: errno=%d is_write=%d obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ ioerr->oer_errno, ioerr->oer_iswrite, ++ _LLU(ioerr->oer_component.oid_object_id), ++ _LLU(ioerr->oer_comp_offset), ++ _LLU(ioerr->oer_comp_length)); ++} ++ ++static int exofs_layout_return( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ __be32 *p = args->lrf_body; ++ unsigned len = exp_xdr_qwords(args->lrf_body_len); ++ ++ EXOFS_DBGMSG("(0x%lx) cookie %p xdr_len %d\n", ++ inode->i_ino, args->lr_cookie, len); ++ ++ while (len >= pnfs_osd_ioerr_xdr_sz()) { ++ struct pnfs_osd_ioerr ioerr; ++ ++ p = pnfs_osd_xdr_decode_ioerr(&ioerr, p); ++ len -= pnfs_osd_ioerr_xdr_sz(); ++ exofs_handle_error(&ioerr); ++ } ++ ++ if (args->lr_cookie) { ++ struct exofs_i_info *oi = exofs_i(inode); ++ bool in_recall; ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ __clear_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ /* TODO: how to communicate cookie with the waiter */ ++ if (in_recall) ++ wake_up(&oi->i_wq); /* wakeup any recalls */ ++ } ++ ++ return 0; ++} ++ ++int exofs_get_device_info(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct exofs_sb_info *sbi = sb->s_fs_info; ++ struct pnfs_osd_deviceaddr devaddr; ++ const struct osd_dev_info *odi; ++ u64 devno = devid->devid; ++ __be32 *start; ++ int err; ++ ++ memset(&devaddr, 0, sizeof(devaddr)); ++ ++ if (unlikely(devno >= sbi->layout.s_numdevs)) ++ return -ENODEV; ++ ++ odi = osduld_device_info(sbi->layout.s_ods[devno]); ++ ++ devaddr.oda_systemid.len = odi->systemid_len; ++ devaddr.oda_systemid.data = (void *)odi->systemid; /* !const cast */ ++ ++ devaddr.oda_osdname.len = odi->osdname_len ; ++ devaddr.oda_osdname.data = (void *)odi->osdname;/* !const cast */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ err = -E2BIG; ++ goto err; ++ } ++ ++ err = pnfs_osd_xdr_encode_deviceaddr(xdr, &devaddr); ++ if (err) ++ goto err; ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ EXOFS_DBGMSG("xdr_bytes=%Zu devno=%lld osdname-%s\n", ++ exp_xdr_qbytes(xdr->p - start), devno, odi->osdname); ++ return 0; ++ ++err: ++ EXOFS_DBGMSG("Error: err=%d at_byte=%zu\n", ++ err, exp_xdr_qbytes(xdr->p - start)); ++ return err; ++} ++ ++struct pnfs_export_operations exofs_pnfs_ops = { ++ .layout_type = exofs_layout_type, ++ .layout_get = exofs_layout_get, ++ .layout_commit = exofs_layout_commit, ++ .layout_return = exofs_layout_return, ++ .get_device_info = exofs_get_device_info, ++}; ++ ++static bool is_layout_returned(struct exofs_i_info *oi) ++{ ++ bool layout_given; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ return !layout_given; ++} ++ ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ int layout_given; ++ int error = 0; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ __set_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ if (!layout_given) ++ goto exec; ++ ++ for (;;) { ++ EXOFS_DBGMSG("(0x%lx) has_layout issue a recall\n", ++ inode->i_ino); ++ error = cb_layout_recall(inode, iomode, 0, NFS4_MAX_UINT64, ++ &oi->i_wq); ++ switch (error) { ++ case 0: ++ case -EAGAIN: ++ break; ++ case -ENOENT: ++ goto exec; ++ default: ++ goto err; ++ } ++ ++ error = wait_event_interruptible(oi->i_wq, ++ is_layout_returned(oi)); ++ if (error) ++ goto err; ++ } ++ ++exec: ++ error = todo(inode); ++ ++err: ++ spin_lock(&oi->i_layout_lock); ++ __clear_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ EXOFS_DBGMSG("(0x%lx) return=>%d\n", inode->i_ino, error); ++ return error; ++} ++ ++void exofs_init_export(struct super_block *sb) ++{ ++ sb->s_pnfs_op = &exofs_pnfs_ops; ++} +diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/inode.c +--- linux-2.6.34.noarch/fs/exofs/inode.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-08-31 20:42:05.494222756 -0400 +@@ -833,7 +833,7 @@ void exofs_truncate(struct inode *inode) + if (unlikely(wait_obj_created(oi))) + goto fail; + +- ret = _do_truncate(inode); ++ ret = exofs_inode_recall_layout(inode, IOMODE_ANY, _do_truncate); + if (ret) + goto fail; + +@@ -964,6 +964,7 @@ static void __oi_init(struct exofs_i_inf + { + init_waitqueue_head(&oi->i_wq); + oi->i_flags = 0; ++ spin_lock_init(&oi->i_layout_lock); + } + /* + * Fill in an inode read from the OSD and set it up for use +diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/Kbuild +--- linux-2.6.34.noarch/fs/exofs/Kbuild.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-08-31 20:42:05.490222933 -0400 +@@ -13,4 +13,5 @@ + # + + exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o ++exofs-$(CONFIG_PNFSD) += export.o + obj-$(CONFIG_EXOFS_FS) += exofs.o +diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/Kconfig +--- linux-2.6.34.noarch/fs/exofs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-08-31 20:42:05.491232880 -0400 +@@ -1,6 +1,7 @@ + config EXOFS_FS + tristate "exofs: OSD based file system support" + depends on SCSI_OSD_ULD ++ select EXPORTFS_OSD_LAYOUT if PNFSD + help + EXOFS is a file system that uses an OSD storage device, + as its backing storage. +diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/super.c +--- linux-2.6.34.noarch/fs/exofs/super.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/super.c 2010-08-31 20:42:05.496073173 -0400 +@@ -621,6 +621,7 @@ static int exofs_fill_super(struct super + sb->s_fs_info = sbi; + sb->s_op = &exofs_sops; + sb->s_export_op = &exofs_export_ops; ++ exofs_init_export(sb); + root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); + if (IS_ERR(root)) { + EXOFS_ERR("ERROR: exofs_iget failed\n"); +diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exportfs/expfs.c +--- linux-2.6.34.noarch/fs/exportfs/expfs.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-08-31 20:42:05.497212975 -0400 +@@ -16,6 +16,13 @@ + #include + #include + ++#if defined(CONFIG_PNFSD) ++struct pnfsd_cb_ctl pnfsd_cb_ctl = { ++ .lock = __SPIN_LOCK_UNLOCKED(pnfsd_cb_ctl.lock) ++}; ++EXPORT_SYMBOL(pnfsd_cb_ctl); ++#endif /* CONFIG_PNFSD */ ++ + #define dprintk(fmt, args...) do{}while(0) + + +diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/exportfs/Makefile +--- linux-2.6.34.noarch/fs/exportfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-08-31 20:42:05.496073173 -0400 +@@ -3,4 +3,7 @@ + + obj-$(CONFIG_EXPORTFS) += exportfs.o + +-exportfs-objs := expfs.o ++exportfs-y := expfs.o ++exportfs-$(CONFIG_EXPORTFS_FILE_LAYOUT) += nfs4filelayoutxdr.o ++exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o ++exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o +diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c +--- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-31 20:42:05.497212975 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-31 20:42:05.498113655 -0400 +@@ -0,0 +1,158 @@ ++/* ++ * linux/fs/nfsd/nfs4blocklayoutxdr.c ++ * ++ * ++ * Created by Rick McNeal on 3/31/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++#include ++#include ++#include ++#include ++ ++static int ++bl_encode_simple(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 12 + 4 + bld->u.simple.bld_sig_len); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u32(p, 1); ++ p = exp_xdr_encode_u64(p, bld->u.simple.bld_offset); ++ exp_xdr_encode_opaque(p, bld->u.simple.bld_sig, ++ bld->u.simple.bld_sig_len); ++ ++ return 0; ++} ++ ++static int ++bl_encode_slice(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2 + 2 + 1); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_start); ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_len); ++ exp_xdr_encode_u32(p, bld->u.slice.bld_index); ++ ++ return 0; ++} ++ ++static int ++bl_encode_concat(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ return -ENOTSUPP; ++} ++ ++static int ++bl_encode_stripe(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ int i; ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 2 + 1 + bld->u.stripe.bld_stripes); ++ ++ p = exp_xdr_encode_u64(p, bld->u.stripe.bld_chunk_size); ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripes); ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripe_indexs[i]); ++ ++ return 0; ++} ++ ++int ++blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes) ++{ ++ u32 num_vols = 0, ++ *layoutlen_p = xdr->p; ++ pnfs_blocklayout_devinfo_t *bld; ++ int status = 0; ++ __be32 *p; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -ETOOSMALL; ++ p += 2; ++ ++ /* ++ * All simple volumes with their signature are required to be listed ++ * first. ++ */ ++ list_for_each_entry(bld, volumes, bld_list) { ++ num_vols++; ++ p = exp_xdr_reserve_qwords(xdr, 1); ++ if (!p) ++ return -ETOOSMALL; ++ p = exp_xdr_encode_u32(p, bld->bld_type); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ status = bl_encode_simple(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_SLICE: ++ status = bl_encode_slice(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ status = bl_encode_concat(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ status = bl_encode_stripe(xdr, bld); ++ break; ++ default: ++ BUG(); ++ } ++ if (status) ++ goto error; ++ } ++ ++ /* ---- Fill in the overall length and number of volumes ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (xdr->p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, num_vols); ++ ++error: ++ return status; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_devinfo); ++ ++enum nfsstat4 ++blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *bl_head) ++{ ++ struct pnfs_blocklayout_layout *b; ++ u32 *layoutlen_p = xdr->p, ++ extents = 0; ++ __be32 *p; ++ ++ /* ++ * Save spot for opaque block layout length and number of extents, ++ * fill-in later. ++ */ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p += 2; ++ ++ list_for_each_entry(b, bl_head, bll_list) { ++ extents++; ++ p = exp_xdr_reserve_qwords(xdr, 5 * 2 + 1); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.sbid); ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.devid); ++ p = exp_xdr_encode_u64(p, b->bll_foff); ++ p = exp_xdr_encode_u64(p, b->bll_len); ++ p = exp_xdr_encode_u64(p, b->bll_soff); ++ p = exp_xdr_encode_u32(p, b->bll_es); ++ } ++ ++ /* ---- Fill in the overall length and number of extents ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, extents); ++ ++ return NFS4_OK; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_layout); +diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c +--- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-31 20:42:05.498113655 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-31 20:42:05.498113655 -0400 +@@ -0,0 +1,218 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++/* We do our-own dprintk so filesystems are not dependent on sunrpc */ ++#ifdef dprintk ++#undef dprintk ++#endif ++#define dprintk(fmt, args, ...) do { } while (0) ++ ++/* Calculate the XDR length of the GETDEVICEINFO4resok structure ++ * excluding the gdir_notification and the gdir_device_addr da_layout_type. ++ */ ++static int fl_devinfo_xdr_words(const struct pnfs_filelayout_device *fdev) ++{ ++ struct pnfs_filelayout_devaddr *fl_addr; ++ struct pnfs_filelayout_multipath *mp; ++ int i, j, nwords; ++ ++ /* da_addr_body length, indice length, indices, ++ * multipath_list4 length */ ++ nwords = 1 + 1 + fdev->fl_stripeindices_length + 1; ++ for (i = 0; i < fdev->fl_device_length; i++) { ++ mp = &fdev->fl_device_list[i]; ++ nwords++; /* multipath list length */ ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ fl_addr = mp->fl_multipath_list; ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_netid.len); ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_addr.len); ++ } ++ } ++ dprintk("<-- %s nwords %d\n", __func__, nwords); ++ return nwords; ++} ++ ++/* Encodes the nfsv4_1_file_layout_ds_addr4 structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++int ++filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ unsigned int i, j, len = 0, opaque_words; ++ u32 *p_in; ++ u32 index_count = fdev->fl_stripeindices_length; ++ u32 dev_count = fdev->fl_device_length; ++ int error = 0; ++ __be32 *p; ++ ++ opaque_words = fl_devinfo_xdr_words(fdev); ++ dprintk("%s: Begin indx_cnt: %u dev_cnt: %u total size %u\n", ++ __func__, ++ index_count, ++ dev_count, ++ opaque_words*4); ++ ++ /* check space for opaque length */ ++ p = p_in = exp_xdr_reserve_qwords(xdr, opaque_words); ++ if (!p) { ++ error = -ETOOSMALL; ++ goto out; ++ } ++ ++ /* Fill in length later */ ++ p++; ++ ++ /* encode device list indices */ ++ p = exp_xdr_encode_u32(p, index_count); ++ for (i = 0; i < index_count; i++) ++ p = exp_xdr_encode_u32(p, fdev->fl_stripeindices_list[i]); ++ ++ /* encode device list */ ++ p = exp_xdr_encode_u32(p, dev_count); ++ for (i = 0; i < dev_count; i++) { ++ struct pnfs_filelayout_multipath *mp = &fdev->fl_device_list[i]; ++ ++ p = exp_xdr_encode_u32(p, mp->fl_multipath_length); ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ struct pnfs_filelayout_devaddr *da = ++ &mp->fl_multipath_list[j]; ++ ++ /* Encode device info */ ++ p = exp_xdr_encode_opaque(p, da->r_netid.data, ++ da->r_netid.len); ++ p = exp_xdr_encode_opaque(p, da->r_addr.data, ++ da->r_addr.len); ++ } ++ } ++ ++ /* backfill in length. Subtract 4 for da_addr_body size */ ++ len = (char *)p - (char *)p_in; ++ exp_xdr_encode_u32(p_in, len - 4); ++ ++ error = 0; ++out: ++ dprintk("%s: End err %d xdrlen %d\n", ++ __func__, error, len); ++ return error; ++} ++EXPORT_SYMBOL(filelayout_encode_devinfo); ++ ++/* Encodes the loc_body structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++enum nfsstat4 ++filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp) ++{ ++ u32 len = 0, nfl_util, fhlen, i; ++ u32 *layoutlen_p; ++ enum nfsstat4 nfserr; ++ __be32 *p; ++ ++ dprintk("%s: device_id %llx:%llx fsi %u, numfh %u\n", ++ __func__, ++ flp->device_id.pnfs_fsid, ++ flp->device_id.pnfs_devid, ++ flp->lg_first_stripe_index, ++ flp->lg_fh_length); ++ ++ /* Ensure file system added at least one file handle */ ++ if (flp->lg_fh_length <= 0) { ++ dprintk("%s: File Layout has no file handles!!\n", __func__); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto out; ++ } ++ ++ /* Ensure room for len, devid, util, first_stripe_index, ++ * pattern_offset, number of filehandles */ ++ p = layoutlen_p = exp_xdr_reserve_qwords(xdr, 1+2+2+1+1+2+1); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ /* save spot for opaque file layout length, fill-in later*/ ++ p++; ++ ++ /* encode device id */ ++ p = exp_xdr_encode_u64(p, flp->device_id.sbid); ++ p = exp_xdr_encode_u64(p, flp->device_id.devid); ++ ++ /* set and encode flags */ ++ nfl_util = flp->lg_stripe_unit; ++ if (flp->lg_commit_through_mds) ++ nfl_util |= NFL4_UFLG_COMMIT_THRU_MDS; ++ if (flp->lg_stripe_type == STRIPE_DENSE) ++ nfl_util |= NFL4_UFLG_DENSE; ++ p = exp_xdr_encode_u32(p, nfl_util); ++ ++ /* encode first stripe index */ ++ p = exp_xdr_encode_u32(p, flp->lg_first_stripe_index); ++ ++ /* encode striping pattern start */ ++ p = exp_xdr_encode_u64(p, flp->lg_pattern_offset); ++ ++ /* encode number of file handles */ ++ p = exp_xdr_encode_u32(p, flp->lg_fh_length); ++ ++ /* encode file handles */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ fhlen = flp->lg_fh_list[i].fh_size; ++ p = exp_xdr_reserve_space(xdr, 4 + fhlen); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ p = exp_xdr_encode_opaque(p, &flp->lg_fh_list[i].fh_base, fhlen); ++ } ++ ++ /* Set number of bytes encoded = total_bytes_encoded - length var */ ++ len = (char *)p - (char *)layoutlen_p; ++ exp_xdr_encode_u32(layoutlen_p, len - 4); ++ ++ nfserr = NFS4_OK; ++out: ++ dprintk("%s: End err %u xdrlen %d\n", ++ __func__, nfserr, len); ++ return nfserr; ++} ++EXPORT_SYMBOL(filelayout_encode_layout); +diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c +--- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-31 20:42:05.499125509 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-31 20:42:05.499125509 -0400 +@@ -0,0 +1,289 @@ ++/* ++ * pnfs_osd_xdr_enc.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_data_map( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_data_map *data_map) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 1+2+1+1+1+1); ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, data_map->odm_num_comps); ++ p = exp_xdr_encode_u64(p, data_map->odm_stripe_unit); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_width); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_depth); ++ p = exp_xdr_encode_u32(p, data_map->odm_mirror_cnt); ++ p = exp_xdr_encode_u32(p, data_map->odm_raid_algorithm); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline int pnfs_osd_xdr_encode_objid( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2+2+2+2); ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)&object_id->oid_device_id; ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u64(p, dev_id->sbid); ++ p = exp_xdr_encode_u64(p, dev_id->devid); ++ p = exp_xdr_encode_u64(p, object_id->oid_partition_id); ++ p = exp_xdr_encode_u64(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * enum pnfs_osd_cap_key_sec4 { ++ * PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ * PNFS_OSD_CAP_KEY_SEC_SSV = 1 ++ * }; ++ * ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_object_cred( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_object_cred *olo_comp) ++{ ++ __be32 *p; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_objid(xdr, &olo_comp->oc_object_id); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_space(xdr, 3*4 + 4+olo_comp->oc_cap.cred_len); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, olo_comp->oc_osd_version); ++ ++ /* No sec for now */ ++ p = exp_xdr_encode_u32(p, PNFS_OSD_CAP_KEY_SEC_NONE); ++ p = exp_xdr_encode_u32(p, 0); /* opaque oc_capability_key<> */ ++ ++ exp_xdr_encode_opaque(p, olo_comp->oc_cap.cred, ++ olo_comp->oc_cap.cred_len); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_layout { ++ * struct pnfs_osd_data_map olo_map; ++ * u32 olo_comps_index; ++ * u32 olo_num_comps; ++ * struct pnfs_osd_object_cred *olo_comps; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *pol) ++{ ++ __be32 *p; ++ u32 i; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_data_map(xdr, &pol->olo_map); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, pol->olo_comps_index); ++ p = exp_xdr_encode_u32(p, pol->olo_num_comps); ++ ++ for (i = 0; i < pol->olo_num_comps; i++) { ++ err = pnfs_osd_xdr_encode_object_cred(xdr, &pol->olo_comps[i]); ++ if (err) ++ return err; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_layout); ++ ++static int _encode_string(struct exp_xdr_stream *xdr, ++ const struct nfs4_string *str) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, 4 + str->len); ++ ++ if (!p) ++ return -E2BIG; ++ exp_xdr_encode_opaque(p, str->data, str->len); ++ return 0; ++} ++ ++/* struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr) ++{ ++ __be32 *p; ++ int err; ++ ++ p = exp_xdr_reserve_space(xdr, 4 + 4 + sizeof(devaddr->oda_lun)); ++ if (!p) ++ return -E2BIG; ++ ++ /* Empty oda_targetid */ ++ p = exp_xdr_encode_u32(p, OBJ_TARGET_ANON); ++ ++ /* Empty oda_targetaddr for now */ ++ p = exp_xdr_encode_u32(p, 0); ++ ++ /* oda_lun */ ++ exp_xdr_encode_bytes(p, devaddr->oda_lun, sizeof(devaddr->oda_lun)); ++ ++ err = _encode_string(xdr, &devaddr->oda_systemid); ++ if (err) ++ return err; ++ ++ err = pnfs_osd_xdr_encode_object_cred(xdr, ++ &devaddr->oda_root_obj_cred); ++ if (err) ++ return err; ++ ++ err = _encode_string(xdr, &devaddr->oda_osdname); ++ if (err) ++ return err; ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_deviceaddr); ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p) ++{ ++ lou->dsu_valid = be32_to_cpu(*p++); ++ if (lou->dsu_valid) ++ p = xdr_decode_hyper(p, &lou->dsu_delta); ++ lou->olu_ioerr_flag = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_layoutupdate); ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline __be32 * ++pnfs_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) ++{ ++ /* FIXME: p = xdr_decode_fixed(...) */ ++ memcpy(objid->oid_device_id.data, p, sizeof(objid->oid_device_id.data)); ++ p += XDR_QUADLEN(sizeof(objid->oid_device_id.data)); ++ ++ p = xdr_decode_hyper(p, &objid->oid_partition_id); ++ p = xdr_decode_hyper(p, &objid->oid_object_id); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p) ++{ ++ p = pnfs_osd_xdr_decode_objid(p, &ioerr->oer_component); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_offset); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_length); ++ ioerr->oer_iswrite = be32_to_cpu(*p++); ++ ioerr->oer_errno = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); +diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gfs2/ops_fstype.c +--- linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-08-31 20:42:05.500123860 -0400 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + #include "gfs2.h" + #include "incore.h" +@@ -1146,6 +1147,9 @@ static int fill_super(struct super_block + sb->s_magic = GFS2_MAGIC; + sb->s_op = &gfs2_super_ops; + sb->s_export_op = &gfs2_export_ops; ++#if defined(CONFIG_PNFSD) ++ sb->s_pnfs_op = &pnfs_dlm_export_ops; ++#endif /* CONFIG_PNFSD */ + sb->s_xattr = gfs2_xattr_handlers; + sb->s_qcop = &gfs2_quotactl_ops; + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; +diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig +--- linux-2.6.34.noarch/fs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/Kconfig 2010-08-31 20:42:05.490222933 -0400 +@@ -224,6 +224,31 @@ config LOCKD_V4 + config EXPORTFS + tristate + ++config EXPORTFS_FILE_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 files layout type. ++ Must be automatically selected by supporting filesystems. ++ ++config EXPORTFS_OSD_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 objects layout type. ++ Must be automatically selected by supporting osd ++ filesystems. ++ ++ If unsure, say N. ++ ++config EXPORTFS_BLOCK_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 blocks layout type. ++ Must be automatically selected by supporting filesystems. ++ ++ + config NFS_ACL_SUPPORT + tristate + select FS_POSIX_ACL +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-31 20:42:05.503222878 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-31 20:42:05.503222878 -0400 +@@ -0,0 +1,66 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pipefs_list bl_device_list; ++struct dentry *bl_device_pipe; ++ ++ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, size_t len) ++{ ++ int err; ++ struct pipefs_hdr *msg; ++ ++ dprintk("Entering %s...\n", __func__); ++ ++ msg = pipefs_readmsg(filp, src, len); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: unable to read pipefs message.\n"); ++ return PTR_ERR(msg); ++ } ++ ++ /* now assign the result, which wakes the blocked thread */ ++ err = pipefs_assign_upcall_reply(msg, &bl_device_list); ++ if (err) { ++ dprintk("ERROR: failed to assign upcall with id %u\n", ++ msg->msgid); ++ kfree(msg); ++ } ++ return len; ++} ++ ++static const struct rpc_pipe_ops bl_pipe_ops = { ++ .upcall = pipefs_generic_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = pipefs_generic_destroy_msg, ++}; ++ ++int bl_pipe_init(void) ++{ ++ dprintk("%s: block_device pipefs registering...\n", __func__); ++ bl_device_pipe = pipefs_mkpipe("bl_device_pipe", &bl_pipe_ops, 1); ++ if (IS_ERR(bl_device_pipe)) ++ dprintk("ERROR, unable to make block_device pipe\n"); ++ ++ if (!bl_device_pipe) ++ dprintk("bl_device_pipe is NULL!\n"); ++ else ++ dprintk("bl_device_pipe created!\n"); ++ pipefs_init_list(&bl_device_list); ++ return 0; ++} ++ ++void bl_pipe_exit(void) ++{ ++ dprintk("%s: block_device pipefs unregistering...\n", __func__); ++ if (IS_ERR(bl_device_pipe)) ++ return ; ++ pipefs_closepipe(bl_device_pipe); ++ return; ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-31 20:42:05.504232855 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-31 20:42:05.504232855 -0400 +@@ -0,0 +1,1160 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include ++ ++#include /* various write calls */ ++#include /* struct bio */ ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Andy Adamson "); ++MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); ++ ++/* Callback operations to the pNFS client */ ++static struct pnfs_client_operations *pnfs_block_callback_ops; ++ ++static void print_page(struct page *page) ++{ ++ dprintk("PRINTPAGE page %p\n", page); ++ dprintk(" PagePrivate %d\n", PagePrivate(page)); ++ dprintk(" PageUptodate %d\n", PageUptodate(page)); ++ dprintk(" PageError %d\n", PageError(page)); ++ dprintk(" PageDirty %d\n", PageDirty(page)); ++ dprintk(" PageReferenced %d\n", PageReferenced(page)); ++ dprintk(" PageLocked %d\n", PageLocked(page)); ++ dprintk(" PageWriteback %d\n", PageWriteback(page)); ++ dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); ++ dprintk("\n"); ++} ++ ++/* Given the be associated with isect, determine if page data needs to be ++ * initialized. ++ */ ++static int is_hole(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return !is_sector_initialized(be->be_inval, isect); ++} ++ ++/* Given the be associated with isect, determine if page data can be ++ * written to disk. ++ */ ++static int is_writable(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return is_sector_initialized(be->be_inval, isect); ++} ++ ++static int ++dont_like_caller(struct nfs_page *req) ++{ ++ if (atomic_read(&req->wb_complete)) { ++ /* Called by _multi */ ++ return 1; ++ } else { ++ /* Called by _one */ ++ return 0; ++ } ++} ++ ++static enum pnfs_try_status ++bl_commit(struct nfs_write_data *nfs_data, ++ int sync) ++{ ++ dprintk("%s enter\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++/* The data we are handed might be spread across several bios. We need ++ * to track when the last one is finished. ++ */ ++struct parallel_io { ++ struct kref refcnt; ++ struct rpc_call_ops call_ops; ++ void (*pnfs_callback) (void *data); ++ void *data; ++}; ++ ++static inline struct parallel_io *alloc_parallel(void *data) ++{ ++ struct parallel_io *rv; ++ ++ rv = kmalloc(sizeof(*rv), GFP_KERNEL); ++ if (rv) { ++ rv->data = data; ++ kref_init(&rv->refcnt); ++ } ++ return rv; ++} ++ ++static inline void get_parallel(struct parallel_io *p) ++{ ++ kref_get(&p->refcnt); ++} ++ ++static void destroy_parallel(struct kref *kref) ++{ ++ struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); ++ ++ dprintk("%s enter\n", __func__); ++ p->pnfs_callback(p->data); ++ kfree(p); ++} ++ ++static inline void put_parallel(struct parallel_io *p) ++{ ++ kref_put(&p->refcnt, destroy_parallel); ++} ++ ++static struct bio * ++bl_submit_bio(int rw, struct bio *bio) ++{ ++ if (bio) { ++ get_parallel(bio->bi_private); ++ dprintk("%s submitting %s bio %u@%llu\n", __func__, ++ rw == READ ? "read" : "write", ++ bio->bi_size, (u64)bio->bi_sector); ++ submit_bio(rw, bio); ++ } ++ return NULL; ++} ++ ++static inline void ++bl_done_with_rpage(struct page *page, const int ok) ++{ ++ if (ok) { ++ ClearPagePnfsErr(page); ++ SetPageUptodate(page); ++ } else { ++ ClearPageUptodate(page); ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ } ++ /* Page is unlocked via rpc_release. Should really be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_read(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_rpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++static void bl_read_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ pnfs_block_callback_ops->nfs_readlist_complete(rdata); ++} ++ ++static void ++bl_end_par_io_read(void *data) ++{ ++ struct nfs_read_data *rdata = data; ++ ++ INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); ++ schedule_work(&rdata->task.u.tk_work); ++} ++ ++/* We don't want normal .rpc_call_done callback used, so we replace it ++ * with this stub. ++ */ ++static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) ++{ ++ return; ++} ++ ++static enum pnfs_try_status ++bl_read_pagelist(struct nfs_read_data *rdata, ++ unsigned nr_pages) ++{ ++ int i, hole; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t f_offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct page **pages = rdata->args.pages; ++ int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, ++ nr_pages, f_offset, count); ++ ++ if (dont_like_caller(rdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ goto use_mds; ++ } ++ if ((nr_pages == 1) && PagePnfsErr(rdata->req->wb_page)) { ++ /* We want to fall back to mds in case of read_page ++ * after error on read_pages. ++ */ ++ dprintk("%s PG_pnfserr set\n", __func__); ++ goto use_mds; ++ } ++ par = alloc_parallel(rdata); ++ if (!par) ++ goto use_mds; ++ par->call_ops = *rdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_read; ++ /* At this point, we can no longer jump to use_mds */ ++ ++ isect = (sector_t) (f_offset >> 9); ++ /* Code assumes extents are page-aligned */ ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ put_extent(cow_read); ++ bio = bl_submit_bio(READ, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(rdata->pdata.lseg), ++ isect, &cow_read); ++ if (!be) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ if (cow_read) { ++ sector_t cow_length = cow_read->be_length - ++ (isect - cow_read->be_f_offset); ++ extent_length = min(extent_length, cow_length); ++ } ++ } ++ hole = is_hole(be, isect); ++ if (hole && !cow_read) { ++ bio = bl_submit_bio(READ, bio); ++ /* Fill hole w/ zeroes w/o accessing device */ ++ dprintk("%s Zeroing page for hole\n", __func__); ++ zero_user(pages[i], 0, ++ min_t(int, PAGE_CACHE_SIZE, count)); ++ print_page(pages[i]); ++ bl_done_with_rpage(pages[i], 1); ++ } else { ++ struct pnfs_block_extent *be_read; ++ ++ be_read = (hole && cow_read) ? cow_read : be; ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - ++ be_read->be_f_offset + ++ be_read->be_v_offset; ++ bio->bi_bdev = be_read->be_mdev; ++ bio->bi_end_io = bl_end_io_read; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(READ, bio); ++ } ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ if ((isect << 9) >= rdata->inode->i_size) { ++ rdata->res.eof = 1; ++ rdata->res.count = rdata->inode->i_size - f_offset; ++ } else { ++ rdata->res.count = (isect << 9) - f_offset; ++ } ++ put_extent(be); ++ put_extent(cow_read); ++ bl_submit_bio(READ, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++ ++ use_mds: ++ dprintk("Giving up and using normal NFS\n"); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static void mark_extents_written(struct pnfs_block_layout *bl, ++ __u64 offset, __u32 count) ++{ ++ sector_t isect, end; ++ struct pnfs_block_extent *be; ++ ++ dprintk("%s(%llu, %u)\n", __func__, offset, count); ++ if (count == 0) ++ return; ++ isect = (offset & (long)(PAGE_CACHE_MASK)) >> 9; ++ end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); ++ end >>= 9; ++ while (isect < end) { ++ sector_t len; ++ be = find_get_extent(bl, isect, NULL); ++ BUG_ON(!be); /* FIXME */ ++ len = min(end, be->be_f_offset + be->be_length) - isect; ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ mark_for_commit(be, isect, len); /* What if fails? */ ++ isect += len; ++ put_extent(be); ++ } ++} ++ ++/* STUB - this needs thought */ ++static inline void ++bl_done_with_wpage(struct page *page, const int ok) ++{ ++ if (!ok) { ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ /* This is an inline copy of nfs_zap_mapping */ ++ /* This is oh so fishy, and needs deep thought */ ++ if (page->mapping->nrpages != 0) { ++ struct inode *inode = page->mapping->host; ++ spin_lock(&inode->i_lock); ++ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; ++ spin_unlock(&inode->i_lock); ++ } ++ } ++ /* end_page_writeback called in rpc_release. Should be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_write(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_wpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++/* Function scheduled for call during bl_end_par_io_write, ++ * it marks sectors as written and extends the commitlist. ++ */ ++static void bl_write_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ if (!wdata->task.tk_status) { ++ /* Marks for LAYOUTCOMMIT */ ++ /* BUG - this should be called after each bio, not after ++ * all finish, unless have some way of storing success/failure ++ */ ++ mark_extents_written(BLK_LSEG2EXT(wdata->pdata.lseg), ++ wdata->args.offset, wdata->args.count); ++ } ++ pnfs_block_callback_ops->nfs_writelist_complete(wdata); ++} ++ ++/* Called when last of bios associated with a bl_write_pagelist call finishes */ ++static void ++bl_end_par_io_write(void *data) ++{ ++ struct nfs_write_data *wdata = data; ++ ++ /* STUB - ignoring error handling */ ++ wdata->task.tk_status = 0; ++ wdata->verf.committed = NFS_FILE_SYNC; ++ INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); ++ schedule_work(&wdata->task.u.tk_work); ++} ++ ++static enum pnfs_try_status ++bl_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int sync) ++{ ++ int i; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t offset = wdata->args.offset; ++ size_t count = wdata->args.count; ++ struct page **pages = wdata->args.pages; ++ int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); ++ if (!wdata->req->wb_lseg) { ++ dprintk("%s no lseg, falling back to MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ if (dont_like_caller(wdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ /* At this point, wdata->pages is a (sequential) list of nfs_pages. ++ * We want to write each, and if there is an error remove it from ++ * list and call ++ * nfs_retry_request(req) to have it redone using nfs. ++ * QUEST? Do as block or per req? Think have to do per block ++ * as part of end_bio ++ */ ++ par = alloc_parallel(wdata); ++ if (!par) ++ return PNFS_NOT_ATTEMPTED; ++ par->call_ops = *wdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_write; ++ /* At this point, have to be more careful with error handling */ ++ ++ isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> 9); ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ bio = bl_submit_bio(WRITE, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(wdata->pdata.lseg), ++ isect, NULL); ++ if (!be || !is_writable(be, isect)) { ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ } ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - be->be_f_offset + ++ be->be_v_offset; ++ bio->bi_bdev = be->be_mdev; ++ bio->bi_end_io = bl_end_io_write; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(WRITE, bio); ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ wdata->res.count = (isect << 9) - (offset & (long)PAGE_CACHE_MASK); ++ put_extent(be); ++ bl_submit_bio(WRITE, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++} ++ ++/* FIXME - range ignored */ ++static void ++release_extents(struct pnfs_block_layout *bl, ++ struct pnfs_layout_range *range) ++{ ++ int i; ++ struct pnfs_block_extent *be; ++ ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ while (!list_empty(&bl->bl_extents[i])) { ++ be = list_first_entry(&bl->bl_extents[i], ++ struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++} ++ ++static void ++release_inval_marks(struct pnfs_inval_markings *marks) ++{ ++ struct pnfs_inval_tracking *pos, *temp; ++ ++ list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { ++ list_del(&pos->it_link); ++ kfree(pos); ++ } ++ return; ++} ++ ++/* Note we are relying on caller locking to prevent nasty races. */ ++static void ++bl_free_layout(struct pnfs_layout_hdr *lo) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ ++ dprintk("%s enter\n", __func__); ++ release_extents(bl, NULL); ++ release_inval_marks(&bl->bl_inval); ++ kfree(bl); ++} ++ ++static struct pnfs_layout_hdr * ++bl_alloc_layout(struct inode *inode) ++{ ++ struct pnfs_block_layout *bl; ++ ++ dprintk("%s enter\n", __func__); ++ bl = kzalloc(sizeof(*bl), GFP_KERNEL); ++ if (!bl) ++ return NULL; ++ spin_lock_init(&bl->bl_ext_lock); ++ INIT_LIST_HEAD(&bl->bl_extents[0]); ++ INIT_LIST_HEAD(&bl->bl_extents[1]); ++ INIT_LIST_HEAD(&bl->bl_commit); ++ bl->bl_count = 0; ++ bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> 9; ++ INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); ++ return &bl->bl_layout; ++} ++ ++static void ++bl_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter\n", __func__); ++ kfree(lseg); ++} ++ ++/* Because the generic infrastructure does not correctly merge layouts, ++ * we pretty much ignore lseg, and store all data layout wide, so we ++ * can correctly merge. Eventually we should push some correct merge ++ * behavior up to the generic code, as the current behavior tends to ++ * cause lots of unnecessary overlapping LAYOUTGET requests. ++ */ ++static struct pnfs_layout_segment * ++bl_alloc_lseg(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr) ++{ ++ struct pnfs_layout_segment *lseg; ++ int status; ++ ++ dprintk("%s enter\n", __func__); ++ lseg = kzalloc(sizeof(*lseg) + 0, GFP_KERNEL); ++ if (!lseg) ++ return NULL; ++ status = nfs4_blk_process_layoutget(lo, lgr); ++ if (status) { ++ /* We don't want to call the full-blown bl_free_lseg, ++ * since on error extents were not touched. ++ */ ++ /* STUB - we really want to distinguish between 2 error ++ * conditions here. This lseg failed, but lo data structures ++ * are OK, or we hosed the lo data structures. The calling ++ * code probably needs to distinguish this too. ++ */ ++ kfree(lseg); ++ return ERR_PTR(status); ++ } ++ return lseg; ++} ++ ++static int ++bl_setup_layoutcommit(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutcommit_args *arg) ++{ ++ struct nfs_server *nfss = PNFS_NFS_SERVER(lo); ++ struct bl_layoutupdate_data *layoutupdate_data; ++ ++ dprintk("%s enter\n", __func__); ++ /* Need to ensure commit is block-size aligned */ ++ if (nfss->pnfs_blksize) { ++ u64 mask = nfss->pnfs_blksize - 1; ++ u64 offset = arg->range.offset & mask; ++ ++ arg->range.offset -= offset; ++ arg->range.length += offset + mask; ++ arg->range.length &= ~mask; ++ } ++ ++ layoutupdate_data = kmalloc(sizeof(struct bl_layoutupdate_data), ++ GFP_KERNEL); ++ if (unlikely(!layoutupdate_data)) ++ return -ENOMEM; ++ INIT_LIST_HEAD(&layoutupdate_data->ranges); ++ arg->layoutdriver_data = layoutupdate_data; ++ ++ return 0; ++} ++ ++static void ++bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg) ++{ ++ dprintk("%s enter\n", __func__); ++ encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); ++} ++ ++static void ++bl_cleanup_layoutcommit(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutcommit_args *arg, int status) ++{ ++ dprintk("%s enter\n", __func__); ++ clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), arg, status); ++ kfree(arg->layoutdriver_data); ++} ++ ++static void free_blk_mountid(struct block_mount_id *mid) ++{ ++ if (mid) { ++ struct pnfs_block_dev *dev; ++ spin_lock(&mid->bm_lock); ++ while (!list_empty(&mid->bm_devlist)) { ++ dev = list_first_entry(&mid->bm_devlist, ++ struct pnfs_block_dev, ++ bm_node); ++ list_del(&dev->bm_node); ++ free_block_dev(dev); ++ } ++ spin_unlock(&mid->bm_lock); ++ kfree(mid); ++ } ++} ++ ++/* This is mostly copied form the filelayout's get_device_info function. ++ * It seems much of this should be at the generic pnfs level. ++ */ ++static struct pnfs_block_dev * ++nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, ++ struct pnfs_deviceid *d_id, ++ struct list_head *sdlist) ++{ ++ struct pnfs_device *dev; ++ struct pnfs_block_dev *rv = NULL; ++ u32 max_resp_sz; ++ int max_pages; ++ struct page **pages = NULL; ++ int i, rc; ++ ++ /* ++ * Use the session max response size as the basis for setting ++ * GETDEVICEINFO's maxcount ++ */ ++ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; ++ max_pages = max_resp_sz >> PAGE_SHIFT; ++ dprintk("%s max_resp_sz %u max_pages %d\n", ++ __func__, max_resp_sz, max_pages); ++ ++ dev = kmalloc(sizeof(*dev), GFP_KERNEL); ++ if (!dev) { ++ dprintk("%s kmalloc failed\n", __func__); ++ return NULL; ++ } ++ ++ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); ++ if (pages == NULL) { ++ kfree(dev); ++ return NULL; ++ } ++ for (i = 0; i < max_pages; i++) { ++ pages[i] = alloc_page(GFP_KERNEL); ++ if (!pages[i]) ++ goto out_free; ++ } ++ ++ /* set dev->area */ ++ dev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); ++ if (!dev->area) ++ goto out_free; ++ ++ memcpy(&dev->dev_id, d_id, sizeof(*d_id)); ++ dev->layout_type = LAYOUT_BLOCK_VOLUME; ++ dev->dev_notify_types = 0; ++ dev->pages = pages; ++ dev->pgbase = 0; ++ dev->pglen = PAGE_SIZE * max_pages; ++ dev->mincount = 0; ++ ++ dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); ++ rc = pnfs_block_callback_ops->nfs_getdeviceinfo(server, dev); ++ dprintk("%s getdevice info returns %d\n", __func__, rc); ++ if (rc) ++ goto out_free; ++ ++ rv = nfs4_blk_decode_device(server, dev, sdlist); ++ out_free: ++ if (dev->area != NULL) ++ vunmap(dev->area); ++ for (i = 0; i < max_pages; i++) ++ __free_page(pages[i]); ++ kfree(pages); ++ kfree(dev); ++ return rv; ++} ++ ++ ++/* ++ * Retrieve the list of available devices for the mountpoint. ++ */ ++static int ++bl_initialize_mountpoint(struct nfs_server *server, const struct nfs_fh *fh) ++{ ++ struct block_mount_id *b_mt_id = NULL; ++ struct pnfs_mount_type *mtype = NULL; ++ struct pnfs_devicelist *dlist = NULL; ++ struct pnfs_block_dev *bdev; ++ LIST_HEAD(block_disklist); ++ int status = 0, i; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (server->pnfs_blksize == 0) { ++ dprintk("%s Server did not return blksize\n", __func__); ++ return -EINVAL; ++ } ++ b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_KERNEL); ++ if (!b_mt_id) { ++ status = -ENOMEM; ++ goto out_error; ++ } ++ /* Initialize nfs4 block layout mount id */ ++ spin_lock_init(&b_mt_id->bm_lock); ++ INIT_LIST_HEAD(&b_mt_id->bm_devlist); ++ ++ dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_KERNEL); ++ if (!dlist) ++ goto out_error; ++ dlist->eof = 0; ++ while (!dlist->eof) { ++ status = pnfs_block_callback_ops->nfs_getdevicelist( ++ server, fh, dlist); ++ if (status) ++ goto out_error; ++ dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", ++ __func__, dlist->num_devs, dlist->eof); ++ /* For each device returned in dlist, call GETDEVICEINFO, and ++ * decode the opaque topology encoding to create a flat ++ * volume topology, matching VOLUME_SIMPLE disk signatures ++ * to disks in the visible block disk list. ++ * Construct an LVM meta device from the flat volume topology. ++ */ ++ for (i = 0; i < dlist->num_devs; i++) { ++ bdev = nfs4_blk_get_deviceinfo(server, fh, ++ &dlist->dev_id[i], ++ &block_disklist); ++ if (!bdev) ++ goto out_error; ++ spin_lock(&b_mt_id->bm_lock); ++ list_add(&bdev->bm_node, &b_mt_id->bm_devlist); ++ spin_unlock(&b_mt_id->bm_lock); ++ } ++ } ++ dprintk("%s SUCCESS\n", __func__); ++ server->pnfs_ld_data = b_mt_id; ++ ++ out_return: ++ kfree(dlist); ++ return status; ++ ++ out_error: ++ free_blk_mountid(b_mt_id); ++ kfree(mtype); ++ goto out_return; ++} ++ ++static int ++bl_uninitialize_mountpoint(struct nfs_server *server) ++{ ++ struct block_mount_id *b_mt_id = server->pnfs_ld_data; ++ ++ dprintk("%s enter\n", __func__); ++ free_blk_mountid(b_mt_id); ++ dprintk("%s RETURNS\n", __func__); ++ return 0; ++} ++ ++/* STUB - mark intersection of layout and page as bad, so is not ++ * used again. ++ */ ++static void mark_bad_read(void) ++{ ++ return; ++} ++ ++/* Copied from buffer.c */ ++static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) ++{ ++ if (uptodate) { ++ set_buffer_uptodate(bh); ++ } else { ++ /* This happens, due to failed READA attempts. */ ++ clear_buffer_uptodate(bh); ++ } ++ unlock_buffer(bh); ++} ++ ++/* Copied from buffer.c */ ++static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) ++{ ++ __end_buffer_read_notouch(bh, uptodate); ++} ++ ++/* ++ * map_block: map a requested I/0 block (isect) into an offset in the LVM ++ * meta block_device ++ */ ++static void ++map_block(sector_t isect, struct pnfs_block_extent *be, struct buffer_head *bh) ++{ ++ dprintk("%s enter be=%p\n", __func__, be); ++ ++ set_buffer_mapped(bh); ++ bh->b_bdev = be->be_mdev; ++ bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> ++ (be->be_mdev->bd_inode->i_blkbits - 9); ++ ++ dprintk("%s isect %ld, bh->b_blocknr %ld, using bsize %Zd\n", ++ __func__, (long)isect, ++ (long)bh->b_blocknr, ++ bh->b_size); ++ return; ++} ++ ++/* Given an unmapped page, zero it (or read in page for COW), ++ * and set appropriate flags/markings, but it is safe to not initialize ++ * the range given in [from, to). ++ */ ++/* This is loosely based on nobh_write_begin */ ++static int ++init_page_for_write(struct pnfs_block_layout *bl, struct page *page, ++ unsigned from, unsigned to, sector_t **pages_to_mark) ++{ ++ struct buffer_head *bh; ++ int inval, ret = -EIO; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect; ++ ++ dprintk("%s enter, %p\n", __func__, page); ++ bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); ++ if (!bh) { ++ ret = -ENOMEM; ++ goto cleanup; ++ } ++ ++ isect = (sector_t)page->index << (PAGE_CACHE_SHIFT - 9); ++ be = find_get_extent(bl, isect, &cow_read); ++ if (!be) ++ goto cleanup; ++ inval = is_hole(be, isect); ++ dprintk("%s inval=%i, from=%u, to=%u\n", __func__, inval, from, to); ++ if (inval) { ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) { ++ dprintk("%s PANIC - got NONE_DATA extent %p\n", ++ __func__, be); ++ goto cleanup; ++ } ++ map_block(isect, be, bh); ++ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); ++ } ++ if (PageUptodate(page)) { ++ /* Do nothing */ ++ } else if (inval & !cow_read) { ++ zero_user_segments(page, 0, from, to, PAGE_CACHE_SIZE); ++ } else if (0 < from || PAGE_CACHE_SIZE > to) { ++ struct pnfs_block_extent *read_extent; ++ ++ read_extent = (inval && cow_read) ? cow_read : be; ++ map_block(isect, read_extent, bh); ++ lock_buffer(bh); ++ bh->b_end_io = end_buffer_read_nobh; ++ submit_bh(READ, bh); ++ dprintk("%s: Waiting for buffer read\n", __func__); ++ /* XXX Don't really want to hold layout lock here */ ++ wait_on_buffer(bh); ++ if (!buffer_uptodate(bh)) ++ goto cleanup; ++ } ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ /* There is a BUG here if is a short copy after write_begin, ++ * but I think this is a generic fs bug. The problem is that ++ * we have marked the page as initialized, but it is possible ++ * that the section not copied may never get copied. ++ */ ++ ret = mark_initialized_sectors(be->be_inval, isect, ++ PAGE_CACHE_SECTORS, ++ pages_to_mark); ++ /* Want to preallocate mem so above can't fail */ ++ if (ret) ++ goto cleanup; ++ } ++ SetPageMappedToDisk(page); ++ ret = 0; ++ ++cleanup: ++ free_buffer_head(bh); ++ put_extent(be); ++ put_extent(cow_read); ++ if (ret) { ++ /* Need to mark layout with bad read...should now ++ * just use nfs4 for reads and writes. ++ */ ++ mark_bad_read(); ++ } ++ return ret; ++} ++ ++static int ++bl_write_begin(struct pnfs_layout_segment *lseg, struct page *page, loff_t pos, ++ unsigned count, struct pnfs_fsdata *fsdata) ++{ ++ unsigned from, to; ++ int ret; ++ sector_t *pages_to_mark = NULL; ++ struct pnfs_block_layout *bl = BLK_LSEG2EXT(lseg); ++ ++ dprintk("%s enter, %u@%lld\n", __func__, count, pos); ++ print_page(page); ++ /* The following code assumes blocksize >= PAGE_CACHE_SIZE */ ++ if (bl->bl_blocksize < (PAGE_CACHE_SIZE >> 9)) { ++ dprintk("%s Can't handle blocksize %llu\n", __func__, ++ (u64)bl->bl_blocksize); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ return 0; ++ } ++ if (PageMappedToDisk(page)) { ++ /* Basically, this is a flag that says we have ++ * successfully called write_begin already on this page. ++ */ ++ /* NOTE - there are cache consistency issues here. ++ * For example, what if the layout is recalled, then regained? ++ * If the file is closed and reopened, will the page flags ++ * be reset? If not, we'll have to use layout info instead of ++ * the page flag. ++ */ ++ return 0; ++ } ++ from = pos & (PAGE_CACHE_SIZE - 1); ++ to = from + count; ++ ret = init_page_for_write(bl, page, from, to, &pages_to_mark); ++ if (ret) { ++ dprintk("%s init page failed with %i", __func__, ret); ++ /* Revert back to plain NFS and just continue on with ++ * write. This assumes there is no request attached, which ++ * should be true if we get here. ++ */ ++ BUG_ON(PagePrivate(page)); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ kfree(pages_to_mark); ++ ret = 0; ++ } else { ++ fsdata->private = pages_to_mark; ++ } ++ return ret; ++} ++ ++/* CAREFUL - what happens if copied < count??? */ ++static int ++bl_write_end(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter, %u@%lld, lseg=%p\n", __func__, count, pos, lseg); ++ print_page(page); ++ if (lseg) ++ SetPageUptodate(page); ++ return 0; ++} ++ ++/* Return any memory allocated to fsdata->private, and take advantage ++ * of no page locks to mark pages noted in write_begin as needing ++ * initialization. ++ */ ++static void ++bl_write_end_cleanup(struct file *filp, struct pnfs_fsdata *fsdata) ++{ ++ struct page *page; ++ pgoff_t index; ++ sector_t *pos; ++ struct address_space *mapping = filp->f_mapping; ++ struct pnfs_fsdata *fake_data; ++ struct pnfs_layout_segment *lseg; ++ ++ if (!fsdata) ++ return; ++ lseg = fsdata->lseg; ++ if (!lseg) ++ return; ++ pos = fsdata->private; ++ if (!pos) ++ return; ++ dprintk("%s enter with pos=%llu\n", __func__, (u64)(*pos)); ++ for (; *pos != ~0; pos++) { ++ index = *pos >> (PAGE_CACHE_SHIFT - 9); ++ /* XXX How do we properly deal with failures here??? */ ++ page = grab_cache_page_write_begin(mapping, index, 0); ++ if (!page) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", __func__); ++ continue; ++ } ++ dprintk("%s: Examining block page\n", __func__); ++ print_page(page); ++ if (!PageMappedToDisk(page)) { ++ /* XXX How do we properly deal with failures here??? */ ++ dprintk("%s Marking block page\n", __func__); ++ init_page_for_write(BLK_LSEG2EXT(fsdata->lseg), page, ++ PAGE_CACHE_SIZE, PAGE_CACHE_SIZE, ++ NULL); ++ print_page(page); ++ fake_data = kzalloc(sizeof(*fake_data), GFP_KERNEL); ++ if (!fake_data) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", ++ __func__); ++ unlock_page(page); ++ continue; ++ } ++ get_lseg(lseg); ++ fake_data->lseg = lseg; ++ fake_data->bypass_eof = 1; ++ mapping->a_ops->write_end(filp, mapping, ++ index << PAGE_CACHE_SHIFT, ++ PAGE_CACHE_SIZE, ++ PAGE_CACHE_SIZE, ++ page, fake_data); ++ /* Note fake_data is freed by nfs_write_end */ ++ } else ++ unlock_page(page); ++ } ++ kfree(fsdata->private); ++ fsdata->private = NULL; ++} ++ ++static ssize_t ++bl_get_stripesize(struct pnfs_layout_hdr *lo) ++{ ++ dprintk("%s enter\n", __func__); ++ return 0; ++} ++ ++/* This is called by nfs_can_coalesce_requests via nfs_pageio_do_add_request. ++ * Should return False if there is a reason requests can not be coalesced, ++ * otherwise, should default to returning True. ++ */ ++static int ++bl_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ dprintk("%s enter\n", __func__); ++ if (pgio->pg_iswrite) ++ return prev->wb_lseg == req->wb_lseg; ++ else ++ return 1; ++} ++ ++static struct layoutdriver_io_operations blocklayout_io_operations = { ++ .commit = bl_commit, ++ .read_pagelist = bl_read_pagelist, ++ .write_pagelist = bl_write_pagelist, ++ .write_begin = bl_write_begin, ++ .write_end = bl_write_end, ++ .write_end_cleanup = bl_write_end_cleanup, ++ .alloc_layout = bl_alloc_layout, ++ .free_layout = bl_free_layout, ++ .alloc_lseg = bl_alloc_lseg, ++ .free_lseg = bl_free_lseg, ++ .setup_layoutcommit = bl_setup_layoutcommit, ++ .encode_layoutcommit = bl_encode_layoutcommit, ++ .cleanup_layoutcommit = bl_cleanup_layoutcommit, ++ .initialize_mountpoint = bl_initialize_mountpoint, ++ .uninitialize_mountpoint = bl_uninitialize_mountpoint, ++}; ++ ++static struct layoutdriver_policy_operations blocklayout_policy_operations = { ++ .get_stripesize = bl_get_stripesize, ++ .pg_test = bl_pg_test, ++}; ++ ++static struct pnfs_layoutdriver_type blocklayout_type = { ++ .id = LAYOUT_BLOCK_VOLUME, ++ .name = "LAYOUT_BLOCK_VOLUME", ++ .ld_io_ops = &blocklayout_io_operations, ++ .ld_policy_ops = &blocklayout_policy_operations, ++}; ++ ++static int __init nfs4blocklayout_init(void) ++{ ++ dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); ++ ++ pnfs_block_callback_ops = pnfs_register_layoutdriver(&blocklayout_type); ++ bl_pipe_init(); ++ return 0; ++} ++ ++static void __exit nfs4blocklayout_exit(void) ++{ ++ dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", ++ __func__); ++ ++ pnfs_unregister_layoutdriver(&blocklayout_type); ++ bl_pipe_exit(); ++} ++ ++module_init(nfs4blocklayout_init); ++module_exit(nfs4blocklayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-31 20:42:05.506119071 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-31 20:42:05.506119071 -0400 +@@ -0,0 +1,335 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdev.c ++ * ++ * Device operations for the pnfs nfs4 file layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include /* __bread */ ++ ++#include ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes) ++{ ++ uint32_t *q = p + XDR_QUADLEN(nbytes); ++ if (unlikely(q > end || q < p)) ++ return NULL; ++ return p; ++} ++EXPORT_SYMBOL(blk_overflow); ++ ++/* Open a block_device by device number. */ ++struct block_device *nfs4_blkdev_get(dev_t dev) ++{ ++ struct block_device *bd; ++ ++ dprintk("%s enter\n", __func__); ++ bd = open_by_devnum(dev, FMODE_READ); ++ if (IS_ERR(bd)) ++ goto fail; ++ return bd; ++fail: ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ return NULL; ++} ++ ++/* ++ * Release the block device ++ */ ++int nfs4_blkdev_put(struct block_device *bdev) ++{ ++ dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), ++ MINOR(bdev->bd_dev)); ++ bd_release(bdev); ++ return blkdev_put(bdev, FMODE_READ); ++} ++ ++/* Decodes pnfs_block_deviceaddr4 (draft-8) which is XDR encoded ++ * in dev->dev_addr_buf. ++ */ ++struct pnfs_block_dev * ++nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist) ++{ ++ struct pnfs_block_dev *rv = NULL; ++ struct block_device *bd = NULL; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint32_t major, minor; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return NULL; ++ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); ++ dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, ++ dev->mincount); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_MOUNT, 0, dev->area, ++ dev->mincount); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out_err; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ dprintk("%s CALLING USERSPACE DAEMON\n", __func__); ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out_err; ++ } ++ if (reply->status != BL_DEVICE_REQUEST_PROC) { ++ dprintk("%s failed to open device: %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ memcpy(&major, (uint32_t *)(payload_of(reply)), sizeof(uint32_t)); ++ memcpy(&minor, (uint32_t *)(payload_of(reply) + sizeof(uint32_t)), ++ sizeof(uint32_t)); ++ bd = nfs4_blkdev_get(MKDEV(major, minor)); ++ if (IS_ERR(bd)) { ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ ++ rv = kzalloc(sizeof(*rv), GFP_KERNEL); ++ if (!rv) ++ goto out_err; ++ ++ rv->bm_mdev = bd; ++ memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct pnfs_deviceid)); ++ dprintk("%s Created device %s with bd_block_size %u\n", ++ __func__, ++ bd->bd_disk->disk_name, ++ bd->bd_block_size); ++ kfree(reply); ++ kfree(msg); ++ return rv; ++ ++out_err: ++ kfree(rv); ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return NULL; ++} ++ ++/* Map deviceid returned by the server to constructed block_device */ ++static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, ++ struct pnfs_deviceid *id) ++{ ++ struct block_device *rv = NULL; ++ struct block_mount_id *mid; ++ struct pnfs_block_dev *dev; ++ ++ dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); ++ mid = BLK_ID(lo); ++ spin_lock(&mid->bm_lock); ++ list_for_each_entry(dev, &mid->bm_devlist, bm_node) { ++ if (memcmp(id->data, dev->bm_mdevid.data, ++ NFS4_PNFS_DEVICEID4_SIZE) == 0) { ++ rv = dev->bm_mdev; ++ goto out; ++ } ++ } ++ out: ++ spin_unlock(&mid->bm_lock); ++ dprintk("%s returning %p\n", __func__, rv); ++ return rv; ++} ++ ++/* Tracks info needed to ensure extents in layout obey constraints of spec */ ++struct layout_verification { ++ u32 mode; /* R or RW */ ++ u64 start; /* Expected start of next non-COW extent */ ++ u64 inval; /* Start of INVAL coverage */ ++ u64 cowread; /* End of COW read coverage */ ++}; ++ ++/* Verify the extent meets the layout requirements of the pnfs-block draft, ++ * section 2.3.1. ++ */ ++static int verify_extent(struct pnfs_block_extent *be, ++ struct layout_verification *lv) ++{ ++ if (lv->mode == IOMODE_READ) { ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA || ++ be->be_state == PNFS_BLOCK_INVALID_DATA) ++ return -EIO; ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } ++ /* lv->mode == IOMODE_RW */ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ if (lv->cowread > lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ lv->inval = lv->start; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_READ_DATA) { ++ if (be->be_f_offset > lv->start) ++ return -EIO; ++ if (be->be_f_offset < lv->inval) ++ return -EIO; ++ if (be->be_f_offset < lv->cowread) ++ return -EIO; ++ /* It looks like you might want to min this with lv->start, ++ * but you really don't. ++ */ ++ lv->inval = lv->inval + be->be_length; ++ lv->cowread = be->be_f_offset + be->be_length; ++ return 0; ++ } else ++ return -EIO; ++} ++ ++/* XDR decode pnfs_block_layout4 structure */ ++int ++nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ uint32_t *p = (uint32_t *)lgr->layout.buf; ++ uint32_t *end = (uint32_t *)((char *)lgr->layout.buf + lgr->layout.len); ++ int i, status = -EIO; ++ uint32_t count; ++ struct pnfs_block_extent *be = NULL, *save; ++ uint64_t tmp; /* Used by READSECTOR */ ++ struct layout_verification lv = { ++ .mode = lgr->range.iomode, ++ .start = lgr->range.offset >> 9, ++ .inval = lgr->range.offset >> 9, ++ .cowread = lgr->range.offset >> 9, ++ }; ++ ++ LIST_HEAD(extents); ++ ++ BLK_READBUF(p, end, 4); ++ READ32(count); ++ ++ dprintk("%s enter, number of extents %i\n", __func__, count); ++ BLK_READBUF(p, end, (28 + NFS4_PNFS_DEVICEID4_SIZE) * count); ++ ++ /* Decode individual extents, putting them in temporary ++ * staging area until whole layout is decoded to make error ++ * recovery easier. ++ */ ++ for (i = 0; i < count; i++) { ++ be = alloc_extent(); ++ if (!be) { ++ status = -ENOMEM; ++ goto out_err; ++ } ++ READ_DEVID(&be->be_devid); ++ be->be_mdev = translate_devid(lo, &be->be_devid); ++ if (!be->be_mdev) ++ goto out_err; ++ /* The next three values are read in as bytes, ++ * but stored as 512-byte sector lengths ++ */ ++ READ_SECTOR(be->be_f_offset); ++ READ_SECTOR(be->be_length); ++ READ_SECTOR(be->be_v_offset); ++ READ32(be->be_state); ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ be->be_inval = &bl->bl_inval; ++ if (verify_extent(be, &lv)) { ++ dprintk("%s verify failed\n", __func__); ++ goto out_err; ++ } ++ list_add_tail(&be->be_node, &extents); ++ } ++ if (p != end) { ++ dprintk("%s Undecoded cruft at end of opaque\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lgr->range.offset + lgr->range.length != lv.start << 9) { ++ dprintk("%s Final length mismatch\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lv.start < lv.cowread) { ++ dprintk("%s Final uncovered COW extent\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ /* Extents decoded properly, now try to merge them in to ++ * existing layout extents. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ list_for_each_entry_safe(be, save, &extents, be_node) { ++ list_del(&be->be_node); ++ status = add_and_merge_extent(bl, be); ++ if (status) { ++ spin_unlock(&bl->bl_ext_lock); ++ /* This is a fairly catastrophic error, as the ++ * entire layout extent lists are now corrupted. ++ * We should have some way to distinguish this. ++ */ ++ be = NULL; ++ goto out_err; ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ status = 0; ++ out: ++ dprintk("%s returns %i\n", __func__, status); ++ return status; ++ ++ out_err: ++ put_extent(be); ++ while (!list_empty(&extents)) { ++ be = list_first_entry(&extents, struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ goto out; ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-31 20:42:05.506119071 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-31 20:42:05.506119071 -0400 +@@ -0,0 +1,120 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdm.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2007 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Fred Isaman ++ * Andy Adamson ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include /* gendisk - used in a dprintk*/ ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Defines used for calculating memory usage in nfs4_blk_flatten() */ ++#define ARGSIZE 24 /* Max bytes needed for linear target arg string */ ++#define SPECSIZE (sizeof8(struct dm_target_spec) + ARGSIZE) ++#define SPECS_PER_PAGE (PAGE_SIZE / SPECSIZE) ++#define SPEC_HEADER_ADJUST (SPECS_PER_PAGE - \ ++ (PAGE_SIZE - sizeof8(struct dm_ioctl)) / SPECSIZE) ++#define roundup8(x) (((x)+7) & ~7) ++#define sizeof8(x) roundup8(sizeof(x)) ++ ++static int dev_remove(dev_t dev) ++{ ++ int ret = 1; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint64_t bl_dev; ++ uint32_t major = MAJOR(dev), minor = MINOR(dev); ++ ++ dprintk("Entering %s\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return ret; ++ ++ memcpy((void *)&bl_dev, &major, sizeof(uint32_t)); ++ memcpy((void *)&bl_dev + sizeof(uint32_t), &minor, sizeof(uint32_t)); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_UMOUNT, 0, (void *)&bl_dev, ++ sizeof(uint64_t)); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out; ++ } ++ ++ if (reply->status == BL_DEVICE_REQUEST_PROC) ++ ret = 0; /*TODO: what to return*/ ++out: ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return ret; ++} ++ ++/* ++ * Release meta device ++ */ ++static int nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) ++{ ++ int rv; ++ ++ dprintk("%s Releasing\n", __func__); ++ /* XXX Check return? */ ++ rv = nfs4_blkdev_put(bdev->bm_mdev); ++ dprintk("%s nfs4_blkdev_put returns %d\n", __func__, rv); ++ ++ rv = dev_remove(bdev->bm_mdev->bd_dev); ++ dprintk("%s Returns %d\n", __func__, rv); ++ return rv; ++} ++ ++void free_block_dev(struct pnfs_block_dev *bdev) ++{ ++ if (bdev) { ++ if (bdev->bm_mdev) { ++ dprintk("%s Removing DM device: %d:%d\n", ++ __func__, ++ MAJOR(bdev->bm_mdev->bd_dev), ++ MINOR(bdev->bm_mdev->bd_dev)); ++ /* XXX Check status ?? */ ++ nfs4_blk_metadev_release(bdev); ++ } ++ kfree(bdev); ++ } ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-31 20:42:05.505169618 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-31 20:42:05.505169618 -0400 +@@ -0,0 +1,302 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#ifndef FS_NFS_NFS4BLOCKLAYOUT_H ++#define FS_NFS_NFS4BLOCKLAYOUT_H ++ ++#include ++#include ++#include /* Needed for struct dm_ioctl*/ ++ ++#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> 9) ++ ++#define PG_pnfserr PG_owner_priv_1 ++#define PagePnfsErr(page) test_bit(PG_pnfserr, &(page)->flags) ++#define SetPagePnfsErr(page) set_bit(PG_pnfserr, &(page)->flags) ++#define ClearPagePnfsErr(page) clear_bit(PG_pnfserr, &(page)->flags) ++ ++extern int dm_dev_create(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_dev_remove(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_do_resume(struct dm_ioctl *param); ++extern int dm_table_load(struct dm_ioctl *param, size_t param_size); ++ ++struct block_mount_id { ++ spinlock_t bm_lock; /* protects list */ ++ struct list_head bm_devlist; /* holds pnfs_block_dev */ ++}; ++ ++struct pnfs_block_dev { ++ struct list_head bm_node; ++ struct pnfs_deviceid bm_mdevid; /* associated devid */ ++ struct block_device *bm_mdev; /* meta device itself */ ++}; ++ ++/* holds visible disks that can be matched against VOLUME_SIMPLE signatures */ ++struct visible_block_device { ++ struct list_head vi_node; ++ struct block_device *vi_bdev; ++ int vi_mapped; ++ int vi_put_done; ++}; ++ ++enum blk_vol_type { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, /* maps to a single LU */ ++ PNFS_BLOCK_VOLUME_SLICE = 1, /* slice of another volume */ ++ PNFS_BLOCK_VOLUME_CONCAT = 2, /* concatenation of multiple volumes */ ++ PNFS_BLOCK_VOLUME_STRIPE = 3 /* striped across multiple volumes */ ++}; ++ ++/* All disk offset/lengths are stored in 512-byte sectors */ ++struct pnfs_blk_volume { ++ uint32_t bv_type; ++ sector_t bv_size; ++ struct pnfs_blk_volume **bv_vols; ++ int bv_vol_n; ++ union { ++ dev_t bv_dev; ++ sector_t bv_stripe_unit; ++ sector_t bv_offset; ++ }; ++}; ++ ++/* Since components need not be aligned, cannot use sector_t */ ++struct pnfs_blk_sig_comp { ++ int64_t bs_offset; /* In bytes */ ++ uint32_t bs_length; /* In bytes */ ++ char *bs_string; ++}; ++ ++/* Maximum number of signatures components in a simple volume */ ++# define PNFS_BLOCK_MAX_SIG_COMP 16 ++ ++struct pnfs_blk_sig { ++ int si_num_comps; ++ struct pnfs_blk_sig_comp si_comps[PNFS_BLOCK_MAX_SIG_COMP]; ++}; ++ ++enum exstate4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ ++ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ ++}; ++ ++#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ ++ ++struct my_tree_t { ++ sector_t mtt_step_size; /* Internal sector alignment */ ++ struct list_head mtt_stub; /* Should be a radix tree */ ++}; ++ ++struct pnfs_inval_markings { ++ spinlock_t im_lock; ++ struct my_tree_t im_tree; /* Sectors that need LAYOUTCOMMIT */ ++ sector_t im_block_size; /* Server blocksize in sectors */ ++}; ++ ++struct pnfs_inval_tracking { ++ struct list_head it_link; ++ int it_sector; ++ int it_tags; ++}; ++ ++/* sector_t fields are all in 512-byte sectors */ ++struct pnfs_block_extent { ++ struct kref be_refcnt; ++ struct list_head be_node; /* link into lseg list */ ++ struct pnfs_deviceid be_devid; /* STUB - remevable??? */ ++ struct block_device *be_mdev; ++ sector_t be_f_offset; /* the starting offset in the file */ ++ sector_t be_length; /* the size of the extent */ ++ sector_t be_v_offset; /* the starting offset in the volume */ ++ enum exstate4 be_state; /* the state of this extent */ ++ struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ ++}; ++ ++/* Shortened extent used by LAYOUTCOMMIT */ ++struct pnfs_block_short_extent { ++ struct list_head bse_node; ++ struct pnfs_deviceid bse_devid; /* STUB - removable??? */ ++ struct block_device *bse_mdev; ++ sector_t bse_f_offset; /* the starting offset in the file */ ++ sector_t bse_length; /* the size of the extent */ ++}; ++ ++static inline void ++INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) ++{ ++ spin_lock_init(&marks->im_lock); ++ INIT_LIST_HEAD(&marks->im_tree.mtt_stub); ++ marks->im_block_size = blocksize; ++ marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, ++ blocksize); ++} ++ ++enum extentclass4 { ++ RW_EXTENT = 0, /* READWRTE and INVAL */ ++ RO_EXTENT = 1, /* READ and NONE */ ++ EXTENT_LISTS = 2, ++}; ++ ++static inline int choose_list(enum exstate4 state) ++{ ++ if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) ++ return RO_EXTENT; ++ else ++ return RW_EXTENT; ++} ++ ++struct pnfs_block_layout { ++ struct pnfs_layout_hdr bl_layout; ++ struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ ++ spinlock_t bl_ext_lock; /* Protects list manipulation */ ++ struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ ++ struct list_head bl_commit; /* Needs layout commit */ ++ unsigned int bl_count; /* entries in bl_commit */ ++ sector_t bl_blocksize; /* Server blocksize in sectors */ ++}; ++ ++/* this struct is comunicated between: ++ * bl_setup_layoutcommit && bl_encode_layoutcommit && bl_cleanup_layoutcommit ++ */ ++struct bl_layoutupdate_data { ++ struct list_head ranges; ++}; ++ ++#define BLK_ID(lo) ((struct block_mount_id *)(PNFS_NFS_SERVER(lo)->pnfs_ld_data)) ++ ++static inline struct pnfs_block_layout * ++BLK_LO2EXT(struct pnfs_layout_hdr *lo) ++{ ++ return container_of(lo, struct pnfs_block_layout, bl_layout); ++} ++ ++static inline struct pnfs_block_layout * ++BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) ++{ ++ return BLK_LO2EXT(lseg->layout); ++} ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes); ++ ++#define BLK_READBUF(p, e, nbytes) do { \ ++ p = blk_overflow(p, e, nbytes); \ ++ if (!p) { \ ++ printk(KERN_WARNING \ ++ "%s: reply buffer overflowed in line %d.\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++} while (0) ++ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (uint64_t)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++#define READ_DEVID(x) COPYMEM((x)->data, NFS4_PNFS_DEVICEID4_SIZE) ++#define READ_SECTOR(x) do { \ ++ READ64(tmp); \ ++ if (tmp & 0x1ff) { \ ++ printk(KERN_WARNING \ ++ "%s Value not 512-byte aligned at line %d\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++ (x) = tmp >> 9; \ ++} while (0) ++ ++#define WRITE32(n) do { \ ++ *p++ = htonl(n); \ ++ } while (0) ++#define WRITE64(n) do { \ ++ *p++ = htonl((uint32_t)((n) >> 32)); \ ++ *p++ = htonl((uint32_t)(n)); \ ++} while (0) ++#define WRITEMEM(ptr, nbytes) do { \ ++ p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ ++} while (0) ++#define WRITE_DEVID(x) WRITEMEM((x)->data, NFS4_PNFS_DEVICEID4_SIZE) ++ ++/* blocklayoutdev.c */ ++struct block_device *nfs4_blkdev_get(dev_t dev); ++int nfs4_blkdev_put(struct block_device *bdev); ++struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist); ++int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr); ++int nfs4_blk_create_block_disk_list(struct list_head *); ++void nfs4_blk_destroy_disk_list(struct list_head *); ++/* blocklayoutdm.c */ ++int nfs4_blk_flatten(struct pnfs_blk_volume *, int, struct pnfs_block_dev *); ++void free_block_dev(struct pnfs_block_dev *bdev); ++/* extents.c */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read); ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages); ++void put_extent(struct pnfs_block_extent *be); ++struct pnfs_block_extent *alloc_extent(void); ++struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be); ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect); ++int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg); ++void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct nfs4_layoutcommit_args *arg, ++ int status); ++int add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new); ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length); ++ ++#include ++ ++extern struct pipefs_list bl_device_list; ++extern struct dentry *bl_device_pipe; ++ ++int bl_pipe_init(void); ++void bl_pipe_exit(void); ++ ++#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ ++#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ ++#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ ++#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ ++#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ ++ ++#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-31 20:42:05.507113260 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-08-31 20:42:05.508119925 -0400 +@@ -0,0 +1,948 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include "blocklayout.h" ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Bit numbers */ ++#define EXTENT_INITIALIZED 0 ++#define EXTENT_WRITTEN 1 ++#define EXTENT_IN_COMMIT 2 ++#define INTERNAL_EXISTS MY_MAX_TAGS ++#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) ++ ++/* Returns largest t<=s s.t. t%base==0 */ ++static inline sector_t normalize(sector_t s, int base) ++{ ++ sector_t tmp = s; /* Since do_div modifies its argument */ ++ return s - do_div(tmp, base); ++} ++ ++static inline sector_t normalize_up(sector_t s, int base) ++{ ++ return normalize(s + base - 1, base); ++} ++ ++/* Complete stub using list while determine API wanted */ ++ ++/* Returns tags, or negative */ ++static int32_t _find_entry(struct my_tree_t *tree, u64 s) ++{ ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu) enter\n", __func__, s); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) ++ return pos->it_tags & INTERNAL_MASK; ++ else ++ break; ++ } ++ return -ENOENT; ++} ++ ++static inline ++int _has_tag(struct my_tree_t *tree, u64 s, int32_t tag) ++{ ++ int32_t tags; ++ ++ dprintk("%s(%llu, %i) enter\n", __func__, s, tag); ++ s = normalize(s, tree->mtt_step_size); ++ tags = _find_entry(tree, s); ++ if ((tags < 0) || !(tags & (1 << tag))) ++ return 0; ++ else ++ return 1; ++} ++ ++/* Creates entry with tag, or if entry already exists, unions tag to it. ++ * If storage is not NULL, newly created entry will use it. ++ * Returns number of entries added, or negative on error. ++ */ ++static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag, ++ struct pnfs_inval_tracking *storage) ++{ ++ int found = 0; ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) { ++ found = 1; ++ break; ++ } else ++ break; ++ } ++ if (found) { ++ pos->it_tags |= (1 << tag); ++ return 0; ++ } else { ++ struct pnfs_inval_tracking *new; ++ if (storage) ++ new = storage; ++ else { ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ } ++ new->it_sector = s; ++ new->it_tags = (1 << tag); ++ list_add(&new->it_link, &pos->it_link); ++ return 1; ++ } ++} ++ ++/* XXXX Really want option to not create */ ++/* Over range, unions tag with existing entries, else creates entry with tag */ ++static int _set_range(struct my_tree_t *tree, int32_t tag, u64 s, u64 length) ++{ ++ u64 i; ++ ++ dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); ++ for (i = normalize(s, tree->mtt_step_size); i < s + length; ++ i += tree->mtt_step_size) ++ if (_add_entry(tree, i, tag, NULL)) ++ return -ENOMEM; ++ return 0; ++} ++ ++/* Ensure that future operations on given range of tree will not malloc */ ++static int _preload_range(struct my_tree_t *tree, u64 offset, u64 length) ++{ ++ u64 start, end, s; ++ int count, i, used = 0, status = -ENOMEM; ++ struct pnfs_inval_tracking **storage; ++ ++ dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); ++ start = normalize(offset, tree->mtt_step_size); ++ end = normalize_up(offset + length, tree->mtt_step_size); ++ count = (int)(end - start) / (int)tree->mtt_step_size; ++ ++ /* Pre-malloc what memory we might need */ ++ storage = kmalloc(sizeof(*storage) * count, GFP_KERNEL); ++ if (!storage) ++ return -ENOMEM; ++ for (i = 0; i < count; i++) { ++ storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), ++ GFP_KERNEL); ++ if (!storage[i]) ++ goto out_cleanup; ++ } ++ ++ /* Now need lock - HOW??? */ ++ ++ for (s = start; s < end; s += tree->mtt_step_size) ++ used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); ++ ++ /* Unlock - HOW??? */ ++ status = 0; ++ ++ out_cleanup: ++ for (i = used; i < count; i++) { ++ if (!storage[i]) ++ break; ++ kfree(storage[i]); ++ } ++ kfree(storage); ++ return status; ++} ++ ++static void set_needs_init(sector_t *array, sector_t offset) ++{ ++ sector_t *p = array; ++ ++ dprintk("%s enter\n", __func__); ++ if (!p) ++ return; ++ while (*p < offset) ++ p++; ++ if (*p == offset) ++ return; ++ else if (*p == ~0) { ++ *p++ = offset; ++ *p = ~0; ++ return; ++ } else { ++ sector_t *save = p; ++ dprintk("%s Adding %llu\n", __func__, (u64)offset); ++ while (*p != ~0) ++ p++; ++ p++; ++ memmove(save + 1, save, (char *)p - (char *)save); ++ *save = offset; ++ return; ++ } ++} ++ ++/* We are relying on page lock to serialize this */ ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Assume start, end already sector aligned */ ++static int ++_range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag) ++{ ++ struct pnfs_inval_tracking *pos; ++ u64 expect = 0; ++ ++ dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector >= end) ++ continue; ++ if (!expect) { ++ if ((pos->it_sector == end - tree->mtt_step_size) && ++ (pos->it_tags & (1 << tag))) { ++ expect = pos->it_sector - tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ continue; ++ } else { ++ return 0; ++ } ++ } ++ if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) ++ return 0; ++ expect -= tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ } ++ return 0; ++} ++ ++static int is_range_written(struct pnfs_inval_markings *marks, ++ sector_t start, sector_t end) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Marks sectors in [offest, offset_length) as having been initialized. ++ * All lengths are step-aligned, where step is min(pagesize, blocksize). ++ * Notes where partial block is initialized, and helps prepare it for ++ * complete initialization later. ++ */ ++/* Currently assumes offset is page-aligned */ ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages) ++{ ++ sector_t s, start, end; ++ sector_t *array = NULL; /* Pages to mark */ ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", ++ __func__, (u64)offset, (u64)length); ++ s = max((sector_t) 3, ++ 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); ++ dprintk("%s set max=%llu\n", __func__, (u64)s); ++ if (pages) { ++ array = kmalloc(s * sizeof(sector_t), GFP_KERNEL); ++ if (!array) ++ goto outerr; ++ array[0] = ~0; ++ } ++ ++ start = normalize(offset, marks->im_block_size); ++ end = normalize_up(offset + length, marks->im_block_size); ++ if (_preload_range(&marks->im_tree, start, end - start)) ++ goto outerr; ++ ++ spin_lock(&marks->im_lock); ++ ++ for (s = normalize_up(start, PAGE_CACHE_SECTORS); ++ s < offset; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s pre-area pages\n", __func__); ++ /* Portion of used block is not initialized */ ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) ++ goto out_unlock; ++ for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); ++ s < end; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s post-area pages\n", __func__); ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ ++ spin_unlock(&marks->im_lock); ++ ++ if (pages) { ++ if (array[0] == ~0) { ++ kfree(array); ++ *pages = NULL; ++ } else ++ *pages = array; ++ } ++ return 0; ++ ++ out_unlock: ++ spin_unlock(&marks->im_lock); ++ outerr: ++ if (pages) { ++ kfree(array); ++ *pages = NULL; ++ } ++ return -ENOMEM; ++} ++ ++/* Marks sectors in [offest, offset+length) as having been written to disk. ++ * All lengths should be block aligned. ++ */ ++int mark_written_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length) ++{ ++ int status; ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, ++ (u64)offset, (u64)length); ++ spin_lock(&marks->im_lock); ++ status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); ++ spin_unlock(&marks->im_lock); ++ return status; ++} ++ ++static void print_short_extent(struct pnfs_block_short_extent *be) ++{ ++ dprintk("PRINT SHORT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->bse_length); ++ } ++} ++ ++void print_clist(struct list_head *list, unsigned int count) ++{ ++ struct pnfs_block_short_extent *be; ++ unsigned int i = 0; ++ ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, bse_node) { ++ i++; ++ print_short_extent(be); ++ } ++ if (i != count) ++ dprintk("\n\nExpected %u entries\n\n\n", count); ++ dprintk("****************\n"); ++} ++ ++/* Note: In theory, we should do more checking that devid's match between ++ * old and new, but if they don't, the lists are too corrupt to salvage anyway. ++ */ ++/* Note this is very similar to add_and_merge_extent */ ++static void add_to_commitlist(struct pnfs_block_layout *bl, ++ struct pnfs_block_short_extent *new) ++{ ++ struct list_head *clist = &bl->bl_commit; ++ struct pnfs_block_short_extent *old, *save; ++ sector_t end = new->bse_f_offset + new->bse_length; ++ ++ dprintk("%s enter\n", __func__); ++ print_short_extent(new); ++ print_clist(clist, bl->bl_count); ++ bl->bl_count++; ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe(old, save, clist, bse_node) { ++ if (new->bse_f_offset < old->bse_f_offset) ++ break; ++ if (end <= old->bse_f_offset + old->bse_length) { ++ /* Range is already in list */ ++ bl->bl_count--; ++ kfree(new); ++ return; ++ } else if (new->bse_f_offset <= ++ old->bse_f_offset + old->bse_length) { ++ /* new overlaps or abuts existing be */ ++ if (new->bse_mdev == old->bse_mdev) { ++ /* extend new to fully replace old */ ++ new->bse_length += new->bse_f_offset - ++ old->bse_f_offset; ++ new->bse_f_offset = old->bse_f_offset; ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ } ++ /* Note that if we never hit the above break, old will not point to a ++ * valid extent. However, in that case &old->bse_node==list. ++ */ ++ list_add_tail(&new->bse_node, &old->bse_node); ++ /* Scan forward for overlaps. If we find any, extend new and ++ * remove the overlapped extent. ++ */ ++ old = list_prepare_entry(new, clist, bse_node); ++ list_for_each_entry_safe_continue(old, save, clist, bse_node) { ++ if (end < old->bse_f_offset) ++ break; ++ /* new overlaps or abuts old */ ++ if (new->bse_mdev == old->bse_mdev) { ++ if (end < old->bse_f_offset + old->bse_length) { ++ /* extend new to fully cover old */ ++ end = old->bse_f_offset + old->bse_length; ++ new->bse_length = end - new->bse_f_offset; ++ } ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ dprintk("%s: after merging\n", __func__); ++ print_clist(clist, bl->bl_count); ++} ++ ++/* Note the range described by offset, length is guaranteed to be contained ++ * within be. ++ */ ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length) ++{ ++ sector_t new_end, end = offset + length; ++ struct pnfs_block_short_extent *new; ++ struct pnfs_block_layout *bl = container_of(be->be_inval, ++ struct pnfs_block_layout, ++ bl_inval); ++ ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ ++ mark_written_sectors(be->be_inval, offset, length); ++ /* We want to add the range to commit list, but it must be ++ * block-normalized, and verified that the normalized range has ++ * been entirely written to disk. ++ */ ++ new->bse_f_offset = offset; ++ offset = normalize(offset, bl->bl_blocksize); ++ if (offset < new->bse_f_offset) { ++ if (is_range_written(be->be_inval, offset, new->bse_f_offset)) ++ new->bse_f_offset = offset; ++ else ++ new->bse_f_offset = offset + bl->bl_blocksize; ++ } ++ new_end = normalize_up(end, bl->bl_blocksize); ++ if (end < new_end) { ++ if (is_range_written(be->be_inval, end, new_end)) ++ end = new_end; ++ else ++ end = new_end - bl->bl_blocksize; ++ } ++ if (end <= new->bse_f_offset) { ++ kfree(new); ++ return 0; ++ } ++ new->bse_length = end - new->bse_f_offset; ++ new->bse_devid = be->be_devid; ++ new->bse_mdev = be->be_mdev; ++ ++ spin_lock(&bl->bl_ext_lock); ++ /* new will be freed, either by add_to_commitlist if it decides not ++ * to use it, or after LAYOUTCOMMIT uses it in the commitlist. ++ */ ++ add_to_commitlist(bl, new); ++ spin_unlock(&bl->bl_ext_lock); ++ return 0; ++} ++ ++static void print_bl_extent(struct pnfs_block_extent *be) ++{ ++ dprintk("PRINT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->be_length); ++ dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); ++ dprintk(" be_state %d\n", be->be_state); ++ } ++} ++ ++static void ++destroy_extent(struct kref *kref) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = container_of(kref, struct pnfs_block_extent, be_refcnt); ++ dprintk("%s be=%p\n", __func__, be); ++ kfree(be); ++} ++ ++void ++put_extent(struct pnfs_block_extent *be) ++{ ++ if (be) { ++ dprintk("%s enter %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_put(&be->be_refcnt, destroy_extent); ++ } ++} ++ ++struct pnfs_block_extent *alloc_extent(void) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = kmalloc(sizeof(struct pnfs_block_extent), GFP_KERNEL); ++ if (!be) ++ return NULL; ++ INIT_LIST_HEAD(&be->be_node); ++ kref_init(&be->be_refcnt); ++ be->be_inval = NULL; ++ return be; ++} ++ ++struct pnfs_block_extent * ++get_extent(struct pnfs_block_extent *be) ++{ ++ if (be) ++ kref_get(&be->be_refcnt); ++ return be; ++} ++ ++void print_elist(struct list_head *list) ++{ ++ struct pnfs_block_extent *be; ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, be_node) { ++ print_bl_extent(be); ++ } ++ dprintk("****************\n"); ++} ++ ++static inline int ++extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) ++{ ++ /* Note this assumes new->be_f_offset >= old->be_f_offset */ ++ return (new->be_state == old->be_state) && ++ ((new->be_state == PNFS_BLOCK_NONE_DATA) || ++ ((new->be_v_offset - old->be_v_offset == ++ new->be_f_offset - old->be_f_offset) && ++ new->be_mdev == old->be_mdev)); ++} ++ ++/* Adds new to appropriate list in bl, modifying new and removing existing ++ * extents as appropriate to deal with overlaps. ++ * ++ * See find_get_extent for list constraints. ++ * ++ * Refcount on new is already set. If end up not using it, or error out, ++ * need to put the reference. ++ * ++ * Lock is held by caller. ++ */ ++int ++add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new) ++{ ++ struct pnfs_block_extent *be, *tmp; ++ sector_t end = new->be_f_offset + new->be_length; ++ struct list_head *list; ++ ++ dprintk("%s enter with be=%p\n", __func__, new); ++ print_bl_extent(new); ++ list = &bl->bl_extents[choose_list(new->be_state)]; ++ print_elist(list); ++ ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe_reverse(be, tmp, list, be_node) { ++ if (new->be_f_offset >= be->be_f_offset + be->be_length) ++ break; ++ if (new->be_f_offset >= be->be_f_offset) { ++ if (end <= be->be_f_offset + be->be_length) { ++ /* new is a subset of existing be*/ ++ if (extents_consistent(be, new)) { ++ dprintk("%s: new is subset, ignoring\n", ++ __func__); ++ put_extent(new); ++ return 0; ++ } else { ++ goto out_err; ++ } ++ } else { ++ /* |<-- be -->| ++ * |<-- new -->| */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ new->be_length += new->be_f_offset - ++ be->be_f_offset; ++ new->be_f_offset = be->be_f_offset; ++ new->be_v_offset = be->be_v_offset; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } else if (end >= be->be_f_offset + be->be_length) { ++ /* new extent overlap existing be */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } else if (end > be->be_f_offset) { ++ /* |<-- be -->| ++ *|<-- new -->| */ ++ if (extents_consistent(new, be)) { ++ /* extend new to fully replace be */ ++ new->be_length += be->be_f_offset + be->be_length - ++ new->be_f_offset - new->be_length; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } ++ /* Note that if we never hit the above break, be will not point to a ++ * valid extent. However, in that case &be->be_node==list. ++ */ ++ list_add(&new->be_node, &be->be_node); ++ dprintk("%s: inserting new\n", __func__); ++ print_elist(list); ++ /* STUB - The per-list consistency checks have all been done, ++ * should now check cross-list consistency. ++ */ ++ return 0; ++ ++ out_err: ++ put_extent(new); ++ return -EIO; ++} ++ ++/* Returns extent, or NULL. If a second READ extent exists, it is returned ++ * in cow_read, if given. ++ * ++ * The extents are kept in two seperate ordered lists, one for READ and NONE, ++ * one for READWRITE and INVALID. Within each list, we assume: ++ * 1. Extents are ordered by file offset. ++ * 2. For any given isect, there is at most one extents that matches. ++ */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read) ++{ ++ struct pnfs_block_extent *be, *cow, *ret; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ cow = ret = NULL; ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret && ++ (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ if (!ret) ++ ret = be; ++ else if (be->be_state != PNFS_BLOCK_READ_DATA) ++ put_extent(be); ++ else ++ cow = be; ++ break; ++ } ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ if (cow_read) ++ *cow_read = cow; ++ print_bl_extent(ret); ++ return ret; ++} ++ ++/* Similar to find_get_extent, but called with lock held, and ignores cow */ ++static struct pnfs_block_extent * ++find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) ++{ ++ struct pnfs_block_extent *be, *ret = NULL; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ ret = be; ++ break; ++ } ++ } ++ } ++ print_bl_extent(ret); ++ return ret; ++} ++ ++int ++encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg) ++{ ++ sector_t start, end; ++ struct pnfs_block_short_extent *lce, *save; ++ unsigned int count = 0; ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct list_head *ranges = &bld->ranges; ++ __be32 *p, *xdr_start; ++ ++ dprintk("%s enter\n", __func__); ++ start = arg->range.offset >> 9; ++ end = start + (arg->range.length >> 9); ++ dprintk("%s set start=%llu, end=%llu\n", ++ __func__, (u64)start, (u64)end); ++ ++ /* BUG - creation of bl_commit is buggy - need to wait for ++ * entire block to be marked WRITTEN before it can be added. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ /* Want to adjust for possible truncate */ ++ /* We now want to adjust argument range */ ++ ++ /* XDR encode the ranges found */ ++ xdr_start = xdr_reserve_space(xdr, 8); ++ if (!xdr_start) ++ goto out; ++ list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { ++ p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); ++ if (!p) ++ break; ++ WRITE_DEVID(&lce->bse_devid); ++ WRITE64(lce->bse_f_offset << 9); ++ WRITE64(lce->bse_length << 9); ++ WRITE64(0LL); ++ WRITE32(PNFS_BLOCK_READWRITE_DATA); ++ list_del(&lce->bse_node); ++ list_add_tail(&lce->bse_node, ranges); ++ bl->bl_count--; ++ count++; ++ } ++ xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); ++ xdr_start[1] = cpu_to_be32(count); ++out: ++ spin_unlock(&bl->bl_ext_lock); ++ dprintk("%s found %i ranges\n", __func__, count); ++ return 0; ++} ++ ++/* Helper function to set_to_rw that initialize a new extent */ ++static void ++_prep_new_extent(struct pnfs_block_extent *new, ++ struct pnfs_block_extent *orig, ++ sector_t offset, sector_t length, int state) ++{ ++ kref_init(&new->be_refcnt); ++ /* don't need to INIT_LIST_HEAD(&new->be_node) */ ++ memcpy(&new->be_devid, &orig->be_devid, sizeof(struct pnfs_deviceid)); ++ new->be_mdev = orig->be_mdev; ++ new->be_f_offset = offset; ++ new->be_length = length; ++ new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; ++ new->be_state = state; ++ new->be_inval = orig->be_inval; ++} ++ ++/* Tries to merge be with extent in front of it in list. ++ * Frees storage if not used. ++ */ ++static struct pnfs_block_extent * ++_front_merge(struct pnfs_block_extent *be, struct list_head *head, ++ struct pnfs_block_extent *storage) ++{ ++ struct pnfs_block_extent *prev; ++ ++ if (!storage) ++ goto no_merge; ++ if (&be->be_node == head || be->be_node.prev == head) ++ goto no_merge; ++ prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); ++ if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || ++ !extents_consistent(prev, be)) ++ goto no_merge; ++ _prep_new_extent(storage, prev, prev->be_f_offset, ++ prev->be_length + be->be_length, prev->be_state); ++ list_replace(&prev->be_node, &storage->be_node); ++ put_extent(prev); ++ list_del(&be->be_node); ++ put_extent(be); ++ return storage; ++ ++ no_merge: ++ kfree(storage); ++ return be; ++} ++ ++static u64 ++set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) ++{ ++ u64 rv = offset + length; ++ struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; ++ struct pnfs_block_extent *children[3]; ++ struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; ++ int i = 0, j; ++ ++ dprintk("%s(%llu, %llu)\n", __func__, offset, length); ++ /* Create storage for up to three new extents e1, e2, e3 */ ++ e1 = kmalloc(sizeof(*e1), GFP_KERNEL); ++ e2 = kmalloc(sizeof(*e2), GFP_KERNEL); ++ e3 = kmalloc(sizeof(*e3), GFP_KERNEL); ++ /* BUG - we are ignoring any failure */ ++ if (!e1 || !e2 || !e3) ++ goto out_nosplit; ++ ++ spin_lock(&bl->bl_ext_lock); ++ be = find_get_extent_locked(bl, offset); ++ rv = be->be_f_offset + be->be_length; ++ if (be->be_state != PNFS_BLOCK_INVALID_DATA) { ++ spin_unlock(&bl->bl_ext_lock); ++ goto out_nosplit; ++ } ++ /* Add e* to children, bumping e*'s krefs */ ++ if (be->be_f_offset != offset) { ++ _prep_new_extent(e1, be, be->be_f_offset, ++ offset - be->be_f_offset, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e1; ++ print_bl_extent(e1); ++ } else ++ merge1 = e1; ++ _prep_new_extent(e2, be, offset, ++ min(length, be->be_f_offset + be->be_length - offset), ++ PNFS_BLOCK_READWRITE_DATA); ++ children[i++] = e2; ++ print_bl_extent(e2); ++ if (offset + length < be->be_f_offset + be->be_length) { ++ _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, ++ be->be_f_offset + be->be_length - ++ offset - length, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e3; ++ print_bl_extent(e3); ++ } else ++ merge2 = e3; ++ ++ /* Remove be from list, and insert the e* */ ++ /* We don't get refs on e*, since this list is the base reference ++ * set when init'ed. ++ */ ++ if (i < 3) ++ children[i] = NULL; ++ new = children[0]; ++ list_replace(&be->be_node, &new->be_node); ++ put_extent(be); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); ++ for (j = 1; j < i; j++) { ++ old = new; ++ new = children[j]; ++ list_add(&new->be_node, &old->be_node); ++ } ++ if (merge2) { ++ /* This is a HACK, should just create a _back_merge function */ ++ new = list_entry(new->be_node.next, ++ struct pnfs_block_extent, be_node); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ ++ /* Since we removed the base reference above, be is now scheduled for ++ * destruction. ++ */ ++ put_extent(be); ++ dprintk("%s returns %llu after split\n", __func__, rv); ++ return rv; ++ ++ out_nosplit: ++ kfree(e1); ++ kfree(e2); ++ kfree(e3); ++ dprintk("%s returns %llu without splitting\n", __func__, rv); ++ return rv; ++} ++ ++void ++clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct nfs4_layoutcommit_args *arg, ++ int status) ++{ ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct pnfs_block_short_extent *lce, *save; ++ ++ dprintk("%s status %d\n", __func__, status); ++ list_for_each_entry_safe_reverse(lce, save, &bld->ranges, bse_node) { ++ if (likely(!status)) { ++ u64 offset = lce->bse_f_offset; ++ u64 end = offset + lce->bse_length; ++ ++ do { ++ offset = set_to_rw(bl, offset, end - offset); ++ } while (offset < end); ++ ++ kfree(lce); ++ } else { ++ spin_lock(&bl->bl_ext_lock); ++ add_to_commitlist(bl, lce); ++ spin_unlock(&bl->bl_ext_lock); ++ } ++ } ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile +--- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-31 20:42:05.502212803 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-08-31 20:42:05.502212803 -0400 +@@ -0,0 +1,6 @@ ++# ++# Makefile for the pNFS block layout driver kernel module ++# ++obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o ++blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \ ++ extents.o block-device-discovery-pipe.o +diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/callback.h +--- linux-2.6.34.noarch/fs/nfs/callback.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-08-31 20:42:05.508119925 -0400 +@@ -111,6 +111,13 @@ extern int nfs41_validate_delegation_sta + + #define RCA4_TYPE_MASK_RDATA_DLG 0 + #define RCA4_TYPE_MASK_WDATA_DLG 1 ++#define RCA4_TYPE_MASK_DIR_DLG 2 ++#define RCA4_TYPE_MASK_FILE_LAYOUT 3 ++#define RCA4_TYPE_MASK_BLK_LAYOUT 4 ++#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8 ++#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 ++#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 ++#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 + + struct cb_recallanyargs { + struct sockaddr *craa_addr; +@@ -127,6 +134,39 @@ struct cb_recallslotargs { + extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, + void *dummy); + ++struct cb_layoutrecallargs { ++ struct sockaddr *cbl_addr; ++ struct nfs_fh cbl_fh; ++ struct pnfs_layout_range cbl_seg; ++ struct nfs_fsid cbl_fsid; ++ uint32_t cbl_recall_type; ++ uint32_t cbl_layout_type; ++ uint32_t cbl_layoutchanged; ++ nfs4_stateid cbl_stateid; ++}; ++ ++extern unsigned nfs4_callback_layoutrecall( ++ struct cb_layoutrecallargs *args, ++ void *dummy); ++ ++struct cb_devicenotifyitem { ++ uint32_t cbd_notify_type; ++ uint32_t cbd_layout_type; ++ struct pnfs_deviceid cbd_dev_id; ++ uint32_t cbd_immediate; ++}; ++ ++/* XXX: Should be dynamic up to max compound size */ ++#define NFS4_DEV_NOTIFY_MAXENTRIES 10 ++struct cb_devicenotifyargs { ++ struct sockaddr *addr; ++ int ndevs; ++ struct cb_devicenotifyitem devs[NFS4_DEV_NOTIFY_MAXENTRIES]; ++}; ++ ++extern unsigned nfs4_callback_devicenotify( ++ struct cb_devicenotifyargs *args, ++ void *dummy); + #endif /* CONFIG_NFS_V4_1 */ + + extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); +diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/nfs/callback_proc.c +--- linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-08-31 20:42:05.509093330 -0400 +@@ -8,10 +8,15 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + #include "nfs4_fs.h" + #include "callback.h" + #include "delegation.h" + #include "internal.h" ++#include "pnfs.h" + + #ifdef NFS_DEBUG + #define NFSDBG_FACILITY NFSDBG_CALLBACK +@@ -62,16 +67,6 @@ out: + return res->status; + } + +-static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *) +-{ +-#if defined(CONFIG_NFS_V4_1) +- if (clp->cl_minorversion > 0) +- return nfs41_validate_delegation_stateid; +-#endif +- return nfs4_validate_delegation_stateid; +-} +- +- + __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) + { + struct nfs_client *clp; +@@ -92,8 +87,7 @@ __be32 nfs4_callback_recall(struct cb_re + inode = nfs_delegation_find_inode(clp, &args->fh); + if (inode != NULL) { + /* Set up a helper thread to actually return the delegation */ +- switch (nfs_async_inode_return_delegation(inode, &args->stateid, +- nfs_validate_delegation_stateid(clp))) { ++ switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { + case 0: + res = 0; + break; +@@ -116,24 +110,364 @@ out: + + int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) + { +- if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, +- sizeof(delegation->stateid.data)) != 0) ++ if (delegation == NULL || memcmp(delegation->stateid.u.data, ++ stateid->u.data, ++ sizeof(delegation->stateid.u.data))) + return 0; + return 1; + } + + #if defined(CONFIG_NFS_V4_1) + ++static bool ++pnfs_is_next_layout_stateid(const struct pnfs_layout_hdr *lo, ++ const nfs4_stateid stateid) ++{ ++ int seqlock; ++ bool res; ++ u32 oldseqid, newseqid; ++ ++ do { ++ seqlock = read_seqbegin(&lo->seqlock); ++ oldseqid = be32_to_cpu(lo->stateid.u.stateid.seqid); ++ newseqid = be32_to_cpu(stateid.u.stateid.seqid); ++ res = !memcmp(lo->stateid.u.stateid.other, ++ stateid.u.stateid.other, ++ NFS4_STATEID_OTHER_SIZE); ++ if (res) { /* comparing layout stateids */ ++ if (oldseqid == ~0) ++ res = (newseqid == 1); ++ else ++ res = (newseqid == oldseqid + 1); ++ } else { /* open stateid */ ++ res = !memcmp(lo->stateid.u.data, ++ &zero_stateid, ++ NFS4_STATEID_SIZE); ++ if (res) ++ res = (newseqid == 1); ++ } ++ } while (read_seqretry(&lo->seqlock, seqlock)); ++ ++ return res; ++} ++ ++/* ++ * Retrieve an inode based on layout recall parameters ++ * ++ * Note: caller must iput(inode) to dereference the inode. ++ */ ++static struct inode * ++nfs_layoutrecall_find_inode(struct nfs_client *clp, ++ const struct cb_layoutrecallargs *args) ++{ ++ struct nfs_inode *nfsi; ++ struct pnfs_layout_hdr *lo; ++ struct nfs_server *server; ++ struct inode *ino = NULL; ++ ++ dprintk("%s: Begin recall_type=%d clp %p\n", ++ __func__, args->cbl_recall_type, clp); ++ ++ spin_lock(&clp->cl_lock); ++ list_for_each_entry(lo, &clp->cl_layouts, layouts) { ++ nfsi = PNFS_NFS_INODE(lo); ++ if (!nfsi) ++ continue; ++ ++ dprintk("%s: Searching inode=%lu\n", ++ __func__, nfsi->vfs_inode.i_ino); ++ ++ if (args->cbl_recall_type == RETURN_FILE) { ++ if (nfs_compare_fh(&args->cbl_fh, &nfsi->fh)) ++ continue; ++ } else if (args->cbl_recall_type == RETURN_FSID) { ++ server = NFS_SERVER(&nfsi->vfs_inode); ++ if (server->fsid.major != args->cbl_fsid.major || ++ server->fsid.minor != args->cbl_fsid.minor) ++ continue; ++ } ++ ++ /* Make sure client didn't clean up layout without ++ * telling the server */ ++ if (!has_layout(nfsi)) ++ continue; ++ ++ ino = igrab(&nfsi->vfs_inode); ++ dprintk("%s: Found inode=%p\n", __func__, ino); ++ break; ++ } ++ spin_unlock(&clp->cl_lock); ++ return ino; ++} ++ ++struct recall_layout_threadargs { ++ struct inode *inode; ++ struct nfs_client *clp; ++ struct completion started; ++ struct cb_layoutrecallargs *rl; ++ int result; ++}; ++ ++static int pnfs_recall_layout(void *data) ++{ ++ struct inode *inode, *ino; ++ struct nfs_client *clp; ++ struct cb_layoutrecallargs rl; ++ struct nfs4_layoutreturn *lrp; ++ struct recall_layout_threadargs *args = ++ (struct recall_layout_threadargs *)data; ++ int status = 0; ++ ++ daemonize("nfsv4-layoutreturn"); ++ ++ dprintk("%s: recall_type=%d fsid 0x%llx-0x%llx start\n", ++ __func__, args->rl->cbl_recall_type, ++ args->rl->cbl_fsid.major, args->rl->cbl_fsid.minor); ++ ++ clp = args->clp; ++ inode = args->inode; ++ rl = *args->rl; ++ ++ /* support whole file layouts only */ ++ rl.cbl_seg.offset = 0; ++ rl.cbl_seg.length = NFS4_MAX_UINT64; ++ ++ if (rl.cbl_recall_type == RETURN_FILE) { ++ if (pnfs_is_next_layout_stateid(NFS_I(inode)->layout, ++ rl.cbl_stateid)) ++ status = pnfs_return_layout(inode, &rl.cbl_seg, ++ &rl.cbl_stateid, RETURN_FILE, ++ false); ++ else ++ status = cpu_to_be32(NFS4ERR_DELAY); ++ if (status) ++ dprintk("%s RETURN_FILE error: %d\n", __func__, status); ++ else ++ status = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); ++ args->result = status; ++ complete(&args->started); ++ goto out; ++ } ++ ++ status = cpu_to_be32(NFS4_OK); ++ args->result = status; ++ complete(&args->started); ++ args = NULL; ++ ++ /* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */ ++ while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) { ++ /* FIXME: need to check status on pnfs_return_layout */ ++ pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, false); ++ iput(ino); ++ } ++ ++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); ++ if (!lrp) { ++ dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n", ++ __func__); ++ goto out; ++ } ++ ++ /* send final layoutreturn */ ++ lrp->args.reclaim = 0; ++ lrp->args.layout_type = rl.cbl_layout_type; ++ lrp->args.return_type = rl.cbl_recall_type; ++ lrp->args.range = rl.cbl_seg; ++ lrp->args.inode = inode; ++ nfs4_proc_layoutreturn(lrp, true); ++ ++out: ++ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); ++ nfs_put_client(clp); ++ module_put_and_exit(0); ++ dprintk("%s: exit status %d\n", __func__, 0); ++ return 0; ++} ++ ++/* ++ * Asynchronous layout recall! ++ */ ++static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode, ++ struct cb_layoutrecallargs *rl) ++{ ++ struct recall_layout_threadargs data = { ++ .clp = clp, ++ .inode = inode, ++ .rl = rl, ++ }; ++ struct task_struct *t; ++ int status = -EAGAIN; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ /* FIXME: do not allow two concurrent layout recalls */ ++ if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) ++ return status; ++ ++ init_completion(&data.started); ++ __module_get(THIS_MODULE); ++ if (!atomic_inc_not_zero(&clp->cl_count)) ++ goto out_put_no_client; ++ ++ t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout"); ++ if (IS_ERR(t)) { ++ printk(KERN_INFO "NFS: Layout recall callback thread failed " ++ "for client (clientid %08x/%08x)\n", ++ (unsigned)(clp->cl_clientid >> 32), ++ (unsigned)(clp->cl_clientid)); ++ status = PTR_ERR(t); ++ goto out_module_put; ++ } ++ wait_for_completion(&data.started); ++ return data.result; ++out_module_put: ++ nfs_put_client(clp); ++out_put_no_client: ++ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); ++ module_put(THIS_MODULE); ++ return status; ++} ++ ++static int pnfs_recall_all_layouts(struct nfs_client *clp) ++{ ++ struct cb_layoutrecallargs rl; ++ struct inode *inode; ++ int status = 0; ++ ++ rl.cbl_recall_type = RETURN_ALL; ++ rl.cbl_seg.iomode = IOMODE_ANY; ++ rl.cbl_seg.offset = 0; ++ rl.cbl_seg.length = NFS4_MAX_UINT64; ++ ++ /* we need the inode to get the nfs_server struct */ ++ inode = nfs_layoutrecall_find_inode(clp, &rl); ++ if (!inode) ++ return status; ++ status = pnfs_async_return_layout(clp, inode, &rl); ++ iput(inode); ++ ++ return status; ++} ++ ++__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args, ++ void *dummy) ++{ ++ struct nfs_client *clp; ++ struct inode *inode = NULL; ++ __be32 res; ++ int status; ++ unsigned int num_client = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); ++ clp = nfs_find_client(args->cbl_addr, 4); ++ if (clp == NULL) { ++ dprintk("%s: no client for addr %u.%u.%u.%u\n", ++ __func__, NIPQUAD(args->cbl_addr)); ++ goto out; ++ } ++ ++ res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); ++ do { ++ struct nfs_client *prev = clp; ++ num_client++; ++ /* the callback must come from the MDS personality */ ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) ++ goto loop; ++ if (args->cbl_recall_type == RETURN_FILE) { ++ inode = nfs_layoutrecall_find_inode(clp, args); ++ if (inode != NULL) { ++ status = pnfs_async_return_layout(clp, inode, ++ args); ++ if (status) ++ res = cpu_to_be32(NFS4ERR_DELAY); ++ iput(inode); ++ } ++ } else { /* _ALL or _FSID */ ++ /* we need the inode to get the nfs_server struct */ ++ inode = nfs_layoutrecall_find_inode(clp, args); ++ if (!inode) ++ goto loop; ++ status = pnfs_async_return_layout(clp, inode, args); ++ if (status) ++ res = cpu_to_be32(NFS4ERR_DELAY); ++ iput(inode); ++ } ++loop: ++ clp = nfs_find_client_next(prev); ++ nfs_put_client(prev); ++ } while (clp != NULL); ++ ++out: ++ dprintk("%s: exit with status = %d numclient %u\n", ++ __func__, ntohl(res), num_client); ++ return res; ++} ++ ++/* Remove the deviceid(s) from the nfs_client deviceid cache */ ++static __be32 pnfs_devicenotify_client(struct nfs_client *clp, ++ struct cb_devicenotifyargs *args) ++{ ++ uint32_t type; ++ int i; ++ ++ dprintk("%s: --> clp %p\n", __func__, clp); ++ ++ for (i = 0; i < args->ndevs; i++) { ++ struct cb_devicenotifyitem *dev = &args->devs[i]; ++ type = dev->cbd_notify_type; ++ if (type == NOTIFY_DEVICEID4_DELETE && clp->cl_devid_cache) ++ nfs4_delete_device(clp->cl_devid_cache, ++ &dev->cbd_dev_id); ++ else if (type == NOTIFY_DEVICEID4_CHANGE) ++ printk(KERN_ERR "%s: NOTIFY_DEVICEID4_CHANGE " ++ "not supported\n", __func__); ++ } ++ return 0; ++} ++ ++__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, ++ void *dummy) ++{ ++ struct nfs_client *clp; ++ __be32 res = 0; ++ unsigned int num_client = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ res = __constant_htonl(NFS4ERR_INVAL); ++ clp = nfs_find_client(args->addr, 4); ++ if (clp == NULL) { ++ dprintk("%s: no client for addr %u.%u.%u.%u\n", ++ __func__, NIPQUAD(args->addr)); ++ goto out; ++ } ++ ++ do { ++ struct nfs_client *prev = clp; ++ num_client++; ++ res = pnfs_devicenotify_client(clp, args); ++ clp = nfs_find_client_next(prev); ++ nfs_put_client(prev); ++ } while (clp != NULL); ++ ++out: ++ dprintk("%s: exit with status = %d numclient %u\n", ++ __func__, ntohl(res), num_client); ++ return res; ++} ++ + int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) + { + if (delegation == NULL) + return 0; + +- /* seqid is 4-bytes long */ +- if (((u32 *) &stateid->data)[0] != 0) ++ if (stateid->u.stateid.seqid != 0) + return 0; +- if (memcmp(&delegation->stateid.data[4], &stateid->data[4], +- sizeof(stateid->data)-4)) ++ if (memcmp(&delegation->stateid.u.stateid.other, ++ &stateid->u.stateid.other, ++ NFS4_STATEID_OTHER_SIZE)) + return 0; + + return 1; +@@ -335,13 +669,37 @@ out: + return status; + } + ++static inline bool ++validate_bitmap_values(const unsigned long *mask) ++{ ++ int i; ++ ++ if (*mask == 0) ++ return true; ++ if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_WDATA_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_DIR_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, mask) || ++ test_bit(RCA4_TYPE_MASK_BLK_LAYOUT, mask)) ++ return true; ++ for (i = RCA4_TYPE_MASK_OBJ_LAYOUT_MIN; ++ i <= RCA4_TYPE_MASK_OBJ_LAYOUT_MAX; i++) ++ if (test_bit(i, mask)) ++ return true; ++ for (i = RCA4_TYPE_MASK_OTHER_LAYOUT_MIN; ++ i <= RCA4_TYPE_MASK_OTHER_LAYOUT_MAX; i++) ++ if (test_bit(i, mask)) ++ return true; ++ return false; ++} ++ + __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) + { + struct nfs_client *clp; + __be32 status; + fmode_t flags = 0; + +- status = htonl(NFS4ERR_OP_NOT_IN_SESSION); ++ status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + clp = nfs_find_client(args->craa_addr, 4); + if (clp == NULL) + goto out; +@@ -349,16 +707,25 @@ __be32 nfs4_callback_recallany(struct cb + dprintk("NFS: RECALL_ANY callback request from %s\n", + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + ++ status = cpu_to_be32(NFS4ERR_INVAL); ++ if (!validate_bitmap_values((const unsigned long *) ++ &args->craa_type_mask)) ++ return status; ++ ++ status = cpu_to_be32(NFS4_OK); + if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags = FMODE_READ; + if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags |= FMODE_WRITE; ++ if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) ++ &args->craa_type_mask)) ++ if (pnfs_recall_all_layouts(clp) == -EAGAIN) ++ status = cpu_to_be32(NFS4ERR_DELAY); + + if (flags) + nfs_expire_all_delegation_types(clp, flags); +- status = htonl(NFS4_OK); + out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/nfs/callback_xdr.c +--- linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-08-31 20:42:05.510143651 -0400 +@@ -22,6 +22,8 @@ + #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + + #if defined(CONFIG_NFS_V4_1) ++#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) ++#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ + 4 + 1 + 3) + #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +@@ -136,7 +138,7 @@ static __be32 decode_stateid(struct xdr_ + p = read_buf(xdr, 16); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); +- memcpy(stateid->data, p, 16); ++ memcpy(stateid->u.data, p, 16); + return 0; + } + +@@ -220,6 +222,148 @@ out: + + #if defined(CONFIG_NFS_V4_1) + ++static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct cb_layoutrecallargs *args) ++{ ++ __be32 *p; ++ __be32 status = 0; ++ ++ args->cbl_addr = svc_addr(rqstp); ++ p = read_buf(xdr, 4 * sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ ++ args->cbl_layout_type = ntohl(*p++); ++ args->cbl_seg.iomode = ntohl(*p++); ++ args->cbl_layoutchanged = ntohl(*p++); ++ args->cbl_recall_type = ntohl(*p++); ++ ++ if (likely(args->cbl_recall_type == RETURN_FILE)) { ++ status = decode_fh(xdr, &args->cbl_fh); ++ if (unlikely(status != 0)) ++ goto out; ++ ++ p = read_buf(xdr, 2 * sizeof(uint64_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ p = xdr_decode_hyper(p, &args->cbl_seg.offset); ++ p = xdr_decode_hyper(p, &args->cbl_seg.length); ++ status = decode_stateid(xdr, &args->cbl_stateid); ++ if (unlikely(status != 0)) ++ goto out; ++ } else if (args->cbl_recall_type == RETURN_FSID) { ++ p = read_buf(xdr, 2 * sizeof(uint64_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ p = xdr_decode_hyper(p, &args->cbl_fsid.major); ++ p = xdr_decode_hyper(p, &args->cbl_fsid.minor); ++ } ++ dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d " ++ "fsid %llx-%llx fhsize %d\n", __func__, ++ args->cbl_layout_type, args->cbl_seg.iomode, ++ args->cbl_layoutchanged, args->cbl_recall_type, ++ args->cbl_fsid.major, args->cbl_fsid.minor, ++ args->cbl_fh.size); ++out: ++ dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); ++ return status; ++} ++ ++static ++__be32 decode_devicenotify_args(struct svc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct cb_devicenotifyargs *args) ++{ ++ __be32 *p; ++ __be32 status = 0; ++ u32 tmp; ++ int n, i; ++ args->ndevs = 0; ++ ++ args->addr = svc_addr(rqstp); ++ ++ /* Num of device notifications */ ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ n = ntohl(*p++); ++ if (n <= 0) ++ goto out; ++ ++ /* XXX: need to possibly return error in this case */ ++ if (n > NFS4_DEV_NOTIFY_MAXENTRIES) { ++ dprintk("%s: Processing (%d) notifications out of (%d)\n", ++ __func__, NFS4_DEV_NOTIFY_MAXENTRIES, n); ++ n = NFS4_DEV_NOTIFY_MAXENTRIES; ++ } ++ ++ /* Decode each dev notification */ ++ for (i = 0; i < n; i++) { ++ struct cb_devicenotifyitem *dev = &args->devs[i]; ++ ++ p = read_buf(xdr, (4 * sizeof(uint32_t)) ++ + NFS4_PNFS_DEVICEID4_SIZE); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* bitmap size */ ++ if (tmp != 1) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_notify_type = ntohl(*p++); ++ if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* opaque size */ ++ if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) && ++ (tmp != NFS4_PNFS_DEVICEID4_SIZE + 8)) || ++ ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) && ++ (tmp != NFS4_PNFS_DEVICEID4_SIZE + 4))) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_layout_type = ntohl(*p++); ++ memcpy(dev->cbd_dev_id.data, p, NFS4_PNFS_DEVICEID4_SIZE); ++ p += XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE); ++ ++ if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_DELAY); ++ goto out; ++ } ++ dev->cbd_immediate = ntohl(*p++); ++ } else { ++ dev->cbd_immediate = 0; ++ } ++ ++ args->ndevs++; ++ ++ dprintk("%s: type %d layout 0x%x immediate %d\n", ++ __func__, dev->cbd_notify_type, dev->cbd_layout_type, ++ dev->cbd_immediate); ++ } ++out: ++ dprintk("%s: status %d ndevs %d\n", ++ __func__, ntohl(status), args->ndevs); ++ return status; ++} ++ + static __be32 decode_sessionid(struct xdr_stream *xdr, + struct nfs4_sessionid *sid) + { +@@ -574,11 +718,11 @@ preprocess_nfs41_op(int nop, unsigned in + case OP_CB_SEQUENCE: + case OP_CB_RECALL_ANY: + case OP_CB_RECALL_SLOT: ++ case OP_CB_LAYOUTRECALL: ++ case OP_CB_NOTIFY_DEVICEID: + *op = &callback_ops[op_nr]; + break; + +- case OP_CB_LAYOUTRECALL: +- case OP_CB_NOTIFY_DEVICEID: + case OP_CB_NOTIFY: + case OP_CB_PUSH_DELEG: + case OP_CB_RECALLABLE_OBJ_AVAIL: +@@ -739,6 +883,18 @@ static struct callback_op callback_ops[] + .res_maxsize = CB_OP_RECALL_RES_MAXSZ, + }, + #if defined(CONFIG_NFS_V4_1) ++ [OP_CB_LAYOUTRECALL] = { ++ .process_op = (callback_process_op_t)nfs4_callback_layoutrecall, ++ .decode_args = ++ (callback_decode_arg_t)decode_layoutrecall_args, ++ .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, ++ }, ++ [OP_CB_NOTIFY_DEVICEID] = { ++ .process_op = (callback_process_op_t)nfs4_callback_devicenotify, ++ .decode_args = ++ (callback_decode_arg_t)decode_devicenotify_args, ++ .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ, ++ }, + [OP_CB_SEQUENCE] = { + .process_op = (callback_process_op_t)nfs4_callback_sequence, + .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, +diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/client.c +--- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-08-31 20:41:19.144140225 -0400 ++++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-31 20:42:05.511222861 -0400 +@@ -39,6 +39,7 @@ + #include + #include + #include ++#include + + #include + +@@ -48,6 +49,7 @@ + #include "iostat.h" + #include "internal.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_CLIENT + +@@ -150,11 +152,14 @@ static struct nfs_client *nfs_alloc_clie + clp->cl_boot_time = CURRENT_TIME; + clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; + clp->cl_minorversion = cl_init->minorversion; ++ clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; + #endif + cred = rpc_lookup_machine_cred(); + if (!IS_ERR(cred)) + clp->cl_machine_cred = cred; +- ++#if defined(CONFIG_NFS_V4_1) ++ INIT_LIST_HEAD(&clp->cl_layouts); ++#endif + nfs_fscache_get_client_cookie(clp); + + return clp; +@@ -178,7 +183,7 @@ static void nfs4_clear_client_minor_vers + clp->cl_session = NULL; + } + +- clp->cl_call_sync = _nfs4_call_sync; ++ clp->cl_mvops = nfs_v4_minor_ops[0]; + #endif /* CONFIG_NFS_V4_1 */ + } + +@@ -188,7 +193,7 @@ static void nfs4_clear_client_minor_vers + static void nfs4_destroy_callback(struct nfs_client *clp) + { + if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) +- nfs_callback_down(clp->cl_minorversion); ++ nfs_callback_down(clp->cl_mvops->minor_version); + } + + static void nfs4_shutdown_client(struct nfs_client *clp) +@@ -251,6 +256,7 @@ void nfs_put_client(struct nfs_client *c + nfs_free_client(clp); + } + } ++EXPORT_SYMBOL(nfs_put_client); + + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + /* +@@ -343,7 +349,7 @@ static int nfs_sockaddr_match_ipaddr(con + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, including the port number. + */ +-static int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2) + { + if (sa1->sa_family != sa2->sa_family) +@@ -357,6 +363,7 @@ static int nfs_sockaddr_cmp(const struct + } + return 0; + } ++EXPORT_SYMBOL(nfs_sockaddr_cmp); + + /* + * Find a client by IP address and protocol version +@@ -548,6 +555,7 @@ int nfs4_check_client_ready(struct nfs_c + return -EPROTONOSUPPORT; + return 0; + } ++EXPORT_SYMBOL(nfs4_check_client_ready); + + /* + * Initialise the timeout values for a connection +@@ -865,9 +873,34 @@ error: + } + + /* ++ * Initialize the pNFS layout driver and setup pNFS related parameters ++ */ ++static void nfs4_init_pnfs(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) ++{ ++#if defined(CONFIG_NFS_V4_1) ++ struct nfs_client *clp = server->nfs_client; ++ ++ if (nfs4_has_session(clp) && ++ (clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) { ++ server->pnfs_blksize = fsinfo->blksize; ++ set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype); ++ pnfs_set_ds_iosize(server); ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ ++static void nfs4_uninit_pnfs(struct nfs_server *server) ++{ ++#if defined(CONFIG_NFS_V4_1) ++ if (server->nfs_client && nfs4_has_session(server->nfs_client)) ++ unmount_pnfs_layoutdriver(server); ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ ++/* + * Load up the server record from information gained in an fsinfo record + */ +-static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) ++static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) + { + unsigned long max_rpc_payload; + +@@ -897,6 +930,8 @@ static void nfs_server_set_fsinfo(struct + if (server->wsize > NFS_MAX_FILE_IO_SIZE) + server->wsize = NFS_MAX_FILE_IO_SIZE; + server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; ++ nfs4_init_pnfs(server, mntfh, fsinfo); ++ + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); + + server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); +@@ -938,7 +973,7 @@ static int nfs_probe_fsinfo(struct nfs_s + if (error < 0) + goto out_error; + +- nfs_server_set_fsinfo(server, &fsinfo); ++ nfs_server_set_fsinfo(server, mntfh, &fsinfo); + + /* Get some general file system info */ + if (server->namelen == 0) { +@@ -1016,6 +1051,7 @@ void nfs_free_server(struct nfs_server * + { + dprintk("--> nfs_free_server()\n"); + ++ nfs4_uninit_pnfs(server); + spin_lock(&nfs_client_lock); + list_del(&server->client_link); + list_del(&server->master_link); +@@ -1126,7 +1162,7 @@ static int nfs4_init_callback(struct nfs + return error; + } + +- error = nfs_callback_up(clp->cl_minorversion, ++ error = nfs_callback_up(clp->cl_mvops->minor_version, + clp->cl_rpcclient->cl_xprt); + if (error < 0) { + dprintk("%s: failed to start callback. Error = %d\n", +@@ -1143,10 +1179,8 @@ static int nfs4_init_callback(struct nfs + */ + static int nfs4_init_client_minor_version(struct nfs_client *clp) + { +- clp->cl_call_sync = _nfs4_call_sync; +- + #if defined(CONFIG_NFS_V4_1) +- if (clp->cl_minorversion) { ++ if (clp->cl_mvops->minor_version) { + struct nfs4_session *session = NULL; + /* + * Create the session and mark it expired. +@@ -1158,7 +1192,13 @@ static int nfs4_init_client_minor_versio + return -ENOMEM; + + clp->cl_session = session; +- clp->cl_call_sync = _nfs4_call_sync_session; ++ /* ++ * The create session reply races with the server back ++ * channel probe. Mark the client NFS_CS_SESSION_INITING ++ * so that the client back channel can find the ++ * nfs_client struct ++ */ ++ clp->cl_cons_state = NFS_CS_SESSION_INITING; + } + #endif /* CONFIG_NFS_V4_1 */ + +@@ -1216,7 +1256,7 @@ error: + /* + * Set up an NFS4 client + */ +-static int nfs4_set_client(struct nfs_server *server, ++int nfs4_set_client(struct nfs_server *server, + const char *hostname, + const struct sockaddr *addr, + const size_t addrlen, +@@ -1259,6 +1299,7 @@ error: + dprintk("<-- nfs4_set_client() = xerror %d\n", error); + return error; + } ++EXPORT_SYMBOL(nfs4_set_client); + + + /* +@@ -1448,7 +1489,7 @@ struct nfs_server *nfs4_create_referral_ + data->authflavor, + parent_server->client->cl_xprt->prot, + parent_server->client->cl_timeout, +- parent_client->cl_minorversion); ++ parent_client->cl_mvops->minor_version); + if (error < 0) + goto error; + +diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/bl_com.c +--- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-08-31 20:42:05.550110844 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-08-31 20:42:05.550110844 -0400 +@@ -0,0 +1,292 @@ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++static ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); ++static void bl_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops bl_upcall_ops = { ++ .upcall = bl_pipe_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = bl_pipe_destroy_msg, ++}; ++ ++bl_comm_t *bl_comm_global; ++ ++int ++nfsd_bl_start(void) ++{ ++ bl_comm_t *bl_comm = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ dprintk("%s: starting pipe\n", __func__); ++ if (bl_comm_global) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ bl_comm = kzalloc(sizeof (*bl_comm), GFP_KERNEL); ++ if (!bl_comm) { ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ /* FIXME: rename to "spnfs_block" */ ++ bl_comm->pipe_dentry = rpc_mkpipe(nd.path.dentry, "pnfs_block", bl_comm, ++ &bl_upcall_ops, 0); ++ if (IS_ERR(bl_comm->pipe_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ mutex_init(&bl_comm->lock); ++ mutex_init(&bl_comm->pipe_lock); ++ init_waitqueue_head(&bl_comm->pipe_wq); ++ ++ bl_comm_global = bl_comm; ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(bl_comm); ++ return rc; ++} ++ ++void ++nfsd_bl_stop(void) ++{ ++ bl_comm_t *c = bl_comm_global; ++ ++ dprintk("%s: stopping pipe\n", __func__); ++ if (!c) ++ return; ++ rpc_unlink(c->pipe_dentry); ++ rpc_put_mount(); ++ bl_comm_global = NULL; ++ kfree(c); ++} ++ ++static ssize_t ++bl_pipe_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *dst, ++ size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied, ++ left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ ++ return mlen; ++} ++ ++static ssize_t ++bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ bl_comm_t *bc = (bl_comm_t *)rpci->private; ++ bl_comm_msg_t *im = &bc->msg; ++ int ret; ++ bl_comm_res_t *res; ++ ++ ++ if (mlen == 0) { ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ im->msg_res = NULL; ++ wake_up(&bc->pipe_wq); ++ return -EFAULT; ++ } ++ ++ if ((res = kmalloc(mlen, GFP_KERNEL)) == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(res, src, mlen)) { ++ kfree(res); ++ return -EFAULT; ++ } ++ ++ mutex_lock(&bc->pipe_lock); ++ ++ ret = mlen; ++ im->msg_status = res->res_status; ++ im->msg_res = res; ++ ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++ return ret; ++} ++ ++static void ++bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ bl_comm_msg_t *im = msg->data; ++ bl_comm_t *bc = container_of(im, struct bl_comm, msg); ++ ++ if (msg->errno >= 0) ++ return; ++ ++ mutex_lock(&bc->pipe_lock); ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++} ++ ++int ++bl_upcall(bl_comm_t *bc, bl_comm_msg_t *upmsg, bl_comm_res_t **res) ++{ ++ struct rpc_pipe_msg msg; ++ DECLARE_WAITQUEUE(wq, current); ++ int rval = 1; ++ bl_comm_msg_t *m = &bc->msg; ++ ++ if (bc == NULL) { ++ dprintk("%s: No pNFS block daemon available\n", __func__); ++ return 1; ++ } ++ ++ mutex_lock(&bc->lock); ++ mutex_lock(&bc->pipe_lock); ++ ++ memcpy(m, upmsg, sizeof (*m)); ++ ++ memset(&msg, 0, sizeof (msg)); ++ msg.data = m; ++ msg.len = sizeof (*m); ++ ++ add_wait_queue(&bc->pipe_wq, &wq); ++ rval = rpc_queue_upcall(bc->pipe_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&bc->pipe_lock); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ mutex_lock(&bc->pipe_lock); ++ ++ if (m->msg_status == PNFS_BLOCK_SUCCESS) { ++ *res = m->msg_res; ++ rval = 0; ++ } else ++ rval = 1; ++ ++out: ++ mutex_unlock(&bc->pipe_lock); ++ mutex_unlock(&bc->lock); ++ return rval; ++} ++ ++static ssize_t ctl_write(struct file *file, const char __user *buf, size_t len, ++ loff_t *offset) ++{ ++ int cmd, ++ rc; ++ bl_comm_t *bc = bl_comm_global; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof (int))) ++ return -EFAULT; ++ switch (cmd) { ++ case PNFS_BLOCK_CTL_STOP: ++ msg.msg_type = PNFS_UPCALL_MSG_STOP; ++ (void) bl_upcall(bc, &msg, &res); ++ kfree(res); ++ nfsd_bl_stop(); ++ break; ++ ++ case PNFS_BLOCK_CTL_START: ++ rc = nfsd_bl_start(); ++ if (rc != 0) ++ return rc; ++ break; ++ ++ case PNFS_BLOCK_CTL_VERS: ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bc, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ kfree(res); ++ break; ++ ++ default: ++ dprintk("%s: unknown ctl command %d\n", __func__, cmd); ++ break; ++ } ++ return len; ++} ++ ++static struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++ ++/* ++ * bl_init_proc -- set up proc interfaces ++ * ++ * Creating a pnfs_block directory isn't really required at this point ++ * since we've only got a single node in that directory. If the need for ++ * more nodes doesn't present itself shortly this code should revert ++ * to a single top level node. McNeal 11-Aug-2008. ++ */ ++int ++bl_init_proc(void) ++{ ++ struct proc_dir_entry *e; ++ ++ e = proc_mkdir("fs/pnfs_block", NULL); ++ if (!e) ++ return -ENOMEM; ++ ++ e = create_proc_entry("fs/pnfs_block/ctl", 0, NULL); ++ if (!e) ++ return -ENOMEM; ++ e->proc_fops = &ctl_ops; ++ ++ return 0; ++} ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/bl_ops.c +--- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-08-31 20:42:05.551222888 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-08-31 20:42:05.551222888 -0400 +@@ -0,0 +1,1672 @@ ++/* ++ * bl_ops.c ++ * spNFS ++ * ++ * Created by Rick McNeal on 4/1/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++ ++/* ++ * Block layout operations. ++ * ++ * These functions, with the exception of pnfs_block_enabled, are assigned to ++ * the super block s_export_op structure. ++ */ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#define MIN(a, b) ((a) < (b) ? (a) : (b)) ++ ++#define BL_LAYOUT_HASH_BITS 4 ++#define BL_LAYOUT_HASH_SIZE (1 << BL_LAYOUT_HASH_BITS) ++#define BL_LAYOUT_HASH_MASK (BL_LAYOUT_HASH_SIZE - 1) ++#define BL_LIST_REQ (sizeof (struct dm_ioctl) + 256) ++ ++#define bl_layout_hashval(id) \ ++ ((id) & BL_LAYOUT_HASH_MASK) ++ ++#define BLL_F_END(p) ((p)->bll_foff + (p)->bll_len) ++#define BLL_S_END(p) ((p)->bll_soff + (p)->bll_len) ++#define _2SECTS(v) ((v) >> 9) ++ ++#ifndef READ32 ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++(x) = (u64)ntohl(*p++) << 32; \ ++(x) |= ntohl(*p++); \ ++} while (0) ++#endif ++ ++ ++typedef enum {True, False} boolean_t; ++/* ---- block layoutget and commit structure ---- */ ++typedef struct bl_layout_rec { ++ struct list_head blr_hash, ++ blr_layouts; ++ dev_t blr_rdev; ++ struct inode *blr_inode; ++ int blr_recalled; // debug ++ u64 blr_orig_size, ++ blr_commit_size, ++ blr_ext_size; ++ spinlock_t blr_lock; // Protects blr_layouts ++} bl_layout_rec_t; ++ ++static struct list_head layout_hash; ++static struct list_head layout_hashtbl[BL_LAYOUT_HASH_SIZE]; ++static spinlock_t layout_hashtbl_lock; ++ ++/* ---- prototypes ---- */ ++static boolean_t device_slice(dev_t devid); ++static boolean_t device_dm(dev_t devid); ++static boolean_t layout_inode_add(struct inode *i, bl_layout_rec_t **); ++static bl_layout_rec_t *layout_inode_find(struct inode *i); ++static void layout_inode_del(struct inode *i); ++static char *map_state2name(enum pnfs_block_extent_state4 s); ++static pnfs_blocklayout_devinfo_t *bld_alloc(struct list_head *volume, int type); ++static void bld_free(pnfs_blocklayout_devinfo_t *bld); ++static pnfs_blocklayout_devinfo_t *bld_simple(struct list_head *volumes, ++ dev_t devid, int local_index); ++static pnfs_blocklayout_devinfo_t *bld_slice(struct list_head *volumes, ++ dev_t devid, int my_loc, int idx); ++static int layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg); ++struct list_head *layout_cache_iter(bl_layout_rec_t *r, ++ struct list_head *bl_possible, struct nfsd4_layout_seg *seg); ++static void layout_cache_merge(bl_layout_rec_t *r, struct list_head *h); ++static int layout_cache_update(bl_layout_rec_t *r, struct list_head *h); ++static void layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg); ++static void print_bll(pnfs_blocklayout_layout_t *b, char *); ++static inline boolean_t layout_cache_fill_from_list(bl_layout_rec_t *r, ++ struct list_head *h, struct nfsd4_layout_seg *seg); ++static inline void bll_collapse(bl_layout_rec_t *r, ++ pnfs_blocklayout_layout_t *c); ++static pnfs_blocklayout_layout_t *bll_alloc(u64 offset, u64 len, ++ enum bl_cache_state state, struct list_head *h); ++static pnfs_blocklayout_layout_t *bll_alloc_dup(pnfs_blocklayout_layout_t *b, ++ enum bl_cache_state c, struct list_head *h); ++static inline boolean_t layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s); ++static void extents_setup(struct fiemap_extent_info *fei); ++static void extents_count(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_get(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_process(struct fiemap_extent_info *fei, ++ struct list_head *bl_candidates, struct nfsd4_layout_seg *, dev_t dev, ++ pnfs_blocklayout_layout_t *b); ++static void extents_cleanup(struct fiemap_extent_info *fei); ++ ++void ++nfsd_bl_init(void) ++{ ++ int i; ++ dprintk("%s loaded\n", __func__); ++ ++ spin_lock_init(&layout_hashtbl_lock); ++ INIT_LIST_HEAD(&layout_hash); ++ for (i = 0; i < BL_LAYOUT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&layout_hashtbl[i]); ++ bl_init_proc(); ++} ++ ++/* ++ * pnfs_block_enabled -- check to see if this file system should be export as ++ * block pnfs ++ */ ++int ++pnfs_block_enabled(struct inode *inode, int ex_flags) ++{ ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ static int bl_comm_once = 0; ++ ++ dprintk("--> %s\n", __func__); ++ /* ++ * FIXME: Figure out method to determine if this file system should ++ * be exported. The following areas need to be checked. ++ * (1) Validate that this file system was exported as a pNFS ++ * block-layout ++ * (2) Has there been successful communication with the ++ * volume daemon? ++ */ ++ /* Check #1 */ ++#ifdef notyet ++ if (!(ex_flags & NFSEXP_PNFS_BLOCK)) { ++ dprintk("%s: pnfs_block not set in export\n", __func__); ++ return 0; ++ } ++#endif ++ ++ /* Check #1 */ ++ if (!bl_comm_once) { ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ if (msg.u.msg_vers != res->u.vers) { ++ dprintk("%s: vers mismatch, kernel != daemon\n", ++ __func__); ++ kfree(res); ++ return 0; ++ } ++ } ++ bl_comm_once = 1; ++ ++ kfree(res); ++ ++ dprintk("<-- %s okay\n", __func__); ++ return 1; ++} ++ ++int ++bl_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_BLOCK_VOLUME; ++} ++ ++int ++bl_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_devid = sb->s_dev; ++ res->gd_verf = 1; ++ res->gd_cookie = 1; ++ return 0; ++} ++ ++static int ++bl_getdeviceinfo_slice(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld_slice_p, ++ *bld_simple_p, ++ *bld; ++ int status = -EIO, ++ location = 0; ++ struct list_head volumes; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ bld_simple_p = bld_simple(&volumes, devid->devid, ++ location++); ++ if (!bld_simple_p) ++ goto out; ++ bld_slice_p = bld_slice(&volumes, devid->devid, location++, ++ bld_simple_p->bld_index_loc); ++ ++ if (!bld_slice_p) ++ goto out; ++ ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ if (bld->bld_type == PNFS_BLOCK_VOLUME_SIMPLE) ++ kfree(bld->u.simple.bld_sig); ++ bld_free(bld); ++ } ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++static int ++bl_getdeviceinfo_dm(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ int status = -EIO, // default to error ++ i, ++ location = 0; ++ struct list_head volumes; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMGET; ++ msg.u.msg_dev = devid->devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: upcall for DMGET failed\n", __func__); ++ goto out; ++ } ++ ++ /* ++ * Don't use bld_alloc() here. If used this will be the first volume ++ * type added to the list whereas the protocol requires it to be the ++ * last. ++ */ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ goto out; ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = PNFS_BLOCK_VOLUME_STRIPE; ++ bld->u.stripe.bld_stripes = res->u.stripe.num_stripes; ++ bld->u.stripe.bld_chunk_size = res->u.stripe.stripe_size * 512LL; ++ dprintk("%s: stripes %d, chunk_size %Lu\n", __func__, ++ bld->u.stripe.bld_stripes, bld->u.stripe.bld_chunk_size / 512LL); ++ ++ bld->u.stripe.bld_stripe_indexs = kmalloc(bld->u.stripe.bld_stripes * ++ sizeof (int), GFP_KERNEL); ++ if (!bld->u.stripe.bld_stripe_indexs) ++ goto out; ++ ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) { ++ dev_t dev; ++ pnfs_blocklayout_devinfo_t *bldp; ++ ++ dev = MKDEV(res->u.stripe.devs[i].major, ++ res->u.stripe.devs[i].minor); ++ if (dev == 0) ++ goto out; ++ ++ bldp = bld_simple(&volumes, dev, location++); ++ if (!bldp) { ++ dprintk("%s: bld_simple failed\n", __func__); ++ goto out; ++ } ++ bldp = bld_slice(&volumes, dev, location++, bldp->bld_index_loc); ++ ++ if (!bldp) { ++ dprintk("%s: bld_slice failed\n", __func__); ++ goto out; ++ } ++ bld->u.stripe.bld_stripe_indexs[i] = bldp->bld_index_loc; ++ ++ } ++ list_add_tail(&bld->bld_list, &volumes); ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SLICE: ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ // No memory to release for these ++ break; ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ kfree(bld->u.simple.bld_sig); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ kfree(bld->u.stripe.bld_stripe_indexs); ++ break; ++ } ++ bld_free(bld); ++ } ++ kfree(res); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++/* ++ * bl_getdeviceinfo -- determine device tree for requested devid ++ */ ++int ++bl_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ if (device_slice(devid->devid) == True) ++ return bl_getdeviceinfo_slice(sb, xdr, devid); ++ else if (device_dm(devid->devid) == True) ++ return bl_getdeviceinfo_dm(sb, xdr, devid); ++ return -EINVAL; ++} ++ ++enum nfsstat4 ++bl_layoutget(struct inode *i, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ pnfs_blocklayout_layout_t *b; ++ bl_layout_rec_t *r; ++ struct list_head bl_possible, ++ *bl_candidates = NULL; ++ boolean_t del_on_error = False; ++ int adj; ++ enum nfsstat4 nfserr = NFS4_OK; ++ ++ dprintk("--> %s (inode=[0x%x:%lu], offset=%Lu, len=%Lu, iomode=%d)\n", ++ __func__, i->i_sb->s_dev, i->i_ino, _2SECTS(res->lg_seg.offset), ++ _2SECTS(res->lg_seg.length), res->lg_seg.iomode); ++ ++ if (res->lg_seg.length == 0) { ++ printk("%s: request length of 0, error condition\n", __func__); ++ return NFS4ERR_BADLAYOUT; ++ } ++ ++ /* ++ * Adjust the length as required per spec. ++ * - First case is were the length is set to (u64)-1. Cheap means to ++ * define the end of the file. ++ * - Second case is were the I/O mode is read-only, but the request is ++ * past the end of the file so the request needs to be trimed. ++ */ ++ if ((res->lg_seg.length == NFS4_MAX_UINT64) || ++ (((res->lg_seg.offset + res->lg_seg.length) > i->i_size) && ++ (res->lg_seg.iomode == IOMODE_READ))) ++ res->lg_seg.length = i->i_size - res->lg_seg.offset; ++ ++ adj = (res->lg_seg.offset & 511) ? res->lg_seg.offset & 511 : 0; ++ res->lg_seg.offset -= adj; ++ res->lg_seg.length = (res->lg_seg.length + adj + 511) & ~511; ++ ++ if (res->lg_seg.iomode != IOMODE_READ) ++ if (i->i_op->fallocate(i, FALLOC_FL_KEEP_SIZE, ++ res->lg_seg.offset, res->lg_seg.length)) ++ return NFS4ERR_IO; ++ ++ INIT_LIST_HEAD(&bl_possible); ++ ++ if ((r = layout_inode_find(i)) == NULL) { ++ if (layout_inode_add(i, &r) == False) { ++ printk("%s: layout_inode_add failed\n", __func__); ++ return NFS4ERR_IO; ++ } ++ del_on_error = True; ++ } ++ BUG_ON(!r); ++ ++ spin_lock(&r->blr_lock); ++ ++ if (layout_cache_fill_from(r, &bl_possible, &res->lg_seg)) { ++ /* ++ * This will send LAYOUTTRYAGAIN error to the client. ++ */ ++ dprintk("%s: layout_cache_fill_from() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res->lg_return_on_close = 1; ++ res->lg_seg.length = 0; ++ ++ bl_candidates = layout_cache_iter(r, &bl_possible, &res->lg_seg); ++ if (!bl_candidates) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ layout_cache_merge(r, bl_candidates); ++ if (layout_cache_update(r, bl_candidates)) { ++ /* ---- Failed to allocate memory. ---- */ ++ dprintk("%s: layout_cache_update() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ nfserr = blocklayout_encode_layout(xdr, bl_candidates); ++ if (nfserr) ++ dprintk("%s: layoutget xdr routine failed\n", __func__); ++ ++layoutget_cleanup: ++ if (bl_candidates) { ++ while (!list_empty(bl_candidates)) { ++ b = list_entry(bl_candidates->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ } ++ ++ spin_unlock(&r->blr_lock); ++ if (unlikely(nfserr)) { ++ if (del_on_error == True) ++ layout_inode_del(i); ++ res->lg_seg.length = 0; ++ res->lg_seg.offset = 0; ++ } ++ ++ dprintk("<-- %s (rval %u)\n", __func__, nfserr); ++ return nfserr; ++} ++ ++/* ++ * bl_layoutcommit -- commit changes, especially size, to file systemj ++ * ++ * Currently this routine isn't called and everything is handled within ++ * nfsd4_layoutcommit(). By not calling this routine the server doesn't ++ * handle a partial return, a set of extents, of the layout. The extents ++ * are decoded here, but nothing is done with them. If this routine is ++ * be called the interface must change to pass the 'dentry' pointer such ++ * that notify_change() can be called. ++ */ ++int ++bl_layoutcommit(struct inode *i, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ bl_layout_rec_t *r; ++ int status = 0; ++ u64 lw_plus; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ r = layout_inode_find(i); ++ if (r) { ++ lw_plus = args->lc_last_wr + 1; ++ if (args->lc_newoffset) { ++ dprintk(" lc_last_wr %Lu\n", lw_plus); ++ if (r->blr_orig_size < lw_plus) { ++ r->blr_orig_size = lw_plus; ++ res->lc_size_chg = 1; ++ res->lc_newsize = lw_plus; ++ } ++ } ++ ++ if (args->lc_up_len) { ++ int extents, ++ i; ++ struct pnfs_blocklayout_layout *b; ++ __be32 *p = args->lc_up_layout; ++ ++ /* ++ * Client is returning a set of extents which ++ * should/could be used to update the file system. ++ * See section 2.3.2 in draft-ietf-nfsv4-pnfs-block-08 ++ */ ++ READ32(extents); ++ dprintk(" Client returning %d extents: data size %d\n", ++ extents, args->lc_up_len); ++ b = kmalloc(sizeof (struct pnfs_blocklayout_layout) * ++ extents, GFP_KERNEL); ++ if (b) { ++ for (i = 0; i < extents; i++) { ++ READ64(b[i].bll_vol_id.sbid); ++ READ64(b[i].bll_vol_id.devid); ++ READ64(b[i].bll_foff); ++ READ64(b[i].bll_len); ++ READ64(b[i].bll_soff); ++ READ32(b[i].bll_es); ++ dprintk(" %d: foff %Lu, len %Lu, soff %Lu " ++ "state %s\n", ++ i, _2SECTS(b[i].bll_foff), ++ _2SECTS(b[i].bll_len), ++ _2SECTS(b[i].bll_soff), ++ map_state2name(b[i].bll_es)); ++ } ++ kfree(b); ++ } else { ++ status = -ENOMEM; ++ } ++ } ++ } else ++ dprintk("%s: Unexpected commit to inode %p\n", __func__, i); ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutreturn(struct inode *i, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ int status = 0; ++ bl_layout_rec_t *r; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ ++ r = layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ layout_cache_del(r, &args->lr_seg); ++ spin_unlock(&r->blr_lock); ++ dprintk(" ext_size %Lu, i_size %Lu, orig_size %Lu\n", ++ r->blr_ext_size, i->i_size, r->blr_orig_size); ++ } ++ ++ layout_inode_del(i); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ bl_layout_rec_t *r; ++ pnfs_blocklayout_layout_t *b; ++ u64 adj; ++ ++ dprintk("--> %s\n", __func__); ++ BUG_ON(!len); ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk(" recalling layout [0x%x:%lu], %Lu:%Lu\n", ++ inode->i_sb->s_dev, inode->i_ino, ++ _2SECTS(offset), _2SECTS(len)); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* ++ * XXX figure out how to get a sb since there's no ++ * inode ptr ++ */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++restart: ++ r = layout_inode_find(inode); ++ if (r && len && !r->blr_recalled) { ++ spin_lock(&r->blr_lock); ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (!r->blr_recalled && !b->bll_recalled && ++ (offset >= b->bll_foff) && (offset < BLL_F_END(b))) { ++ b->bll_recalled = 1; ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_BLOCK_VOLUME; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = 0; ++ lr.cbl_seg.length = NFS4_MAX_UINT64; ++ r->blr_recalled = 1; ++ dprintk(" FULL LAYOUTRECALL\n"); ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ ++ /* ++ * Currently there are only two cases where the ++ * layout is being returned. ++ * (1) Someone is issuing a NFS_WRITE operation ++ * to this layout. ++ * (2) The file has been truncated which means ++ * the layout is immediately made invalid. ++ * In both cases the client must write any ++ * uncommitted modifications to the server via ++ * NFS_WRITE. ++ */ ++ lr.cbl_layoutchanged = 1; ++ ++ /* ++ * Need to drop the lock because we'll get a ++ * layoutreturn which will block waiting for ++ * the lock. The request will come in on the ++ * same thread which will cause a deadlock. ++ */ ++ spin_unlock(&r->blr_lock); ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ adj = MIN(b->bll_len - (offset - b->bll_foff), ++ len); ++ offset += adj; ++ len -= adj; ++ if (!len) { ++ spin_lock(&r->blr_lock); ++ break; ++ } ++ /* ++ * Since layoutreturn will have been called we ++ * can't assume blr_layouts is still valid, ++ * so restart. ++ */ ++ goto restart; ++ } ++ } ++ spin_unlock(&r->blr_lock); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++/* ++ * []------------------------------------------------------------------[] ++ * | Support functions from here on down. | ++ * []------------------------------------------------------------------[] ++ */ ++ ++/* ++ * bld_simple -- given a dev_t build a simple volume structure ++ * ++ * Simple volume contains the device signature and offset to that data in ++ * the storage volume. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_simple(struct list_head *volumes, dev_t devid, int local_index) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSIG; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to get signature information\n", __func__); ++ goto error; ++ } ++ ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SIMPLE); ++ if (!bld) ++ return NULL; ++ ++ bld->u.simple.bld_offset = (res->u.sig.sector * 512LL) + res->u.sig.offset; ++ bld->u.simple.bld_sig_len = res->u.sig.len; ++ bld->u.simple.bld_sig = kmalloc(res->u.sig.len, GFP_KERNEL); ++ if (!bld->u.simple.bld_sig) ++ goto error; ++ ++ memcpy(bld->u.simple.bld_sig, res->u.sig.sig, res->u.sig.len); ++ kfree(res); ++ return bld; ++ ++error: ++ if (bld) ++ bld_free(bld); ++ if (res) ++ kfree(res); ++ dprintk("%s: error in bld_simple\n", __func__); ++ return NULL; ++} ++ ++/* ++ * bld_slice -- given a dev_t build a slice volume structure ++ * ++ * A slice volume contains the length of the slice/partition and its offset ++ * from the beginning of the storage volume. There's also a reference to ++ * the "simple" volume which contains this slice. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_slice(struct list_head *volumes, dev_t devid, int my_loc, int simple_loc) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SLICE); ++ if (!bld) ++ return NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSLICE; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Upcall to get slice info failed\n"); ++ bld_free(bld); ++ return NULL; ++ } ++ ++ bld->bld_devid.devid = devid; ++ bld->bld_index_loc = my_loc; ++ bld->u.slice.bld_start = res->u.slice.start * 512LL; ++ bld->u.slice.bld_len = res->u.slice.length * 512LL; ++ bld->u.slice.bld_index = simple_loc; ++ ++ dprintk("%s: start %Lu, len %Lu\n", __func__, ++ bld->u.slice.bld_start / 512LL, bld->u.slice.bld_len / 512LL); ++ ++ kfree(res); ++ dprintk("<-- %s (rval %p)\n", __func__, bld); ++ return bld; ++} ++ ++static int ++layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!list_empty(&r->blr_layouts)) ++ if (layout_cache_fill_from_list(r, h, seg) == False) ++ return -EIO; ++ ++ /* ++ * This deals with two conditions. ++ * (1) When blr_layouts is empty we need to create the first entry ++ * (2) When the range requested falls past the end of any current ++ * layout the residual must be taken care of. ++ */ ++ if (seg->length) { ++ n = bll_alloc(seg->offset, seg->length, BLOCK_LAYOUT_NEW, h); ++ if (!n) ++ return -ENOMEM; ++ dprintk(" remaining at %Lu, len %Lu\n", _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len)); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++struct list_head * ++layout_cache_iter(bl_layout_rec_t *r, struct list_head *bl_possible, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n = NULL; ++ struct list_head *bl_candidates = NULL; ++ struct fiemap_extent_info fei; ++ struct inode *i; ++ dev_t dev; ++ ++ dev = r->blr_rdev; ++ i = r->blr_inode; ++ ++ dprintk("--> %s\n", __func__); ++ bl_candidates = kmalloc(sizeof (*bl_candidates), GFP_KERNEL); ++ if (!bl_candidates) ++ return NULL; ++ INIT_LIST_HEAD(bl_candidates); ++ extents_setup(&fei); ++ ++ list_for_each_entry(b, bl_possible, bll_list) { ++ if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ extents_count(&fei, i, b->bll_foff, b->bll_len); ++ if (fei.fi_extents_mapped) { ++ ++ /* ++ * Common case here. Got a range which has ++ * extents. Now get those extents and process ++ * them into pNFS extents. ++ */ ++ if (extents_get(&fei, i, b->bll_foff, ++ b->bll_len) == False) ++ goto cleanup; ++ if (extents_process(&fei, bl_candidates, ++ seg, dev, b) == False) ++ goto cleanup; ++ extents_cleanup(&fei); ++ ++ } else if (seg->iomode == IOMODE_READ) { ++ ++ /* ++ * Found a hole in a file while reading. No ++ * problem, just create a pNFS extent for the ++ * range and let the client know there's no ++ * backing store. ++ */ ++ n = bll_alloc(b->bll_foff, b->bll_len, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += b->bll_len; ++ } else { ++ ++ /* ++ * There's a problem here. Since the iomode ++ * is read/write fallocate should have allocated ++ * any necessary storage for the given range. ++ */ ++ dprintk(" Extent count for RW is 0\n"); ++ goto cleanup; ++ } ++ ++ } else { ++ n = bll_alloc_dup(b, b->bll_cache_state, bl_candidates); ++ seg->length += n->bll_len; ++ } ++ ++ if (r->blr_ext_size < (b->bll_foff + b->bll_len)) ++ r->blr_ext_size = b->bll_foff + b->bll_len; ++ } ++ ++ while (!list_empty(bl_possible)) { ++ b = list_entry(bl_possible->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ ++ b = list_first_entry(bl_candidates, struct pnfs_blocklayout_layout, ++ bll_list); ++ seg->offset = b->bll_foff; ++ dprintk("<-- %s okay\n", __func__); ++ return bl_candidates; ++ ++cleanup: ++ extents_cleanup(&fei); ++ if (bl_candidates) ++ kfree(bl_candidates); ++ dprintk("<-- %s, error occurred\n", __func__); ++ return NULL; ++} ++ ++/* ++ * layout_cache_merge -- collapse layouts which make up a contiguous range. ++ */ ++static void ++layout_cache_merge(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *p; ++ ++ dprintk("--> %s\n", __func__); ++restart: ++ p = NULL; ++ list_for_each_entry(b, h, bll_list) { ++ if (p && (BLL_S_END(p) == b->bll_soff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ /* ++ * We've got a condidate. ++ */ ++#ifdef too_verbose ++ dprintk(" merge %Lu(f):%Lu(l):%Lu(s) into %Lu(f):%Lu(l):%Lu(s)\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), ++ _2SECTS(p->bll_foff), _2SECTS(p->bll_len), ++ _2SECTS(b->bll_soff)); ++#endif ++ ++ if (p->bll_cache_state == BLOCK_LAYOUT_CACHE) ++ p->bll_cache_state = BLOCK_LAYOUT_UPDATE; ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else if (p && (BLL_F_END(p) == b->bll_foff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es == PNFS_BLOCK_NONE_DATA)) { ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else ++ p = b; ++ } ++ dprintk("<-- %s\n", __func__); ++} ++ ++static int ++layout_cache_update(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *c, ++ *n; ++ boolean_t status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ if (list_empty(&r->blr_layouts)) { ++ /* ---- Just add entries and return ---- */ ++ dprintk(" cache empty for inode 0x%x:%ld\n", r->blr_rdev, ++ r->blr_inode->i_ino); ++ list_for_each_entry(b, h, bll_list) { ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, ++ &r->blr_layouts); ++ if (!c) { ++ status = -ENOMEM; ++ break; ++ } ++ dprintk(" adding %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), c->bll_es); ++ } ++ return status; ++ } ++ ++ list_for_each_entry(b, h, bll_list) { ++ BUG_ON(!b->bll_vol_id.devid); ++ if (b->bll_cache_state == BLOCK_LAYOUT_UPDATE) { ++ boolean_t found = False; ++ list_for_each_entry(c, &r->blr_layouts, bll_list) { ++ if ((b->bll_soff >= c->bll_soff) && ++ (b->bll_soff < BLL_S_END(c)) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ u64 u; ++ ++ if ((b->bll_foff < c->bll_foff) || ++ (b->bll_foff > BLL_F_END(c))) ++ BUG(); ++ ++ u = BLL_S_END(b) - BLL_S_END(c); ++ /* ++ * The updated cache entry has to be ++ * different than the current. ++ * Otherwise the cache state for 'b' ++ * should be BLOCK_LAYOUT_CACHE. ++ */ ++ BUG_ON(BLL_S_END(b) < BLL_S_END(c)); ++ ++ dprintk(" " ++ "updating %Lu(f):%Lu(l):%Lu(s) to len %Lu\n", ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), ++ _2SECTS(c->bll_len + u)); ++ c->bll_len += u; ++ bll_collapse(r, c); ++ found = True; ++ break; ++ } ++ } ++ ++ if (found == False) { ++ dprintk(" ERROR Expected to find" ++ " %Lu(f):%Lu(l):%Lu(s), but didn't\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_for_each_entry(c, &r->blr_layouts, bll_list) ++ print_bll(c, "Cached"); ++ BUG(); ++ } ++ } else if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ c = list_first_entry(&r->blr_layouts, ++ struct pnfs_blocklayout_layout, bll_list); ++ if (b->bll_foff < c->bll_foff) { ++ /* ++ * Special case where new entry is before ++ * first cached entry. ++ */ ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, NULL); ++ list_add(&c->bll_list, &r->blr_layouts); ++ dprintk(" new entry at head of list at %Lu, " ++ "len %Lu\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len)); ++ } else { ++ list_for_each_entry(c, &r->blr_layouts, ++ bll_list) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, ++ bll_list); ++ /* ++ * This is ugly, but can't think of ++ * another way to examine this case. ++ * Consider the following. Need to ++ * add an entry which starts at 40 ++ * and the cache has the following ++ * entries: ++ * Start Length ++ * 10 5 ++ * 30 5 ++ * 50 5 ++ * So, need to look and see if the new ++ * entry starts after the current ++ * cache, but before the next one. ++ * There's a catch in that the next ++ * entry might not be valid as it's ++ * really just a pointer to the list ++ * head. ++ */ ++ if (((b->bll_foff >= ++ BLL_F_END(c)) && ++ (c->bll_list.next == &r->blr_layouts)) || ++ ((b->bll_foff >= ++ BLL_F_END(c)) && ++ (b->bll_foff < n->bll_foff))) { ++ ++ n = bll_alloc_dup(b, ++ BLOCK_LAYOUT_CACHE, NULL); ++ dprintk(" adding new %Lu:%Lu" ++ " after %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len), ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len)); ++ list_add(&n->bll_list, ++ &c->bll_list); ++ break; ++ } ++ } ++ } ++ } ++ } ++ dprintk("<-- %s\n", __func__); ++ return status; ++} ++ ++static void ++layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg_in) ++{ ++ struct pnfs_blocklayout_layout *b, ++ *n; ++ u64 len; ++ struct nfsd4_layout_seg seg = *seg_in; ++ ++ dprintk("--> %s\n", __func__); ++ if (seg.length == NFS4_MAX_UINT64) { ++ r->blr_recalled = 0; ++ dprintk(" Fast return of all layouts\n"); ++ while (!list_empty(&r->blr_layouts)) { ++ b = list_entry(r->blr_layouts.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ dprintk(" foff %Lu, len %Lu, soff %Lu\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ dprintk("<-- %s\n", __func__); ++ return; ++ } ++ ++restart: ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg.offset == b->bll_foff) { ++ /* ++ * This handle the following three cases: ++ * (1) return layout matches entire cache layout ++ * (2) return layout matches beginning portion of cache ++ * (3) return layout matches entire cache layout and ++ * into next entry. Varies from #1 in end case. ++ */ ++ dprintk(" match on offsets, %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length)); ++ len = MIN(seg.length, b->bll_len); ++ b->bll_foff += len; ++ b->bll_soff += len; ++ b->bll_len -= len; ++ seg.length -= len; ++ seg.offset += len; ++ if (!b->bll_len) { ++ list_del(&b->bll_list); ++ kfree(b); ++ dprintk(" removing cache line\n"); ++ if (!seg.length) { ++ dprintk(" also finished\n"); ++ goto complete; ++ } ++ /* ++ * Since 'b' was freed we can't continue at the ++ * next entry which is referenced as ++ * b->bll_list.next by the list_for_each_entry ++ * macro. Need to restart the loop. ++ * TODO: Think about creating a dummy 'b' which ++ * would keep list_for_each_entry() happy. ++ */ ++ goto restart; ++ } ++ if (!seg.length) { ++ dprintk(" finished, but cache line not" ++ "empty\n"); ++ goto complete; ++ } ++ } else if ((seg.offset >= b->bll_foff) && ++ (seg.offset < BLL_F_END(b))) { ++ /* ++ * layout being returned is within this cache line. ++ */ ++ dprintk(" layout %Lu:%Lu within cache line %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ BUG_ON(!seg.length); ++ if ((seg.offset + seg.length) >= BLL_F_END(b)) { ++ /* ++ * Layout returned starts in the middle of ++ * cache entry and just need to trim back ++ * cache to shorter length. ++ */ ++ dprintk(" trim back cache line\n"); ++ len = seg.offset - b->bll_foff; ++ seg.offset += b->bll_len - len; ++ seg.length -= b->bll_len - len; ++ b->bll_len = len; ++ if (!seg.length) ++ return; ++ } else { ++ /* ++ * Need to split current cache layout because ++ * chunk is being removed from the middle. ++ */ ++ dprintk(" split cache line\n"); ++ len = seg.offset + seg.length; ++ n = bll_alloc(len, ++ (b->bll_foff + b->bll_len) - len, ++ BLOCK_LAYOUT_CACHE, NULL); ++ n->bll_soff = b->bll_soff + len; ++ list_add(&n->bll_list, &b->bll_list); ++ b->bll_len = seg.offset - b->bll_foff; ++ return; ++ } ++ } ++ } ++complete: ++ if (list_empty(&r->blr_layouts)) ++ r->blr_recalled = 0; ++ dprintk("<-- %s\n", __func__); ++} ++ ++/* ++ * layout_cache_fill_from_list -- fills from cache list ++ * ++ * NOTE: This routine was only seperated out from layout_cache_file_from() ++ * to reduce the indentation level which makes the code easier to read. ++ */ ++static inline boolean_t ++layout_cache_fill_from_list(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n; ++ enum pnfs_block_extent_state4 s; ++ ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg->offset < b->bll_foff) { ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, b->bll_foff - seg->offset), ++ BLOCK_LAYOUT_NEW, NULL); ++ if (!n) ++ return False; ++ ++ list_add(&n->bll_list, h->prev); ++ dprintk(" new: %Lu:%Lu, added before %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ ++ if ((seg->offset >= b->bll_foff) && ++ (seg->offset < BLL_F_END(b))) { ++ if (layout_conflict(b, seg->iomode, &s) == False) { ++ dprintk(" CONFLICT FOUND: " ++ "%Lu(f):%Lu(l):%Lu(s) state %d, iomode %d\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es, ++ seg->iomode); ++ return False; ++ } ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, BLL_F_END(b) - seg->offset), ++ BLOCK_LAYOUT_CACHE, h); ++ dprintk(" CACHE hit: Found %Lu(f):%Lu(l): " ++ "in %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es); ++ if (!n) ++ return False; ++ ++ n->bll_soff = b->bll_soff + seg->offset - b->bll_foff; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ n->bll_es = s; ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ } ++ return True; ++} ++ ++static u64 ++bll_alloc_holey(struct list_head *bl_candidates, u64 offset, u64 length, ++ dev_t dev) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ n = bll_alloc(offset, length, BLOCK_LAYOUT_NEW, bl_candidates); ++ if (!n) ++ return 0; ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ ++ return n->bll_len; ++} ++ ++static void ++extents_setup(struct fiemap_extent_info *fei) ++{ ++ fei->fi_extents_start = NULL; ++} ++ ++/* ++ * extents_count -- Determine the number of extents for a given range. ++ * ++ * No need to call set_fs() here because the function ++ * doesn't use copy_to_user() if it's only counting ++ * the number of extents needed. ++ */ ++static void ++extents_count(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ dprintk(" Need fiemap of %Ld:%Ld\n", _2SECTS(foff), _2SECTS(len)); ++ fei->fi_flags = FIEMAP_FLAG_SYNC; ++ fei->fi_extents_max = 0; ++ fei->fi_extents_start = NULL; ++ fei->fi_extents_mapped = 0; ++ i->i_op->fiemap(i, fei, foff, len + (1 << i->i_sb->s_blocksize_bits) - 1); ++} ++ ++/* ++ * extents_get -- Get list of extents for range ++ * ++ * extents_count() must have been called before this routine such that ++ * fi_extents_mapped is known. ++ */ ++static boolean_t ++extents_get(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ int m_space, ++ rval; ++ struct fiemap_extent *fe; ++ mm_segment_t old_fs = get_fs(); ++ ++ /* ++ * Now malloc the correct amount of space ++ * needed. It's possible for the file to have changed ++ * between calls which would require more space for ++ * the extents. If that occurs the last extent will ++ * not have FIEMAP_EXTENT_LAST set and the error will ++ * be caught in extents_process(). ++ */ ++ m_space = fei->fi_extents_mapped * sizeof (struct fiemap_extent); ++ fe = kmalloc(m_space, GFP_KERNEL); ++ if (!fe) ++ return False; ++ memset(fe, 0, m_space); ++ ++ fei->fi_extents_max = fei->fi_extents_mapped; ++ fei->fi_extents_mapped = 0; ++ fei->fi_extents_start = fe; ++ ++ set_fs(KERNEL_DS); ++ rval = i->i_op->fiemap(i, fei, foff, len + ++ (1 << i->i_sb->s_blocksize_bits) - 1); ++ set_fs(old_fs); ++ ++ if (rval || !fei->fi_extents_mapped) { ++ dprintk(" No extents. Wanted %d, got %d\n", ++ fei->fi_extents_max, fei->fi_extents_mapped); ++ kfree(fe); ++ fei->fi_extents_start = NULL; ++ return False; ++ } else ++ return True; ++} ++ ++/* ++ * extents_process -- runs through the extent returned from the file system and ++ * creates block layout entries. ++ */ ++static boolean_t ++extents_process(struct fiemap_extent_info *fei, struct list_head *bl_candidates, ++ struct nfsd4_layout_seg *seg, dev_t dev, pnfs_blocklayout_layout_t *b) ++{ ++ struct fiemap_extent *fep, ++ *fep_last = NULL; ++ int i; ++ pnfs_blocklayout_layout_t *n; ++ u64 last_end, ++ rval; ++ ++ dprintk("--> %s\n", __func__); ++ for (fep = fei->fi_extents_start, i = 0; i < fei->fi_extents_mapped; ++ i++, fep++) { ++ ++ BUG_ON(!fep->fe_physical); ++ /* ++ * Deal with corner cases of hoel-y files. ++ */ ++ if (fep_last && ((fep_last->fe_logical + fep_last->fe_length) != ++ fep->fe_logical)) { ++ ++ /* ++ * If the last extent doesn't end logically ++ * at the beginning of the current we've got ++ * hole and need to create a pNFS extent. ++ */ ++ dprintk(" Got a hole at %Ld:%Ld \n", ++ _2SECTS(fep_last->fe_logical), ++ _2SECTS(fep_last->fe_length)); ++ last_end = fep_last->fe_logical + fep_last->fe_length; ++ rval = bll_alloc_holey(bl_candidates, last_end, ++ fep->fe_logical - last_end, dev); ++ if (!rval) ++ return False; ++ seg->length += rval; ++ } ++ ++ n = bll_alloc(fep->fe_logical, fep->fe_length, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ if (unlikely(n == NULL)) { ++ dprintk("%s: bll_alloc failed\n", __func__); ++ return False; ++ } ++ ++ n->bll_soff = fep->fe_physical; ++ n->bll_es = seg->iomode == IOMODE_READ ? ++ PNFS_BLOCK_READ_DATA : PNFS_BLOCK_READWRITE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += fep->fe_length; ++ print_bll(n, "New extent"); ++ fep_last = fep; ++ } ++ dprintk("<-- %s (i=%d)\n", __func__, i); ++ ++ return True; ++} ++ ++static void ++extents_cleanup(struct fiemap_extent_info *fei) ++{ ++ if (fei->fi_extents_start) { ++ kfree(fei->fi_extents_start); ++ fei->fi_extents_start = NULL; ++ } ++} ++ ++/* ++ * device_slice -- check to see if device is a slice or DM ++ */ ++static boolean_t ++device_slice(dev_t devid) ++{ ++ struct block_device *bd = open_by_devnum(devid, FMODE_READ); ++ boolean_t rval = False; ++ ++ if (bd) { ++ if (bd->bd_disk->minors > 1) ++ rval = True; ++ blkdev_put(bd, FMODE_READ); ++ } ++ return rval; ++} ++ ++/* ++ * device_dm -- check to see if device is a Device Mapper volume. ++ * ++ * Returns 1 for DM or 0 if not ++ */ ++static boolean_t ++device_dm(dev_t devid) ++{ ++ boolean_t rval = False; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMCHK; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Failed upcall to check on DM status\n"); ++ } else if (res->u.dm_vol) { ++ rval = True; ++ dprintk("Device is DM volume\n"); ++ } else ++ dprintk("Device is not DM volume\n"); ++ kfree(res); ++ ++ return rval; ++} ++ ++static boolean_t ++layout_inode_add(struct inode *i, bl_layout_rec_t **p) ++{ ++ bl_layout_rec_t *r = NULL; ++ ++ if (!i->i_op->fiemap || !i->i_op->fallocate) { ++ printk("pNFS: file system doesn't support required fiemap or" ++ "fallocate methods\n"); ++ return False; ++ } ++ ++ r = kmalloc(sizeof (*r), GFP_KERNEL); ++ if (!r) ++ goto error; ++ ++ r->blr_rdev = i->i_sb->s_dev; ++ r->blr_inode = i; ++ r->blr_orig_size = i->i_size; ++ r->blr_ext_size = 0; ++ r->blr_recalled = 0; ++ INIT_LIST_HEAD(&r->blr_layouts); ++ spin_lock_init(&r->blr_lock); ++ spin_lock(&layout_hashtbl_lock); ++ list_add_tail(&r->blr_hash, &layout_hash); ++ spin_unlock(&layout_hashtbl_lock); ++ *p = r; ++ return True; ++ ++error: ++ if (r) ++ kfree(r); ++ return False; ++} ++ ++static bl_layout_rec_t * ++__layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ if (!list_empty(&layout_hash)) { ++ list_for_each_entry(r, &layout_hash, blr_hash) { ++ if ((r->blr_inode->i_ino == i->i_ino) && ++ (r->blr_rdev == i->i_sb->s_dev)) { ++ return r; ++ } ++ } ++ } ++ return NULL; ++} ++ ++static bl_layout_rec_t * ++layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ spin_unlock(&layout_hashtbl_lock); ++ ++ return r; ++} ++ ++static void ++layout_inode_del(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ if (list_empty(&r->blr_layouts)) { ++ list_del(&r->blr_hash); ++ spin_unlock(&r->blr_lock); ++ kfree(r); ++ } else { ++ spin_unlock(&r->blr_lock); ++ } ++ } else { ++ dprintk("%s: failed to find inode [0x%x:%lu] in table for delete\n", ++ __func__, i->i_sb->s_dev, i->i_ino); ++ } ++ spin_unlock(&layout_hashtbl_lock); ++} ++ ++/* ++ * map_state2name -- converts state in ascii string. ++ * ++ * Used for debug messages only. ++ */ ++static char * ++map_state2name(enum pnfs_block_extent_state4 s) ++{ ++ switch (s) { ++ case PNFS_BLOCK_READWRITE_DATA: return " RW"; ++ case PNFS_BLOCK_READ_DATA: return " RO"; ++ case PNFS_BLOCK_INVALID_DATA: return "INVALID"; ++ case PNFS_BLOCK_NONE_DATA: return " NONE"; ++ default: ++ BUG(); ++ } ++} ++ ++static pnfs_blocklayout_devinfo_t * ++bld_alloc(struct list_head *volumes, int type) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ return NULL; ++ ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = type; ++ list_add_tail(&bld->bld_list, volumes); ++ ++ return bld; ++} ++ ++static void ++bld_free(pnfs_blocklayout_devinfo_t *bld) ++{ ++ list_del(&bld->bld_list); ++ kfree(bld); ++} ++ ++static void ++print_bll(pnfs_blocklayout_layout_t *b, char *text) ++{ ++ dprintk(" BLL: %s\n", text); ++ dprintk(" foff %Lu, soff %Lu, len %Lu, state %s\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_soff), _2SECTS(b->bll_len), ++ map_state2name(b->bll_es)); ++} ++ ++static inline void ++bll_collapse(bl_layout_rec_t *r, pnfs_blocklayout_layout_t *c) ++{ ++ pnfs_blocklayout_layout_t *n; ++ int dbg_count = 0; ++ u64 endpoint; ++ ++ BUG_ON(c->bll_es == PNFS_BLOCK_NONE_DATA); ++ while (c->bll_list.next != &r->blr_layouts) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ endpoint = BLL_S_END(c); ++ if ((n->bll_soff >= c->bll_soff) && ++ (n->bll_soff < endpoint)) { ++ if (endpoint < BLL_S_END(n)) { ++ /* ++ * The following is possible. ++ * ++ * ++ * Existing: +---+ +---+ ++ * New: +-----------------------+ ++ * The client request merge entries together ++ * but didn't require picking up all of the ++ * last entry. So, we still need to delete ++ * the last entry and add the remaining space ++ * to the new entry. ++ */ ++ c->bll_len += BLL_S_END(n) - endpoint; ++ } ++ dbg_count++; ++ list_del(&n->bll_list); ++ kfree(n); ++ } else { ++ break; ++ } ++ } ++ /* ---- Debug only, remove before integration ---- */ ++ if (dbg_count) ++ dprintk(" Collapsed %d cache entries between %Lu(s) and %Lu(s)\n", ++ dbg_count, _2SECTS(c->bll_soff), _2SECTS(BLL_S_END(c))); ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc(u64 offset, u64 len, enum bl_cache_state state, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = kmalloc(sizeof (*n), GFP_KERNEL); ++ if (n) { ++ memset(n, 0, sizeof (*n)); ++ n->bll_foff = offset; ++ n->bll_len = len; ++ n->bll_cache_state = state; ++ if (h) ++ list_add_tail(&n->bll_list, h); ++ } ++ return n; ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc_dup(pnfs_blocklayout_layout_t *b, enum bl_cache_state c, ++ struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = bll_alloc(b->bll_foff, b->bll_len, c, h); ++ if (n) { ++ n->bll_es = b->bll_es; ++ n->bll_soff = b->bll_soff; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ } ++ return n; ++} ++ ++static inline boolean_t ++layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s) ++{ ++ /* ---- Normal case ---- */ ++ *s = b->bll_es; ++ ++ switch (b->bll_es) { ++ case PNFS_BLOCK_READWRITE_DATA: ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_READ_DATA; ++ /* ---- Any use is permitted. ---- */ ++ break; ++ case PNFS_BLOCK_READ_DATA: ++ /* ---- Committed as read only data. ---- */ ++ if (iomode == IOMODE_RW) ++ return False; ++ break; ++ case PNFS_BLOCK_INVALID_DATA: ++ /* ---- Blocks have been allocated, but not initialized ---- */ ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_NONE_DATA; ++ break; ++ case PNFS_BLOCK_NONE_DATA: ++ /* ---- Hole-y file. No backing store avail. ---- */ ++ if (iomode != IOMODE_READ) ++ return False; ++ break; ++ default: ++ BUG(); ++ } ++ return True; ++} ++ ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs/delegation.c +--- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-08-31 20:41:19.144140225 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-31 20:42:05.512106042 -0400 +@@ -104,7 +104,8 @@ again: + continue; + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + continue; +- if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) ++ if (memcmp(state->stateid.u.data, stateid->u.data, ++ sizeof(state->stateid.u.data)) != 0) + continue; + get_nfs_open_context(ctx); + spin_unlock(&inode->i_lock); +@@ -133,8 +134,8 @@ void nfs_inode_reclaim_delegation(struct + if (delegation != NULL) { + spin_lock(&delegation->lock); + if (delegation->inode != NULL) { +- memcpy(delegation->stateid.data, res->delegation.data, +- sizeof(delegation->stateid.data)); ++ memcpy(delegation->stateid.u.data, res->delegation.u.data, ++ sizeof(delegation->stateid.u.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + oldcred = delegation->cred; +@@ -187,8 +188,9 @@ static struct nfs_delegation *nfs_detach + if (delegation == NULL) + goto nomatch; + spin_lock(&delegation->lock); +- if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data, +- sizeof(delegation->stateid.data)) != 0) ++ if (stateid != NULL && memcmp(delegation->stateid.u.data, ++ stateid->u.data, ++ sizeof(delegation->stateid.u.data)) != 0) + goto nomatch_unlock; + list_del_rcu(&delegation->super_list); + delegation->inode = NULL; +@@ -216,8 +218,8 @@ int nfs_inode_set_delegation(struct inod + delegation = kmalloc(sizeof(*delegation), GFP_NOFS); + if (delegation == NULL) + return -ENOMEM; +- memcpy(delegation->stateid.data, res->delegation.data, +- sizeof(delegation->stateid.data)); ++ memcpy(delegation->stateid.u.data, res->delegation.u.data, ++ sizeof(delegation->stateid.u.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + delegation->change_attr = nfsi->change_attr; +@@ -471,9 +473,7 @@ void nfs_expire_unreferenced_delegations + /* + * Asynchronous delegation recall! + */ +-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, +- int (*validate_stateid)(struct nfs_delegation *delegation, +- const nfs4_stateid *stateid)) ++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) + { + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_delegation *delegation; +@@ -481,7 +481,7 @@ int nfs_async_inode_return_delegation(st + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + +- if (!validate_stateid(delegation, stateid)) { ++ if (!clp->cl_mvops->validate_stateid(delegation, stateid)) { + rcu_read_unlock(); + return -ENOENT; + } +@@ -562,7 +562,8 @@ int nfs4_copy_delegation_stateid(nfs4_st + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation != NULL) { +- memcpy(dst->data, delegation->stateid.data, sizeof(dst->data)); ++ memcpy(dst->u.data, delegation->stateid.u.data, ++ sizeof(dst->u.data)); + ret = 1; + } + rcu_read_unlock(); +diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs/delegation.h +--- linux-2.6.34.noarch/fs/nfs/delegation.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-08-31 20:42:05.513114811 -0400 +@@ -34,9 +34,7 @@ enum { + int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); + void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); + int nfs_inode_return_delegation(struct inode *inode); +-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, +- int (*validate_stateid)(struct nfs_delegation *delegation, +- const nfs4_stateid *stateid)); ++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); + void nfs_inode_return_delegation_noreclaim(struct inode *inode); + + struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); +diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c +--- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-08-31 20:41:19.196140434 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-31 20:42:05.553222784 -0400 +@@ -17,11 +17,19 @@ + #include + #include + ++#include ++#if defined(CONFIG_SPNFS) ++#include ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif ++#endif + #include + #include + + #include "nfsd.h" + #include "nfsfh.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_EXPORT + +@@ -352,6 +360,40 @@ static int svc_export_upcall(struct cach + return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); + } + ++#if defined(CONFIG_PNFSD) ++static struct pnfsd_cb_operations pnfsd_cb_op = { ++ .cb_layout_recall = nfsd_layout_recall_cb, ++ .cb_device_notify = nfsd_device_notify_cb, ++ ++ .cb_get_state = nfs4_pnfs_cb_get_state, ++ .cb_change_state = nfs4_pnfs_cb_change_state, ++}; ++ ++#if defined(CONFIG_SPNFS) ++static struct pnfs_export_operations spnfs_export_ops = { ++ .layout_type = spnfs_layout_type, ++ .get_device_info = spnfs_getdeviceinfo, ++ .get_device_iter = spnfs_getdeviceiter, ++ .layout_get = spnfs_layoutget, ++ .layout_return = spnfs_layoutreturn, ++}; ++ ++static struct pnfs_export_operations spnfs_ds_export_ops = { ++ .get_state = spnfs_get_state, ++}; ++ ++#if defined(CONFIG_SPNFS_BLOCK) ++static struct pnfs_export_operations bl_export_ops = { ++ .layout_type = bl_layout_type, ++ .get_device_info = bl_getdeviceinfo, ++ .get_device_iter = bl_getdeviceiter, ++ .layout_get = bl_layoutget, ++ .layout_return = bl_layoutreturn, ++}; ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_SPNFS */ ++#endif /* CONFIG_PNFSD */ ++ + static struct svc_export *svc_export_update(struct svc_export *new, + struct svc_export *old); + static struct svc_export *svc_export_lookup(struct svc_export *); +@@ -395,6 +437,47 @@ static int check_export(struct inode *in + return -EINVAL; + } + ++#if !defined(CONFIG_SPNFS) ++ if (inode->i_sb->s_pnfs_op && ++ (!inode->i_sb->s_pnfs_op->layout_type || ++ !inode->i_sb->s_pnfs_op->get_device_info || ++ !inode->i_sb->s_pnfs_op->layout_get)) { ++ dprintk("exp_export: export of invalid fs pnfs export ops.\n"); ++ return -EINVAL; ++ } ++#endif /* CONFIG_SPNFS */ ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ if (!inode->i_sb->s_pnfs_op) ++ pnfsd_lexp_init(inode); ++ return 0; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, *flags)) { ++ dprintk("set pnfs block export structure... \n"); ++ inode->i_sb->s_pnfs_op = &bl_export_ops; ++ } else ++#endif /* CONFIG_SPNFS_BLOCK */ ++ /* ++ * spnfs_enabled() indicates we're an MDS. ++ * XXX Better to check an export time option as well. ++ */ ++ if (spnfs_enabled()) { ++ dprintk("set spnfs export structure...\n"); ++ inode->i_sb->s_pnfs_op = &spnfs_export_ops; ++ } else { ++ dprintk("%s spnfs not in use\n", __func__); ++ ++ /* ++ * get_state is needed if we're a DS using spnfs. ++ * XXX Better to check an export time option instead. ++ */ ++ inode->i_sb->s_pnfs_op = &spnfs_ds_export_ops; ++ } ++#endif /* CONFIG_SPNFS */ ++ + return 0; + + } +@@ -586,6 +669,8 @@ static int svc_export_parse(struct cache + if (exp.ex_uuid == NULL) + err = -ENOMEM; + } ++ } else if (strcmp(buf, "pnfs") == 0) { ++ exp.ex_pnfs = 1; + } else if (strcmp(buf, "secinfo") == 0) + err = secinfo_parse(&mesg, buf, &exp); + else +@@ -660,6 +745,8 @@ static int svc_export_show(struct seq_fi + seq_printf(m, "%02x", exp->ex_uuid[i]); + } + } ++ if (exp->ex_pnfs) ++ seq_puts(m, ",pnfs"); + show_secinfo(m, exp); + } + seq_puts(m, ")\n"); +@@ -687,6 +774,7 @@ static void svc_export_init(struct cache + new->ex_fslocs.locations = NULL; + new->ex_fslocs.locations_count = 0; + new->ex_fslocs.migrated = 0; ++ new->ex_pnfs = 0; + } + + static void export_update(struct cache_head *cnew, struct cache_head *citem) +@@ -699,6 +787,7 @@ static void export_update(struct cache_h + new->ex_anon_uid = item->ex_anon_uid; + new->ex_anon_gid = item->ex_anon_gid; + new->ex_fsid = item->ex_fsid; ++ new->ex_pnfs = item->ex_pnfs; + new->ex_uuid = item->ex_uuid; + item->ex_uuid = NULL; + new->ex_pathname = item->ex_pathname; +@@ -1635,8 +1724,17 @@ nfsd_export_init(void) + if (rv) + return rv; + rv = cache_register(&svc_expkey_cache); +- if (rv) ++ if (rv) { + cache_unregister(&svc_export_cache); ++ goto out; ++ } ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = THIS_MODULE; ++ pnfsd_cb_ctl.cb_op = &pnfsd_cb_op; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ ++out: + return rv; + + } +@@ -1664,6 +1762,12 @@ nfsd_export_shutdown(void) + + exp_writelock(); + ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = NULL; ++ pnfsd_cb_ctl.cb_op = NULL; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ + cache_unregister(&svc_expkey_cache); + cache_unregister(&svc_export_cache); + svcauth_unix_purge(); +diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/direct.c +--- linux-2.6.34.noarch/fs/nfs/direct.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-08-31 20:42:05.514196343 -0400 +@@ -267,6 +267,38 @@ static const struct rpc_call_ops nfs_rea + .rpc_release = nfs_direct_read_release, + }; + ++static long nfs_direct_read_execute(struct nfs_read_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ nfs_fattr_init(&data->fattr); ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ NFS_PROTO(inode)->read_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct read call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each rsize'd chunk of the user's buffer, dispatch an NFS READ + * operation. If nfs_readdata_alloc() or get_user_pages() fails, +@@ -283,7 +315,6 @@ static ssize_t nfs_direct_read_schedule_ + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; + size_t rsize = NFS_SERVER(inode)->rsize; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -343,26 +374,9 @@ static ssize_t nfs_direct_read_schedule_ + data->res.fattr = &data->fattr; + data->res.eof = 0; + data->res.count = bytes; +- nfs_fattr_init(&data->fattr); +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct read call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_read_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +@@ -448,12 +462,15 @@ static void nfs_direct_free_writedata(st + } + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg); ++ + static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) + { + struct inode *inode = dreq->inode; + struct list_head *p; + struct nfs_write_data *data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = dreq->ctx->cred, + }; +@@ -487,25 +504,7 @@ static void nfs_direct_write_reschedule( + * Reuse data->task; data->args should not have changed + * since the original request was sent. + */ +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- /* +- * We're called via an RPC callback, so BKL is already held. +- */ +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); +- +- dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- data->args.count, +- (unsigned long long)data->args.offset); ++ nfs_direct_write_execute(data, &task_setup_data, &msg); + } + + if (put_dreq(dreq)) +@@ -548,10 +547,31 @@ static const struct rpc_call_ops nfs_com + .rpc_release = nfs_direct_commit_release, + }; + ++static long nfs_direct_commit_execute(struct nfs_direct_req *dreq, ++ struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct rpc_task *task; ++ ++ NFS_PROTO(data->inode)->commit_setup(data, msg); ++ ++ /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ ++ dreq->commit_data = NULL; ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ return 0; ++} ++ + static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) + { + struct nfs_write_data *data = dreq->commit_data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +@@ -579,16 +599,7 @@ static void nfs_direct_commit_schedule(s + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- NFS_PROTO(data->inode)->commit_setup(data, &msg); +- +- /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ +- dreq->commit_data = NULL; +- +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); +- +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); ++ nfs_direct_commit_execute(dreq, data, &task_setup_data, &msg); + } + + static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +@@ -690,6 +701,36 @@ static const struct rpc_call_ops nfs_wri + .rpc_release = nfs_direct_write_release, + }; + ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ NFS_PROTO(inode)->write_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE + * operation. If nfs_writedata_alloc() or get_user_pages() fails, +@@ -705,7 +746,6 @@ static ssize_t nfs_direct_write_schedule + struct inode *inode = ctx->path.dentry->d_inode; + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -771,24 +811,8 @@ static ssize_t nfs_direct_write_schedule + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct write call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_write_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kconfig +--- linux-2.6.34.noarch/fs/nfsd/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-08-31 20:42:05.549222922 -0400 +@@ -79,3 +79,52 @@ config NFSD_V4 + available from http://linux-nfs.org/. + + If unsure, say N. ++ ++config PNFSD ++ bool "NFSv4.1 server support for Parallel NFS (pNFS) (DEVELOPER ONLY)" ++ depends on NFSD_V4 && EXPERIMENTAL ++ select EXPORTFS_FILE_LAYOUT ++ help ++ This option enables support for the parallel NFS features of the ++ minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1) ++ in the kernel's NFS server. ++ ++ Unless you're an NFS developer, say N. ++ ++config PNFSD_LOCAL_EXPORT ++ bool "Enable pNFS support for exporting local filesystems for debugging purposes" ++ depends on PNFSD ++ help ++ Say Y here if you want your pNFS server to export local file systems ++ over the files layout type. With this option the MDS (metadata ++ server) functions also as a single DS (data server). This is mostly ++ useful for development and debugging purposes. ++ ++ If unsure, say N. ++ ++config SPNFS ++ bool "Provide spNFS server support (EXPERIMENTAL)" ++ depends on PNFSD ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS server support. ++ ++ If unsure, say N. ++ ++config SPNFS_LAYOUTSEGMENTS ++ bool "Allow spNFS to return partial file layouts (EXPERIMENTAL)" ++ depends on SPNFS ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS to be able to return layout segments. ++ ++ If unsure, say N. ++ ++config SPNFS_BLOCK ++ bool "Provide Block Layout server support (EXPERIMENTAL)" ++ depends on SPNFS ++ select EXPORTFS_BLOCK_LAYOUT ++ help ++ Say Y here if you want spNFS block layout support ++ ++ If unsure, say N. +diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/Makefile +--- linux-2.6.34.noarch/fs/nfsd/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-08-31 20:42:05.549222922 -0400 +@@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs + nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o + nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ + nfs4acl.o nfs4callback.o nfs4recover.o ++nfsd-$(CONFIG_PNFSD) += nfs4pnfsd.o nfs4pnfsdlm.o nfs4pnfsds.o ++nfsd-$(CONFIG_PNFSD_LOCAL_EXPORT) += pnfsd_lexp.o ++nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o ++nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-31 20:41:19.197150385 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-31 20:42:05.554114789 -0400 +@@ -40,7 +40,6 @@ + + #define NFSPROC4_CB_NULL 0 + #define NFSPROC4_CB_COMPOUND 1 +-#define NFS4_STATEID_SIZE 16 + + /* Index of predefined Linux callback client operations */ + +@@ -48,11 +47,17 @@ enum { + NFSPROC4_CLNT_CB_NULL = 0, + NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_SEQUENCE, ++#if defined(CONFIG_PNFSD) ++ NFSPROC4_CLNT_CB_LAYOUT, ++ NFSPROC4_CLNT_CB_DEVICE, ++#endif + }; + + enum nfs_cb_opnum4 { + OP_CB_RECALL = 4, ++ OP_CB_LAYOUT = 5, + OP_CB_SEQUENCE = 11, ++ OP_CB_DEVICE = 14, + }; + + #define NFS4_MAXTAGLEN 20 +@@ -78,6 +83,19 @@ enum nfs_cb_opnum4 { + #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) ++#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 3 + \ ++ enc_nfs4_fh_sz + 4) ++#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) ++#define NFS4_enc_cb_device_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 6) ++#define NFS4_dec_cb_device_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) + + /* + * Generic encode routines from fs/nfs/nfs4xdr.c +@@ -94,6 +112,10 @@ xdr_writemem(__be32 *p, const void *ptr, + } + + #define WRITE32(n) *p++ = htonl(n) ++#define WRITE64(n) do { \ ++ *p++ = htonl((u32)((n) >> 32)); \ ++ *p++ = htonl((u32)(n)); \ ++} while (0) + #define WRITEMEM(ptr,nbytes) do { \ + p = xdr_writemem(p, ptr, nbytes); \ + } while (0) +@@ -204,6 +226,16 @@ nfs_cb_stat_to_errno(int stat) + */ + + static void ++encode_stateid(struct xdr_stream *xdr, stateid_t *sid) ++{ ++ __be32 *p; ++ ++ RESERVE_SPACE(sizeof(stateid_t)); ++ WRITE32(sid->si_generation); ++ WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); ++} ++ ++static void + encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) + { + __be32 * p; +@@ -228,10 +260,10 @@ encode_cb_recall(struct xdr_stream *xdr, + __be32 *p; + int len = dp->dl_fh.fh_size; + +- RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len); ++ RESERVE_SPACE(4); + WRITE32(OP_CB_RECALL); +- WRITE32(dp->dl_stateid.si_generation); +- WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t)); ++ encode_stateid(xdr, &dp->dl_stateid); ++ RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2)); + WRITE32(0); /* truncate optimization not implemented */ + WRITE32(len); + WRITEMEM(&dp->dl_fh.fh_base, len); +@@ -259,6 +291,111 @@ encode_cb_sequence(struct xdr_stream *xd + hdr->nops++; + } + ++#if defined(CONFIG_PNFSD) ++ ++#include "pnfsd.h" ++ ++static void ++encode_cb_layout(struct xdr_stream *xdr, struct nfs4_layoutrecall *clr, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ RESERVE_SPACE(20); ++ WRITE32(OP_CB_LAYOUT); ++ WRITE32(clr->cb.cbl_seg.layout_type); ++ WRITE32(clr->cb.cbl_seg.iomode); ++ WRITE32(clr->cb.cbl_layoutchanged); ++ WRITE32(clr->cb.cbl_recall_type); ++ if (unlikely(clr->cb.cbl_recall_type == RETURN_FSID)) { ++ struct nfs4_fsid fsid = clr->cb.cbl_fsid; ++ ++ RESERVE_SPACE(16); ++ WRITE64(fsid.major); ++ WRITE64(fsid.minor); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "fsid 0x%llx-0x%llx\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, fsid.major, fsid.minor); ++ } else if (clr->cb.cbl_recall_type == RETURN_FILE) { ++ int len = clr->clr_file->fi_fhlen; ++ stateid_t *cbl_sid = (stateid_t *)&clr->cb.cbl_sid; ++ ++ RESERVE_SPACE(20 + len); ++ WRITE32(len); ++ WRITEMEM(clr->clr_file->fi_fhval, len); ++ WRITE64(clr->cb.cbl_seg.offset); ++ WRITE64(clr->cb.cbl_seg.length); ++ encode_stateid(xdr, cbl_sid); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "offset %lld length %lld stateid " STATEID_FMT "\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, ++ clr->cb.cbl_seg.offset, clr->cb.cbl_seg.length, ++ STATEID_VAL(cbl_sid)); ++ } else { ++ dprintk("%s: type %x iomode %d changed %d recall_type %d\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type); ++ } ++ hdr->nops++; ++} ++ ++static void ++encode_cb_device(struct xdr_stream *xdr, struct nfs4_notify_device *nd, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ int i; ++ int len = nd->nd_list->cbd_len; ++ struct nfsd4_pnfs_cb_dev_item *cbd = nd->nd_list->cbd_list; ++ ++ dprintk("NFSD %s: --> num %d\n", __func__, len); ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ RESERVE_SPACE(8); ++ WRITE32(OP_CB_DEVICE); ++ ++ /* notify4 cnda_changes<>; */ ++ WRITE32(len); ++ for (i = 0; i < len; i++) { ++ dprintk("%s: nt %d lt %d devid x%llx-x%llx im %d i %d\n", ++ __func__, cbd[i].cbd_notify_type, ++ cbd[i].cbd_layout_type, ++ cbd[i].cbd_devid.sbid, ++ cbd[i].cbd_devid.devid, ++ cbd[i].cbd_immediate, i); ++ ++ BUG_ON(cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_DELETE); ++ RESERVE_SPACE(32); ++ /* bitmap4 notify_mask; */ ++ WRITE32(1); ++ WRITE32(cbd[i].cbd_notify_type); ++ /* opaque notify_vals<>; */ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) ++ WRITE32(24); ++ else ++ WRITE32(20); ++ WRITE32(cbd[i].cbd_layout_type); ++ WRITE64(cbd[i].cbd_devid.sbid); ++ WRITE64(cbd[i].cbd_devid.devid); ++ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) { ++ RESERVE_SPACE(4); ++ WRITE32(cbd[i].cbd_immediate); ++ } ++ } ++ hdr->nops++; ++} ++#endif /* CONFIG_PNFSD */ ++ + static int + nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) + { +@@ -288,6 +425,45 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst * + return 0; + } + ++#if defined(CONFIG_PNFSD) ++static int ++nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, u32 *p, ++ struct nfs4_rpc_args *rpc_args) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_layoutrecall *args = rpc_args->args_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = rpc_args->args_seq.cbs_minorversion, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_cb_compound_hdr(&xdr, &hdr); ++ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); ++ encode_cb_layout(&xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++ return 0; ++} ++ ++static int ++nfs4_xdr_enc_cb_device(struct rpc_rqst *req, u32 *p, ++ struct nfs4_rpc_args *rpc_args) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_notify_device *args = rpc_args->args_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = rpc_args->args_seq.cbs_minorversion, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_cb_compound_hdr(&xdr, &hdr); ++ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); ++ encode_cb_device(&xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++ return 0; ++} ++#endif /* CONFIG_PNFSD */ + + static int + decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ +@@ -403,6 +579,48 @@ out: + return status; + } + ++#if defined(CONFIG_PNFSD) ++static int ++nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, u32 *p, ++ struct nfsd4_cb_sequence *seq) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_cb_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_cb_sequence(&xdr, seq, rqstp); ++ if (status) ++ goto out; ++ status = decode_cb_op_hdr(&xdr, OP_CB_LAYOUT); ++out: ++ return status; ++} ++ ++static int ++nfs4_xdr_dec_cb_device(struct rpc_rqst *rqstp, u32 *p, ++ struct nfsd4_cb_sequence *seq) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_cb_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_cb_sequence(&xdr, seq, rqstp); ++ if (status) ++ goto out; ++ status = decode_cb_op_hdr(&xdr, OP_CB_DEVICE); ++out: ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * RPC procedure tables + */ +@@ -420,6 +638,10 @@ out: + static struct rpc_procinfo nfs4_cb_procedures[] = { + PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null), + PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall), ++#if defined(CONFIG_PNFSD) ++ PROC(CB_LAYOUT, COMPOUND, enc_cb_layout, dec_cb_layout), ++ PROC(CB_DEVICE, COMPOUND, enc_cb_device, dec_cb_device), ++#endif + }; + + static struct rpc_version nfs_cb_version4 = { +@@ -606,10 +828,9 @@ out: + * TODO: cb_sequence should support referring call lists, cachethis, multiple + * slots, and mark callback channel down on communication errors. + */ +-static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_prepare_sequence(struct rpc_task *task, ++ struct nfs4_client *clp) + { +- struct nfs4_delegation *dp = calldata; +- struct nfs4_client *clp = dp->dl_client; + struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; + u32 minorversion = clp->cl_cb_conn.cb_minorversion; + int status = 0; +@@ -629,11 +850,15 @@ static void nfsd4_cb_prepare(struct rpc_ + rpc_call_start(task); + } + +-static void nfsd4_cb_done(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_recall_prepare(struct rpc_task *task, void *calldata) + { + struct nfs4_delegation *dp = calldata; +- struct nfs4_client *clp = dp->dl_client; ++ nfsd4_cb_prepare_sequence(task, dp->dl_client); ++} + ++static void nfsd4_cb_done_sequence(struct rpc_task *task, ++ struct nfs4_client *clp) ++{ + dprintk("%s: minorversion=%d\n", __func__, + clp->cl_cb_conn.cb_minorversion); + +@@ -657,7 +882,7 @@ static void nfsd4_cb_recall_done(struct + struct nfs4_client *clp = dp->dl_client; + struct rpc_clnt *current_rpc_client = clp->cl_cb_client; + +- nfsd4_cb_done(task, calldata); ++ nfsd4_cb_done_sequence(task, clp); + + if (current_rpc_client == NULL) { + /* We're shutting down; give up. */ +@@ -688,7 +913,7 @@ static void nfsd4_cb_recall_done(struct + if (dp->dl_retries--) { + rpc_delay(task, 2*HZ); + task->tk_status = 0; +- rpc_restart_call(task); ++ rpc_restart_call_prepare(task); + return; + } else { + atomic_set(&clp->cl_cb_set, 0); +@@ -704,7 +929,7 @@ static void nfsd4_cb_recall_release(void + } + + static const struct rpc_call_ops nfsd4_cb_recall_ops = { +- .rpc_call_prepare = nfsd4_cb_prepare, ++ .rpc_call_prepare = nfsd4_cb_recall_prepare, + .rpc_call_done = nfsd4_cb_recall_done, + .rpc_release = nfsd4_cb_recall_release, + }; +@@ -781,3 +1006,173 @@ void nfsd4_cb_recall(struct nfs4_delegat + { + queue_work(callback_wq, &dp->dl_recall.cb_work); + } ++ ++#if defined(CONFIG_PNFSD) ++static void nfsd4_cb_layout_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ nfsd4_cb_prepare_sequence(task, clr->clr_client); ++} ++ ++static void nfsd4_cb_layout_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ struct nfs4_client *clp = clr->clr_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ if (!task->tk_status) ++ return; ++ ++ printk("%s: clp %p cb_client %p fp %p failed with status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ clr->clr_file, ++ task->tk_status); ++ ++ switch (task->tk_status) { ++ case -EIO: ++ /* Network partition? */ ++ atomic_set(&clp->cl_cb_set, 0); ++ warn_no_callback_path(clp, task->tk_status); ++ /* FIXME: ++ * The pnfs standard states that we need to only expire ++ * the client after at-least "lease time" .eg lease-time * 2 ++ * when failing to communicate a recall ++ */ ++ break; ++ case -NFS4ERR_DELAY: ++ /* Pole the client until it's done with the layout */ ++ rpc_delay(task, HZ/100); /* 10 mili-seconds */ ++ task->tk_status = 0; ++ rpc_restart_call_prepare(task); ++ break; ++ case -NFS4ERR_NOMATCHING_LAYOUT: ++ task->tk_status = 0; ++ nomatching_layout(clr); ++ } ++} ++ ++static void nfsd4_cb_layout_release(void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ kfree(clr->clr_args); ++ clr->clr_args = NULL; ++ put_layoutrecall(clr); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_layout_ops = { ++ .rpc_call_prepare = nfsd4_cb_layout_prepare, ++ .rpc_call_done = nfsd4_cb_layout_done, ++ .rpc_release = nfsd4_cb_layout_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++int ++nfsd4_cb_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfs4_client *clp = clr->clr_client; ++ struct rpc_clnt *clnt = clp->cl_cb_client; ++ struct nfs4_rpc_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_LAYOUT], ++ .rpc_cred = callback_cred ++ }; ++ int status; ++ ++ args = kzalloc(sizeof(*args), GFP_KERNEL); ++ if (!args) { ++ status = -ENOMEM; ++ goto out; ++ } ++ clr->clr_args = args; ++ args->args_op = clr; ++ msg.rpc_argp = args; ++ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, ++ &nfsd4_cb_layout_ops, clr); ++out: ++ if (status) { ++ kfree(args); ++ put_layoutrecall(clr); ++ } ++ dprintk("NFSD: nfsd4_cb_layout: status %d\n", status); ++ return status; ++} ++ ++static void nfsd4_cb_device_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ nfsd4_cb_prepare_sequence(task, cbnd->nd_client); ++} ++ ++static void nfsd4_cb_device_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ struct nfs4_client *clp = cbnd->nd_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ dprintk("%s: clp %p cb_client %p: status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ task->tk_status); ++ ++ if (task->tk_status == -EIO) { ++ /* Network partition? */ ++ atomic_set(&clp->cl_cb_set, 0); ++ warn_no_callback_path(clp, task->tk_status); ++ } ++} ++ ++static void nfsd4_cb_device_release(void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ kfree(cbnd->nd_args); ++ cbnd->nd_args = NULL; ++ kfree(cbnd); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_device_ops = { ++ .rpc_call_prepare = nfsd4_cb_device_prepare, ++ .rpc_call_done = nfsd4_cb_device_done, ++ .rpc_release = nfsd4_cb_device_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++int ++nfsd4_cb_notify_device(struct nfs4_notify_device *cbnd) ++{ ++ struct nfs4_client *clp = cbnd->nd_client; ++ struct rpc_clnt *clnt = clp->cl_cb_client; ++ struct nfs4_rpc_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_DEVICE], ++ .rpc_cred = callback_cred ++ }; ++ int status = -EIO; ++ ++ dprintk("%s: clp %p\n", __func__, clp); ++ ++ args = kzalloc(sizeof(*args), GFP_KERNEL); ++ if (!args) { ++ status = -ENOMEM; ++ goto out; ++ } ++ args->args_op = cbnd; ++ msg.rpc_argp = args; ++ ++ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, ++ &nfsd4_cb_device_ops, cbnd); ++out: ++ if (status) ++ kfree(args); ++ dprintk("%s: status %d\n", __func__, status); ++ return status; ++} ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-31 20:42:05.556172071 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-31 20:42:05.556172071 -0400 +@@ -0,0 +1,1679 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ *****************************************************************************/ ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Globals */ ++static u32 current_layoutid = 1; ++ ++/* ++ * Currently used for manipulating the layout state. ++ */ ++static DEFINE_SPINLOCK(layout_lock); ++ ++#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_SMP) ++# define BUG_ON_UNLOCKED_LAYOUT() BUG_ON(!spin_is_locked(&layout_lock)) ++#else ++# define BUG_ON_UNLOCKED_LAYOUT() ++#endif ++ ++/* ++ * Layout state - NFSv4.1 pNFS ++ */ ++static struct kmem_cache *pnfs_layout_slab; ++static struct kmem_cache *pnfs_layoutrecall_slab; ++ ++/* hash table for nfsd4_pnfs_deviceid.sbid */ ++#define SBID_HASH_BITS 8 ++#define SBID_HASH_SIZE (1 << SBID_HASH_BITS) ++#define SBID_HASH_MASK (SBID_HASH_SIZE - 1) ++ ++struct sbid_tracker { ++ u64 id; ++ struct super_block *sb; ++ struct list_head hash; ++}; ++ ++static u64 current_sbid; ++static struct list_head sbid_hashtbl[SBID_HASH_SIZE]; ++ ++static inline unsigned long ++sbid_hashval(struct super_block *sb) ++{ ++ return hash_ptr(sb, SBID_HASH_BITS); ++} ++ ++static inline struct sbid_tracker * ++alloc_sbid(void) ++{ ++ return kmalloc(sizeof(struct sbid_tracker), GFP_KERNEL); ++} ++ ++static void ++destroy_sbid(struct sbid_tracker *sbid) ++{ ++ spin_lock(&layout_lock); ++ list_del(&sbid->hash); ++ spin_unlock(&layout_lock); ++ kfree(sbid); ++} ++ ++void ++nfsd4_free_pnfs_slabs(void) ++{ ++ int i; ++ struct sbid_tracker *sbid; ++ ++ nfsd4_free_slab(&pnfs_layout_slab); ++ nfsd4_free_slab(&pnfs_layoutrecall_slab); ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ while (!list_empty(&sbid_hashtbl[i])) { ++ sbid = list_first_entry(&sbid_hashtbl[i], ++ struct sbid_tracker, ++ hash); ++ destroy_sbid(sbid); ++ } ++ } ++} ++ ++int ++nfsd4_init_pnfs_slabs(void) ++{ ++ int i; ++ ++ pnfs_layout_slab = kmem_cache_create("pnfs_layouts", ++ sizeof(struct nfs4_layout), 0, 0, NULL); ++ if (pnfs_layout_slab == NULL) ++ return -ENOMEM; ++ pnfs_layoutrecall_slab = kmem_cache_create("pnfs_layoutrecalls", ++ sizeof(struct nfs4_layoutrecall), 0, 0, NULL); ++ if (pnfs_layoutrecall_slab == NULL) ++ return -ENOMEM; ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ INIT_LIST_HEAD(&sbid_hashtbl[i]); ++ } ++ ++ return 0; ++} ++ ++/* XXX: Need to implement the notify types and track which ++ * clients have which devices. */ ++void pnfs_set_device_notify(clientid_t *clid, unsigned int types) ++{ ++ struct nfs4_client *clp; ++ dprintk("%s: -->\n", __func__); ++ ++ nfs4_lock_state(); ++ /* Indicate that client has a device so we can only notify ++ * the correct clients */ ++ clp = find_confirmed_client(clid); ++ if (clp) { ++ atomic_inc(&clp->cl_deviceref); ++ dprintk("%s: Incr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++ } ++ nfs4_unlock_state(); ++} ++ ++/* Clear notifications for this client ++ * XXX: Do we need to loop through a clean up all ++ * krefs when nfsd cleans up the client? */ ++void pnfs_clear_device_notify(struct nfs4_client *clp) ++{ ++ atomic_dec(&clp->cl_deviceref); ++ dprintk("%s: Decr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++} ++ ++static struct nfs4_layout_state * ++alloc_init_layout_state(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid) ++{ ++ struct nfs4_layout_state *new; ++ ++ /* FIXME: use a kmem_cache */ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return new; ++ get_nfs4_file(fp); ++ INIT_LIST_HEAD(&new->ls_perfile); ++ INIT_LIST_HEAD(&new->ls_layouts); ++ kref_init(&new->ls_ref); ++ new->ls_client = clp; ++ new->ls_file = fp; ++ new->ls_stateid.si_boot = stateid->si_boot; ++ new->ls_stateid.si_stateownerid = 0; /* identifies layout stateid */ ++ new->ls_stateid.si_generation = 1; ++ spin_lock(&layout_lock); ++ new->ls_stateid.si_fileid = current_layoutid++; ++ list_add(&new->ls_perfile, &fp->fi_layout_states); ++ spin_unlock(&layout_lock); ++ return new; ++} ++ ++static inline void ++get_layout_state(struct nfs4_layout_state *ls) ++{ ++ kref_get(&ls->ls_ref); ++} ++ ++static void ++destroy_layout_state_common(struct nfs4_layout_state *ls) ++{ ++ struct nfs4_file *fp = ls->ls_file; ++ ++ dprintk("pNFS %s: ls %p fp %p clp %p\n", __func__, ls, fp, ++ ls->ls_client); ++ BUG_ON(!list_empty(&ls->ls_layouts)); ++ kfree(ls); ++ put_nfs4_file(fp); ++} ++ ++static void ++destroy_layout_state(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ spin_lock(&layout_lock); ++ list_del(&ls->ls_perfile); ++ spin_unlock(&layout_lock); ++ destroy_layout_state_common(ls); ++} ++ ++static void ++destroy_layout_state_locked(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ list_del(&ls->ls_perfile); ++ destroy_layout_state_common(ls); ++} ++ ++static inline void ++put_layout_state(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state); ++} ++ ++static inline void ++put_layout_state_locked(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state_locked); ++} ++ ++/* ++ * Search the fp->fi_layout_state list for a layout state with the clientid. ++ * If not found, then this is a 'first open/delegation/lock stateid' from ++ * the client for this file. ++ * Called under the layout_lock. ++ */ ++static struct nfs4_layout_state * ++find_get_layout_state(struct nfs4_client *clp, struct nfs4_file *fp) ++{ ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_for_each_entry(ls, &fp->fi_layout_states, ls_perfile) { ++ if (ls->ls_client == clp) { ++ dprintk("pNFS %s: before GET ls %p ls_ref %d\n", ++ __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ get_layout_state(ls); ++ return ls; ++ } ++ } ++ return NULL; ++} ++ ++static __be32 ++verify_stateid(struct nfs4_file *fp, stateid_t *stateid) ++{ ++ struct nfs4_stateid *local = NULL; ++ struct nfs4_delegation *temp = NULL; ++ ++ /* check if open or lock stateid */ ++ local = find_stateid(stateid, RD_STATE); ++ if (local) ++ return 0; ++ temp = find_delegation_stateid(fp->fi_inode, stateid); ++ if (temp) ++ return 0; ++ return nfserr_bad_stateid; ++} ++ ++/* ++ * nfs4_preocess_layout_stateid () ++ * ++ * We have looked up the nfs4_file corresponding to the current_fh, and ++ * confirmed the clientid. Pull the few tests from nfs4_preprocess_stateid_op() ++ * that make sense with a layout stateid. ++ * ++ * Called with the state_lock held ++ * Returns zero and stateid is updated, or error. ++ * ++ * Note: the struct nfs4_layout_state pointer is only set by layoutget. ++ */ ++static __be32 ++nfs4_process_layout_stateid(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid, struct nfs4_layout_state **lsp) ++{ ++ struct nfs4_layout_state *ls = NULL; ++ __be32 status = 0; ++ ++ dprintk("--> %s clp %p fp %p \n", __func__, clp, fp); ++ ++ dprintk("%s: operation stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ goto out; ++ ++ /* Is this the first use of this layout ? */ ++ spin_lock(&layout_lock); ++ ls = find_get_layout_state(clp, fp); ++ spin_unlock(&layout_lock); ++ if (!ls) { ++ /* Only alloc layout state on layoutget (which sets lsp). */ ++ if (!lsp) { ++ dprintk("%s ERROR: Not layoutget & no layout stateid\n", ++ __func__); ++ status = nfserr_bad_stateid; ++ goto out; ++ } ++ dprintk("%s Initial stateid for layout: file %p client %p\n", ++ __func__, fp, clp); ++ ++ /* verify input stateid */ ++ status = verify_stateid(fp, stateid); ++ if (status) { ++ dprintk("%s ERROR: invalid open/deleg/lock stateid\n", ++ __func__); ++ goto out; ++ } ++ ls = alloc_init_layout_state(clp, fp, stateid); ++ if (!ls) { ++ dprintk("%s pNFS ERROR: no memory for layout state\n", ++ __func__); ++ status = nfserr_resource; ++ goto out; ++ } ++ } else { ++ dprintk("%s Not initial stateid. Layout state %p file %p\n", ++ __func__, ls, fp); ++ ++ /* BAD STATEID */ ++ status = nfserr_bad_stateid; ++ if (memcmp(&ls->ls_stateid.si_opaque, &stateid->si_opaque, ++ sizeof(stateid_opaque_t)) != 0) { ++ ++ /* if a LAYOUTGET operation and stateid is a valid ++ * open/deleg/lock stateid, accept it as a parallel ++ * initial layout stateid ++ */ ++ if (lsp && ((verify_stateid(fp, stateid)) == 0)) { ++ dprintk("%s parallel initial layout state\n", ++ __func__); ++ goto update; ++ } ++ ++ dprintk("%s ERROR bad opaque in stateid 1\n", __func__); ++ goto out_put; ++ } ++ ++ /* stateid is a valid layout stateid for this file. */ ++ if (stateid->si_generation > ls->ls_stateid.si_generation) { ++ dprintk("%s bad stateid 1\n", __func__); ++ goto out_put; ++ } ++update: ++ update_stateid(&ls->ls_stateid); ++ dprintk("%s Updated ls_stateid to %d on layoutstate %p\n", ++ __func__, ls->ls_stateid.si_generation, ls); ++ } ++ status = 0; ++ /* Set the stateid to be encoded */ ++ memcpy(stateid, &ls->ls_stateid, sizeof(stateid_t)); ++ ++ /* Return the layout state if requested */ ++ if (lsp) { ++ get_layout_state(ls); ++ *lsp = ls; ++ } ++ dprintk("%s: layout stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(&ls->ls_stateid)); ++out_put: ++ dprintk("%s PUT LO STATE:\n", __func__); ++ put_layout_state(ls); ++out: ++ dprintk("<-- %s status %d\n", __func__, htonl(status)); ++ ++ return status; ++} ++ ++static inline struct nfs4_layout * ++alloc_layout(void) ++{ ++ return kmem_cache_alloc(pnfs_layout_slab, GFP_KERNEL); ++} ++ ++static inline void ++free_layout(struct nfs4_layout *lp) ++{ ++ kmem_cache_free(pnfs_layout_slab, lp); ++} ++ ++static void ++init_layout(struct nfs4_layout_state *ls, ++ struct nfs4_layout *lp, ++ struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg) ++{ ++ dprintk("pNFS %s: ls %p lp %p clp %p fp %p ino %p\n", __func__, ++ ls, lp, clp, fp, fp->fi_inode); ++ ++ get_nfs4_file(fp); ++ lp->lo_client = clp; ++ lp->lo_file = fp; ++ get_layout_state(ls); ++ lp->lo_state = ls; ++ memcpy(&lp->lo_seg, seg, sizeof(lp->lo_seg)); ++ spin_lock(&layout_lock); ++ list_add_tail(&lp->lo_perstate, &ls->ls_layouts); ++ list_add_tail(&lp->lo_perclnt, &clp->cl_layouts); ++ list_add_tail(&lp->lo_perfile, &fp->fi_layouts); ++ spin_unlock(&layout_lock); ++ dprintk("pNFS %s end\n", __func__); ++} ++ ++static void ++dequeue_layout(struct nfs4_layout *lp) ++{ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del(&lp->lo_perclnt); ++ list_del(&lp->lo_perfile); ++ list_del(&lp->lo_perstate); ++} ++ ++static void ++destroy_layout(struct nfs4_layout *lp) ++{ ++ struct nfs4_client *clp; ++ struct nfs4_file *fp; ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ clp = lp->lo_client; ++ fp = lp->lo_file; ++ ls = lp->lo_state; ++ dprintk("pNFS %s: lp %p clp %p fp %p ino %p ls_layouts empty %d\n", ++ __func__, lp, clp, fp, fp->fi_inode, ++ list_empty(&ls->ls_layouts)); ++ ++ kmem_cache_free(pnfs_layout_slab, lp); ++ /* release references taken by init_layout */ ++ put_layout_state_locked(ls); ++ put_nfs4_file(fp); ++} ++ ++void fs_layout_return(struct super_block *sb, struct inode *ino, ++ struct nfsd4_pnfs_layoutreturn *lrp, int flags, ++ void *recall_cookie) ++{ ++ int ret; ++ ++ if (unlikely(!sb->s_pnfs_op->layout_return)) ++ return; ++ ++ lrp->lr_flags = flags; ++ lrp->args.lr_cookie = recall_cookie; ++ ++ if (!ino) /* FSID or ALL */ ++ ino = sb->s_root->d_inode; ++ ++ ret = sb->s_pnfs_op->layout_return(ino, &lrp->args); ++ dprintk("%s: inode %lu iomode=%d offset=0x%llx length=0x%llx " ++ "cookie = %p flags 0x%x status=%d\n", ++ __func__, ino->i_ino, lrp->args.lr_seg.iomode, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, ++ recall_cookie, flags, ret); ++} ++ ++static u64 ++alloc_init_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ struct sbid_tracker *new = alloc_sbid(); ++ unsigned long hash_idx = sbid_hashval(sb); ++ u64 id = 0; ++ ++ if (likely(new)) { ++ spin_lock(&layout_lock); ++ id = ++current_sbid; ++ new->id = (id << SBID_HASH_BITS) | (hash_idx & SBID_HASH_MASK); ++ id = new->id; ++ BUG_ON(id == 0); ++ new->sb = sb; ++ ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) ++ if (sbid->sb == sb) { ++ kfree(new); ++ id = sbid->id; ++ spin_unlock(&layout_lock); ++ return id; ++ } ++ list_add(&new->hash, &sbid_hashtbl[hash_idx]); ++ spin_unlock(&layout_lock); ++ } ++ return id; ++} ++ ++struct super_block * ++find_sbid_id(u64 id) ++{ ++ struct sbid_tracker *sbid; ++ struct super_block *sb = NULL; ++ unsigned long hash_idx = id & SBID_HASH_MASK; ++ int pos = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->id != id) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ sb = sbid->sb; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return sb; ++} ++ ++u64 ++find_create_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ unsigned long hash_idx = sbid_hashval(sb); ++ int pos = 0; ++ u64 id = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->sb != sb) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ id = sbid->id; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ if (!id) ++ id = alloc_init_sbid(sb); ++ ++ return id; ++} ++ ++/* ++ * Create a layoutrecall structure ++ * An optional layoutrecall can be cloned (except for the layoutrecall lists) ++ */ ++static struct nfs4_layoutrecall * ++alloc_init_layoutrecall(struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_client *clp, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ dprintk("NFSD %s\n", __func__); ++ clr = kmem_cache_alloc(pnfs_layoutrecall_slab, GFP_KERNEL); ++ if (clr == NULL) ++ return clr; ++ ++ dprintk("NFSD %s -->\n", __func__); ++ ++ memset(clr, 0, sizeof(*clr)); ++ if (lrfile) ++ get_nfs4_file(lrfile); ++ clr->clr_client = clp; ++ clr->clr_file = lrfile; ++ clr->cb = *cbl; ++ ++ kref_init(&clr->clr_ref); ++ INIT_LIST_HEAD(&clr->clr_perclnt); ++ ++ dprintk("NFSD %s return %p\n", __func__, clr); ++ return clr; ++} ++ ++static void ++get_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ kref_get(&clr->clr_ref); ++} ++ ++static void ++destroy_layoutrecall(struct kref *kref) ++{ ++ struct nfs4_layoutrecall *clr = ++ container_of(kref, struct nfs4_layoutrecall, clr_ref); ++ dprintk("pNFS %s: clr %p fp %p clp %p\n", __func__, clr, ++ clr->clr_file, clr->clr_client); ++ BUG_ON(!list_empty(&clr->clr_perclnt)); ++ if (clr->clr_file) ++ put_nfs4_file(clr->clr_file); ++ kmem_cache_free(pnfs_layoutrecall_slab, clr); ++} ++ ++int ++put_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ return kref_put(&clr->clr_ref, destroy_layoutrecall); ++} ++ ++void * ++layoutrecall_done(struct nfs4_layoutrecall *clr) ++{ ++ void *recall_cookie = clr->cb.cbl_cookie; ++ struct nfs4_layoutrecall *parent = clr->parent; ++ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del_init(&clr->clr_perclnt); ++ put_layoutrecall(clr); ++ ++ if (parent && !put_layoutrecall(parent)) ++ recall_cookie = NULL; ++ ++ return recall_cookie; ++} ++ ++/* ++ * get_state() and cb_get_state() are ++ */ ++void ++release_pnfs_ds_dev_list(struct nfs4_stateid *stp) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ while (!list_empty(&stp->st_pnfs_ds_id)) { ++ ddp = list_entry(stp->st_pnfs_ds_id.next, ++ struct pnfs_ds_dev_entry, dd_dev_entry); ++ list_del(&ddp->dd_dev_entry); ++ kfree(ddp); ++ } ++} ++ ++static int ++nfs4_add_pnfs_ds_dev(struct nfs4_stateid *stp, u32 dsid) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ ddp = kmalloc(sizeof(*ddp), GFP_KERNEL); ++ if (!ddp) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&ddp->dd_dev_entry); ++ list_add(&ddp->dd_dev_entry, &stp->st_pnfs_ds_id); ++ ddp->dd_dsid = dsid; ++ return 0; ++} ++ ++/* ++ * are two octet ranges overlapping? ++ * start1 last1 ++ * |-----------------| ++ * start2 last2 ++ * |----------------| ++ */ ++static inline int ++lo_seg_overlapping(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 last1 = last_byte_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 last2 = last_byte_offset(start2, l2->length); ++ int ret; ++ ++ /* if last1 == start2 there's a single byte overlap */ ++ ret = (last2 >= start1) && (last1 >= start2); ++ dprintk("%s: l1 %llu:%lld l2 %llu:%lld ret=%d\n", __func__, ++ l1->offset, l1->length, l2->offset, l2->length, ret); ++ return ret; ++} ++ ++static inline int ++same_fsid_major(struct nfs4_fsid *fsid, u64 major) ++{ ++ return fsid->major == major; ++} ++ ++static inline int ++same_fsid(struct nfs4_fsid *fsid, struct svc_fh *current_fh) ++{ ++ return same_fsid_major(fsid, current_fh->fh_export->ex_fsid); ++} ++ ++/* ++ * find a layout recall conflicting with the specified layoutget ++ */ ++static int ++is_layout_recalled(struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (clr, &clp->cl_layoutrecalls, clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != seg->layout_type) ++ continue; ++ if (clr->cb.cbl_recall_type == RETURN_ALL) ++ goto found; ++ if (clr->cb.cbl_recall_type == RETURN_FSID) { ++ if (same_fsid(&clr->cb.cbl_fsid, current_fh)) ++ goto found; ++ else ++ continue; ++ } ++ BUG_ON(clr->cb.cbl_recall_type != RETURN_FILE); ++ if (clr->cb.cbl_seg.clientid == seg->clientid && ++ lo_seg_overlapping(&clr->cb.cbl_seg, seg)) ++ goto found; ++ } ++ spin_unlock(&layout_lock); ++ return 0; ++found: ++ spin_unlock(&layout_lock); ++ return 1; ++} ++ ++/* ++ * are two octet ranges overlapping or adjacent? ++ */ ++static inline int ++lo_seg_mergeable(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ /* is end1 == start2 ranges are adjacent */ ++ return (end2 >= start1) && (end1 >= start2); ++} ++ ++static void ++extend_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lg) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lg_start = lg->offset; ++ u64 lg_end = end_offset(lg_start, lg->length); ++ ++ /* lo already covers lg? */ ++ if (lo_start <= lg_start && lg_end <= lo_end) ++ return; ++ ++ /* extend start offset */ ++ if (lo_start > lg_start) ++ lo_start = lg_start; ++ ++ /* extend end offset */ ++ if (lo_end < lg_end) ++ lo_end = lg_end; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? ++ lo_end : lo_end - lo_start; ++} ++ ++static struct nfs4_layout * ++merge_layout(struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layout *lp = NULL; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (lp, &fp->fi_layouts, lo_perfile) ++ if (lp->lo_seg.layout_type == seg->layout_type && ++ lp->lo_seg.clientid == seg->clientid && ++ lp->lo_seg.iomode == seg->iomode && ++ lo_seg_mergeable(&lp->lo_seg, seg)) { ++ extend_layout(&lp->lo_seg, seg); ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return lp; ++} ++ ++__be32 ++nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *lgp, ++ struct exp_xdr_stream *xdr) ++{ ++ u32 status; ++ __be32 nfserr; ++ struct inode *ino = lgp->lg_fhp->fh_dentry->d_inode; ++ struct super_block *sb = ino->i_sb; ++ int can_merge; ++ struct nfs4_file *fp; ++ struct nfs4_client *clp; ++ struct nfs4_layout *lp = NULL; ++ struct nfs4_layout_state *ls = NULL; ++ struct nfsd4_pnfs_layoutget_arg args = { ++ .lg_minlength = lgp->lg_minlength, ++ .lg_fh = &lgp->lg_fhp->fh_handle, ++ }; ++ struct nfsd4_pnfs_layoutget_res res = { ++ .lg_seg = lgp->lg_seg, ++ }; ++ ++ dprintk("NFSD: %s Begin\n", __func__); ++ ++ args.lg_sbid = find_create_sbid(sb); ++ if (!args.lg_sbid) { ++ nfserr = nfserr_layouttrylater; ++ goto out; ++ } ++ ++ can_merge = sb->s_pnfs_op->can_merge_layouts != NULL && ++ sb->s_pnfs_op->can_merge_layouts(lgp->lg_seg.layout_type); ++ ++ nfs4_lock_state(); ++ fp = find_alloc_file(ino, lgp->lg_fhp); ++ clp = find_confirmed_client((clientid_t *)&lgp->lg_seg.clientid); ++ dprintk("pNFS %s: fp %p clp %p \n", __func__, fp, clp); ++ if (!fp || !clp) { ++ nfserr = nfserr_inval; ++ goto out_unlock; ++ } ++ ++ /* Check decoded layout stateid */ ++ nfserr = nfs4_process_layout_stateid(clp, fp, &lgp->lg_sid, &ls); ++ if (nfserr) ++ goto out_unlock; ++ ++ if (is_layout_recalled(clp, lgp->lg_fhp, &lgp->lg_seg)) { ++ nfserr = nfserr_recallconflict; ++ goto out; ++ } ++ ++ /* pre-alloc layout in case we can't merge after we call ++ * the file system ++ */ ++ lp = alloc_layout(); ++ if (!lp) { ++ nfserr = nfserr_layouttrylater; ++ goto out_unlock; ++ } ++ ++ dprintk("pNFS %s: pre-export type 0x%x maxcount %Zd " ++ "iomode %u offset %llu length %llu\n", ++ __func__, lgp->lg_seg.layout_type, ++ exp_xdr_qbytes(xdr->end - xdr->p), ++ lgp->lg_seg.iomode, lgp->lg_seg.offset, lgp->lg_seg.length); ++ ++ /* FIXME: need to eliminate the use of the state lock */ ++ nfs4_unlock_state(); ++ status = sb->s_pnfs_op->layout_get(ino, xdr, &args, &res); ++ nfs4_lock_state(); ++ ++ dprintk("pNFS %s: post-export status %u " ++ "iomode %u offset %llu length %llu\n", ++ __func__, status, res.lg_seg.iomode, ++ res.lg_seg.offset, res.lg_seg.length); ++ ++ /* ++ * The allowable error codes for the layout_get pNFS export ++ * operations vector function (from the file system) can be ++ * expanded as needed to include other errors defined for ++ * the RFC 5561 LAYOUTGET operation. ++ */ ++ switch (status) { ++ case 0: ++ nfserr = NFS4_OK; ++ break; ++ case NFS4ERR_ACCESS: ++ case NFS4ERR_BADIOMODE: ++ /* No support for LAYOUTIOMODE4_RW layouts */ ++ case NFS4ERR_BADLAYOUT: ++ /* No layout matching loga_minlength rules */ ++ case NFS4ERR_INVAL: ++ case NFS4ERR_IO: ++ case NFS4ERR_LAYOUTTRYLATER: ++ case NFS4ERR_LAYOUTUNAVAILABLE: ++ case NFS4ERR_LOCKED: ++ case NFS4ERR_NOSPC: ++ case NFS4ERR_RECALLCONFLICT: ++ case NFS4ERR_SERVERFAULT: ++ case NFS4ERR_TOOSMALL: ++ /* Requested layout too big for loga_maxcount */ ++ case NFS4ERR_WRONG_TYPE: ++ /* Not a regular file */ ++ nfserr = cpu_to_be32(status); ++ goto out_freelayout; ++ default: ++ BUG(); ++ nfserr = nfserr_serverfault; ++ } ++ ++ lgp->lg_seg = res.lg_seg; ++ lgp->lg_roc = res.lg_return_on_close; ++ ++ /* SUCCESS! ++ * Can the new layout be merged into an existing one? ++ * If so, free unused layout struct ++ */ ++ if (can_merge && merge_layout(fp, clp, &res.lg_seg)) ++ goto out_freelayout; ++ ++ /* Can't merge, so let's initialize this new layout */ ++ init_layout(ls, lp, fp, clp, lgp->lg_fhp, &res.lg_seg); ++out_unlock: ++ if (ls) ++ put_layout_state(ls); ++ if (fp) ++ put_nfs4_file(fp); ++ nfs4_unlock_state(); ++out: ++ dprintk("pNFS %s: lp %p exit nfserr %u\n", __func__, lp, ++ be32_to_cpu(nfserr)); ++ return nfserr; ++out_freelayout: ++ free_layout(lp); ++ goto out_unlock; ++} ++ ++static void ++trim_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lr) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lr_start = lr->offset; ++ u64 lr_end = end_offset(lr_start, lr->length); ++ ++ dprintk("%s:Begin lo %llu:%lld lr %llu:%lld\n", __func__, ++ lo->offset, lo->length, lr->offset, lr->length); ++ ++ /* lr fully covers lo? */ ++ if (lr_start <= lo_start && lo_end <= lr_end) { ++ lo->length = 0; ++ goto out; ++ } ++ ++ /* ++ * split not supported yet. retain layout segment. ++ * remains must be returned by the client ++ * on the final layout return. ++ */ ++ if (lo_start < lr_start && lr_end < lo_end) { ++ dprintk("%s: split not supported\n", __func__); ++ goto out; ++ } ++ ++ if (lo_start < lr_start) ++ lo_end = lr_start - 1; ++ else /* lr_end < lo_end */ ++ lo_start = lr_end + 1; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? lo_end : lo_end - lo_start; ++out: ++ dprintk("%s:End lo %llu:%lld\n", __func__, lo->offset, lo->length); ++} ++ ++static int ++pnfs_return_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ dprintk("%s: clp %p fp %p\n", __func__, clp, fp); ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &fp->fi_layouts, lo_perfile) { ++ dprintk("%s: lp %p client %p,%p lo_type %x,%x iomode %d,%d\n", ++ __func__, lp, ++ lp->lo_client, clp, ++ lp->lo_seg.layout_type, lrp->args.lr_seg.layout_type, ++ lp->lo_seg.iomode, lrp->args.lr_seg.iomode); ++ if (lp->lo_client != clp || ++ lp->lo_seg.layout_type != lrp->args.lr_seg.layout_type || ++ (lp->lo_seg.iomode != lrp->args.lr_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) || ++ !lo_seg_overlapping(&lp->lo_seg, &lrp->args.lr_seg)) ++ continue; ++ layouts_found++; ++ trim_layout(&lp->lo_seg, &lrp->args.lr_seg); ++ if (!lp->lo_seg.length) { ++ lrp->lrs_present = 0; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ } ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++pnfs_return_client_layouts(struct nfs4_client *clp, ++ struct nfsd4_pnfs_layoutreturn *lrp, u64 ex_fsid) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &clp->cl_layouts, lo_perclnt) { ++ if (lrp->args.lr_seg.layout_type != lp->lo_seg.layout_type || ++ (lrp->args.lr_seg.iomode != lp->lo_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY)) ++ continue; ++ ++ if (lrp->args.lr_return_type == RETURN_FSID && ++ !same_fsid_major(&lp->lo_file->fi_fsid, ex_fsid)) ++ continue; ++ ++ layouts_found++; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++recall_return_perfect_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode || ++ clr->cb.cbl_recall_type != lrp->args.lr_return_type) ++ return 0; ++ ++ return (clr->cb.cbl_recall_type == RETURN_FILE && ++ clr->clr_file == fp && ++ clr->cb.cbl_seg.offset == lrp->args.lr_seg.offset && ++ clr->cb.cbl_seg.length == lrp->args.lr_seg.length) || ++ ++ (clr->cb.cbl_recall_type == RETURN_FSID && ++ same_fsid(&clr->cb.cbl_fsid, current_fh)) || ++ ++ clr->cb.cbl_recall_type == RETURN_ALL; ++} ++ ++static int ++recall_return_partial_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ /* iomode matching? */ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode && ++ clr->cb.cbl_seg.iomode != IOMODE_ANY && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) ++ return 0; ++ ++ if (clr->cb.cbl_recall_type == RETURN_ALL || ++ lrp->args.lr_return_type == RETURN_ALL) ++ return 1; ++ ++ /* fsid matches? */ ++ if (clr->cb.cbl_recall_type == RETURN_FSID || ++ lrp->args.lr_return_type == RETURN_FSID) ++ return same_fsid(&clr->cb.cbl_fsid, current_fh); ++ ++ /* file matches, range overlapping? */ ++ return clr->clr_file == fp && ++ lo_seg_overlapping(&clr->cb.cbl_seg, &lrp->args.lr_seg); ++} ++ ++int nfs4_pnfs_return_layout(struct super_block *sb, struct svc_fh *current_fh, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status = 0; ++ int layouts_found = 0; ++ struct inode *ino = current_fh->fh_dentry->d_inode; ++ struct nfs4_file *fp = NULL; ++ struct nfs4_client *clp; ++ struct nfs4_layoutrecall *clr, *nextclr; ++ u64 ex_fsid = current_fh->fh_export->ex_fsid; ++ void *recall_cookie = NULL; ++ ++ dprintk("NFSD: %s\n", __func__); ++ ++ nfs4_lock_state(); ++ clp = find_confirmed_client((clientid_t *)&lrp->args.lr_seg.clientid); ++ if (!clp) ++ goto out; ++ ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ fp = find_file(ino); ++ if (!fp) { ++ printk(KERN_ERR "%s: RETURN_FILE: no nfs4_file for " ++ "ino %p:%lu\n", ++ __func__, ino, ino ? ino->i_ino : 0L); ++ goto out; ++ } ++ ++ /* Check the stateid */ ++ dprintk("%s PROCESS LO_STATEID inode %p\n", __func__, ino); ++ status = nfs4_process_layout_stateid(clp, fp, &lrp->lr_sid, ++ NULL); ++ if (status) ++ goto out_put_file; ++ ++ /* update layouts */ ++ layouts_found = pnfs_return_file_layouts(clp, fp, lrp); ++ /* optimize for the all-empty case */ ++ if (list_empty(&fp->fi_layouts)) ++ recall_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ } else { ++ layouts_found = pnfs_return_client_layouts(clp, lrp, ex_fsid); ++ } ++ ++ dprintk("pNFS %s: clp %p fp %p layout_type 0x%x iomode %d " ++ "return_type %d fsid 0x%llx offset %llu length %llu: " ++ "layouts_found %d\n", ++ __func__, clp, fp, lrp->args.lr_seg.layout_type, ++ lrp->args.lr_seg.iomode, lrp->args.lr_return_type, ++ ex_fsid, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, layouts_found); ++ ++ /* update layoutrecalls ++ * note: for RETURN_{FSID,ALL}, fp may be NULL ++ */ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (clr, nextclr, &clp->cl_layoutrecalls, ++ clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != lrp->args.lr_seg.layout_type) ++ continue; ++ ++ if (recall_return_perfect_match(clr, lrp, fp, current_fh)) ++ recall_cookie = layoutrecall_done(clr); ++ else if (layouts_found && ++ recall_return_partial_match(clr, lrp, fp, current_fh)) ++ clr->clr_time = CURRENT_TIME; ++ } ++ spin_unlock(&layout_lock); ++ ++out_put_file: ++ if (fp) ++ put_nfs4_file(fp); ++out: ++ nfs4_unlock_state(); ++ ++ /* call exported filesystem layout_return (ignore return-code) */ ++ fs_layout_return(sb, ino, lrp, 0, recall_cookie); ++ ++ dprintk("pNFS %s: exit status %d \n", __func__, status); ++ return status; ++} ++ ++/* ++ * PNFS Metadata server export operations callback for get_state ++ * ++ * called by the cluster fs when it receives a get_state() from a data ++ * server. ++ * returns status, or pnfs_get_state* with pnfs_get_state->status set. ++ * ++ */ ++int ++nfs4_pnfs_cb_get_state(struct super_block *sb, struct pnfs_get_state *arg) ++{ ++ struct nfs4_stateid *stp; ++ int flags = LOCK_STATE | OPEN_STATE; /* search both hash tables */ ++ int status = -EINVAL; ++ struct inode *ino; ++ struct nfs4_delegation *dl; ++ stateid_t *stid = (stateid_t *)&arg->stid; ++ ++ dprintk("NFSD: %s sid=" STATEID_FMT " ino %llu\n", __func__, ++ STATEID_VAL(stid), arg->ino); ++ ++ nfs4_lock_state(); ++ stp = find_stateid(stid, flags); ++ if (!stp) { ++ ino = iget_locked(sb, arg->ino); ++ if (!ino) ++ goto out; ++ ++ if (ino->i_state & I_NEW) { ++ iget_failed(ino); ++ goto out; ++ } ++ ++ dl = find_delegation_stateid(ino, stid); ++ if (dl) ++ status = 0; ++ ++ iput(ino); ++ } else { ++ /* XXX ANDROS: marc removed nfs4_check_fh - how come? */ ++ ++ /* arg->devid is the Data server id, set by the cluster fs */ ++ status = nfs4_add_pnfs_ds_dev(stp, arg->dsid); ++ if (status) ++ goto out; ++ ++ arg->access = stp->st_access_bmap; ++ *(clientid_t *)&arg->clid = ++ stp->st_stateowner->so_client->cl_clientid; ++ } ++out: ++ nfs4_unlock_state(); ++ return status; ++} ++ ++static int ++cl_has_file_layout(struct nfs4_client *clp, struct nfs4_file *lrfile, ++ stateid_t *lsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ struct nfs4_layout_state *ls; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) { ++ if (lp->lo_file != lrfile) ++ continue; ++ ++ ls = find_get_layout_state(clp, lrfile); ++ if (!ls) { ++ /* This shouldn't happen as the file should have a ++ * layout stateid if it has a layout. ++ */ ++ printk(KERN_ERR "%s: file %p has no layout stateid\n", ++ __func__, lrfile); ++ WARN_ON(1); ++ break; ++ } ++ update_stateid(&ls->ls_stateid); ++ memcpy(lsid, &ls->ls_stateid, sizeof(stateid_t)); ++ put_layout_state_locked(ls); ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return found; ++} ++ ++static int ++cl_has_fsid_layout(struct nfs4_client *clp, struct nfs4_fsid *fsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ ++ /* note: minor version unused */ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) ++ if (lp->lo_file->fi_fsid.major == fsid->major) { ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return found; ++} ++ ++static int ++cl_has_any_layout(struct nfs4_client *clp) ++{ ++ return !list_empty(&clp->cl_layouts); ++} ++ ++static int ++cl_has_layout(struct nfs4_client *clp, struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile, stateid_t *lsid) ++{ ++ switch (cbl->cbl_recall_type) { ++ case RETURN_FILE: ++ return cl_has_file_layout(clp, lrfile, lsid); ++ case RETURN_FSID: ++ return cl_has_fsid_layout(clp, &cbl->cbl_fsid); ++ default: ++ return cl_has_any_layout(clp); ++ } ++} ++ ++/* ++ * Called without the layout_lock. ++ */ ++void ++nomatching_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfsd4_pnfs_layoutreturn lr = { ++ .args.lr_return_type = clr->cb.cbl_recall_type, ++ .args.lr_seg = clr->cb.cbl_seg, ++ }; ++ struct inode *inode; ++ void *recall_cookie; ++ ++ if (clr->clr_file) { ++ inode = igrab(clr->clr_file->fi_inode); ++ if (WARN_ON(!inode)) ++ return; ++ } else { ++ inode = NULL; ++ } ++ ++ dprintk("%s: clp %p fp %p: simulating layout_return\n", __func__, ++ clr->clr_client, clr->clr_file); ++ ++ if (clr->cb.cbl_recall_type == RETURN_FILE) ++ pnfs_return_file_layouts(clr->clr_client, clr->clr_file, &lr); ++ else ++ pnfs_return_client_layouts(clr->clr_client, &lr, ++ clr->cb.cbl_fsid.major); ++ ++ spin_lock(&layout_lock); ++ recall_cookie = layoutrecall_done(clr); ++ spin_unlock(&layout_lock); ++ ++ fs_layout_return(clr->clr_sb, inode, &lr, LR_FLAG_INTERN, ++ recall_cookie); ++ iput(inode); ++} ++ ++void pnfs_expire_client(struct nfs4_client *clp) ++{ ++ for (;;) { ++ struct nfs4_layoutrecall *lrp = NULL; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layoutrecalls)) { ++ lrp = list_entry(clp->cl_layoutrecalls.next, ++ struct nfs4_layoutrecall, clr_perclnt); ++ get_layoutrecall(lrp); ++ } ++ spin_unlock(&layout_lock); ++ if (!lrp) ++ break; ++ ++ dprintk("%s: lrp %p, fp %p\n", __func__, lrp, lrp->clr_file); ++ BUG_ON(lrp->clr_client != clp); ++ nomatching_layout(lrp); ++ put_layoutrecall(lrp); ++ } ++ ++ for (;;) { ++ struct nfs4_layout *lp = NULL; ++ struct inode *inode = NULL; ++ struct nfsd4_pnfs_layoutreturn lr; ++ bool empty = false; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layouts)) { ++ lp = list_entry(clp->cl_layouts.next, ++ struct nfs4_layout, lo_perclnt); ++ inode = igrab(lp->lo_file->fi_inode); ++ memset(&lr, 0, sizeof(lr)); ++ lr.args.lr_return_type = RETURN_FILE; ++ lr.args.lr_seg = lp->lo_seg; ++ empty = list_empty(&lp->lo_file->fi_layouts); ++ BUG_ON(lp->lo_client != clp); ++ dequeue_layout(lp); ++ destroy_layout(lp); /* do not access lp after this */ ++ } ++ spin_unlock(&layout_lock); ++ if (!lp) ++ break; ++ ++ if (WARN_ON(!inode)) ++ break; ++ ++ dprintk("%s: inode %lu lp %p clp %p\n", __func__, inode->i_ino, ++ lp, clp); ++ ++ fs_layout_return(inode->i_sb, inode, &lr, LR_FLAG_EXPIRE, ++ empty ? PNFS_LAST_LAYOUT_NO_RECALLS : NULL); ++ iput(inode); ++ } ++} ++ ++struct create_recall_list_arg { ++ struct nfsd4_pnfs_cb_layout *cbl; ++ struct nfs4_file *lrfile; ++ struct list_head *todolist; ++ unsigned todo_count; ++}; ++ ++/* ++ * look for matching layout for the given client ++ * and add a pending layout recall to the todo list ++ * if found any. ++ * returns: ++ * 0 if layouts found or negative error. ++ */ ++static int ++lo_recall_per_client(struct nfs4_client *clp, void *p) ++{ ++ stateid_t lsid; ++ struct nfs4_layoutrecall *pending; ++ struct create_recall_list_arg *arg = p; ++ ++ memset(&lsid, 0, sizeof(lsid)); ++ if (!cl_has_layout(clp, arg->cbl, arg->lrfile, &lsid)) ++ return 0; ++ ++ /* Matching put done by layoutreturn */ ++ pending = alloc_init_layoutrecall(arg->cbl, clp, arg->lrfile); ++ /* out of memory, drain todo queue */ ++ if (!pending) ++ return -ENOMEM; ++ ++ *(stateid_t *)&pending->cb.cbl_sid = lsid; ++ list_add(&pending->clr_perclnt, arg->todolist); ++ arg->todo_count++; ++ return 0; ++} ++ ++/* Create a layoutrecall structure for each client based on the ++ * original structure. */ ++int ++create_layout_recall_list(struct list_head *todolist, unsigned *todo_len, ++ struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_client *clp; ++ struct create_recall_list_arg arg = { ++ .cbl = cbl, ++ .lrfile = lrfile, ++ .todolist = todolist, ++ }; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ /* If client given by fs, just do single client */ ++ if (cbl->cbl_seg.clientid) { ++ clp = find_confirmed_client( ++ (clientid_t *)&cbl->cbl_seg.clientid); ++ if (!clp) { ++ status = -ENOENT; ++ dprintk("%s: clientid %llx not found\n", __func__, ++ (unsigned long long)cbl->cbl_seg.clientid); ++ goto out; ++ } ++ ++ status = lo_recall_per_client(clp, &arg); ++ } else { ++ /* Check all clients for layout matches */ ++ status = filter_confirmed_clients(lo_recall_per_client, &arg); ++ } ++ ++out: ++ *todo_len = arg.todo_count; ++ dprintk("%s: <-- list len %u status %d\n", __func__, *todo_len, status); ++ return status; ++} ++ ++/* ++ * Recall layouts asynchronously ++ * Called with state lock. ++ */ ++static int ++spawn_layout_recall(struct super_block *sb, struct list_head *todolist, ++ unsigned todo_len) ++{ ++ struct nfs4_layoutrecall *pending; ++ struct nfs4_layoutrecall *parent = NULL; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ if (todo_len > 1) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ ++ parent = alloc_init_layoutrecall(&pending->cb, NULL, ++ pending->clr_file); ++ if (unlikely(!parent)) { ++ /* We want forward progress. If parent cannot be ++ * allocated take the first one as parent but don't ++ * execute it. Caller must check for -EAGAIN, if so ++ * When the partial recalls return, ++ * nfsd_layout_recall_cb should be called again. ++ */ ++ list_del_init(&pending->clr_perclnt); ++ if (todo_len > 2) { ++ parent = pending; ++ } else { ++ parent = NULL; ++ put_layoutrecall(pending); ++ } ++ --todo_len; ++ status = -ENOMEM; ++ } ++ } ++ ++ while (!list_empty(todolist)) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ list_del_init(&pending->clr_perclnt); ++ dprintk("%s: clp %p cb_client %p fp %p\n", __func__, ++ pending->clr_client, ++ pending->clr_client->cl_cb_client, ++ pending->clr_file); ++ if (unlikely(!pending->clr_client->cl_cb_client)) { ++ printk(KERN_INFO ++ "%s: clientid %08x/%08x has no callback path\n", ++ __func__, ++ pending->clr_client->cl_clientid.cl_boot, ++ pending->clr_client->cl_clientid.cl_id); ++ put_layoutrecall(pending); ++ continue; ++ } ++ ++ pending->clr_time = CURRENT_TIME; ++ pending->clr_sb = sb; ++ if (parent) { ++ /* If we created a parent its initial ref count is 1. ++ * We will need to de-ref it eventually. So we just ++ * don't increment on behalf of the last one. ++ */ ++ if (todo_len != 1) ++ get_layoutrecall(parent); ++ } ++ pending->parent = parent; ++ get_layoutrecall(pending); ++ /* Add to list so corresponding layoutreturn can find req */ ++ list_add(&pending->clr_perclnt, ++ &pending->clr_client->cl_layoutrecalls); ++ ++ nfsd4_cb_layout(pending); ++ --todo_len; ++ } ++ ++ return status; ++} ++ ++/* ++ * Spawn a thread to perform a recall layout ++ * ++ */ ++int nfsd_layout_recall_cb(struct super_block *sb, struct inode *inode, ++ struct nfsd4_pnfs_cb_layout *cbl) ++{ ++ int status; ++ struct nfs4_file *lrfile = NULL; ++ struct list_head todolist; ++ unsigned todo_len = 0; ++ ++ dprintk("NFSD nfsd_layout_recall_cb: inode %p cbl %p\n", inode, cbl); ++ BUG_ON(!cbl); ++ BUG_ON(cbl->cbl_recall_type != RETURN_FILE && ++ cbl->cbl_recall_type != RETURN_FSID && ++ cbl->cbl_recall_type != RETURN_ALL); ++ BUG_ON(cbl->cbl_recall_type == RETURN_FILE && !inode); ++ BUG_ON(cbl->cbl_seg.iomode != IOMODE_READ && ++ cbl->cbl_seg.iomode != IOMODE_RW && ++ cbl->cbl_seg.iomode != IOMODE_ANY); ++ ++ if (nfsd_serv == NULL) { ++ dprintk("NFSD nfsd_layout_recall_cb: nfsd_serv == NULL\n"); ++ return -ENOENT; ++ } ++ ++ nfs4_lock_state(); ++ status = -ENOENT; ++ if (inode) { ++ lrfile = find_file(inode); ++ if (!lrfile) { ++ dprintk("NFSD nfsd_layout_recall_cb: " ++ "nfs4_file not found\n"); ++ goto err; ++ } ++ if (cbl->cbl_recall_type == RETURN_FSID) ++ cbl->cbl_fsid = lrfile->fi_fsid; ++ } ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ /* If no cookie provided by FS, return a default one */ ++ if (!cbl->cbl_cookie) ++ cbl->cbl_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ ++ status = create_layout_recall_list(&todolist, &todo_len, cbl, lrfile); ++ if (list_empty(&todolist)) { ++ status = -ENOENT; ++ } else { ++ /* process todolist even if create_layout_recall_list ++ * returned an error */ ++ int status2 = spawn_layout_recall(sb, &todolist, todo_len); ++ if (status2) ++ status = status2; ++ } ++ ++err: ++ nfs4_unlock_state(); ++ if (lrfile) ++ put_nfs4_file(lrfile); ++ return (todo_len && status) ? -EAGAIN : status; ++} ++ ++struct create_device_notify_list_arg { ++ struct list_head *todolist; ++ struct nfsd4_pnfs_cb_dev_list *ndl; ++}; ++ ++static int ++create_device_notify_per_cl(struct nfs4_client *clp, void *p) ++{ ++ struct nfs4_notify_device *cbnd; ++ struct create_device_notify_list_arg *arg = p; ++ ++ if (atomic_read(&clp->cl_deviceref) <= 0) ++ return 0; ++ ++ cbnd = kmalloc(sizeof(*cbnd), GFP_KERNEL); ++ if (!cbnd) ++ return -ENOMEM; ++ ++ cbnd->nd_list = arg->ndl; ++ cbnd->nd_client = clp; ++ list_add(&cbnd->nd_perclnt, arg->todolist); ++ return 0; ++} ++ ++/* Create a list of clients to send device notifications. */ ++int ++create_device_notify_list(struct list_head *todolist, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ int status; ++ struct create_device_notify_list_arg arg = { ++ .todolist = todolist, ++ .ndl = ndl, ++ }; ++ ++ nfs4_lock_state(); ++ status = filter_confirmed_clients(create_device_notify_per_cl, &arg); ++ nfs4_unlock_state(); ++ ++ return status; ++} ++ ++/* ++ * For each client that a device, send a device notification. ++ * XXX: Need to track which clients have which devices. ++ */ ++int nfsd_device_notify_cb(struct super_block *sb, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ struct nfs4_notify_device *cbnd; ++ unsigned int notify_num = 0; ++ int status2, status = 0; ++ struct list_head todolist; ++ ++ BUG_ON(!ndl || ndl->cbd_len == 0 || !ndl->cbd_list); ++ ++ dprintk("NFSD %s: cbl %p len %u\n", __func__, ndl, ndl->cbd_len); ++ ++ if (nfsd_serv == NULL) ++ return -ENOENT; ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ status = create_device_notify_list(&todolist, ndl); ++ ++ while (!list_empty(&todolist)) { ++ cbnd = list_entry(todolist.next, struct nfs4_notify_device, ++ nd_perclnt); ++ list_del_init(&cbnd->nd_perclnt); ++ status2 = nfsd4_cb_notify_device(cbnd); ++ pnfs_clear_device_notify(cbnd->nd_client); ++ if (status2) { ++ kfree(cbnd); ++ status = status2; ++ } ++ notify_num++; ++ } ++ ++ dprintk("NFSD %s: status %d clients %u\n", ++ __func__, status, notify_num); ++ return status; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-31 20:42:05.557222774 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-31 20:42:05.557222774 -0400 +@@ -0,0 +1,461 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsfh.h" ++#include "nfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Just use a linked list. Do not expect more than 32 dlm_device_entries ++ * the first implementation will just use one device per cluster file system ++ */ ++ ++static LIST_HEAD(dlm_device_list); ++static DEFINE_SPINLOCK(dlm_device_list_lock); ++ ++struct dlm_device_entry { ++ struct list_head dlm_dev_list; ++ char disk_name[DISK_NAME_LEN]; ++ int num_ds; ++ char ds_list[NFSD_DLM_DS_LIST_MAX]; ++}; ++ ++static struct dlm_device_entry * ++_nfsd4_find_pnfs_dlm_device(char *disk_name) ++{ ++ struct dlm_device_entry *dlm_pdev; ++ ++ dprintk("--> %s disk name %s\n", __func__, disk_name); ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) { ++ dprintk("%s Look for dlm_pdev %s\n", __func__, ++ dlm_pdev->disk_name); ++ if (!memcmp(dlm_pdev->disk_name, disk_name, strlen(disk_name))) { ++ spin_unlock(&dlm_device_list_lock); ++ return dlm_pdev; ++ } ++ } ++ spin_unlock(&dlm_device_list_lock); ++ return NULL; ++} ++ ++static struct dlm_device_entry * ++nfsd4_find_pnfs_dlm_device(struct super_block *sb) { ++ char dname[BDEVNAME_SIZE]; ++ ++ bdevname(sb->s_bdev, dname); ++ return _nfsd4_find_pnfs_dlm_device(dname); ++} ++ ++ssize_t ++nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen) ++{ ++ char *pos = buf; ++ ssize_t size = 0; ++ struct dlm_device_entry *dlm_pdev; ++ int ret = -EINVAL; ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) ++ { ++ int advanced; ++ advanced = snprintf(pos, buflen - size, "%s:%s\n", dlm_pdev->disk_name, dlm_pdev->ds_list); ++ if (advanced >= buflen - size) ++ goto out; ++ size += advanced; ++ pos += advanced; ++ } ++ ret = size; ++ ++out: ++ spin_unlock(&dlm_device_list_lock); ++ return ret; ++} ++ ++bool nfsd4_validate_pnfs_dlm_device(char *ds_list, int *num_ds) ++{ ++ char *start = ds_list; ++ ++ *num_ds = 0; ++ ++ while (*start) { ++ struct sockaddr_storage tempAddr; ++ int ipLen = strcspn(start, ","); ++ ++ if (!rpc_pton(start, ipLen, (struct sockaddr *)&tempAddr, sizeof(tempAddr))) ++ return false; ++ (*num_ds)++; ++ start += ipLen + 1; ++ } ++ return true; ++} ++ ++/* ++ * pnfs_dlm_device string format: ++ * block-device-path:, ++ * ++ * Examples ++ * /dev/sda:192.168.1.96,192.168.1.97' creates a data server list with ++ * two data servers for the dlm cluster file system mounted on /dev/sda. ++ * ++ * /dev/sda:192.168.1.96,192.168.1.100' ++ * replaces the data server list for /dev/sda ++ * ++ * Only the deviceid == 1 is supported. Can add device id to ++ * pnfs_dlm_device string when needed. ++ * ++ * Only the round robin each data server once stripe index is supported. ++ */ ++int ++nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len) ++ ++{ ++ struct dlm_device_entry *new, *found; ++ char *bufp = pnfs_dlm_device; ++ char *endp = bufp + strlen(bufp); ++ int err = -ENOMEM; ++ ++ dprintk("--> %s len %d\n", __func__, len); ++ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return err; ++ ++ err = -EINVAL; ++ /* disk_name */ ++ /* FIXME: need to check for valid disk_name. search superblocks? ++ * check for slash dev slash ? ++ */ ++ len = strcspn(bufp, ":"); ++ if (len > DISK_NAME_LEN) ++ goto out_free; ++ memcpy(new->disk_name, bufp, len); ++ ++ err = -EINVAL; ++ bufp += len + 1; ++ if (bufp >= endp) ++ goto out_free; ++ ++ /* data server list */ ++ /* FIXME: need to check for comma separated valid ip format */ ++ len = strcspn(bufp, ":"); ++ if (len > NFSD_DLM_DS_LIST_MAX) ++ goto out_free; ++ memcpy(new->ds_list, bufp, len); ++ ++ ++ /* validate the ips */ ++ if (!nfsd4_validate_pnfs_dlm_device(new->ds_list, &(new->num_ds))) ++ goto out_free; ++ ++ dprintk("%s disk_name %s num_ds %d ds_list %s\n", __func__, ++ new->disk_name, new->num_ds, new->ds_list); ++ ++ found = _nfsd4_find_pnfs_dlm_device(new->disk_name); ++ if (found) { ++ /* FIXME: should compare found->ds_list with new->ds_list ++ * and if it is different, kick off a CB_NOTIFY change ++ * deviceid. ++ */ ++ dprintk("%s pnfs_dlm_device %s:%s already in cache " ++ " replace ds_list with new ds_list %s\n", __func__, ++ found->disk_name, found->ds_list, new->ds_list); ++ memset(found->ds_list, 0, DISK_NAME_LEN); ++ memcpy(found->ds_list, new->ds_list, strlen(new->ds_list)); ++ found->num_ds = new->num_ds; ++ kfree(new); ++ } else { ++ dprintk("%s Adding pnfs_dlm_device %s:%s\n", __func__, ++ new->disk_name, new->ds_list); ++ spin_lock(&dlm_device_list_lock); ++ list_add(&new->dlm_dev_list, &dlm_device_list); ++ spin_unlock(&dlm_device_list_lock); ++ } ++ dprintk("<-- %s Success\n", __func__); ++ return 0; ++ ++out_free: ++ kfree(new); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ struct dlm_device_entry *dlm_pdev, *next; ++ ++ dprintk("--> %s\n", __func__); ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry_safe (dlm_pdev, next, &dlm_device_list, ++ dlm_dev_list) { ++ list_del(&dlm_pdev->dlm_dev_list); ++ kfree(dlm_pdev); ++ } ++ spin_unlock(&dlm_device_list_lock); ++} ++ ++static int nfsd4_pnfs_dlm_getdeviter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ printk(KERN_ERR "%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return -ENOTSUPP; ++ } ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ return 0; ++} ++ ++static int nfsd4_pnfs_dlm_getdevinfo(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err, len, i = 0; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_devaddr *daddr; ++ struct dlm_device_entry *dlm_pdev; ++ char *bufp; ++ ++ err = -ENOTSUPP; ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ dprintk("%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return err; ++ } ++ ++ /* We only hand out a deviceid of 1 in LAYOUTGET, so a GETDEVICEINFO ++ * with a gdia_device_id != 1 is invalid. ++ */ ++ err = -EINVAL; ++ if (devid->devid != 1) { ++ dprintk("%s: WARNING: didn't receive a deviceid of " ++ "1 (got: 0x%llx)\n", __func__, devid->devid); ++ return err; ++ } ++ ++ /* ++ * If the DS list has not been established, return -EINVAL ++ */ ++ dlm_pdev = nfsd4_find_pnfs_dlm_device(sb); ++ if (!dlm_pdev) { ++ dprintk("%s: DEBUG: disk %s Not Found\n", __func__, ++ sb->s_bdev->bd_disk->disk_name); ++ return err; ++ } ++ ++ dprintk("%s: Found disk %s with DS list |%s|\n", ++ __func__, dlm_pdev->disk_name, dlm_pdev->ds_list); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ fdev.fl_device_length = dlm_pdev->num_ds; ++ ++ err = -ENOMEM; ++ len = sizeof(*fdev.fl_device_list) * fdev.fl_device_length; ++ fdev.fl_device_list = kzalloc(len, GFP_KERNEL); ++ if (!fdev.fl_device_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device list " ++ "buffer for %d DSes.\n", __func__, i); ++ fdev.fl_device_length = 0; ++ goto out; ++ } ++ ++ /* Set a simple stripe indicie */ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = kzalloc(sizeof(u32) * ++ fdev.fl_stripeindices_length, GFP_KERNEL); ++ ++ if (!fdev.fl_stripeindices_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a stripeindices " ++ "list buffer for %d DSes.\n", __func__, i); ++ goto out; ++ } ++ for (i = 0; i < fdev.fl_stripeindices_length; i++) ++ fdev.fl_stripeindices_list[i] = i; ++ ++ /* Transfer the data server list with a single multipath entry */ ++ bufp = dlm_pdev->ds_list; ++ for (i = 0; i < fdev.fl_device_length; i++) { ++ daddr = kmalloc(sizeof(*daddr), GFP_KERNEL); ++ if (!daddr) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device " ++ "addr buffer.\n", __func__); ++ goto out; ++ } ++ ++ daddr->r_netid.data = "tcp"; ++ daddr->r_netid.len = 3; ++ ++ len = strcspn(bufp, ","); ++ daddr->r_addr.data = kmalloc(len + 4, GFP_KERNEL); ++ memcpy(daddr->r_addr.data, bufp, len); ++ /* ++ * append the port number. interpreted as two more bytes ++ * beyond the quad: ".8.1" -> 0x08.0x01 -> 0x0801 = port 2049. ++ */ ++ memcpy(daddr->r_addr.data + len, ".8.1", 4); ++ daddr->r_addr.len = len + 4; ++ ++ fdev.fl_device_list[i].fl_multipath_length = 1; ++ fdev.fl_device_list[i].fl_multipath_list = daddr; ++ ++ dprintk("%s: encoding DS |%s|\n", __func__, bufp); ++ ++ bufp += len + 1; ++ } ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ for (i = 0; i < fdev.fl_device_length; i++) ++ kfree(fdev.fl_device_list[i].fl_multipath_list); ++ kfree(fdev.fl_device_list); ++ kfree(fdev.fl_stripeindices_list); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize >= NFSSVC_MAXBLKSIZE) ++ return blocksize; ++ return NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++} ++ ++/* ++ * Look up inode block device in pnfs_dlm_device list. ++ * Hash on the inode->i_ino and number of data servers. ++ */ ++static int dlm_ino_hash(struct inode *ino) ++{ ++ struct dlm_device_entry *de; ++ u32 hash_mask = 0; ++ ++ /* If can't find the inode block device in the pnfs_dlm_deivce list ++ * then don't hand out a layout ++ */ ++ de = nfsd4_find_pnfs_dlm_device(ino->i_sb); ++ if (!de) ++ return -1; ++ hash_mask = de->num_ds - 1; ++ return ino->i_ino & hash_mask; ++} ++ ++static enum nfsstat4 nfsd4_pnfs_dlm_layoutget(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ int index; ++ enum nfsstat4 rc = NFS4_OK; ++ ++ dprintk("%s: LAYOUT_GET\n", __func__); ++ ++ /* DLM exported file systems only support layouts for READ */ ++ if (res->lg_seg.iomode == IOMODE_RW) ++ return NFS4ERR_BADIOMODE; ++ ++ index = dlm_ino_hash(inode); ++ dprintk("%s first stripe index %d i_ino %lu\n", __func__, index, ++ inode->i_ino); ++ if (index < 0) ++ return NFS4ERR_LAYOUTUNAVAILABLE; ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ /* Always give out whole file layouts */ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ /* Always give out READ ONLY layouts */ ++ res->lg_seg.iomode = IOMODE_READ; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = false; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = args->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = index; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ memcpy(fhp, args->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++nfsd4_pnfs_dlm_layouttype(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++const struct pnfs_export_operations pnfs_dlm_export_ops = { ++ .layout_type = nfsd4_pnfs_dlm_layouttype, ++ .get_device_info = nfsd4_pnfs_dlm_getdevinfo, ++ .get_device_iter = nfsd4_pnfs_dlm_getdeviter, ++ .layout_get = nfsd4_pnfs_dlm_layoutget, ++}; ++EXPORT_SYMBOL(pnfs_dlm_export_ops); +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-31 20:42:05.558141620 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-31 20:42:05.558141620 -0400 +@@ -0,0 +1,620 @@ ++/* ++* linux/fs/nfsd/nfs4pnfsds.c ++* ++* Copyright (c) 2005 The Regents of the University of Michigan. ++* All rights reserved. ++* ++* Andy Adamson ++* ++* Redistribution and use in source and binary forms, with or without ++* modification, are permitted provided that the following conditions ++* are met: ++* ++* 1. Redistributions of source code must retain the above copyright ++* notice, this list of conditions and the following disclaimer. ++* 2. Redistributions in binary form must reproduce the above copyright ++* notice, this list of conditions and the following disclaimer in the ++* documentation and/or other materials provided with the distribution. ++* 3. Neither the name of the University nor the names of its ++* contributors may be used to endorse or promote products derived ++* from this software without specific prior written permission. ++* ++* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++* ++*/ ++#if defined(CONFIG_PNFSD) ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsd.h" ++#include "pnfsd.h" ++#include "state.h" ++ ++/* ++ ******************* ++ * PNFS ++ ******************* ++ */ ++/* ++ * Hash tables for pNFS Data Server state ++ * ++ * mds_nodeid: list of struct pnfs_mds_id one per Metadata server (MDS) using ++ * this data server (DS). ++ * ++ * mds_clid_hashtbl[]: uses clientid_hashval(), hash of all clientids obtained ++ * from any MDS. ++ * ++ * ds_stid_hashtbl[]: uses stateid_hashval(), hash of all stateids obtained ++ * from any MDS. ++ * ++ */ ++/* Hash tables for clientid state */ ++#define CLIENT_HASH_BITS 4 ++#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) ++#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) ++ ++#define clientid_hashval(id) \ ++ ((id) & CLIENT_HASH_MASK) ++ ++/* hash table for pnfs_ds_stateid */ ++#define STATEID_HASH_BITS 10 ++#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) ++#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) ++ ++#define stateid_hashval(owner_id, file_id) \ ++ (((owner_id) + (file_id)) & STATEID_HASH_MASK) ++ ++static struct list_head mds_id_tbl; ++static struct list_head mds_clid_hashtbl[CLIENT_HASH_SIZE]; ++static struct list_head ds_stid_hashtbl[STATEID_HASH_SIZE]; ++ ++static inline void put_ds_clientid(struct pnfs_ds_clientid *dcp); ++static inline void put_ds_mdsid(struct pnfs_mds_id *mdp); ++ ++/* Mutex for data server state. Needs to be separate from ++ * mds state mutex since a node can be both mds and ds */ ++static DEFINE_MUTEX(ds_mutex); ++static struct thread_info *ds_mutex_owner; ++ ++static void ++ds_lock_state(void) ++{ ++ mutex_lock(&ds_mutex); ++ ds_mutex_owner = current_thread_info(); ++} ++ ++static void ++ds_unlock_state(void) ++{ ++ BUG_ON(ds_mutex_owner != current_thread_info()); ++ ds_mutex_owner = NULL; ++ mutex_unlock(&ds_mutex); ++} ++ ++static int ++cmp_clid(const clientid_t *cl1, const clientid_t *cl2) ++{ ++ return (cl1->cl_boot == cl2->cl_boot) && ++ (cl1->cl_id == cl2->cl_id); ++} ++ ++void ++nfs4_pnfs_state_init(void) ++{ ++ int i; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&mds_clid_hashtbl[i]); ++ ++ for (i = 0; i < STATEID_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&ds_stid_hashtbl[i]); ++ ++ INIT_LIST_HEAD(&mds_id_tbl); ++} ++ ++static struct pnfs_mds_id * ++find_pnfs_mds_id(u32 mdsid) ++{ ++ struct pnfs_mds_id *local = NULL; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ list_for_each_entry(local, &mds_id_tbl, di_hash) { ++ if (local->di_mdsid == mdsid) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_clientid * ++find_pnfs_ds_clientid(const clientid_t *clid) ++{ ++ struct pnfs_ds_clientid *local = NULL; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = clientid_hashval(clid->cl_id); ++ list_for_each_entry(local, &mds_clid_hashtbl[hashval], dc_hash) { ++ if (cmp_clid(&local->dc_mdsclid, clid)) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_stateid * ++find_pnfs_ds_stateid(stateid_t *stid) ++{ ++ struct pnfs_ds_stateid *local = NULL; ++ u32 st_id = stid->si_stateownerid; ++ u32 f_id = stid->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_for_each_entry(local, &ds_stid_hashtbl[hashval], ds_hash) ++ if ((local->ds_stid.si_stateownerid == st_id) && ++ (local->ds_stid.si_fileid == f_id) && ++ (local->ds_stid.si_boot == stid->si_boot)) { ++ stateid_t *sid = &local->ds_stid; ++ dprintk("NFSD: %s <-- %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, local, local->ds_flags, ++ STATEID_VAL(sid)); ++ return local; ++ } ++ return NULL; ++} ++ ++static void ++release_ds_mdsid(struct kref *kref) ++{ ++ struct pnfs_mds_id *mdp = ++ container_of(kref, struct pnfs_mds_id, di_ref); ++ dprintk("pNFSD: %s\n", __func__); ++ ++ list_del(&mdp->di_hash); ++ list_del(&mdp->di_mdsclid); ++ kfree(mdp); ++} ++ ++static void ++release_ds_clientid(struct kref *kref) ++{ ++ struct pnfs_ds_clientid *dcp = ++ container_of(kref, struct pnfs_ds_clientid, dc_ref); ++ struct pnfs_mds_id *mdp; ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(dcp->dc_mdsid); ++ if (mdp) ++ put_ds_mdsid(mdp); ++ ++ list_del(&dcp->dc_hash); ++ list_del(&dcp->dc_stateid); ++ list_del(&dcp->dc_permdsid); ++ kfree(dcp); ++} ++ ++static void ++release_ds_stateid(struct kref *kref) ++{ ++ struct pnfs_ds_stateid *dsp = ++ container_of(kref, struct pnfs_ds_stateid, ds_ref); ++ struct pnfs_ds_clientid *dcp; ++ dprintk("pNFS %s: dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid(&dsp->ds_mdsclid); ++ if (dcp) ++ put_ds_clientid(dcp); ++ ++ list_del(&dsp->ds_hash); ++ list_del(&dsp->ds_perclid); ++ kfree(dsp); ++} ++ ++static inline void ++put_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_put(&dcp->dc_ref, release_ds_clientid); ++} ++ ++static inline void ++get_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_get(&dcp->dc_ref); ++} ++ ++static inline void ++put_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_put(&mdp->di_ref, release_ds_mdsid); ++} ++ ++static inline void ++get_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_get(&mdp->di_ref); ++} ++ ++static inline void ++put_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_put(&dsp->ds_ref, release_ds_stateid); ++} ++ ++static inline void ++get_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_get(&dsp->ds_ref); ++} ++ ++void ++nfs4_pnfs_state_shutdown(void) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int i; ++ ++ dprintk("pNFSD %s: -->\n", __func__); ++ ++ ds_lock_state(); ++ for (i = 0; i < STATEID_HASH_SIZE; i++) { ++ while (!list_empty(&ds_stid_hashtbl[i])) { ++ dsp = list_entry(ds_stid_hashtbl[i].next, ++ struct pnfs_ds_stateid, ds_hash); ++ put_ds_stateid(dsp); ++ } ++ } ++ ds_unlock_state(); ++} ++ ++static struct pnfs_mds_id * ++alloc_init_mds_id(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = kmalloc(sizeof(*mdp), GFP_KERNEL); ++ if (!mdp) ++ return NULL; ++ INIT_LIST_HEAD(&mdp->di_hash); ++ INIT_LIST_HEAD(&mdp->di_mdsclid); ++ list_add(&mdp->di_hash, &mds_id_tbl); ++ mdp->di_mdsid = gsp->dsid; ++ mdp->di_mdsboot = 0; ++ kref_init(&mdp->di_ref); ++ return mdp; ++} ++ ++static struct pnfs_ds_clientid * ++alloc_init_ds_clientid(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ struct pnfs_ds_clientid *dcp; ++ clientid_t *clid = (clientid_t *)&gsp->clid; ++ unsigned int hashval = clientid_hashval(clid->cl_id); ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(gsp->dsid); ++ if (!mdp) { ++ mdp = alloc_init_mds_id(gsp); ++ if (!mdp) ++ return NULL; ++ } else { ++ get_ds_mdsid(mdp); ++ } ++ ++ dcp = kmalloc(sizeof(*dcp), GFP_KERNEL); ++ if (!dcp) ++ return NULL; ++ ++ INIT_LIST_HEAD(&dcp->dc_hash); ++ INIT_LIST_HEAD(&dcp->dc_stateid); ++ INIT_LIST_HEAD(&dcp->dc_permdsid); ++ list_add(&dcp->dc_hash, &mds_clid_hashtbl[hashval]); ++ list_add(&dcp->dc_permdsid, &mdp->di_mdsclid); ++ dcp->dc_mdsclid = *clid; ++ kref_init(&dcp->dc_ref); ++ dcp->dc_mdsid = gsp->dsid; ++ return dcp; ++} ++ ++static struct pnfs_ds_stateid * ++alloc_init_ds_stateid(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct pnfs_ds_stateid *dsp; ++ u32 st_id = stidp->si_stateownerid; ++ u32 f_id = stidp->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ dsp = kmalloc(sizeof(*dsp), GFP_KERNEL); ++ if (!dsp) ++ return dsp; ++ ++ INIT_LIST_HEAD(&dsp->ds_hash); ++ INIT_LIST_HEAD(&dsp->ds_perclid); ++ memcpy(&dsp->ds_stid, stidp, sizeof(stateid_t)); ++ fh_copy_shallow(&dsp->ds_fh, &cfh->fh_handle); ++ dsp->ds_access = 0; ++ dsp->ds_status = 0; ++ dsp->ds_flags = 0L; ++ kref_init(&dsp->ds_ref); ++ set_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ clear_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ init_waitqueue_head(&dsp->ds_waitq); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_add(&dsp->ds_hash, &ds_stid_hashtbl[hashval]); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++static int ++update_ds_stateid(struct pnfs_ds_stateid *dsp, struct svc_fh *cfh, ++ struct pnfs_get_state *gsp) ++{ ++ struct pnfs_ds_clientid *dcp; ++ int new = 0; ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid((clientid_t *)&gsp->clid); ++ if (!dcp) { ++ dcp = alloc_init_ds_clientid(gsp); ++ if (!dcp) ++ return 1; ++ new = 1; ++ } ++ if (test_bit(DS_STATEID_NEW, &dsp->ds_flags)) { ++ list_add(&dsp->ds_perclid, &dcp->dc_stateid); ++ if (!new) ++ get_ds_clientid(dcp); ++ } ++ ++ memcpy(&dsp->ds_stid, &gsp->stid, sizeof(stateid_t)); ++ dsp->ds_access = gsp->access; ++ dsp->ds_status = 0; ++ dsp->ds_verifier[0] = gsp->verifier[0]; ++ dsp->ds_verifier[1] = gsp->verifier[1]; ++ memcpy(&dsp->ds_mdsclid, &gsp->clid, sizeof(clientid_t)); ++ set_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ clear_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ return 0; ++} ++ ++int ++nfs4_pnfs_cb_change_state(struct pnfs_get_state *gs) ++{ ++ stateid_t *stid = (stateid_t *)&gs->stid; ++ struct pnfs_ds_stateid *dsp; ++ ++ dprintk("pNFSD: %s stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stid)); ++ ++ ds_lock_state(); ++ dsp = find_pnfs_ds_stateid(stid); ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ if (dsp) ++ return 0; ++ return -ENOENT; ++} ++ ++/* Retrieves and validates stateid. ++ * If stateid exists and its fields match, return it. ++ * If stateid exists but either the generation or ++ * ownerids don't match, check with mds to see if it is valid. ++ * If the stateid doesn't exist, the first thread creates a ++ * invalid *marker* stateid, then checks to see if the ++ * stateid exists on the mds. If so, it validates the *marker* ++ * stateid and updates its fields. Subsequent threads that ++ * find the *marker* stateid wait until it is valid or an error ++ * occurs. ++ * Called with ds_state_lock. ++ */ ++static struct pnfs_ds_stateid * ++nfsv4_ds_get_state(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct inode *ino = cfh->fh_dentry->d_inode; ++ struct super_block *sb; ++ struct pnfs_ds_stateid *dsp = NULL; ++ struct pnfs_get_state gs = { ++ .access = 0, ++ }; ++ int status = 0, waiter = 0; ++ ++ dprintk("pNFSD: %s -->\n", __func__); ++ ++ dsp = find_pnfs_ds_stateid(stidp); ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags) && ++ (stidp->si_generation == dsp->ds_stid.si_generation)) ++ goto out_noput; ++ ++ sb = ino->i_sb; ++ if (!sb || !sb->s_pnfs_op->get_state) ++ goto out_noput; ++ ++ /* Uninitialize current state if it exists yet it doesn't match. ++ * If it is already invalid, another thread is checking state */ ++ if (dsp) { ++ if (!test_and_clear_bit(DS_STATEID_VALID, &dsp->ds_flags)) ++ waiter = 1; ++ } else { ++ dsp = alloc_init_ds_stateid(cfh, stidp); ++ if (!dsp) ++ goto out_noput; ++ } ++ ++ dprintk("pNFSD: %s Starting loop\n", __func__); ++ get_ds_stateid(dsp); ++ while (!test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ ds_unlock_state(); ++ ++ /* Another thread is checking the state */ ++ if (waiter) { ++ dprintk("pNFSD: %s waiting\n", __func__); ++ wait_event_interruptible_timeout(dsp->ds_waitq, ++ (test_bit(DS_STATEID_VALID, &dsp->ds_flags) || ++ test_bit(DS_STATEID_ERROR, &dsp->ds_flags)), ++ msecs_to_jiffies(1024)); ++ dprintk("pNFSD: %s awake\n", __func__); ++ ds_lock_state(); ++ if (test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ goto out; ++ ++ continue; ++ } ++ ++ /* Validate stateid on mds */ ++ dprintk("pNFSD: %s Checking state on MDS\n", __func__); ++ memcpy(&gs.stid, stidp, sizeof(stateid_t)); ++ status = sb->s_pnfs_op->get_state(ino, &cfh->fh_handle, &gs); ++ dprintk("pNFSD: %s from MDS status %d\n", __func__, status); ++ ds_lock_state(); ++ /* if !status and stateid is valid, update id and mark valid */ ++ if (status || update_ds_stateid(dsp, cfh, &gs)) { ++ set_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ /* remove invalid stateid from list */ ++ put_ds_stateid(dsp); ++ wake_up(&dsp->ds_waitq); ++ goto out; ++ } ++ ++ wake_up(&dsp->ds_waitq); ++ } ++out: ++ if (dsp) ++ put_ds_stateid(dsp); ++out_noput: ++ if (dsp) ++ dprintk("pNFSD: %s <-- dsp %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, dsp, dsp->ds_flags, STATEID_VAL(&dsp->ds_stid)); ++ /* If error, return null */ ++ if (dsp && test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ dsp = NULL; ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++int ++nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *cfh, stateid_t *stateid) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int status = 0; ++ ++ dprintk("pNFSD: %s --> " STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ /* Must release state lock while verifying stateid on mds */ ++ nfs4_unlock_state(); ++ ds_lock_state(); ++ dsp = nfsv4_ds_get_state(cfh, stateid); ++ if (dsp) { ++ get_ds_stateid(dsp); ++ dprintk("pNFSD: %s Found " STATEID_FMT "\n", __func__, ++ STATEID_VAL(&dsp->ds_stid)); ++ ++ dprintk("NFSD: %s: dsp %p fh_size %u:%u " ++ "fh [%08x:%08x:%08x:%08x]:[%08x:%08x:%08x:%08x] " ++ "gen %x:%x\n", ++ __func__, dsp, ++ cfh->fh_handle.fh_size, dsp->ds_fh.fh_size, ++ ((unsigned *)&cfh->fh_handle.fh_base)[0], ++ ((unsigned *)&cfh->fh_handle.fh_base)[1], ++ ((unsigned *)&cfh->fh_handle.fh_base)[2], ++ ((unsigned *)&cfh->fh_handle.fh_base)[3], ++ ((unsigned *)&dsp->ds_fh.fh_base)[0], ++ ((unsigned *)&dsp->ds_fh.fh_base)[1], ++ ((unsigned *)&dsp->ds_fh.fh_base)[2], ++ ((unsigned *)&dsp->ds_fh.fh_base)[3], ++ stateid->si_generation, dsp->ds_stid.si_generation); ++ } ++ ++ if (!dsp || ++ (cfh->fh_handle.fh_size != dsp->ds_fh.fh_size) || ++ (memcmp(&cfh->fh_handle.fh_base, &dsp->ds_fh.fh_base, ++ dsp->ds_fh.fh_size) != 0) || ++ (stateid->si_generation > dsp->ds_stid.si_generation)) ++ status = nfserr_bad_stateid; ++ else if (stateid->si_generation < dsp->ds_stid.si_generation) ++ status = nfserr_old_stateid; ++ ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ nfs4_lock_state(); ++ dprintk("pNFSD: %s <-- status %d\n", __func__, be32_to_cpu(status)); ++ return status; ++} ++ ++void ++nfs4_ds_get_verifier(stateid_t *stateid, struct super_block *sb, u32 *p) ++{ ++ struct pnfs_ds_stateid *dsp = NULL; ++ ++ dprintk("pNFSD: %s --> stid %p\n", __func__, stateid); ++ ++ ds_lock_state(); ++ if (stateid != NULL) { ++ dsp = find_pnfs_ds_stateid(stateid); ++ if (dsp) ++ get_ds_stateid(dsp); ++ } ++ ++ /* XXX: Should we fetch the stateid or wait if some other ++ * thread is currently retrieving the stateid ? */ ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ *p++ = dsp->ds_verifier[0]; ++ *p++ = dsp->ds_verifier[1]; ++ put_ds_stateid(dsp); ++ } else { ++ /* must be on MDS */ ++ ds_unlock_state(); ++ sb->s_pnfs_op->get_verifier(sb, p); ++ ds_lock_state(); ++ p += 2; ++ } ++ ds_unlock_state(); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-31 20:41:19.198160463 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-31 20:42:05.559129617 -0400 +@@ -34,10 +34,14 @@ + */ + #include + #include ++#include ++#include ++#include + + #include "cache.h" + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_PROC + +@@ -372,6 +376,24 @@ nfsd4_open(struct svc_rqst *rqstp, struc + * set, (2) sets open->op_stateid, (3) sets open->op_delegation. + */ + status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); ++#if defined(CONFIG_SPNFS) ++ if (!status && spnfs_enabled()) { ++ struct inode *inode = cstate->current_fh.fh_dentry->d_inode; ++ ++ status = spnfs_open(inode, open); ++ if (status) { ++ dprintk( ++ "nfsd: pNFS could not be enabled for inode: %lu\n", ++ inode->i_ino); ++ /* ++ * XXX When there's a failure then need to indicate to ++ * future ops that no pNFS is available. Should I save ++ * the status in the inode? It's kind of a big hammer. ++ * But there may be no stripes available? ++ */ ++ } ++ } ++#endif /* CONFIG_SPNFS */ + out: + if (open->op_stateowner) { + nfs4_get_stateowner(open->op_stateowner); +@@ -454,16 +476,30 @@ nfsd4_access(struct svc_rqst *rqstp, str + &access->ac_supported); + } + ++static void ++nfsd4_get_verifier(struct super_block *sb, nfs4_verifier *verf) ++{ ++ u32 *p = (u32 *)verf->data; ++ ++#if defined(CONFIG_PNFSD) ++ if (sb->s_pnfs_op && sb->s_pnfs_op->get_verifier) { ++ nfs4_ds_get_verifier(NULL, sb, p); ++ return; ++ } ++#endif /* CONFIG_PNFSD */ ++ ++ *p++ = nfssvc_boot.tv_sec; ++ *p++ = nfssvc_boot.tv_usec; ++} ++ + static __be32 + nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_commit *commit) + { + __be32 status; + +- u32 *p = (u32 *)commit->co_verf.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; +- ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &commit->co_verf); + status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + commit->co_count); + if (status == nfserr_symlink) +@@ -816,7 +852,6 @@ nfsd4_write(struct svc_rqst *rqstp, stru + { + stateid_t *stateid = &write->wr_stateid; + struct file *filp = NULL; +- u32 *p; + __be32 status = nfs_ok; + unsigned long cnt; + +@@ -838,13 +873,49 @@ nfsd4_write(struct svc_rqst *rqstp, stru + + cnt = write->wr_buflen; + write->wr_how_written = write->wr_stable_how; +- p = (u32 *)write->wr_verifier.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; + ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &write->wr_verifier); ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(cstate->current_fh.fh_dentry->d_inode, 0)) { ++ status = bl_layoutrecall(cstate->current_fh.fh_dentry->d_inode, ++ RETURN_FILE, write->wr_offset, write->wr_buflen); ++ if (!status) { ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++ } ++ } else ++#endif ++ ++ if (spnfs_enabled()) { ++ status = spnfs_write(cstate->current_fh.fh_dentry->d_inode, ++ write->wr_offset, write->wr_buflen, write->wr_vlen, ++ rqstp); ++ if (status == nfs_ok) { ++ /* DMXXX: HACK to get filesize set */ ++ /* write one byte at offset+length-1 */ ++ struct kvec k[1]; ++ char zero = 0; ++ unsigned long cnt = 1; ++ ++ k[0].iov_base = (void *)&zero; ++ k[0].iov_len = 1; ++ nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset+write->wr_buflen-1, k, 1, ++ &cnt, &write->wr_how_written); ++ } ++ } else /* we're not an MDS */ ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++#else + status = nfsd_write(rqstp, &cstate->current_fh, filp, + write->wr_offset, rqstp->rq_vec, write->wr_vlen, + &cnt, &write->wr_how_written); ++#endif /* CONFIG_SPNFS */ ++ + if (filp) + fput(filp); + +@@ -935,6 +1006,306 @@ nfsd4_verify(struct svc_rqst *rqstp, str + return status == nfserr_same ? nfs_ok : status; + } + ++#if defined(CONFIG_PNFSD) ++ ++static __be32 ++nfsd4_layout_verify(struct super_block *sb, struct svc_export *exp, ++ unsigned int layout_type) ++{ ++ int status, type; ++ ++ /* check to see if pNFS is supported. */ ++ status = nfserr_layoutunavailable; ++ if (exp && exp->ex_pnfs == 0) { ++ dprintk("%s: Underlying file system " ++ "is not exported over pNFS\n", __func__); ++ goto out; ++ } ++ if (!sb->s_pnfs_op || !sb->s_pnfs_op->layout_type) { ++ dprintk("%s: Underlying file system " ++ "does not support pNFS\n", __func__); ++ goto out; ++ } ++ ++ type = sb->s_pnfs_op->layout_type(sb); ++ ++ /* check to see if requested layout type is supported. */ ++ status = nfserr_unknown_layouttype; ++ if (!type) ++ dprintk("BUG: %s: layout_type 0 is reserved and must not be " ++ "used by filesystem\n", __func__); ++ else if (type != layout_type) ++ dprintk("%s: requested layout type %d " ++ "does not match supported type %d\n", ++ __func__, layout_type, type); ++ else ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevlist(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevlist *gdlp) ++{ ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ int status; ++ ++ dprintk("%s: type %u maxdevices %u cookie %llu verf %llu\n", ++ __func__, gdlp->gd_layout_type, gdlp->gd_maxdevices, ++ gdlp->gd_cookie, gdlp->gd_verf); ++ ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* We must be able to encode at list one device */ ++ if (!gdlp->gd_maxdevices) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ gdlp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Do nothing if underlying file system does not support ++ * getdevicelist */ ++ if (!sb->s_pnfs_op->get_device_iter) { ++ status = nfserr_notsupp; ++ goto out; ++ } ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdlp->gd_fhp = &cstate->current_fh; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevinfo(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevinfo *gdp) ++{ ++ struct super_block *sb; ++ int status; ++ clientid_t clid; ++ ++ dprintk("%s: layout_type %u dev_id %llx:%llx maxcnt %u\n", ++ __func__, gdp->gd_layout_type, gdp->gd_devid.sbid, ++ gdp->gd_devid.devid, gdp->gd_maxcount); ++ ++ status = nfserr_inval; ++ sb = find_sbid_id(gdp->gd_devid.sbid); ++ dprintk("%s: sb %p\n", __func__, sb); ++ if (!sb) { ++ status = nfserr_noent; ++ goto out; ++ } ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, NULL, gdp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdp->gd_sb = sb; ++ ++ /* Update notifications */ ++ copy_clientid(&clid, cstate->session); ++ pnfs_set_device_notify(&clid, gdp->gd_notify_types); ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutget(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lgp->lg_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_badiomode; ++ if (lgp->lg_seg.iomode != IOMODE_READ && ++ lgp->lg_seg.iomode != IOMODE_RW) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lgp->lg_seg.iomode); ++ goto out; ++ } ++ ++ /* Set up arguments so layout can be retrieved at encode time */ ++ lgp->lg_fhp = current_fh; ++ copy_clientid((clientid_t *)&lgp->lg_seg.clientid, cstate->session); ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutcommit(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ int status; ++ struct inode *ino = NULL; ++ struct iattr ia; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ dprintk("NFSD: nfsd4_layoutcommit \n"); ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ ino = current_fh->fh_dentry->d_inode; ++ if (!ino) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = ino->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lcp->args.lc_seg.layout_type); ++ if (status) ++ goto out; ++ ++ /* This will only extend the file length. Do a quick ++ * check to see if there is any point in waiting for the update ++ * locks. ++ * TODO: Is this correct for all back ends? ++ */ ++ dprintk("%s:new offset: %d new size: %llu old size: %lld\n", ++ __func__, lcp->args.lc_newoffset, lcp->args.lc_last_wr + 1, ++ ino->i_size); ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lcp->args.lc_seg.clientid, cstate->session); ++ lcp->res.lc_size_chg = 0; ++ if (sb->s_pnfs_op->layout_commit) { ++ status = sb->s_pnfs_op->layout_commit(ino, &lcp->args, &lcp->res); ++ dprintk("%s:layout_commit result %d\n", __func__, status); ++ } else { ++ fh_lock(current_fh); ++ if ((lcp->args.lc_newoffset == 0) || ++ ((lcp->args.lc_last_wr + 1) <= ino->i_size)) { ++ status = 0; ++ lcp->res.lc_size_chg = 0; ++ fh_unlock(current_fh); ++ goto out; ++ } ++ ++ /* Try our best to update the file size */ ++ dprintk("%s: Modifying file size\n", __func__); ++ ia.ia_valid = ATTR_SIZE; ++ ia.ia_size = lcp->args.lc_last_wr + 1; ++ status = notify_change(current_fh->fh_dentry, &ia); ++ fh_unlock(current_fh); ++ dprintk("%s:notify_change result %d\n", __func__, status); ++ } ++ ++ if (!status && lcp->res.lc_size_chg && ++ EX_ISSYNC(current_fh->fh_export)) { ++ dprintk("%s: Synchronously writing inode size %llu\n", ++ __func__, ino->i_size); ++ write_inode_now(ino, 1); ++ lcp->res.lc_newsize = i_size_read(ino); ++ } ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutreturn(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lrp->args.lr_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_return_type != RETURN_FILE && ++ lrp->args.lr_return_type != RETURN_FSID && ++ lrp->args.lr_return_type != RETURN_ALL) { ++ dprintk("pNFS %s: invalid return_type %d\n", __func__, ++ lrp->args.lr_return_type); ++ goto out; ++ } ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_seg.iomode != IOMODE_READ && ++ lrp->args.lr_seg.iomode != IOMODE_RW && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lrp->args.lr_seg.iomode); ++ goto out; ++ } ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lrp->args.lr_seg.clientid, cstate->session); ++ lrp->lrs_present = (lrp->args.lr_return_type == RETURN_FILE); ++ status = nfs4_pnfs_return_layout(sb, current_fh, lrp); ++out: ++ dprintk("pNFS %s: status %d return_type 0x%x lrs_present %d\n", ++ __func__, status, lrp->args.lr_return_type, lrp->lrs_present); ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * NULL call. + */ +@@ -1317,6 +1688,29 @@ static struct nfsd4_operation nfsd4_ops[ + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_RECLAIM_COMPLETE", + }, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICELIST] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevlist, ++ .op_name = "OP_GETDEVICELIST", ++ }, ++ [OP_GETDEVICEINFO] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevinfo, ++ .op_flags = ALLOWED_WITHOUT_FH, ++ .op_name = "OP_GETDEVICEINFO", ++ }, ++ [OP_LAYOUTGET] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutget, ++ .op_name = "OP_LAYOUTGET", ++ }, ++ [OP_LAYOUTCOMMIT] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutcommit, ++ .op_name = "OP_LAYOUTCOMMIT", ++ }, ++ [OP_LAYOUTRETURN] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutreturn, ++ .op_name = "OP_LAYOUTRETURN", ++ }, ++#endif /* CONFIG_PNFSD */ + }; + + static const char *nfsd4_op_name(unsigned opnum) +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-08-31 20:41:19.200150153 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-31 20:42:05.561202607 -0400 +@@ -42,6 +42,8 @@ + #include "xdr4.h" + #include "vfs.h" + ++#include "pnfsd.h" ++ + #define NFSDDBG_FACILITY NFSDDBG_PROC + + /* Globals */ +@@ -60,8 +62,6 @@ static u64 current_sessionid = 1; + #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) + + /* forward declarations */ +-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); +-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); + static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; + static void nfs4_set_recdir(char *recdir); + +@@ -69,6 +69,7 @@ static void nfs4_set_recdir(char *recdir + + /* Currently used for almost all code touching nfsv4 state: */ + static DEFINE_MUTEX(client_mutex); ++struct task_struct *client_mutex_owner; + + /* + * Currently used for the del_recall_lru and file hash table. In an +@@ -86,11 +87,21 @@ void + nfs4_lock_state(void) + { + mutex_lock(&client_mutex); ++ client_mutex_owner = current; ++} ++ ++#define BUG_ON_UNLOCKED_STATE() BUG_ON(client_mutex_owner != current) ++ ++void ++nfs4_bug_on_unlocked_state(void) ++{ ++ BUG_ON(client_mutex_owner != current); + } + + void + nfs4_unlock_state(void) + { ++ client_mutex_owner = NULL; + mutex_unlock(&client_mutex); + } + +@@ -109,7 +120,7 @@ opaque_hashval(const void *ptr, int nbyt + + static struct list_head del_recall_lru; + +-static inline void ++inline void + put_nfs4_file(struct nfs4_file *fi) + { + if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { +@@ -120,7 +131,7 @@ put_nfs4_file(struct nfs4_file *fi) + } + } + +-static inline void ++inline void + get_nfs4_file(struct nfs4_file *fi) + { + atomic_inc(&fi->fi_ref); +@@ -230,7 +241,10 @@ nfs4_close_delegation(struct nfs4_delega + * but we want to remove the lease in any case. */ + if (dp->dl_flock) + vfs_setlease(filp, F_UNLCK, &dp->dl_flock); ++ BUG_ON_UNLOCKED_STATE(); ++ nfs4_unlock_state(); /* allow nested layout recall/return */ + nfsd_close(filp); ++ nfs4_lock_state(); + } + + /* Called under the state lock. */ +@@ -266,8 +280,8 @@ static DEFINE_SPINLOCK(client_lock); + * reclaim_str_hashtbl[] holds known client info from previous reset/reboot + * used in reboot/reset lease grace period processing + * +- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed +- * setclientid_confirmed info. ++ * conf_id_hashtbl[], and conf_str_hashtbl[] hold ++ * confirmed setclientid_confirmed info. + * + * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed + * setclientid info. +@@ -292,6 +306,7 @@ static void unhash_generic_stateid(struc + list_del(&stp->st_hash); + list_del(&stp->st_perfile); + list_del(&stp->st_perstateowner); ++ release_pnfs_ds_dev_list(stp); + } + + static void free_generic_stateid(struct nfs4_stateid *stp) +@@ -345,7 +360,10 @@ static void release_open_stateid(struct + { + unhash_generic_stateid(stp); + release_stateid_lockowners(stp); ++ BUG_ON_UNLOCKED_STATE(); ++ nfs4_unlock_state(); /* allow nested layout recall/return */ + nfsd_close(stp->st_vfs_file); ++ nfs4_lock_state(); + free_generic_stateid(stp); + } + +@@ -739,6 +757,8 @@ expire_client(struct nfs4_client *clp) + struct nfs4_delegation *dp; + struct list_head reaplist; + ++ BUG_ON_UNLOCKED_STATE(); ++ + INIT_LIST_HEAD(&reaplist); + spin_lock(&recall_lock); + while (!list_empty(&clp->cl_delegations)) { +@@ -758,6 +778,7 @@ expire_client(struct nfs4_client *clp) + sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); + release_openowner(sop); + } ++ pnfs_expire_client(clp); + nfsd4_set_callback_client(clp, NULL); + if (clp->cl_cb_conn.cb_xprt) + svc_xprt_put(clp->cl_cb_conn.cb_xprt); +@@ -770,6 +791,13 @@ expire_client(struct nfs4_client *clp) + spin_unlock(&client_lock); + } + ++void expire_client_lock(struct nfs4_client *clp) ++{ ++ nfs4_lock_state(); ++ expire_client(clp); ++ nfs4_unlock_state(); ++} ++ + static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) + { + memcpy(target->cl_verifier.data, source->data, +@@ -859,6 +887,11 @@ static struct nfs4_client *create_client + INIT_LIST_HEAD(&clp->cl_strhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&clp->cl_layouts); ++ INIT_LIST_HEAD(&clp->cl_layoutrecalls); ++ atomic_set(&clp->cl_deviceref, 0); ++#endif /* CONFIG_PNFSD */ + INIT_LIST_HEAD(&clp->cl_sessions); + INIT_LIST_HEAD(&clp->cl_lru); + clp->cl_time = get_seconds(); +@@ -908,7 +941,7 @@ move_to_confirmed(struct nfs4_client *cl + renew_client(clp); + } + +-static struct nfs4_client * ++struct nfs4_client * + find_confirmed_client(clientid_t *clid) + { + struct nfs4_client *clp; +@@ -978,6 +1011,24 @@ find_unconfirmed_client_by_str(const cha + return NULL; + } + ++int ++filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), ++ void *arg) ++{ ++ struct nfs4_client *clp, *next; ++ int i, status = 0; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ list_for_each_entry_safe (clp, next, &conf_str_hashtbl[i], ++ cl_strhash) { ++ status = func(clp, arg); ++ if (status) ++ break; ++ } ++ ++ return status; ++} ++ + static void + gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) + { +@@ -1110,8 +1161,12 @@ nfsd4_replay_cache_entry(struct nfsd4_co + static void + nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) + { +- /* pNFS is not supported */ ++#if defined(CONFIG_PNFSD) ++ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS | ++ EXCHGID4_FLAG_USE_PNFS_DS; ++#else /* CONFIG_PNFSD */ + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; ++#endif /* CONFIG_PNFSD */ + + /* Referrals are supported, Migration is not. */ + new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; +@@ -1301,6 +1356,13 @@ nfsd4_create_session(struct svc_rqst *rq + struct nfsd4_clid_slot *cs_slot = NULL; + int status = 0; + ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ /* XXX hack to get local ip address */ ++ memcpy(&pnfsd_lexp_addr, &rqstp->rq_xprt->xpt_local, ++ sizeof(pnfsd_lexp_addr)); ++ pnfs_lexp_addr_len = rqstp->rq_xprt->xpt_locallen; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ + nfs4_lock_state(); + unconf = find_unconfirmed_client(&cr_ses->clientid); + conf = find_confirmed_client(&cr_ses->clientid); +@@ -1340,25 +1402,26 @@ nfsd4_create_session(struct svc_rqst *rq + cs_slot->sl_seqid++; /* from 0 to 1 */ + move_to_confirmed(unconf); + +- if (cr_ses->flags & SESSION4_BACK_CHAN) { +- unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; +- svc_xprt_get(rqstp->rq_xprt); +- rpc_copy_addr( +- (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, +- sa); +- unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); +- unconf->cl_cb_conn.cb_minorversion = +- cstate->minorversion; +- unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; +- unconf->cl_cb_seq_nr = 1; +- nfsd4_probe_callback(unconf, &unconf->cl_cb_conn); +- } ++ if (is_ds_only_session(unconf->cl_exchange_flags)) ++ cr_ses->flags &= ~SESSION4_BACK_CHAN; ++ + conf = unconf; + } else { + status = nfserr_stale_clientid; + goto out; + } + ++ if (cr_ses->flags & SESSION4_BACK_CHAN) { ++ conf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; ++ svc_xprt_get(rqstp->rq_xprt); ++ rpc_copy_addr((struct sockaddr *)&conf->cl_cb_conn.cb_addr, sa); ++ conf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); ++ conf->cl_cb_conn.cb_minorversion = cstate->minorversion; ++ conf->cl_cb_conn.cb_prog = cr_ses->callback_prog; ++ conf->cl_cb_seq_nr = 1; ++ nfsd4_probe_callback(conf, &conf->cl_cb_conn); ++ } ++ + /* + * We do not support RDMA or persistent sessions + */ +@@ -1746,7 +1809,7 @@ out: + + /* OPEN Share state helper functions */ + static inline struct nfs4_file * +-alloc_init_file(struct inode *ino) ++alloc_init_file(struct inode *ino, struct svc_fh *current_fh) + { + struct nfs4_file *fp; + unsigned int hashval = file_hashval(ino); +@@ -1760,6 +1823,16 @@ alloc_init_file(struct inode *ino) + fp->fi_inode = igrab(ino); + fp->fi_id = current_fileid++; + fp->fi_had_conflict = false; ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&fp->fi_layouts); ++ INIT_LIST_HEAD(&fp->fi_layout_states); ++ fp->fi_fsid.major = current_fh->fh_export->ex_fsid; ++ fp->fi_fsid.minor = 0; ++ fp->fi_fhlen = current_fh->fh_handle.fh_size; ++ BUG_ON(fp->fi_fhlen > sizeof(fp->fi_fhval)); ++ memcpy(fp->fi_fhval, ¤t_fh->fh_handle.fh_base, ++ fp->fi_fhlen); ++#endif /* CONFIG_PNFSD */ + spin_lock(&recall_lock); + list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); +@@ -1768,7 +1841,7 @@ alloc_init_file(struct inode *ino) + return NULL; + } + +-static void ++void + nfsd4_free_slab(struct kmem_cache **slab) + { + if (*slab == NULL) +@@ -1784,6 +1857,7 @@ nfsd4_free_slabs(void) + nfsd4_free_slab(&file_slab); + nfsd4_free_slab(&stateid_slab); + nfsd4_free_slab(&deleg_slab); ++ nfsd4_free_pnfs_slabs(); + } + + static int +@@ -1805,6 +1879,8 @@ nfsd4_init_slabs(void) + sizeof(struct nfs4_delegation), 0, 0, NULL); + if (deleg_slab == NULL) + goto out_nomem; ++ if (nfsd4_init_pnfs_slabs()) ++ goto out_nomem; + return 0; + out_nomem: + nfsd4_free_slabs(); +@@ -1878,6 +1954,9 @@ init_stateid(struct nfs4_stateid *stp, s + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); + INIT_LIST_HEAD(&stp->st_perfile); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &stateid_hashtbl[hashval]); + list_add(&stp->st_perstateowner, &sop->so_stateids); + list_add(&stp->st_perfile, &fp->fi_stateids); +@@ -1919,6 +1998,7 @@ find_openstateowner_str(unsigned int has + { + struct nfs4_stateowner *so = NULL; + ++ BUG_ON_UNLOCKED_STATE(); + list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { + if (same_owner_str(so, &open->op_owner, &open->op_clientid)) + return so; +@@ -1927,7 +2007,7 @@ find_openstateowner_str(unsigned int has + } + + /* search file_hashtbl[] for file */ +-static struct nfs4_file * ++struct nfs4_file * + find_file(struct inode *ino) + { + unsigned int hashval = file_hashval(ino); +@@ -1945,6 +2025,18 @@ find_file(struct inode *ino) + return NULL; + } + ++struct nfs4_file * ++find_alloc_file(struct inode *ino, struct svc_fh *current_fh) ++{ ++ struct nfs4_file *fp; ++ ++ fp = find_file(ino); ++ if (fp) ++ return fp; ++ ++ return alloc_init_file(ino, current_fh); ++} ++ + static inline int access_valid(u32 x, u32 minorversion) + { + if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) +@@ -2503,7 +2595,7 @@ nfsd4_process_open2(struct svc_rqst *rqs + if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) + goto out; + status = nfserr_resource; +- fp = alloc_init_file(ino); ++ fp = alloc_init_file(ino, current_fh); + if (fp == NULL) + goto out; + } +@@ -2730,7 +2822,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct + return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_path.dentry->d_inode; + } + +-static int ++int + STALE_STATEID(stateid_t *stateid) + { + if (stateid->si_boot == boot_time) +@@ -2740,6 +2832,16 @@ STALE_STATEID(stateid_t *stateid) + return 1; + } + ++__be32 ++nfs4_check_stateid(stateid_t *stateid) ++{ ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ return nfserr_bad_stateid; ++ if (STALE_STATEID(stateid)) ++ return nfserr_stale_stateid; ++ return 0; ++} ++ + static inline int + access_permit_read(unsigned long access_bmap) + { +@@ -2848,6 +2950,24 @@ nfs4_preprocess_stateid_op(struct nfsd4_ + if (grace_disallows_io(ino)) + return nfserr_grace; + ++#if defined(CONFIG_PNFSD) ++ if (pnfs_fh_is_ds(¤t_fh->fh_handle)) { ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ status = nfserr_bad_stateid; ++ else ++#ifdef CONFIG_GFS2_FS_LOCKING_DLM ++ { ++ dprintk("%s Don't check DS stateid\n", __func__); ++ return 0; ++ } ++#else /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ status = nfs4_preprocess_pnfs_ds_stateid(current_fh, ++ stateid); ++#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ goto out; ++ } ++#endif /* CONFIG_PNFSD */ ++ + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + +@@ -2924,13 +3044,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_co + *stpp = NULL; + *sopp = NULL; + +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { +- dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); +- return nfserr_bad_stateid; +- } +- +- if (STALE_STATEID(stateid)) +- return nfserr_stale_stateid; ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ return status; + + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; +@@ -3205,11 +3321,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + nfs4_lock_state(); +- status = nfserr_bad_stateid; +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) +- goto out; +- status = nfserr_stale_stateid; +- if (STALE_STATEID(stateid)) ++ status = nfs4_check_stateid(stateid); ++ if (status) + goto out; + status = nfserr_bad_stateid; + if (!is_delegation_stateid(stateid)) +@@ -3238,26 +3351,6 @@ out: + #define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) + #define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) + +-static inline u64 +-end_offset(u64 start, u64 len) +-{ +- u64 end; +- +- end = start + len; +- return end >= start ? end: NFS4_MAX_UINT64; +-} +- +-/* last octet in a range */ +-static inline u64 +-last_byte_offset(u64 start, u64 len) +-{ +- u64 end; +- +- BUG_ON(!len); +- end = start + len; +- return end > start ? end - 1: NFS4_MAX_UINT64; +-} +- + #define lockownerid_hashval(id) \ + ((id) & LOCK_HASH_MASK) + +@@ -3274,7 +3367,7 @@ static struct list_head lock_ownerid_has + static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; + static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; + +-static struct nfs4_stateid * ++struct nfs4_stateid * + find_stateid(stateid_t *stid, int flags) + { + struct nfs4_stateid *local; +@@ -3303,7 +3396,7 @@ find_stateid(stateid_t *stid, int flags) + return NULL; + } + +-static struct nfs4_delegation * ++struct nfs4_delegation * + find_delegation_stateid(struct inode *ino, stateid_t *stid) + { + struct nfs4_file *fp; +@@ -3436,6 +3529,9 @@ alloc_init_lock_stateid(struct nfs4_stat + INIT_LIST_HEAD(&stp->st_perfile); + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]); + list_add(&stp->st_perfile, &fp->fi_stateids); + list_add(&stp->st_perstateowner, &sop->so_stateids); +@@ -3998,6 +4094,9 @@ nfs4_state_init(void) + INIT_LIST_HEAD(&client_lru); + INIT_LIST_HEAD(&del_recall_lru); + reclaim_str_hashtbl_size = 0; ++#if defined(CONFIG_PNFSD) ++ nfs4_pnfs_state_init(); ++#endif /* CONFIG_PNFSD */ + return 0; + } + +@@ -4110,6 +4209,7 @@ __nfs4_state_shutdown(void) + } + + nfsd4_shutdown_recdir(); ++ nfs4_pnfs_state_shutdown(); + nfs4_init = 0; + } + +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-08-31 20:41:19.202150173 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-31 20:42:05.563232916 -0400 +@@ -47,9 +47,14 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_XDR + +@@ -1234,6 +1239,138 @@ nfsd4_decode_sequence(struct nfsd4_compo + DECODE_TAIL; + } + ++#if defined(CONFIG_PNFSD) ++static __be32 ++nfsd4_decode_getdevlist(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16 + sizeof(nfs4_verifier)); ++ READ32(gdevl->gd_layout_type); ++ READ32(gdevl->gd_maxdevices); ++ READ64(gdevl->gd_cookie); ++ COPYMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_getdevinfo(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ u32 num; ++ DECODE_HEAD; ++ ++ READ_BUF(12 + sizeof(struct nfsd4_pnfs_deviceid)); ++ READ64(gdev->gd_devid.sbid); ++ READ64(gdev->gd_devid.devid); ++ READ32(gdev->gd_layout_type); ++ READ32(gdev->gd_maxcount); ++ READ32(num); ++ if (num) { ++ READ_BUF(4); ++ READ32(gdev->gd_notify_types); ++ } else { ++ gdev->gd_notify_types = 0; ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(36); ++ READ32(lgp->lg_signal); ++ READ32(lgp->lg_seg.layout_type); ++ READ32(lgp->lg_seg.iomode); ++ READ64(lgp->lg_seg.offset); ++ READ64(lgp->lg_seg.length); ++ READ64(lgp->lg_minlength); ++ nfsd4_decode_stateid(argp, &lgp->lg_sid); ++ READ_BUF(4); ++ READ32(lgp->lg_maxcount); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ DECODE_HEAD; ++ u32 timechange; ++ ++ READ_BUF(20); ++ READ64(lcp->args.lc_seg.offset); ++ READ64(lcp->args.lc_seg.length); ++ READ32(lcp->args.lc_reclaim); ++ nfsd4_decode_stateid(argp, &lcp->lc_sid); ++ READ_BUF(4); ++ READ32(lcp->args.lc_newoffset); ++ if (lcp->args.lc_newoffset) { ++ READ_BUF(8); ++ READ64(lcp->args.lc_last_wr); ++ } else ++ lcp->args.lc_last_wr = 0; ++ READ_BUF(4); ++ READ32(timechange); ++ if (timechange) { ++ READ_BUF(12); ++ READ64(lcp->args.lc_mtime.seconds); ++ READ32(lcp->args.lc_mtime.nseconds); ++ } else { ++ lcp->args.lc_mtime.seconds = 0; ++ lcp->args.lc_mtime.nseconds = 0; ++ } ++ READ_BUF(8); ++ READ32(lcp->args.lc_seg.layout_type); ++ /* XXX: saving XDR'ed layout update. Since we don't have the ++ * current_fh yet, and therefore no export_ops, we can't call ++ * the layout specific decode routines. File and pVFS2 ++ * do not use the layout update.... ++ */ ++ READ32(lcp->args.lc_up_len); ++ if (lcp->args.lc_up_len > 0) { ++ READ_BUF(lcp->args.lc_up_len); ++ READMEM(lcp->args.lc_up_layout, lcp->args.lc_up_len); ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16); ++ READ32(lrp->args.lr_reclaim); ++ READ32(lrp->args.lr_seg.layout_type); ++ READ32(lrp->args.lr_seg.iomode); ++ READ32(lrp->args.lr_return_type); ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ READ_BUF(16); ++ READ64(lrp->args.lr_seg.offset); ++ READ64(lrp->args.lr_seg.length); ++ nfsd4_decode_stateid(argp, &lrp->lr_sid); ++ READ_BUF(4); ++ READ32(lrp->args.lrf_body_len); ++ if (lrp->args.lrf_body_len > 0) { ++ READ_BUF(lrp->args.lrf_body_len); ++ READMEM(lrp->args.lrf_body, lrp->args.lrf_body_len); ++ } ++ } ++ ++ DECODE_TAIL; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) + { +@@ -1335,11 +1472,19 @@ static nfsd4_dec nfsd41_dec_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, + [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, +@@ -2136,6 +2281,36 @@ out_acl: + } + WRITE64(stat.ino); + } ++#if defined(CONFIG_PNFSD) ++ if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { ++ struct super_block *sb = dentry->d_inode->i_sb; ++ int type = 0; ++ ++ /* Query the filesystem for supported pNFS layout types. ++ * Currently, we only support one layout type per file system. ++ * The export_ops->layout_type() returns the pnfs_layouttype4. ++ */ ++ buflen -= 4; ++ if (buflen < 0) /* length */ ++ goto out_resource; ++ ++ if (sb && sb->s_pnfs_op && sb->s_pnfs_op->layout_type) ++ type = sb->s_pnfs_op->layout_type(sb); ++ if (type) { ++ if ((buflen -= 4) < 0) /* type */ ++ goto out_resource; ++ WRITE32(1); /* length */ ++ WRITE32(type); /* type */ ++ } else ++ WRITE32(0); /* length */ ++ } ++ ++ if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ if ((buflen -= 4) < 0) ++ goto out_resource; ++ WRITE32(stat.blksize); ++ } ++#endif /* CONFIG_PNFSD */ + if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + WRITE32(3); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); +@@ -2366,6 +2541,10 @@ nfsd4_encode_commit(struct nfsd4_compoun + if (!nfserr) { + RESERVE_SPACE(8); + WRITEMEM(commit->co_verf.data, 8); ++ dprintk("NFSD: nfsd4_encode_commit: verifier %x:%x\n", ++ ((u32 *)(&commit->co_verf.data))[0], ++ ((u32 *)(&commit->co_verf.data))[1]); ++ + ADJUST_ARGS(); + } + return nfserr; +@@ -2620,9 +2799,20 @@ nfsd4_encode_read(struct nfsd4_compoundr + } + read->rd_vlen = v; + ++#if defined(CONFIG_SPNFS) ++ if (spnfs_enabled()) ++ nfserr = spnfs_read(read->rd_fhp->fh_dentry->d_inode, ++ read->rd_offset, &maxcount, read->rd_vlen, ++ resp->rqstp); ++ else /* we're not an MDS */ ++ nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, ++ read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, ++ &maxcount); ++#else + nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, + read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, + &maxcount); ++#endif /* CONFIG_SPNFS */ + + if (nfserr == nfserr_symlink) + nfserr = nfserr_inval; +@@ -2926,6 +3116,9 @@ nfsd4_encode_write(struct nfsd4_compound + WRITE32(write->wr_bytes_written); + WRITE32(write->wr_how_written); + WRITEMEM(write->wr_verifier.data, 8); ++ dprintk("NFSD: nfsd4_encode_write: verifier %x:%x\n", ++ ((u32 *)(&write->wr_verifier.data))[0], ++ ((u32 *)(&write->wr_verifier.data))[1]); + ADJUST_ARGS(); + } + return nfserr; +@@ -3069,6 +3262,343 @@ nfsd4_encode_sequence(struct nfsd4_compo + return 0; + } + ++#if defined(CONFIG_PNFSD) ++ ++/* Uses the export interface to iterate through the available devices ++ * and encodes them on the response stream. ++ */ ++static __be32 ++nfsd4_encode_devlist_iterator(struct nfsd4_compoundres *resp, ++ struct nfsd4_pnfs_getdevlist *gdevl, ++ unsigned int *dev_count) ++{ ++ struct super_block *sb = gdevl->gd_fhp->fh_dentry->d_inode->i_sb; ++ __be32 nfserr; ++ int status; ++ __be32 *p; ++ struct nfsd4_pnfs_dev_iter_res res = { ++ .gd_cookie = gdevl->gd_cookie, ++ .gd_verf = gdevl->gd_verf, ++ .gd_eof = 0 ++ }; ++ u64 sbid; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ sbid = find_create_sbid(sb); ++ *dev_count = 0; ++ do { ++ status = sb->s_pnfs_op->get_device_iter(sb, ++ gdevl->gd_layout_type, ++ &res); ++ if (status) { ++ if (status == -ENOENT) { ++ res.gd_eof = 1; ++ /* return success */ ++ break; ++ } ++ nfserr = nfserrno(status); ++ goto out_err; ++ } ++ ++ /* Encode device id and layout type */ ++ RESERVE_SPACE(sizeof(struct nfsd4_pnfs_deviceid)); ++ WRITE64((__be64)sbid); ++ WRITE64(res.gd_devid); /* devid minor */ ++ ADJUST_ARGS(); ++ (*dev_count)++; ++ } while (*dev_count < gdevl->gd_maxdevices && !res.gd_eof); ++ gdevl->gd_cookie = res.gd_cookie; ++ gdevl->gd_verf = res.gd_verf; ++ gdevl->gd_eof = res.gd_eof; ++ nfserr = nfs_ok; ++out_err: ++ dprintk("%s: Encoded %u devices\n", __func__, *dev_count); ++ return nfserr; ++} ++ ++/* Encodes the response of get device list. ++*/ ++static __be32 ++nfsd4_encode_getdevlist(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ unsigned int dev_count = 0, lead_count; ++ u32 *p_in = resp->p; ++ __be32 *p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ /* Ensure we have room for cookie, verifier, and devlist len, ++ * which we will backfill in after we encode as many devices as possible ++ */ ++ lead_count = 8 + sizeof(nfs4_verifier) + 4; ++ RESERVE_SPACE(lead_count); ++ /* skip past these values */ ++ p += XDR_QUADLEN(lead_count); ++ ADJUST_ARGS(); ++ ++ /* Iterate over as many device ids as possible on the xdr stream */ ++ nfserr = nfsd4_encode_devlist_iterator(resp, gdevl, &dev_count); ++ if (nfserr) ++ goto out_err; ++ ++ /* Backfill in cookie, verf and number of devices encoded */ ++ p = p_in; ++ WRITE64(gdevl->gd_cookie); ++ WRITEMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ WRITE32(dev_count); ++ ++ /* Skip over devices */ ++ p += XDR_QUADLEN(dev_count * sizeof(struct nfsd4_pnfs_deviceid)); ++ ADJUST_ARGS(); ++ ++ /* are we at the end of devices? */ ++ RESERVE_SPACE(4); ++ WRITE32(gdevl->gd_eof); ++ ADJUST_ARGS(); ++ ++ dprintk("%s: done.\n", __func__); ++ ++ nfserr = nfs_ok; ++out: ++ return nfserr; ++out_err: ++ p = p_in; ++ ADJUST_ARGS(); ++ goto out; ++} ++ ++/* For a given device id, have the file system retrieve and encode the ++ * associated device. For file layout, the encoding function is ++ * passed down to the file system. The file system then has the option ++ * of using this encoding function or one of its own. ++ * ++ * Note: the file system must return the XDR size of struct device_addr4 ++ * da_addr_body in pnfs_xdr_info.bytes_written on NFS4ERR_TOOSMALL for the ++ * gdir_mincount calculation. ++ */ ++static __be32 ++nfsd4_encode_getdevinfo(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ struct super_block *sb; ++ int maxcount = 0, type_notify_len = 12; ++ __be32 *p, *p_save = NULL, *p_in = resp->p; ++ struct exp_xdr_stream xdr; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = gdev->gd_sb; ++ ++ if (gdev->gd_maxcount != 0) { ++ /* FIXME: this will be bound by the session max response */ ++ maxcount = svc_max_payload(resp->rqstp); ++ if (maxcount > gdev->gd_maxcount) ++ maxcount = gdev->gd_maxcount; ++ ++ /* Ensure have room for type and notify field */ ++ maxcount -= type_notify_len; ++ if (maxcount < 0) { ++ nfserr = -ETOOSMALL; ++ goto toosmall; ++ } ++ } ++ ++ RESERVE_SPACE(4); ++ WRITE32(gdev->gd_layout_type); ++ ADJUST_ARGS(); ++ ++ /* If maxcount is 0 then just update notifications */ ++ if (gdev->gd_maxcount == 0) ++ goto handle_notifications; ++ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ nfserr = sb->s_pnfs_op->get_device_info(sb, &xdr, gdev->gd_layout_type, ++ &gdev->gd_devid); ++ if (nfserr) ++ goto err; ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ BUG_ON(xdr.p > xdr.end); ++ ++ /* Update the xdr stream with the number of bytes encoded ++ * by the file system. ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++handle_notifications: ++ /* Encode supported device notifications */ ++ RESERVE_SPACE(4); ++ if (sb->s_pnfs_op->set_device_notify) { ++ struct pnfs_devnotify_arg dn_args; ++ ++ dn_args.dn_layout_type = gdev->gd_layout_type; ++ dn_args.dn_devid = gdev->gd_devid; ++ dn_args.dn_notify_types = gdev->gd_notify_types; ++ nfserr = sb->s_pnfs_op->set_device_notify(sb, &dn_args); ++ if (nfserr) ++ goto err; ++ WRITE32(dn_args.dn_notify_types); ++ } else { ++ WRITE32(0); ++ } ++ ADJUST_ARGS(); ++ ++out: ++ return nfserrno(nfserr); ++toosmall: ++ dprintk("%s: maxcount too small\n", __func__); ++ RESERVE_SPACE(4); ++ WRITE32((p_save ? (xdr.p - p_save) * 4 : 0) + type_notify_len); ++ ADJUST_ARGS(); ++ goto out; ++err: ++ /* Rewind to the beginning */ ++ p = p_in; ++ ADJUST_ARGS(); ++ if (nfserr == -ETOOSMALL) ++ goto toosmall; ++ printk(KERN_ERR "%s: export ERROR %d\n", __func__, nfserr); ++ goto out; ++} ++ ++static __be32 ++nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, ++ __be32 nfserr, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int maxcount, leadcount; ++ struct super_block *sb; ++ struct exp_xdr_stream xdr; ++ __be32 *p, *p_save, *p_start = resp->p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = lgp->lg_fhp->fh_dentry->d_inode->i_sb; ++ maxcount = PAGE_SIZE; ++ if (maxcount > lgp->lg_maxcount) ++ maxcount = lgp->lg_maxcount; ++ ++ /* Check for space on xdr stream */ ++ leadcount = 36 + sizeof(stateid_opaque_t); ++ RESERVE_SPACE(leadcount); ++ /* encode layout metadata after file system encodes layout */ ++ p += XDR_QUADLEN(leadcount); ++ ADJUST_ARGS(); ++ ++ /* Ensure have room for ret_on_close, off, len, iomode, type */ ++ maxcount -= leadcount; ++ if (maxcount < 0) { ++ printk(KERN_ERR "%s: buffer too small\n", __func__); ++ nfserr = nfserr_toosmall; ++ goto err; ++ } ++ ++ /* Set xdr info so file system can encode layout */ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ /* Retrieve, encode, and merge layout; process stateid */ ++ nfserr = nfs4_pnfs_get_layout(lgp, &xdr); ++ if (nfserr) ++ goto err; ++ ++ /* Ensure file system returned enough bytes for the client ++ * to access. ++ */ ++ if (lgp->lg_seg.length < lgp->lg_minlength) { ++ nfserr = nfserr_badlayout; ++ goto err; ++ } ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ ++ /* Rewind to beginning and encode attrs */ ++ resp->p = p_start; ++ RESERVE_SPACE(4); ++ WRITE32(lgp->lg_roc); /* return on close */ ++ ADJUST_ARGS(); ++ nfsd4_encode_stateid(resp, &lgp->lg_sid); ++ RESERVE_SPACE(28); ++ /* Note: response logr_layout array count, always one for now */ ++ WRITE32(1); ++ WRITE64(lgp->lg_seg.offset); ++ WRITE64(lgp->lg_seg.length); ++ WRITE32(lgp->lg_seg.iomode); ++ WRITE32(lgp->lg_seg.layout_type); ++ ++ /* Update the xdr stream with the number of bytes written ++ * by the file system ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++ return nfs_ok; ++err: ++ resp->p = p_start; ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lcp->res.lc_size_chg); ++ ADJUST_ARGS(); ++ if (lcp->res.lc_size_chg) { ++ RESERVE_SPACE(8); ++ WRITE64(lcp->res.lc_newsize); ++ ADJUST_ARGS(); ++ } ++out: ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lrp->lrs_present != 0); /* got stateid? */ ++ ADJUST_ARGS(); ++ if (lrp->lrs_present) ++ nfsd4_encode_stateid(resp, &lrp->lr_sid); ++out: ++ return nfserr; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) + { +@@ -3129,11 +3659,19 @@ static nfsd4_enc nfsd4_enc_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, + [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, +diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c +--- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-08-31 20:41:19.203150982 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-31 20:42:05.565212801 -0400 +@@ -13,10 +13,15 @@ + #include + #include + #include ++#include + + #include "nfsd.h" + #include "cache.h" + ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++#include ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + /* + * We have a single directory with 9 nodes in it. + */ +@@ -49,6 +54,9 @@ enum { + NFSD_Gracetime, + NFSD_RecoveryDir, + #endif ++#ifdef CONFIG_PNFSD ++ NFSD_pnfs_dlm_device, ++#endif + }; + + /* +@@ -74,6 +82,9 @@ static ssize_t write_leasetime(struct fi + static ssize_t write_gracetime(struct file *file, char *buf, size_t size); + static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); + #endif ++#ifdef CONFIG_PNFSD ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size); ++#endif + + static ssize_t (*write_op[])(struct file *, char *, size_t) = { + [NFSD_Svc] = write_svc, +@@ -96,6 +107,9 @@ static ssize_t (*write_op[])(struct file + [NFSD_Gracetime] = write_gracetime, + [NFSD_RecoveryDir] = write_recoverydir, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = write_pnfs_dlm_device, ++#endif + }; + + static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) +@@ -1349,6 +1363,68 @@ static ssize_t write_recoverydir(struct + + #endif + ++#ifdef CONFIG_PNFSD ++ ++static ssize_t __write_pnfs_dlm_device(struct file *file, char *buf, ++ size_t size) ++{ ++ char *mesg = buf; ++ char *pnfs_dlm_device; ++ int max_size = NFSD_PNFS_DLM_DEVICE_MAX; ++ int len, ret = 0; ++ ++ if (size > 0) { ++ ret = -EINVAL; ++ if (size > max_size || buf[size-1] != '\n') ++ return ret; ++ buf[size-1] = 0; ++ ++ pnfs_dlm_device = mesg; ++ len = qword_get(&mesg, pnfs_dlm_device, size); ++ if (len <= 0) ++ return ret; ++ ++ ret = nfsd4_set_pnfs_dlm_device(pnfs_dlm_device, len); ++ } else ++ return nfsd4_get_pnfs_dlm_device_list(buf, SIMPLE_TRANSACTION_LIMIT); ++ ++ return ret <= 0 ? ret : strlen(buf); ++} ++ ++/** ++ * write_pnfs_dlm_device - Set or report the current pNFS data server list ++ * ++ * Input: ++ * buf: ignored ++ * size: zero ++ * ++ * OR ++ * ++ * Input: ++ * buf: C string containing a block device name, ++ * a colon, and then a comma separated ++ * list of pNFS data server IPv4 addresses ++ * size: non-zero length of C string in @buf ++ * Output: ++ * On success: passed-in buffer filled with '\n'-terminated C ++ * string containing a block device name, a colon, and ++ * then a comma separated list of pNFS ++ * data server IPv4 addresses. ++ * return code is the size in bytes of the string ++ * On error: return code is a negative errno value ++ */ ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size) ++{ ++ ssize_t rv; ++ ++ mutex_lock(&nfsd_mutex); ++ rv = __write_pnfs_dlm_device(file, buf, size); ++ mutex_unlock(&nfsd_mutex); ++ return rv; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ + /*----------------------------------------------------------------------------*/ + /* + * populating the filesystem. +@@ -1383,6 +1459,10 @@ static int nfsd_fill_super(struct super_ + [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = {"pnfs_dlm_device", &transaction_ops, ++ S_IWUSR|S_IRUSR}, ++#endif + /* last one */ {""} + }; + return simple_fill_super(sb, 0x6e667364, nfsd_files); +@@ -1421,6 +1501,9 @@ static int create_proc_exports_entry(voi + } + #endif + ++#if defined(CONFIG_SPNFS_BLOCK) ++int nfsd_bl_init(void); ++#endif + static int __init init_nfsd(void) + { + int retval; +@@ -1443,6 +1526,15 @@ static int __init init_nfsd(void) + retval = create_proc_exports_entry(); + if (retval) + goto out_free_idmap; ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ retval = spnfs_init_proc(); ++ if (retval != 0) ++ goto out_free_idmap; ++#if defined(CONFIG_SPNFS_BLOCK) ++ nfsd_bl_init(); ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + retval = register_filesystem(&nfsd_fs_type); + if (retval) + goto out_free_all; +@@ -1465,7 +1557,22 @@ out_free_stat: + + static void __exit exit_nfsd(void) + { ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ remove_proc_entry("fs/nfs/spnfs/recall", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/getfh", NULL); ++ remove_proc_entry("fs/nfs/spnfs/config", NULL); ++ remove_proc_entry("fs/nfs/spnfs/ctl", NULL); ++ remove_proc_entry("fs/nfs/spnfs", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutsegsize", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ + nfsd_export_shutdown(); ++ nfsd4_pnfs_dlm_shutdown(); + nfsd_reply_cache_shutdown(); + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); +diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h +--- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-08-31 20:41:19.204160960 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-31 20:42:05.565212801 -0400 +@@ -285,11 +285,17 @@ extern time_t nfsd4_grace; + #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ + NFSD4_SUPPORTED_ATTRS_WORD0 + ++#if defined(CONFIG_PNFSD) ++#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ ++ (NFSD4_SUPPORTED_ATTRS_WORD1 | FATTR4_WORD1_FS_LAYOUT_TYPES) ++#else /* CONFIG_PNFSD */ + #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ + NFSD4_SUPPORTED_ATTRS_WORD1 ++#endif /* CONFIG_PNFSD */ + + #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ +- (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) ++ (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT | \ ++ FATTR4_WORD2_LAYOUT_BLKSIZE) + + static inline u32 nfsd_suppattrs0(u32 minorversion) + { +diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.c +--- linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-08-31 20:42:05.566222921 -0400 +@@ -10,6 +10,7 @@ + #include + + #include ++#include + #include "nfsd.h" + #include "vfs.h" + #include "auth.h" +@@ -139,6 +140,7 @@ static inline __be32 check_pseudo_root(s + static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) + { + struct knfsd_fh *fh = &fhp->fh_handle; ++ int fsid_type; + struct fid *fid = NULL, sfid; + struct svc_export *exp; + struct dentry *dentry; +@@ -159,7 +161,8 @@ static __be32 nfsd_set_fh_dentry(struct + return error; + if (fh->fh_auth_type != 0) + return error; +- len = key_len(fh->fh_fsid_type) / 4; ++ fsid_type = pnfs_fh_fsid_type(fh); ++ len = key_len(fsid_type) / 4; + if (len == 0) + return error; + if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { +@@ -172,7 +175,7 @@ static __be32 nfsd_set_fh_dentry(struct + data_left -= len; + if (data_left < 0) + return error; +- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth); ++ exp = rqst_exp_find(rqstp, fsid_type, fh->fh_auth); + fid = (struct fid *)(fh->fh_auth + len); + } else { + __u32 tfh[2]; +diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.h +--- linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-08-31 20:42:05.567233002 -0400 +@@ -14,6 +14,7 @@ enum nfsd_fsid { + FSID_UUID8, + FSID_UUID16, + FSID_UUID16_INUM, ++ FSID_MAX + }; + + enum fsid_source { +@@ -205,4 +206,42 @@ fh_unlock(struct svc_fh *fhp) + } + } + ++#if defined(CONFIG_PNFSD) ++ ++/* ++ * fh_fsid_type is overloaded to indicate whether a filehandle was one supplied ++ * to a DS by LAYOUTGET. nfs4_preprocess_stateid_op() uses this to decide how ++ * to handle a given stateid. ++ */ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return fh->fh_fsid_type >= FSID_MAX; ++} ++ ++static inline void pnfs_fh_mark_ds(struct knfsd_fh *fh) ++{ ++ BUG_ON(fh->fh_version != 1); ++ BUG_ON(pnfs_fh_is_ds(fh)); ++ fh->fh_fsid_type += FSID_MAX; ++} ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ ++/* allows fh_verify() to check the real fsid_type (i.e., not overloaded). */ ++static inline int pnfs_fh_fsid_type(struct knfsd_fh *fh) ++{ ++ int fsid_type = fh->fh_fsid_type; ++ ++ if (pnfs_fh_is_ds(fh)) ++ return fsid_type - FSID_MAX; ++ return fsid_type; ++} ++ + #endif /* _LINUX_NFSD_FH_INT_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/nfssvc.c +--- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-08-31 20:41:17.274232911 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-08-31 20:42:05.568144414 -0400 +@@ -115,7 +115,7 @@ struct svc_program nfsd_program = { + + }; + +-u32 nfsd_supported_minorversion; ++u32 nfsd_supported_minorversion = NFSD_SUPPORTED_MINOR_VERSION; + + int nfsd_vers(int vers, enum vers_op change) + { +diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pnfsd.h +--- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-08-31 20:42:05.569090615 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-08-31 20:42:05.569090615 -0400 +@@ -0,0 +1,143 @@ ++/* ++ * Copyright (c) 2005 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef LINUX_NFSD_PNFSD_H ++#define LINUX_NFSD_PNFSD_H ++ ++#include ++#include ++ ++#include "state.h" ++#include "xdr4.h" ++ ++/* outstanding layout stateid */ ++struct nfs4_layout_state { ++ struct list_head ls_perfile; ++ struct list_head ls_layouts; /* list of nfs4_layouts */ ++ struct kref ls_ref; ++ struct nfs4_client *ls_client; ++ struct nfs4_file *ls_file; ++ stateid_t ls_stateid; ++}; ++ ++/* outstanding layout */ ++struct nfs4_layout { ++ struct list_head lo_perfile; /* hash by f_id */ ++ struct list_head lo_perclnt; /* hash by clientid */ ++ struct list_head lo_perstate; ++ struct nfs4_file *lo_file; /* backpointer */ ++ struct nfs4_client *lo_client; ++ struct nfs4_layout_state *lo_state; ++ struct nfsd4_layout_seg lo_seg; ++}; ++ ++struct pnfs_inval_state { ++ struct knfsd_fh mdsfh; /* needed only by invalidate all */ ++ stateid_t stid; ++ clientid_t clid; ++ u32 status; ++}; ++ ++/* pNFS Data Server state */ ++#define DS_STATEID_VALID 0 ++#define DS_STATEID_ERROR 1 ++#define DS_STATEID_NEW 2 ++ ++struct pnfs_ds_stateid { ++ struct list_head ds_hash; /* ds_stateid hash entry */ ++ struct list_head ds_perclid; /* per client hash entry */ ++ stateid_t ds_stid; ++ struct knfsd_fh ds_fh; ++ unsigned long ds_access; ++ u32 ds_status; /* from MDS */ ++ u32 ds_verifier[2]; /* from MDS */ ++ wait_queue_head_t ds_waitq; ++ unsigned long ds_flags; ++ struct kref ds_ref; ++ clientid_t ds_mdsclid; ++}; ++ ++struct pnfs_ds_clientid { ++ struct list_head dc_hash; /* mds_clid_hashtbl entry */ ++ struct list_head dc_stateid; /* ds_stateid head */ ++ struct list_head dc_permdsid; /* per mdsid hash entry */ ++ clientid_t dc_mdsclid; ++ struct kref dc_ref; ++ uint32_t dc_mdsid; ++}; ++ ++struct pnfs_mds_id { ++ struct list_head di_hash; /* mds_nodeid list entry */ ++ struct list_head di_mdsclid; /* mds_clientid head */ ++ uint32_t di_mdsid; ++ time_t di_mdsboot; /* mds boot time */ ++ struct kref di_ref; ++}; ++ ++/* notify device request (from exported filesystem) */ ++struct nfs4_notify_device { ++ struct nfsd4_pnfs_cb_dev_list *nd_list; ++ struct nfs4_client *nd_client; ++ struct list_head nd_perclnt; ++ ++ void *nd_args; /* nfsd internal */ ++}; ++ ++u64 find_create_sbid(struct super_block *); ++struct super_block *find_sbid_id(u64); ++__be32 nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *, struct exp_xdr_stream *); ++int nfs4_pnfs_return_layout(struct super_block *, struct svc_fh *, ++ struct nfsd4_pnfs_layoutreturn *); ++int nfs4_pnfs_cb_get_state(struct super_block *, struct pnfs_get_state *); ++int nfs4_pnfs_cb_change_state(struct pnfs_get_state *); ++void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++int put_layoutrecall(struct nfs4_layoutrecall *); ++void nomatching_layout(struct nfs4_layoutrecall *); ++void *layoutrecall_done(struct nfs4_layoutrecall *); ++int nfsd4_cb_layout(struct nfs4_layoutrecall *); ++int nfsd_layout_recall_cb(struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++int nfsd_device_notify_cb(struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++int nfsd4_cb_notify_device(struct nfs4_notify_device *); ++void pnfs_set_device_notify(clientid_t *, unsigned int types); ++void pnfs_clear_device_notify(struct nfs4_client *); ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++extern struct sockaddr pnfsd_lexp_addr; ++extern size_t pnfs_lexp_addr_len; ++ ++extern void pnfsd_lexp_init(struct inode *); ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#endif /* LINUX_NFSD_PNFSD_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c +--- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-31 20:42:05.569090615 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-31 20:42:05.569090615 -0400 +@@ -0,0 +1,225 @@ ++/* ++ * linux/fs/nfsd/pnfs_lexp.c ++ * ++ * pNFS export of local filesystems. ++ * ++ * Export local file systems over the files layout type. ++ * The MDS (metadata server) functions also as a single DS (data server). ++ * This is mostly useful for development and debugging purposes. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * Copyright (C) 2008 Benny Halevy, ++ * ++ * Initial implementation was based on the pnfs-gfs2 patches done ++ * by David M. Richter ++ */ ++ ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++struct sockaddr pnfsd_lexp_addr; ++size_t pnfs_lexp_addr_len; ++ ++static int ++pnfsd_lexp_layout_type(struct super_block *sb) ++{ ++ int ret = LAYOUT_NFSV4_1_FILES; ++ dprintk("<-- %s: return %d\n", __func__, ret); ++ return ret; ++} ++ ++static int ++pnfsd_lexp_get_device_iter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ ++ dprintk("<-- %s: return 0\n", __func__); ++ return 0; ++} ++ ++static int ++pnfsd_lexp_get_device_info(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_multipath fl_devices[1]; ++ u32 fl_stripe_indices[1] = { 0 }; ++ struct pnfs_filelayout_devaddr daddr; ++ /* %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x.%03u.%03u */ ++ char daddr_buf[8*4 + 2*3 + 10]; ++ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ ++ if (devid->devid != 1) { ++ printk(KERN_ERR "%s: WARNING: didn't receive a deviceid of 1 " ++ "(got: 0x%llx)\n", __func__, devid->devid); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ /* count the number of comma-delimited DS IPs */ ++ fdev.fl_device_length = 1; ++ fdev.fl_device_list = fl_devices; ++ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = fl_stripe_indices; ++ ++ daddr.r_addr.data = daddr_buf; ++ daddr.r_addr.len = sizeof(daddr_buf); ++ err = __svc_print_netaddr(&pnfsd_lexp_addr, &daddr.r_addr); ++ if (err < 0) ++ goto out; ++ daddr.r_addr.len = err; ++ switch (pnfsd_lexp_addr.sa_family) { ++ case AF_INET: ++ daddr.r_netid.data = "tcp"; ++ daddr.r_netid.len = 3; ++ break; ++ case AF_INET6: ++ daddr.r_netid.data = "tcp6"; ++ daddr.r_netid.len = 4; ++ break; ++ default: ++ BUG(); ++ } ++ fdev.fl_device_list[0].fl_multipath_length = 1; ++ fdev.fl_device_list[0].fl_multipath_list = &daddr; ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ dprintk("<-- %s: return %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize < NFSSVC_MAXBLKSIZE) ++ blocksize = NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++ dprintk("%s: return %d\n", __func__, blocksize); ++ return blocksize; ++} ++ ++static enum nfsstat4 ++pnfsd_lexp_layout_get(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ enum nfsstat4 rc = NFS4_OK; ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ ++ dprintk("--> %s: inode=%p\n", __func__, inode); ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = true; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = arg->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = 0; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ memcpy(fhp, arg->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ dprintk("<-- %s: return %d\n", __func__, rc); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++pnfsd_lexp_layout_commit(struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ dprintk("%s: (unimplemented)\n", __func__); ++ ++ return 0; ++} ++ ++static int ++pnfsd_lexp_layout_return(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ dprintk("%s: (unimplemented)\n", __func__); ++ ++ return 0; ++} ++ ++static int pnfsd_lexp_get_state(struct inode *inode, struct knfsd_fh *fh, ++ struct pnfs_get_state *p) ++{ ++ return 0; /* just use the current stateid */ ++} ++ ++static struct pnfs_export_operations pnfsd_lexp_ops = { ++ .layout_type = pnfsd_lexp_layout_type, ++ .get_device_info = pnfsd_lexp_get_device_info, ++ .get_device_iter = pnfsd_lexp_get_device_iter, ++ .layout_get = pnfsd_lexp_layout_get, ++ .layout_commit = pnfsd_lexp_layout_commit, ++ .layout_return = pnfsd_lexp_layout_return, ++ .get_state = pnfsd_lexp_get_state, ++}; ++ ++void ++pnfsd_lexp_init(struct inode *inode) ++{ ++ dprintk("%s: &pnfsd_lexp_ops=%p\n", __func__, &pnfsd_lexp_ops); ++ inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_com.c +--- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-31 20:42:05.570119170 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-08-31 20:42:05.570119170 -0400 +@@ -0,0 +1,535 @@ ++/* ++ * fs/nfsd/spnfs_com.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * Based heavily on idmap.c ++ * ++ */ ++ ++/* ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Marius Aamodt Eriksen ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++static ssize_t spnfs_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t spnfs_pipe_downcall(struct file *, const char __user *, ++ size_t); ++static void spnfs_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops spnfs_upcall_ops = { ++ .upcall = spnfs_pipe_upcall, ++ .downcall = spnfs_pipe_downcall, ++ .destroy_msg = spnfs_pipe_destroy_msg, ++}; ++ ++/* evil global variable */ ++struct spnfs *global_spnfs; ++struct spnfs_config *spnfs_config; ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++int spnfs_use_layoutsegments; ++uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++/* ++ * Used by spnfs_enabled() ++ * Tracks if the subsystem has been initialized at some point. It doesn't ++ * matter if it's not currently initialized. ++ */ ++static int spnfs_enabled_at_some_point; ++ ++/* call this to start the ball rolling */ ++/* code it like we're going to avoid the global variable in the future */ ++int ++nfsd_spnfs_new(void) ++{ ++ struct spnfs *spnfs = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ if (global_spnfs != NULL) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ spnfs = kzalloc(sizeof(*spnfs), GFP_KERNEL); ++ if (spnfs == NULL){ ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ spnfs->spnfs_dentry = rpc_mkpipe(nd.path.dentry, "spnfs", spnfs, ++ &spnfs_upcall_ops, 0); ++ if (IS_ERR(spnfs->spnfs_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ ++ mutex_init(&spnfs->spnfs_lock); ++ mutex_init(&spnfs->spnfs_plock); ++ init_waitqueue_head(&spnfs->spnfs_wq); ++ ++ global_spnfs = spnfs; ++ spnfs_enabled_at_some_point = 1; ++ ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(spnfs); ++ return rc; ++} ++ ++/* again, code it like we're going to remove the global variable */ ++void ++nfsd_spnfs_delete(void) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ ++ if (!spnfs) ++ return; ++ rpc_unlink(spnfs->spnfs_dentry); ++ rpc_put_mount(); ++ global_spnfs = NULL; ++ kfree(spnfs); ++} ++ ++/* RPC pipefs upcall/downcall routines */ ++/* looks like this code is invoked by the rpc_pipe code */ ++/* to handle upcalls on things we've queued elsewhere */ ++/* See nfs_idmap_id for an exmaple of enqueueing */ ++static ssize_t ++spnfs_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, ++ char __user *dst, size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied; ++ ssize_t left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ return mlen; ++} ++ ++static ssize_t ++spnfs_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ struct spnfs *spnfs = (struct spnfs *)rpci->private; ++ struct spnfs_msg *im_in = NULL, *im = &spnfs->spnfs_im; ++ int ret; ++ ++ if (mlen != sizeof(struct spnfs_msg)) ++ return -ENOSPC; ++ ++ im_in = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im_in == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(im_in, src, mlen) != 0) ++ return -EFAULT; ++ ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ ret = mlen; ++ im->im_status = im_in->im_status; ++ /* If we got an error, terminate now, and wake up pending upcalls */ ++ if (!(im_in->im_status & SPNFS_STATUS_SUCCESS)) { ++ wake_up(&spnfs->spnfs_wq); ++ goto out; ++ } ++ ++ ret = -EINVAL; ++ /* Did we match the current upcall? */ ++ /* DMXXX: do not understand the comment above, from original code */ ++ /* DMXXX: when do we _not_ match the current upcall? */ ++ /* DMXXX: anyway, let's to a simplistic check */ ++ if (im_in->im_type == im->im_type) { ++ /* copy the response into the spnfs struct */ ++ memcpy(&im->im_res, &im_in->im_res, sizeof(im->im_res)); ++ ret = mlen; ++ } else ++ dprintk("spnfs: downcall type != upcall type\n"); ++ ++ ++ wake_up(&spnfs->spnfs_wq); ++/* DMXXX handle rval processing */ ++out: ++ mutex_unlock(&spnfs->spnfs_plock); ++ kfree(im_in); ++ return ret; ++} ++ ++static void ++spnfs_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ struct spnfs_msg *im = msg->data; ++ struct spnfs *spnfs = container_of(im, struct spnfs, spnfs_im); ++ ++ if (msg->errno >= 0) ++ return; ++ mutex_lock(&spnfs->spnfs_plock); ++ im->im_status = SPNFS_STATUS_FAIL; /* DMXXX */ ++ wake_up(&spnfs->spnfs_wq); ++ mutex_unlock(&spnfs->spnfs_plock); ++} ++ ++/* generic upcall. called by functions in spnfs_ops.c */ ++int ++spnfs_upcall(struct spnfs *spnfs, struct spnfs_msg *upmsg, ++ union spnfs_msg_res *res) ++{ ++ struct rpc_pipe_msg msg; ++ struct spnfs_msg *im; ++ DECLARE_WAITQUEUE(wq, current); ++ int ret = -EIO; ++ int rval; ++ ++ im = &spnfs->spnfs_im; ++ ++ mutex_lock(&spnfs->spnfs_lock); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ memset(im, 0, sizeof(*im)); ++ memcpy(im, upmsg, sizeof(*upmsg)); ++ ++ memset(&msg, 0, sizeof(msg)); ++ msg.data = im; ++ msg.len = sizeof(*im); ++ ++ add_wait_queue(&spnfs->spnfs_wq, &wq); ++ rval = rpc_queue_upcall(spnfs->spnfs_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&spnfs->spnfs_plock); ++ schedule(); ++ current->state = TASK_RUNNING; ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ if (im->im_status & SPNFS_STATUS_SUCCESS) { ++ /* copy our result from the upcall */ ++ memcpy(res, &im->im_res, sizeof(*res)); ++ ret = 0; ++ } ++ ++out: ++ memset(im, 0, sizeof(*im)); ++ mutex_unlock(&spnfs->spnfs_plock); ++ mutex_unlock(&spnfs->spnfs_lock); ++ return(ret); ++} ++ ++/* ++ * This is used to determine if the spnfsd daemon has been started at ++ * least once since the system came up. This is used to by the export ++ * mechanism to decide if spnfs is in use. ++ * ++ * Returns non-zero if the spnfsd has initialized the communication pipe ++ * at least once. ++ */ ++int spnfs_enabled(void) ++{ ++ return spnfs_enabled_at_some_point; ++} ++ ++#ifdef CONFIG_PROC_FS ++ ++/* ++ * procfs virtual files for user/kernel space communication: ++ * ++ * ctl - currently just an on/off switch...can be expanded ++ * getfh - fd to fh conversion ++ * recall - recall a layout from the command line, for example: ++ * echo > /proc/fs/spnfs/recall ++ * config - configuration info, e.g., stripe size, num ds, etc. ++ */ ++ ++/*-------------- start ctl -------------------------*/ ++static ssize_t ctl_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int cmd, rc; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (cmd) { ++ rc = nfsd_spnfs_new(); ++ if (rc != 0) ++ return rc; ++ } else ++ nfsd_spnfs_delete(); ++ ++ return count; ++} ++ ++static const struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++/*-------------- end ctl ---------------------------*/ ++ ++/*-------------- start config -------------------------*/ ++static ssize_t config_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ static struct spnfs_config cfg; ++ ++ if (copy_from_user(&cfg, buf, count)) ++ return -EFAULT; ++ ++ spnfs_config = &cfg; ++ return 0; ++} ++ ++static const struct file_operations config_ops = { ++ .write = config_write, ++}; ++/*-------------- end config ---------------------------*/ ++ ++/*-------------- start getfh -----------------------*/ ++static int getfh_open(struct inode *inode, struct file *file) ++{ ++ file->private_data = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); ++ if (file->private_data == NULL) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static ssize_t getfh_read(struct file *file, char __user *buf, size_t count, ++ loff_t *offset) ++{ ++ if (copy_to_user(buf, file->private_data, sizeof(struct nfs_fh))) ++ return -EFAULT; ++ ++ return count; ++} ++ ++static ssize_t getfh_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int fd; ++ ++ if (copy_from_user((int *)&fd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (spnfs_getfh(fd, file->private_data) != 0) ++ return -EIO; ++ ++ return count; ++} ++ ++static int getfh_release(struct inode *inode, struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static const struct file_operations getfh_ops = { ++ .open = getfh_open, ++ .read = getfh_read, ++ .write = getfh_write, ++ .release = getfh_release, ++}; ++/*-------------- end getfh ------------------------*/ ++ ++ ++/*-------------- start recall layout --------------*/ ++static ssize_t recall_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char input[128]; ++ char *path, *str, *p; ++ int rc; ++ u64 off = 0, len = 0; ++ ++ if (count > 128) ++ return -EINVAL; ++ ++ if (copy_from_user(input, buf, count)) ++ return -EFAULT; ++ ++ /* assumes newline-terminated path */ ++ p = memchr(input, '\n', count); ++ if (p == NULL) ++ return -EINVAL; ++ *p = '\0'; ++ ++ /* ++ * Scan for path and, optionally, an offset and length ++ * of a layout segment to be recalled; if there are two ++ * fields, they're assumed to be path and offset. ++ */ ++ p = input; ++ path = strsep(&p, " "); ++ if (path == NULL) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &off); ++ if (rc != 0) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &len); ++ if (rc != 0) ++ return -EINVAL; ++ } ++ } ++ ++ rc = spnfs_test_layoutrecall(path, off, len); ++ if (rc != 0) ++ return rc; ++ ++ return count; ++} ++ ++static const struct file_operations recall_ops = { ++ .write = recall_write, ++}; ++/*-------------- end recall layout --------------*/ ++ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++/*-------------- start layoutseg -------------------------*/ ++static ssize_t layoutseg_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[3]; ++ ++ if (copy_from_user(cmd, buf, 1)) ++ return -EFAULT; ++ if (cmd[0] == '0') ++ spnfs_use_layoutsegments = 0; ++ else ++ spnfs_use_layoutsegments = 1; ++ ++ return count; ++} ++ ++static const struct file_operations layoutseg_ops = { ++ .write = layoutseg_write, ++}; ++/*-------------- end layoutseg ---------------------------*/ ++ ++/*-------------- start layoutsegsize -------------------------*/ ++static ssize_t layoutsegsize_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[50]; ++ ++ if (copy_from_user(cmd, buf, 49)) ++ return -EFAULT; ++ layoutsegment_size = simple_strtoull(cmd, NULL, 10); ++ ++ return count; ++} ++ ++static const struct file_operations layoutsegsize_ops = { ++ .write = layoutsegsize_write, ++}; ++/*-------------- end layoutsegsize ---------------------------*/ ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++int ++spnfs_init_proc(void) ++{ ++ struct proc_dir_entry *entry; ++ ++ entry = proc_mkdir("fs/spnfs", NULL); ++ if (!entry) ++ return -ENOMEM; ++ ++ entry = create_proc_entry("fs/spnfs/ctl", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &ctl_ops; ++ ++ entry = create_proc_entry("fs/spnfs/config", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &config_ops; ++ ++ entry = create_proc_entry("fs/spnfs/getfh", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &getfh_ops; ++ ++ entry = create_proc_entry("fs/spnfs/recall", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &recall_ops; ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++ entry = create_proc_entry("fs/spnfs/layoutseg", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutseg_ops; ++ ++ entry = create_proc_entry("fs/spnfs/layoutsegsize", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutsegsize_ops; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ return 0; ++} ++#endif /* CONFIG_PROC_FS */ +diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c +--- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-31 20:42:05.571097807 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-08-31 20:42:05.572091128 -0400 +@@ -0,0 +1,878 @@ ++/* ++ * fs/nfsd/spnfs_ops.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * ++ */ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++/* comment out CONFIG_SPNFS_TEST for non-test behaviour */ ++/* #define CONFIG_SPNFS_TEST 1 */ ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++/* ++ * The functions that are called from elsewhere in the kernel ++ * to perform tasks in userspace ++ * ++ */ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++extern int spnfs_use_layoutsegments; ++extern uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++extern struct spnfs *global_spnfs; ++ ++int ++spnfs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++enum nfsstat4 ++spnfs_layoutget(struct inode *inode, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *lg_arg, ++ struct nfsd4_pnfs_layoutget_res *lg_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct pnfs_filelayout_layout *flp = NULL; ++ int status, i; ++ enum nfsstat4 nfserr; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ im->im_type = SPNFS_TYPE_LAYOUTGET; ++ im->im_args.layoutget_args.inode = inode->i_ino; ++ im->im_args.layoutget_args.generation = inode->i_generation; ++ ++ /* call function to queue the msg for upcall */ ++ if (spnfs_upcall(spnfs, im, res) != 0) { ++ dprintk("failed spnfs upcall: layoutget\n"); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto layoutget_cleanup; ++ } ++ status = res->layoutget_res.status; ++ if (status != 0) { ++ /* FIXME? until user mode is fixed, translate system error */ ++ switch (status) { ++ case -E2BIG: ++ case -ETOOSMALL: ++ nfserr = NFS4ERR_TOOSMALL; ++ break; ++ case -ENOMEM: ++ case -EAGAIN: ++ case -EINTR: ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ break; ++ case -ENOENT: ++ nfserr = NFS4ERR_BADLAYOUT; ++ break; ++ default: ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ } ++ dprintk("spnfs layout_get upcall: status=%d nfserr=%u\n", ++ status, nfserr); ++ goto layoutget_cleanup; ++ } ++ ++ lg_res->lg_return_on_close = 0; ++#if defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ /* if spnfs_use_layoutsegments & layoutsegment_size == 0, use */ ++ /* the amount requested by the client. */ ++ if (spnfs_use_layoutsegments) { ++ if (layoutsegment_size != 0) ++ lg_res->lg_seg.length = layoutsegment_size; ++ } else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ flp = kmalloc(sizeof(struct pnfs_filelayout_layout), GFP_KERNEL); ++ if (flp == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ flp->device_id.sbid = lg_arg->lg_sbid; ++ flp->device_id.devid = res->layoutget_res.devid; ++ flp->lg_layout_type = 1; /* XXX */ ++ flp->lg_stripe_type = res->layoutget_res.stripe_type; ++ flp->lg_commit_through_mds = 0; ++ flp->lg_stripe_unit = res->layoutget_res.stripe_size; ++ flp->lg_first_stripe_index = 0; ++ flp->lg_pattern_offset = 0; ++ flp->lg_fh_length = res->layoutget_res.stripe_count; ++ ++ flp->lg_fh_list = kmalloc(flp->lg_fh_length * sizeof(struct knfsd_fh), ++ GFP_KERNEL); ++ if (flp->lg_fh_list == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ /* ++ * FIX: Doing an extra copy here. Should group res.flist's fh_len ++ * and fh_val into a knfsd_fh structure. ++ */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ flp->lg_fh_list[i].fh_size = res->layoutget_res.flist[i].fh_len; ++ memcpy(&flp->lg_fh_list[i].fh_base, ++ res->layoutget_res.flist[i].fh_val, ++ res->layoutget_res.flist[i].fh_len); ++ } ++ ++ /* encode the layoutget body */ ++ nfserr = filelayout_encode_layout(xdr, flp); ++ ++layoutget_cleanup: ++ if (flp) { ++ if (flp->lg_fh_list) ++ kfree(flp->lg_fh_list); ++ kfree(flp); ++ } ++ kfree(im); ++ kfree(res); ++ ++ return nfserr; ++} ++ ++int ++spnfs_layoutcommit(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutreturn(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for ino = %lu\n", ++ __func__, inode->i_ino); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* XXX figure out how to get a sb since there's no inode ptr */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = offset; ++ lr.cbl_seg.length = len; ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ lr.cbl_layoutchanged = 0; ++ ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ ++ return 0; ++} ++ ++ ++int ++spnfs_test_layoutrecall(char *path, u64 offset, u64 len) ++{ ++ struct nameidata nd; ++ struct inode *inode; ++ int type, rc; ++ ++ dprintk("%s: path=%s, offset=%llu, len=%llu\n", ++ __func__, path, offset, len); ++ ++ if (strcmp(path, "all") == 0) { ++ inode = NULL; ++ type = RETURN_ALL; ++ } else { ++ rc = path_lookup(path, 0, &nd); ++ if (rc != 0) ++ return -ENOENT; ++ ++ /* ++ * XXX todo: add a RETURN_FSID scenario here...maybe if ++ * inode is a dir... ++ */ ++ ++ inode = nd.path.dentry->d_inode; ++ type = RETURN_FILE; ++ } ++ ++ if (len == 0) ++ len = NFS4_MAX_UINT64; ++ ++ rc = spnfs_layoutrecall(inode, type, offset, len); ++ ++ if (type != RETURN_ALL) ++ path_put(&nd.path); ++ return rc; ++} ++ ++int ++spnfs_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *gd_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* XXX keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEITER; ++ im->im_args.getdeviceiter_args.cookie = gd_res->gd_cookie; ++ im->im_args.getdeviceiter_args.verf = gd_res->gd_verf; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceiter_out; ++ } ++ status = res->getdeviceiter_res.status; ++ ++ if (res->getdeviceiter_res.eof) ++ gd_res->gd_eof = 1; ++ else { ++ gd_res->gd_devid = res->getdeviceiter_res.devid; ++ gd_res->gd_cookie = res->getdeviceiter_res.cookie; ++ gd_res->gd_verf = res->getdeviceiter_res.verf; ++ gd_res->gd_eof = 0; ++ } ++ ++getdeviceiter_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++#ifdef CONFIG_SPNFS_TEST ++/* ++ * Setup the rq_res xdr_buf. The svc_rqst rq_respages[1] page contains the ++ * 1024 encoded stripe indices. ++ * ++ * Skip the devaddr4 length and encode the indicies count (1024) in the ++ * rq_res.head and set the rq_res.head length. ++ * ++ * Set the rq_res page_len to 4096 (for the 1024 stripe indices). ++ * Set the rq_res xdr_buf tail base to rq_respages[0] just after the ++ * rq_res head to hold the rest of the getdeviceinfo return. ++ * ++ * So rq_respages[rq_resused - 1] contains the rq_res.head and rq_res.tail and ++ * rq_respages[rq_resused] contains the rq_res.pages. ++ */ ++static int spnfs_test_indices_xdr(struct pnfs_xdr_info *info, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ struct nfsd4_compoundres *resp = info->resp; ++ struct svc_rqst *rqstp = resp->rqstp; ++ struct xdr_buf *xb = &resp->rqstp->rq_res; ++ __be32 *p; ++ ++ p = nfsd4_xdr_reserve_space(resp, 8); ++ p++; /* Fill in length later */ ++ *p++ = cpu_to_be32(fdev->fl_stripeindices_length); /* 1024 */ ++ resp->p = p; ++ ++ xb->head[0].iov_len = (char *)resp->p - (char *)xb->head[0].iov_base; ++ xb->pages = &rqstp->rq_respages[rqstp->rq_resused]; ++ xb->page_base = 0; ++ xb->page_len = PAGE_SIZE; /* page of 1024 encoded indices */ ++ xb->tail[0].iov_base = resp->p; ++ resp->end = xb->head[0].iov_base + PAGE_SIZE; ++ xb->tail[0].iov_len = (char *)resp->end - (char *)resp->p; ++ return 0; ++} ++/* ++ * Return a stripeindices of length 1024 to test ++ * the pNFS client multipage getdeviceinfo implementation. ++ * ++ * Encode a page of stripe indices. ++ */ ++static void spnfs_set_test_indices(struct pnfs_filelayout_device *fldev, ++ struct spnfs_device *dev, ++ struct pnfs_devinfo_arg *info) ++{ ++ struct svc_rqst *rqstp = info->xdr.resp->rqstp; ++ __be32 *p; ++ int i, j = 0; ++ ++ p = (__be32 *)page_address(rqstp->rq_respages[rqstp->rq_resused]); ++ fldev->fl_stripeindices_length = 1024; ++ /* round-robin the data servers device index into the stripe indicie */ ++ for (i = 0; i < 1024; i++) { ++ *p++ = cpu_to_be32(j); ++ if (j < dev->dscount - 1) ++ j++; ++ else ++ j = 0; ++ } ++ fldev->fl_stripeindices_list = NULL; ++} ++#endif /* CONFIG_SPNFS_TEST */ ++ ++int ++spnfs_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct spnfs_device *dev; ++ struct pnfs_filelayout_device *fldev = NULL; ++ struct pnfs_filelayout_multipath *mp = NULL; ++ struct pnfs_filelayout_devaddr *fldap = NULL; ++ int status = 0, i, len; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEINFO; ++ /* XXX FIX: figure out what to do about fsid */ ++ im->im_args.getdeviceinfo_args.devid = devid->devid; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceinfo_out; ++ } ++ status = res->getdeviceinfo_res.status; ++ if (status != 0) ++ goto getdeviceinfo_out; ++ ++ dev = &res->getdeviceinfo_res.devinfo; ++ ++ /* Fill in the device data, i.e., nfs4_1_file_layout_ds_addr4 */ ++ fldev = kzalloc(sizeof(struct pnfs_filelayout_device), GFP_KERNEL); ++ if (fldev == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ /* ++ * Stripe count is the same as data server count for our purposes ++ */ ++ fldev->fl_stripeindices_length = dev->dscount; ++ fldev->fl_device_length = dev->dscount; ++ ++ /* Set stripe indices */ ++#ifdef CONFIG_SPNFS_TEST ++ spnfs_set_test_indices(fldev, dev, info); ++ fldev->fl_enc_stripe_indices = spnfs_test_indices_xdr; ++#else /* CONFIG_SPNFS_TEST */ ++ fldev->fl_stripeindices_list = ++ kmalloc(fldev->fl_stripeindices_length * sizeof(u32), ++ GFP_KERNEL); ++ if (fldev->fl_stripeindices_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_stripeindices_length; i++) ++ fldev->fl_stripeindices_list[i] = i; ++#endif /* CONFIG_SPNFS_TEST */ ++ ++ /* ++ * Set the device's data server addresses No multipath for spnfs, ++ * so mp length is always 1. ++ * ++ */ ++ fldev->fl_device_list = ++ kmalloc(fldev->fl_device_length * ++ sizeof(struct pnfs_filelayout_multipath), ++ GFP_KERNEL); ++ if (fldev->fl_device_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ mp = &fldev->fl_device_list[i]; ++ mp->fl_multipath_length = 1; ++ mp->fl_multipath_list = ++ kmalloc(sizeof(struct pnfs_filelayout_devaddr), ++ GFP_KERNEL); ++ if (mp->fl_multipath_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ fldap = mp->fl_multipath_list; ++ ++ /* ++ * Copy the netid into the device address, for example: "tcp" ++ */ ++ len = strlen(dev->dslist[i].netid); ++ fldap->r_netid.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_netid.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_netid.data, dev->dslist[i].netid, len); ++ fldap->r_netid.len = len; ++ ++ /* ++ * Copy the network address into the device address, ++ * for example: "10.35.9.16.08.01" ++ */ ++ len = strlen(dev->dslist[i].addr); ++ fldap->r_addr.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_addr.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_addr.data, dev->dslist[i].addr, len); ++ fldap->r_addr.len = len; ++ } ++ ++ /* encode the device data */ ++ status = filelayout_encode_devinfo(xdr, fldev); ++ ++getdeviceinfo_out: ++ if (fldev) { ++ kfree(fldev->fl_stripeindices_list); ++ if (fldev->fl_device_list) { ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ fldap = ++ fldev->fl_device_list[i].fl_multipath_list; ++ kfree(fldap->r_netid.data); ++ kfree(fldap->r_addr.data); ++ kfree(fldap); ++ } ++ kfree(fldev->fl_device_list); ++ } ++ kfree(fldev); ++ } ++ ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_setattr(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_open(struct inode *inode, struct nfsd4_open *open) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_OPEN; ++ im->im_args.open_args.inode = inode->i_ino; ++ im->im_args.open_args.generation = inode->i_generation; ++ im->im_args.open_args.create = open->op_create; ++ im->im_args.open_args.createmode = open->op_createmode; ++ im->im_args.open_args.truncate = open->op_truncate; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto open_out; ++ } ++ status = res->open_res.status; ++ ++open_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_create(void) ++{ ++ return 0; ++} ++ ++/* ++ * Invokes the spnfsd with the inode number of the object to remove. ++ * The file has already been removed on the MDS, so all the spnsfd ++ * daemon does is remove the stripes. ++ * Returns 0 on success otherwise error code ++ */ ++int ++spnfs_remove(unsigned long ino, unsigned long generation) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_REMOVE; ++ im->im_args.remove_args.inode = ino; ++ im->im_args.remove_args.generation = generation; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto remove_out; ++ } ++ status = res->remove_res.status; ++ ++remove_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++static int ++read_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_read(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ if (err == 0) ++ break; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDONLY | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0 ; vnum < vlen ; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = read_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err < 0) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ if (err < iolen) { ++ bytecount += err; ++ goto read_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++read_out: ++ *lenp = bytecount; ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ return status; ++} ++ ++__be32 ++spnfs_read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return read(inode, offset, lenp, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++static int ++write_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_write(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDWR | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto write_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0; vnum < vlen; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = write_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err != iolen) { ++ dprintk("spnfs_write: err=%d expected %Zd\n", err, len); ++ status = nfserr_io; ++ goto write_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++write_out: ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ ++ return status; ++} ++ ++__be32 ++spnfs_write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return write(inode, offset, len, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++int ++spnfs_commit(void) ++{ ++ return 0; ++} ++ ++/* ++ * Return the state for this object. ++ * At this time simply return 0 to indicate success and use the existing state ++ */ ++int ++spnfs_get_state(struct inode *inode, struct knfsd_fh *fh, struct pnfs_get_state *arg) ++{ ++ return 0; ++} ++ ++/* ++ * Return the filehandle for the specified file descriptor ++ */ ++int ++spnfs_getfh(int fd, struct nfs_fh *fh) ++{ ++ struct file *file; ++ ++ file = fget(fd); ++ if (file == NULL) ++ return -EIO; ++ ++ memcpy(fh, NFS_FH(file->f_dentry->d_inode), sizeof(struct nfs_fh)); ++ fput(file); ++ return 0; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h +--- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-08-31 20:41:19.205016844 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-31 20:42:05.572091128 -0400 +@@ -242,6 +242,12 @@ struct nfs4_client { + u32 cl_cb_seq_nr; + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ ++#if defined(CONFIG_PNFSD) ++ struct list_head cl_layouts; /* outstanding layouts */ ++ struct list_head cl_layoutrecalls; /* outstanding layoutrecall ++ callbacks */ ++ atomic_t cl_deviceref; /* Num outstanding devs */ ++#endif /* CONFIG_PNFSD */ + }; + + static inline void +@@ -342,12 +348,31 @@ struct nfs4_file { + struct list_head fi_hash; /* hash by "struct inode *" */ + struct list_head fi_stateids; + struct list_head fi_delegations; ++#if defined(CONFIG_PNFSD) ++ struct list_head fi_layouts; ++ struct list_head fi_layout_states; ++#endif /* CONFIG_PNFSD */ + struct inode *fi_inode; + u32 fi_id; /* used with stateowner->so_id + * for stateid_hashtbl hash */ + bool fi_had_conflict; ++#if defined(CONFIG_PNFSD) ++ /* used by layoutget / layoutrecall */ ++ struct nfs4_fsid fi_fsid; ++ u32 fi_fhlen; ++ u8 fi_fhval[NFS4_FHSIZE]; ++#endif /* CONFIG_PNFSD */ + }; + ++#if defined(CONFIG_PNFSD) ++/* pNFS Metadata server state */ ++ ++struct pnfs_ds_dev_entry { ++ struct list_head dd_dev_entry; /* st_pnfs_ds_id entry */ ++ u32 dd_dsid; ++}; ++#endif /* CONFIG_PNFSD */ ++ + /* + * nfs4_stateid can either be an open stateid or (eventually) a lock stateid + * +@@ -370,6 +395,9 @@ struct nfs4_stateid { + struct list_head st_perfile; + struct list_head st_perstateowner; + struct list_head st_lockowners; ++#if defined(CONFIG_PNFSD) ++ struct list_head st_pnfs_ds_id; ++#endif /* CONFIG_PNFSD */ + struct nfs4_stateowner * st_stateowner; + struct nfs4_file * st_file; + stateid_t st_stateid; +@@ -421,6 +449,34 @@ extern void nfsd4_recdir_purge_old(void) + extern int nfsd4_create_clid_dir(struct nfs4_client *clp); + extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); + extern void release_session_client(struct nfsd4_session *); ++extern void nfsd4_free_slab(struct kmem_cache **); ++extern struct nfs4_file *find_file(struct inode *); ++extern struct nfs4_file *find_alloc_file(struct inode *, struct svc_fh *); ++extern void put_nfs4_file(struct nfs4_file *); ++extern void get_nfs4_file(struct nfs4_file *); ++extern struct nfs4_client *find_confirmed_client(clientid_t *); ++extern struct nfs4_stateid *find_stateid(stateid_t *, int flags); ++extern struct nfs4_delegation *find_delegation_stateid(struct inode *, stateid_t *); ++extern __be32 nfs4_check_stateid(stateid_t *); ++extern void expire_client_lock(struct nfs4_client *); ++extern int filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), void *); ++ ++#if defined(CONFIG_PNFSD) ++extern int nfsd4_init_pnfs_slabs(void); ++extern void nfsd4_free_pnfs_slabs(void); ++extern void pnfs_expire_client(struct nfs4_client *); ++extern void release_pnfs_ds_dev_list(struct nfs4_stateid *); ++extern void nfs4_pnfs_state_init(void); ++extern void nfs4_pnfs_state_shutdown(void); ++extern void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++extern int nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *, stateid_t *); ++#else /* CONFIG_PNFSD */ ++static inline void nfsd4_free_pnfs_slabs(void) {} ++static inline int nfsd4_init_pnfs_slabs(void) { return 0; } ++static inline void pnfs_expire_client(struct nfs4_client *clp) {} ++static inline void release_pnfs_ds_dev_list(struct nfs4_stateid *stp) {} ++static inline void nfs4_pnfs_state_shutdown(void) {} ++#endif /* CONFIG_PNFSD */ + + static inline void + nfs4_put_stateowner(struct nfs4_stateowner *so) +@@ -434,4 +490,24 @@ nfs4_get_stateowner(struct nfs4_stateown + kref_get(&so->so_ref); + } + ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ + #endif /* NFSD4_STATE_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs.c +--- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-08-31 20:41:17.275233561 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-08-31 20:42:05.573121119 -0400 +@@ -37,7 +37,12 @@ + #ifdef CONFIG_NFSD_V4 + #include + #include ++#include ++#include + #endif /* CONFIG_NFSD_V4 */ ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif + + #include "nfsd.h" + #include "vfs.h" +@@ -383,6 +388,12 @@ nfsd_setattr(struct svc_rqst *rqstp, str + NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); + if (err) + goto out; ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, 0)) { ++ err = bl_layoutrecall(inode, RETURN_FILE, ++ iap->ia_size, inode->i_size - iap->ia_size); ++ } ++#endif /* CONFIG_SPNFS_BLOCK */ + } + + /* +@@ -1703,6 +1714,11 @@ nfsd_rename(struct svc_rqst *rqstp, stru + struct inode *fdir, *tdir; + __be32 err; + int host_err; ++#ifdef CONFIG_SPNFS ++ unsigned long ino = 0; ++ unsigned long generation = 0; ++ unsigned int nlink = 0; ++#endif /* CONFIG_SPNFS */ + + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); + if (err) +@@ -1766,7 +1782,26 @@ nfsd_rename(struct svc_rqst *rqstp, stru + if (host_err) + goto out_dput_new; + ++#ifdef CONFIG_SPNFS ++ /* ++ * if the target is a preexisting regular file, remember the ++ * inode number and generation so we can delete the stripes; ++ * save the link count as well so that the stripes only get ++ * get deleted when the last link is deleted ++ */ ++ if (ndentry && ndentry->d_inode && S_ISREG(ndentry->d_inode->i_mode)) { ++ ino = ndentry->d_inode->i_ino; ++ generation = ndentry->d_inode->i_generation; ++ nlink = ndentry->d_inode->i_nlink; ++ } ++#endif /* CONFIG_SPNFS */ ++ + host_err = vfs_rename(fdir, odentry, tdir, ndentry); ++#ifdef CONFIG_SPNFS ++ if (spnfs_enabled() && (!host_err && ino && nlink == 1)) ++ spnfs_remove(ino, generation); ++#endif /* CONFIG_SPNFS */ ++ + if (!host_err) { + host_err = commit_metadata(tfhp); + if (!host_err) +@@ -1807,6 +1842,11 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + struct inode *dirp; + __be32 err; + int host_err; ++#if defined(CONFIG_SPNFS) ++ unsigned long ino; ++ unsigned long generation; ++ unsigned int nlink; ++#endif /* defined(CONFIG_SPNFS) */ + + err = nfserr_acces; + if (!flen || isdotent(fname, flen)) +@@ -1830,6 +1870,17 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + goto out; + } + ++#if defined(CONFIG_SPNFS) ++ /* ++ * Remember the inode number to communicate to the spnfsd ++ * for removal of stripes; save the link count as well so that ++ * the stripes only get get deleted when the last link is deleted ++ */ ++ ino = rdentry->d_inode->i_ino; ++ generation = rdentry->d_inode->i_generation; ++ nlink = rdentry->d_inode->i_nlink; ++#endif /* defined(CONFIG_SPNFS) */ ++ + if (!type) + type = rdentry->d_inode->i_mode & S_IFMT; + +@@ -1854,6 +1905,29 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + if (!host_err) + host_err = commit_metadata(fhp); + ++#if defined(CONFIG_SPNFS) ++ /* ++ * spnfs: notify spnfsd of removal to destroy stripes ++ */ ++/* ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (sb->s_export_op->spnfs_remove) { ++*/ ++ dprintk("%s check if spnfs_enabled\n", __FUNCTION__); ++ if (spnfs_enabled() && nlink == 1) { ++ BUG_ON(ino == 0); ++ dprintk("%s calling spnfs_remove inumber=%ld\n", ++ __FUNCTION__, ino); ++ if (spnfs_remove(ino, generation) == 0) { ++ dprintk("%s spnfs_remove success\n", __FUNCTION__); ++ } else { ++ /* XXX How do we make this atomic? */ ++ printk(KERN_WARNING "nfsd: pNFS could not " ++ "remove stripes for inode: %ld\n", ino); ++ } ++ } ++#endif /* defined(CONFIG_SPNFS) */ ++ + mnt_drop_write(fhp->fh_export->ex_path.mnt); + out_nfserr: + err = nfserrno(host_err); +diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h +--- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-08-31 20:41:19.206170424 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-31 20:42:05.575139084 -0400 +@@ -37,6 +37,8 @@ + #ifndef _LINUX_NFSD_XDR4_H + #define _LINUX_NFSD_XDR4_H + ++#include ++ + #include "state.h" + #include "nfsd.h" + +@@ -385,6 +387,51 @@ struct nfsd4_reclaim_complete { + u32 rca_one_fs; + }; + ++struct nfsd4_pnfs_getdevinfo { ++ struct nfsd4_pnfs_deviceid gd_devid; /* request */ ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxcount; /* request */ ++ u32 gd_notify_types;/* request */ ++ struct super_block *gd_sb; ++}; ++ ++struct nfsd4_pnfs_getdevlist { ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxdevices; /* request */ ++ u64 gd_cookie; /* request - response */ ++ u64 gd_verf; /* request - response */ ++ struct svc_fh *gd_fhp; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutget { ++ u64 lg_minlength; /* request */ ++ u32 lg_signal; /* request */ ++ u32 lg_maxcount; /* request */ ++ struct svc_fh *lg_fhp; /* request */ ++ stateid_t lg_sid; /* request/response */ ++ struct nfsd4_layout_seg lg_seg; /* request/response */ ++ u32 lg_roc; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit { ++ struct nfsd4_pnfs_layoutcommit_arg args; ++ stateid_t lc_sid; /* request */ ++ struct nfsd4_pnfs_layoutcommit_res res; ++}; ++ ++enum layoutreturn_flags { ++ LR_FLAG_INTERN = 1 << 0, /* internal return */ ++ LR_FLAG_EXPIRE = 1 << 1, /* return on client expiration */ ++}; ++ ++struct nfsd4_pnfs_layoutreturn { ++ struct nfsd4_pnfs_layoutreturn_arg args; ++ u32 lr_flags; ++ stateid_t lr_sid; /* request/resopnse */ ++ u32 lrs_present; /* response */ ++}; ++ + struct nfsd4_op { + int opnum; + __be32 status; +@@ -426,6 +473,13 @@ struct nfsd4_op { + struct nfsd4_destroy_session destroy_session; + struct nfsd4_sequence sequence; + struct nfsd4_reclaim_complete reclaim_complete; ++#if defined(CONFIG_PNFSD) ++ struct nfsd4_pnfs_getdevlist pnfs_getdevlist; ++ struct nfsd4_pnfs_getdevinfo pnfs_getdevinfo; ++ struct nfsd4_pnfs_layoutget pnfs_layoutget; ++ struct nfsd4_pnfs_layoutcommit pnfs_layoutcommit; ++ struct nfsd4_pnfs_layoutreturn pnfs_layoutreturn; ++#endif /* CONFIG_PNFSD */ + } u; + struct nfs4_replay * replay; + }; +diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file.c +--- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-31 20:41:19.146161064 -0400 ++++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-31 20:42:05.515139585 -0400 +@@ -36,6 +36,7 @@ + #include "internal.h" + #include "iostat.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_FILE + +@@ -388,12 +389,17 @@ static int nfs_write_begin(struct file * + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int once_thru = 0; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + mapping->host->i_ino, len, (long long) pos); + ++ pnfs_update_layout(mapping->host, ++ nfs_file_open_context(file), ++ 0, NFS4_MAX_UINT64, IOMODE_RW, ++ &lseg); + start: + /* + * Prevent starvation issues if someone is doing a consistency +@@ -402,17 +408,22 @@ start: + ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) +- return ret; ++ goto out; + + page = grab_cache_page_write_begin(mapping, index, flags); +- if (!page) +- return -ENOMEM; ++ if (!page) { ++ ret = -ENOMEM; ++ goto out; ++ } + *pagep = page; + +- ret = nfs_flush_incompatible(file, page); ++ ret = nfs_flush_incompatible(file, page, lseg); + if (ret) { + unlock_page(page); + page_cache_release(page); ++ *pagep = NULL; ++ *fsdata = NULL; ++ goto out; + } else if (!once_thru && + nfs_want_read_modify_write(file, page, pos, len)) { + once_thru = 1; +@@ -421,6 +432,12 @@ start: + if (!ret) + goto start; + } ++ ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata); ++ out: ++ if (ret) { ++ put_lseg(lseg); ++ *fsdata = NULL; ++ } + return ret; + } + +@@ -430,6 +447,7 @@ static int nfs_write_end(struct file *fi + { + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + int status; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, +@@ -456,10 +474,17 @@ static int nfs_write_end(struct file *fi + zero_user_segment(page, pglen, PAGE_CACHE_SIZE); + } + +- status = nfs_updatepage(file, page, offset, copied); ++ lseg = nfs4_pull_lseg_from_fsdata(file, fsdata); ++ status = pnfs_write_end(file, page, pos, len, copied, lseg); ++ if (status) ++ goto out; ++ status = nfs_updatepage(file, page, offset, copied, lseg, fsdata); + ++ out: + unlock_page(page); + page_cache_release(page); ++ pnfs_write_end_cleanup(file, fsdata); ++ put_lseg(lseg); + + if (status < 0) + return status; +@@ -570,6 +595,8 @@ static int nfs_vm_page_mkwrite(struct vm + /* make sure the cache has finished storing the page */ + nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); + ++ /* XXX Do we want to call pnfs_update_layout here? */ ++ + lock_page(page); + mapping = page->mapping; + if (mapping != dentry->d_inode->i_mapping) +@@ -580,11 +607,11 @@ static int nfs_vm_page_mkwrite(struct vm + if (pagelen == 0) + goto out_unlock; + +- ret = nfs_flush_incompatible(filp, page); ++ ret = nfs_flush_incompatible(filp, page, NULL); + if (ret != 0) + goto out_unlock; + +- ret = nfs_updatepage(filp, page, 0, pagelen); ++ ret = nfs_updatepage(filp, page, 0, pagelen, NULL, NULL); + out_unlock: + if (!ret) + return VM_FAULT_LOCKED; +diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inode.c +--- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-08-31 20:41:19.149170418 -0400 ++++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-31 20:42:05.516222809 -0400 +@@ -48,6 +48,7 @@ + #include "internal.h" + #include "fscache.h" + #include "dns_resolve.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_VFS + +@@ -278,7 +279,7 @@ nfs_fhget(struct super_block *sb, struct + */ + inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; + if (S_ISREG(inode->i_mode)) { +- inode->i_fop = &nfs_file_operations; ++ inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; + inode->i_data.a_ops = &nfs_file_aops; + inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; + } else if (S_ISDIR(inode->i_mode)) { +@@ -530,6 +531,68 @@ out: + return err; + } + ++static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) ++{ ++ atomic_set(&l_ctx->count, 1); ++ l_ctx->lockowner = current->files; ++ l_ctx->pid = current->tgid; ++ INIT_LIST_HEAD(&l_ctx->list); ++} ++ ++static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) ++{ ++ struct nfs_lock_context *pos; ++ ++ list_for_each_entry(pos, &ctx->lock_context.list, list) { ++ if (pos->lockowner != current->files) ++ continue; ++ if (pos->pid != current->tgid) ++ continue; ++ atomic_inc(&pos->count); ++ return pos; ++ } ++ return NULL; ++} ++ ++struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) ++{ ++ struct nfs_lock_context *res, *new = NULL; ++ struct inode *inode = ctx->path.dentry->d_inode; ++ ++ spin_lock(&inode->i_lock); ++ res = __nfs_find_lock_context(ctx); ++ if (res == NULL) { ++ spin_unlock(&inode->i_lock); ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (new == NULL) ++ return NULL; ++ nfs_init_lock_context(new); ++ spin_lock(&inode->i_lock); ++ res = __nfs_find_lock_context(ctx); ++ if (res == NULL) { ++ list_add_tail(&new->list, &ctx->lock_context.list); ++ new->open_context = ctx; ++ res = new; ++ new = NULL; ++ } ++ } ++ spin_unlock(&inode->i_lock); ++ kfree(new); ++ return res; ++} ++ ++void nfs_put_lock_context(struct nfs_lock_context *l_ctx) ++{ ++ struct nfs_open_context *ctx = l_ctx->open_context; ++ struct inode *inode = ctx->path.dentry->d_inode; ++ ++ if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) ++ return; ++ list_del(&l_ctx->list); ++ spin_unlock(&inode->i_lock); ++ kfree(l_ctx); ++} ++ + /** + * nfs_close_context - Common close_context() routine NFSv2/v3 + * @ctx: pointer to context +@@ -566,11 +629,11 @@ static struct nfs_open_context *alloc_nf + path_get(&ctx->path); + ctx->cred = get_rpccred(cred); + ctx->state = NULL; +- ctx->lockowner = current->files; + ctx->flags = 0; + ctx->error = 0; + ctx->dir_cookie = 0; +- atomic_set(&ctx->count, 1); ++ nfs_init_lock_context(&ctx->lock_context); ++ ctx->lock_context.open_context = ctx; + } + return ctx; + } +@@ -578,15 +641,16 @@ static struct nfs_open_context *alloc_nf + struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) + { + if (ctx != NULL) +- atomic_inc(&ctx->count); ++ atomic_inc(&ctx->lock_context.count); + return ctx; + } ++EXPORT_SYMBOL(get_nfs_open_context); + + static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) + { + struct inode *inode = ctx->path.dentry->d_inode; + +- if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) ++ if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) + return; + list_del(&ctx->list); + spin_unlock(&inode->i_lock); +@@ -933,6 +997,7 @@ void nfs_fattr_init(struct nfs_fattr *fa + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); + } ++EXPORT_SYMBOL(nfs_fattr_init); + + struct nfs_fattr *nfs_alloc_fattr(void) + { +@@ -1142,6 +1207,14 @@ static int nfs_update_inode(struct inode + server->fsid = fattr->fsid; + + /* ++ * file needs layout commit, server attributes may be stale ++ */ ++ if (layoutcommit_needed(nfsi) && nfsi->change_attr >= fattr->change_attr) { ++ dprintk("NFS: %s: layoutcommit is needed for file %s/%ld\n", ++ __func__, inode->i_sb->s_id, inode->i_ino); ++ return 0; ++ } ++ /* + * Update the read time so we don't revalidate too often. + */ + nfsi->read_cache_jiffies = fattr->time_start; +@@ -1340,9 +1413,10 @@ static int nfs_update_inode(struct inode + */ + void nfs4_clear_inode(struct inode *inode) + { ++ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); ++ + /* If we are holding a delegation, return it! */ + nfs_inode_return_delegation_noreclaim(inode); +- /* First call standard NFS clear_inode() code */ + nfs_clear_inode(inode); + } + #endif +@@ -1367,7 +1441,10 @@ struct inode *nfs_alloc_inode(struct sup + + void nfs_destroy_inode(struct inode *inode) + { +- kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); ++ struct nfs_inode *nfsi = NFS_I(inode); ++ ++ pnfs_destroy_layout(nfsi); ++ kmem_cache_free(nfs_inode_cachep, nfsi); + } + + static inline void nfs4_init_once(struct nfs_inode *nfsi) +@@ -1377,6 +1454,11 @@ static inline void nfs4_init_once(struct + nfsi->delegation = NULL; + nfsi->delegation_state = 0; + init_rwsem(&nfsi->rwsem); ++#ifdef CONFIG_NFS_V4_1 ++ init_waitqueue_head(&nfsi->lo_waitq); ++ nfsi->pnfs_layout_suspend = 0; ++ nfsi->layout = NULL; ++#endif /* CONFIG_NFS_V4_1 */ + #endif + } + +@@ -1488,6 +1570,12 @@ static int __init init_nfs_fs(void) + if (err) + goto out0; + ++#ifdef CONFIG_NFS_V4_1 ++ err = pnfs_initialize(); ++ if (err) ++ goto out00; ++#endif /* CONFIG_NFS_V4_1 */ ++ + #ifdef CONFIG_PROC_FS + rpc_proc_register(&nfs_rpcstat); + #endif +@@ -1498,6 +1586,10 @@ out: + #ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); + #endif ++#ifdef CONFIG_NFS_V4_1 ++out00: ++ pnfs_uninitialize(); ++#endif /* CONFIG_NFS_V4_1 */ + nfs_destroy_directcache(); + out0: + nfs_destroy_writepagecache(); +@@ -1531,6 +1623,9 @@ static void __exit exit_nfs_fs(void) + #ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); + #endif ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_uninitialize(); ++#endif + unregister_nfs_fs(); + nfs_fs_proc_exit(); + nfsiod_stop(); +diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/internal.h +--- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-08-31 20:41:19.149170418 -0400 ++++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-31 20:42:05.517099944 -0400 +@@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_serv + struct nfs_fattr *); + extern void nfs_mark_client_ready(struct nfs_client *clp, int state); + extern int nfs4_check_client_ready(struct nfs_client *clp); ++extern int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++ const struct sockaddr *sa2); ++extern int nfs4_set_client(struct nfs_server *server, ++ const char *hostname, ++ const struct sockaddr *addr, ++ const size_t addrlen, ++ const char *ip_addr, ++ rpc_authflavor_t authflavour, ++ int proto, const struct rpc_timeout *timeparms, ++ u32 minorversion); + #ifdef CONFIG_PROC_FS + extern int __init nfs_fs_proc_init(void); + extern void nfs_fs_proc_exit(void); +@@ -201,6 +211,8 @@ extern const u32 nfs41_maxwrite_overhead + extern struct rpc_procinfo nfs4_procedures[]; + #endif + ++extern int nfs4_recover_expired_lease(struct nfs_client *clp); ++ + /* proc.c */ + void nfs_close_context(struct nfs_open_context *ctx, int is_sync); + +@@ -248,10 +260,31 @@ extern int nfs4_get_rootfh(struct nfs_se + #endif + + /* read.c */ ++extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); ++extern int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); + extern void nfs_read_prepare(struct rpc_task *task, void *calldata); + + /* write.c */ ++extern int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs); + extern void nfs_write_prepare(struct rpc_task *task, void *calldata); ++extern void nfs_mark_list_commit(struct list_head *head); + #ifdef CONFIG_MIGRATION + extern int nfs_migrate_page(struct address_space *, + struct page *, struct page *); +diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kconfig +--- linux-2.6.34.noarch/fs/nfs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-08-31 20:42:05.500123860 -0400 +@@ -79,10 +79,48 @@ config NFS_V4_1 + depends on NFS_V4 && EXPERIMENTAL + help + This option enables support for minor version 1 of the NFSv4 protocol +- (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. ++ (RFC5661) including support for the parallel NFS (pNFS) features ++ in the kernel's NFS client. + + Unless you're an NFS developer, say N. + ++config PNFS_FILE_LAYOUT ++ tristate "NFS client support for the pNFS nfs-files layout (DEVELOPER ONLY)" ++ depends on NFS_FS && NFS_V4_1 ++ default y ++ help ++ This option enables support for the pNFS nfs-files layout. ++ ++ Unless you're an NFS developer, say N. ++ ++config PNFS_OBJLAYOUT ++ tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD ++ help ++ Say M here if you want your pNFS client to support the Objects Layout Driver. ++ Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and ++ upper level driver (SCSI_OSD_ULD). ++ ++ If unsure, say N. ++ ++config PNFS_PANLAYOUT ++ tristate "Provide support for the Panasas OSD Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on PNFS_OBJLAYOUT ++ help ++ Say M or y here if you want your pNFS client to support the Panasas OSD Layout Driver. ++ ++ If unsure, say N. ++ ++config PNFS_BLOCK ++ tristate "Provide a pNFS block client (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 ++ select MD ++ select BLK_DEV_DM ++ help ++ Say M or y here if you want your pNfs client to support the block protocol ++ ++ If unsure, say N. ++ + config ROOT_NFS + bool "Root file system on NFS" + depends on NFS_FS=y && IP_PNP +diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Makefile +--- linux-2.6.34.noarch/fs/nfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-08-31 20:42:05.501268752 -0400 +@@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4x + delegation.o idmap.o \ + callback.o callback_xdr.o callback_proc.o \ + nfs4namespace.o ++nfs-$(CONFIG_NFS_V4_1) += pnfs.o + nfs-$(CONFIG_SYSCTL) += sysctl.o + nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o ++ ++obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o ++nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o ++ ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ ++obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ +diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs3proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-08-31 20:41:19.152180625 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-31 20:42:05.518232887 -0400 +@@ -833,6 +833,7 @@ const struct nfs_rpc_ops nfs_v3_clientop + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs3_dir_inode_operations, + .file_inode_ops = &nfs3_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs3_proc_get_root, + .getattr = nfs3_proc_getattr, + .setattr = nfs3_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-31 20:42:05.519163219 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-08-31 20:42:05.520222923 -0400 +@@ -0,0 +1,768 @@ ++/* ++ * linux/fs/nfs/nfs4filelayout.c ++ * ++ * Module for the pnfs nfs4 file layout driver. ++ * Defines all I/O and Policy interface operations, plus code ++ * to register itself with the pNFS client. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfs4filelayout.h" ++#include "nfs4_fs.h" ++#include "internal.h" ++#include "pnfs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Dean Hildebrand "); ++MODULE_DESCRIPTION("The NFSv4 file layout driver"); ++ ++/* Callback operations to the pNFS client */ ++struct pnfs_client_operations *pnfs_callback_ops; ++ ++/* Forward declaration */ ++struct layoutdriver_io_operations filelayout_io_operations; ++ ++int ++filelayout_initialize_mountpoint(struct nfs_server *nfss, ++ const struct nfs_fh *mntfh) ++{ ++ int status = nfs4_alloc_init_deviceid_cache(nfss->nfs_client, ++ nfs4_fl_free_deviceid_callback); ++ if (status) { ++ printk(KERN_WARNING "%s: deviceid cache could not be " ++ "initialized\n", __func__); ++ return status; ++ } ++ dprintk("%s: deviceid cache has been initialized successfully\n", ++ __func__); ++ return 0; ++} ++ ++/* Uninitialize a mountpoint by destroying its device list */ ++int ++filelayout_uninitialize_mountpoint(struct nfs_server *nfss) ++{ ++ dprintk("--> %s\n", __func__); ++ ++ if (nfss->pnfs_curr_ld && nfss->nfs_client->cl_devid_cache) ++ nfs4_put_deviceid_cache(nfss->nfs_client); ++ return 0; ++} ++ ++/* This function is used by the layout driver to calculate the ++ * offset of the file on the dserver based on whether the ++ * layout type is STRIPE_DENSE or STRIPE_SPARSE ++ */ ++static loff_t ++filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ ++ switch (flseg->stripe_type) { ++ case STRIPE_SPARSE: ++ return offset; ++ ++ case STRIPE_DENSE: ++ { ++ u32 stripe_width; ++ u64 tmp, off; ++ u32 unit = flseg->stripe_unit; ++ ++ stripe_width = unit * FILE_DSADDR(lseg)->stripe_count; ++ tmp = off = offset - flseg->pattern_offset; ++ do_div(tmp, stripe_width); ++ return tmp * unit + do_div(off, unit); ++ } ++ default: ++ BUG(); ++ } ++ ++ /* We should never get here... just to stop the gcc warning */ ++ return 0; ++} ++ ++/* ++ * Call ops for the async read/write cases ++ * In the case of dense layouts, the offset needs to be reset to its ++ * original value. ++ */ ++static void filelayout_read_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ if (rdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ rdata->args.offset, rdata->fldata.orig_offset); ++ rdata->args.offset = rdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ rdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_read_release(void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ put_lseg(rdata->pdata.lseg); ++ rdata->pdata.lseg = NULL; ++ rdata->pdata.call_ops->rpc_release(data); ++} ++ ++static void filelayout_write_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ if (wdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ wdata->args.offset, wdata->fldata.orig_offset); ++ wdata->args.offset = wdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_write_release(void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ put_lseg(wdata->pdata.lseg); ++ wdata->pdata.lseg = NULL; ++ wdata->pdata.call_ops->rpc_release(data); ++} ++ ++struct rpc_call_ops filelayout_read_call_ops = { ++ .rpc_call_prepare = nfs_read_prepare, ++ .rpc_call_done = filelayout_read_call_done, ++ .rpc_release = filelayout_read_release, ++}; ++ ++struct rpc_call_ops filelayout_write_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_write_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* Perform sync or async reads. ++ * ++ * An optimization for the NFS file layout driver ++ * allows the original read/write data structs to be passed in the ++ * last argument. ++ * ++ * TODO: join with write_pagelist? ++ */ ++static enum pnfs_try_status ++filelayout_read_pagelist(struct nfs_read_data *data, unsigned nr_pages) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ dprintk("--> %s ino %lu nr_pages %d pgbase %u req %Zu@%llu\n", ++ __func__, data->inode->i_ino, nr_pages, ++ data->args.pgbase, (size_t)data->args.count, offset); ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s USE DS:ip %x %s\n", __func__, ++ htonl(ds->ds_ip_addr), ds->r_addr); ++ ++ /* just try the first data server for the index..*/ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ ++ /* ++ * Now get the file offset on the dserver ++ * Set the read offset to this offset, and ++ * save the original offset in orig_offset ++ * In the case of aync reads, the offset will be reset in the ++ * call_ops->rpc_call_done() routine. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* Perform an asynchronous read */ ++ nfs_initiate_read(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_read_call_ops); ++ ++ data->pdata.pnfs_error = 0; ++ ++ return PNFS_ATTEMPTED; ++} ++ ++/* Perform async writes. */ ++static enum pnfs_try_status ++filelayout_write_pagelist(struct nfs_write_data *data, unsigned nr_pages, int sync) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu %s\n", __func__, ++ data->inode->i_ino, sync, (size_t) data->args.count, offset, ++ htonl(ds->ds_ip_addr), ntohs(ds->ds_port), ds->r_addr); ++ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ /* ++ * Get the file offset on the dserver. Set the write offset to ++ * this offset and save the original offset. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* ++ * Perform an asynchronous write The offset will be reset in the ++ * call_ops->rpc_call_done() routine ++ */ ++ nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_write_call_ops, sync); ++ ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++} ++ ++/* ++ * Create a filelayout layout structure and return it. The pNFS client ++ * will use the pnfs_layout_hdr type to refer to the layout for this ++ * inode from now on. ++ */ ++static struct pnfs_layout_hdr * ++filelayout_alloc_layout(struct inode *inode) ++{ ++ struct nfs4_filelayout *flp; ++ ++ dprintk("NFS_FILELAYOUT: allocating layout\n"); ++ flp = kzalloc(sizeof(struct nfs4_filelayout), GFP_KERNEL); ++ return flp ? &flp->fl_layout : NULL; ++} ++ ++/* Free a filelayout layout structure */ ++static void ++filelayout_free_layout(struct pnfs_layout_hdr *lo) ++{ ++ dprintk("NFS_FILELAYOUT: freeing layout\n"); ++ kfree(FILE_LO(lo)); ++} ++ ++/* ++ * filelayout_check_layout() ++ * ++ * Make sure layout segment parameters are sane WRT the device. ++ * ++ * Notes: ++ * 1) current code insists that # stripe index = # data servers in ds_list ++ * which is wrong. ++ * 2) pattern_offset is ignored and must == 0 which is wrong; ++ * 3) the pattern_offset needs to be a mutliple of the stripe unit. ++ * 4) stripe unit is multiple of page size ++ */ ++ ++static int ++filelayout_check_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct nfs4_filelayout_segment *fl = LSEG_LD_DATA(lseg); ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ int status = -EINVAL; ++ struct nfs_server *nfss = NFS_SERVER(PNFS_INODE(lo)); ++ ++ dprintk("--> %s\n", __func__); ++ /* find in list or get from server and reference the deviceid */ ++ dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, &fl->dev_id); ++ if (dsaddr == NULL) { ++ dsaddr = get_device_info(PNFS_INODE(lo), &fl->dev_id); ++ if (dsaddr == NULL) { ++ dprintk("%s NO device for dev_id %s\n", ++ __func__, deviceid_fmt(&fl->dev_id)); ++ goto out; ++ } ++ } ++ if (fl->first_stripe_index < 0 || ++ fl->first_stripe_index > dsaddr->stripe_count) { ++ dprintk("%s Bad first_stripe_index %d\n", ++ __func__, fl->first_stripe_index); ++ goto out_put; ++ } ++ ++ if (fl->pattern_offset != 0) { ++ dprintk("%s Unsupported no-zero pattern_offset %Ld\n", ++ __func__, fl->pattern_offset); ++ goto out_put; ++ } ++ ++ if (fl->stripe_unit % PAGE_SIZE) { ++ dprintk("%s Stripe unit (%u) not page aligned\n", ++ __func__, fl->stripe_unit); ++ goto out_put; ++ } ++ ++ /* XXX only support SPARSE packing. Don't support use MDS open fh */ ++ if (!(fl->num_fh == 1 || fl->num_fh == dsaddr->ds_num)) { ++ dprintk("%s num_fh %u not equal to 1 or ds_num %u\n", ++ __func__, fl->num_fh, dsaddr->ds_num); ++ goto out_put; ++ } ++ ++ if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { ++ dprintk("%s Stripe unit (%u) not aligned with rsize %u " ++ "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, ++ nfss->wsize); ++ } ++ ++ nfs4_set_layout_deviceid(lseg, &dsaddr->deviceid); ++ ++ status = 0; ++out: ++ dprintk("--> %s returns %d\n", __func__, status); ++ return status; ++out_put: ++ nfs4_put_unset_layout_deviceid(lseg, &dsaddr->deviceid, ++ nfs4_fl_free_deviceid_callback); ++ goto out; ++} ++ ++static void _filelayout_free_lseg(struct pnfs_layout_segment *lseg); ++static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl); ++ ++/* Decode layout and store in layoutid. Overwrite any existing layout ++ * information for this file. ++ */ ++static int ++filelayout_set_layout(struct nfs4_filelayout *flo, ++ struct nfs4_filelayout_segment *fl, ++ struct nfs4_layoutget_res *lgr) ++{ ++ uint32_t *p = (uint32_t *)lgr->layout.buf; ++ uint32_t nfl_util; ++ int i; ++ ++ dprintk("%s: set_layout_map Begin\n", __func__); ++ ++ memcpy(&fl->dev_id, p, NFS4_PNFS_DEVICEID4_SIZE); ++ p += XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE); ++ nfl_util = be32_to_cpup(p++); ++ if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) ++ fl->commit_through_mds = 1; ++ if (nfl_util & NFL4_UFLG_DENSE) ++ fl->stripe_type = STRIPE_DENSE; ++ else ++ fl->stripe_type = STRIPE_SPARSE; ++ fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; ++ ++ if (!flo->stripe_unit) ++ flo->stripe_unit = fl->stripe_unit; ++ else if (flo->stripe_unit != fl->stripe_unit) { ++ printk(KERN_NOTICE "%s: updating strip_unit from %u to %u\n", ++ __func__, flo->stripe_unit, fl->stripe_unit); ++ flo->stripe_unit = fl->stripe_unit; ++ } ++ ++ fl->first_stripe_index = be32_to_cpup(p++); ++ p = xdr_decode_hyper(p, &fl->pattern_offset); ++ fl->num_fh = be32_to_cpup(p++); ++ ++ dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu dev_id %s\n", ++ __func__, nfl_util, fl->num_fh, fl->first_stripe_index, ++ fl->pattern_offset, deviceid_fmt(&fl->dev_id)); ++ ++ if (fl->num_fh * sizeof(struct nfs_fh) > 2*PAGE_SIZE) { ++ fl->fh_array = vmalloc(fl->num_fh * sizeof(struct nfs_fh)); ++ if (fl->fh_array) ++ memset(fl->fh_array, 0, ++ fl->num_fh * sizeof(struct nfs_fh)); ++ } else { ++ fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh), ++ GFP_KERNEL); ++ } ++ if (!fl->fh_array) ++ return -ENOMEM; ++ ++ for (i = 0; i < fl->num_fh; i++) { ++ /* fh */ ++ fl->fh_array[i].size = be32_to_cpup(p++); ++ if (sizeof(struct nfs_fh) < fl->fh_array[i].size) { ++ printk(KERN_ERR "Too big fh %d received %d\n", ++ i, fl->fh_array[i].size); ++ /* Layout is now invalid, pretend it doesn't exist */ ++ filelayout_free_fh_array(fl); ++ fl->num_fh = 0; ++ break; ++ } ++ memcpy(fl->fh_array[i].data, p, fl->fh_array[i].size); ++ p += XDR_QUADLEN(fl->fh_array[i].size); ++ dprintk("DEBUG: %s: fh len %d\n", __func__, ++ fl->fh_array[i].size); ++ } ++ ++ return 0; ++} ++ ++static struct pnfs_layout_segment * ++filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, ++ struct nfs4_layoutget_res *lgr) ++{ ++ struct nfs4_filelayout *flo = FILE_LO(layoutid); ++ struct pnfs_layout_segment *lseg; ++ int rc; ++ ++ dprintk("--> %s\n", __func__); ++ lseg = kzalloc(sizeof(struct pnfs_layout_segment) + ++ sizeof(struct nfs4_filelayout_segment), GFP_KERNEL); ++ if (!lseg) ++ return NULL; ++ ++ rc = filelayout_set_layout(flo, LSEG_LD_DATA(lseg), lgr); ++ ++ if (rc != 0 || filelayout_check_layout(layoutid, lseg)) { ++ _filelayout_free_lseg(lseg); ++ lseg = NULL; ++ } ++ return lseg; ++} ++ ++static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) ++{ ++ if (fl->num_fh * sizeof(struct nfs_fh) > 2*PAGE_SIZE) ++ vfree(fl->fh_array); ++ else ++ kfree(fl->fh_array); ++ ++ fl->fh_array = NULL; ++} ++ ++static void ++_filelayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ filelayout_free_fh_array(LSEG_LD_DATA(lseg)); ++ kfree(lseg); ++} ++ ++static void ++filelayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ dprintk("--> %s\n", __func__); ++ nfs4_put_unset_layout_deviceid(lseg, lseg->deviceid, ++ nfs4_fl_free_deviceid_callback); ++ _filelayout_free_lseg(lseg); ++} ++ ++/* Allocate a new nfs_write_data struct and initialize */ ++static struct nfs_write_data * ++filelayout_clone_write_data(struct nfs_write_data *old) ++{ ++ static struct nfs_write_data *new; ++ ++ new = nfs_commitdata_alloc(); ++ if (!new) ++ goto out; ++ kref_init(&new->refcount); ++ new->parent = old; ++ kref_get(&old->refcount); ++ new->inode = old->inode; ++ new->cred = old->cred; ++ new->args.offset = 0; ++ new->args.count = 0; ++ new->res.count = 0; ++ new->res.fattr = &new->fattr; ++ nfs_fattr_init(&new->fattr); ++ new->res.verf = &new->verf; ++ new->args.context = get_nfs_open_context(old->args.context); ++ new->pdata.lseg = NULL; ++ new->pdata.call_ops = old->pdata.call_ops; ++ new->pdata.how = old->pdata.how; ++out: ++ return new; ++} ++ ++static void filelayout_commit_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static struct rpc_call_ops filelayout_commit_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_commit_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* ++ * Execute a COMMIT op to the MDS or to each data server on which a page ++ * in 'pages' exists. ++ * Invoke the pnfs_commit_complete callback. ++ */ ++enum pnfs_try_status ++filelayout_commit(struct nfs_write_data *data, int sync) ++{ ++ LIST_HEAD(head); ++ struct nfs_page *req; ++ loff_t file_offset = 0; ++ u16 idx, i; ++ struct list_head **ds_page_list = NULL; ++ u16 *indices_used; ++ int num_indices_seen = 0; ++ const struct rpc_call_ops *call_ops; ++ struct rpc_clnt *clnt; ++ struct nfs_write_data **clone_list = NULL; ++ struct nfs_write_data *dsdata; ++ struct nfs4_pnfs_ds *ds; ++ ++ dprintk("%s data %p sync %d\n", __func__, data, sync); ++ ++ /* Alloc room for both in one go */ ++ ds_page_list = kzalloc((NFS4_PNFS_MAX_MULTI_CNT + 1) * ++ (sizeof(u16) + sizeof(struct list_head *)), ++ GFP_KERNEL); ++ if (!ds_page_list) ++ goto mem_error; ++ indices_used = (u16 *) (ds_page_list + NFS4_PNFS_MAX_MULTI_CNT + 1); ++ /* ++ * Sort pages based on which ds to send to. ++ * MDS is given index equal to NFS4_PNFS_MAX_MULTI_CNT. ++ * Note we are assuming there is only a single lseg in play. ++ * When that is not true, we could first sort on lseg, then ++ * sort within each as we do here. ++ */ ++ while (!list_empty(&data->pages)) { ++ req = nfs_list_entry(data->pages.next); ++ nfs_list_remove_request(req); ++ if (!req->wb_lseg || ++ ((struct nfs4_filelayout_segment *) ++ LSEG_LD_DATA(req->wb_lseg))->commit_through_mds) ++ idx = NFS4_PNFS_MAX_MULTI_CNT; ++ else { ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ idx = nfs4_fl_calc_ds_index(req->wb_lseg, file_offset); ++ } ++ if (ds_page_list[idx]) { ++ /* Already seen this idx */ ++ list_add(&req->wb_list, ds_page_list[idx]); ++ } else { ++ /* New idx not seen so far */ ++ list_add_tail(&req->wb_list, &head); ++ indices_used[num_indices_seen++] = idx; ++ } ++ ds_page_list[idx] = &req->wb_list; ++ } ++ /* Once created, clone must be released via call_op */ ++ clone_list = kzalloc(num_indices_seen * ++ sizeof(struct nfs_write_data *), GFP_KERNEL); ++ if (!clone_list) ++ goto mem_error; ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ clone_list[i] = filelayout_clone_write_data(data); ++ if (!clone_list[i]) ++ goto mem_error; ++ } ++ clone_list[i] = data; ++ /* ++ * Now send off the RPCs to each ds. Note that it is important ++ * that any RPC to the MDS be sent last (or at least after all ++ * clones have been made.) ++ */ ++ for (i = 0; i < num_indices_seen; i++) { ++ dsdata = clone_list[i]; ++ idx = indices_used[i]; ++ list_cut_position(&dsdata->pages, &head, ds_page_list[idx]); ++ if (idx == NFS4_PNFS_MAX_MULTI_CNT) { ++ call_ops = data->pdata.call_ops;; ++ clnt = NFS_CLIENT(dsdata->inode); ++ ds = NULL; ++ } else { ++ struct nfs_fh *fh; ++ ++ call_ops = &filelayout_commit_call_ops; ++ req = nfs_list_entry(dsdata->pages.next); ++ ds = nfs4_fl_prepare_ds(req->wb_lseg, idx); ++ if (!ds) { ++ /* Trigger retry of this chunk through MDS */ ++ dsdata->task.tk_status = -EIO; ++ data->pdata.call_ops->rpc_release(dsdata); ++ continue; ++ } ++ clnt = ds->ds_clp->cl_rpcclient; ++ dsdata->fldata.ds_nfs_client = ds->ds_clp; ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ fh = nfs4_fl_select_ds_fh(req->wb_lseg, file_offset); ++ if (fh) ++ dsdata->args.fh = fh; ++ } ++ dprintk("%s: Initiating commit: %llu USE DS:\n", ++ __func__, file_offset); ++ print_ds(ds); ++ ++ /* Send COMMIT to data server */ ++ nfs_initiate_commit(dsdata, clnt, call_ops, sync); ++ } ++ kfree(clone_list); ++ kfree(ds_page_list); ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++ ++ mem_error: ++ if (clone_list) { ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ if (!clone_list[i]) ++ break; ++ data->pdata.call_ops->rpc_release(clone_list[i]); ++ } ++ kfree(clone_list); ++ } ++ kfree(ds_page_list); ++ /* One of these will be empty, but doesn't hurt to do both */ ++ nfs_mark_list_commit(&head); ++ nfs_mark_list_commit(&data->pages); ++ data->pdata.call_ops->rpc_release(data); ++ return PNFS_ATTEMPTED; ++} ++ ++/* Return the stripesize for the specified file */ ++ssize_t ++filelayout_get_stripesize(struct pnfs_layout_hdr *lo) ++{ ++ struct nfs4_filelayout *flo = FILE_LO(lo); ++ ++ return flo->stripe_unit; ++} ++ ++/* ++ * filelayout_pg_test(). Called by nfs_can_coalesce_requests() ++ * ++ * return 1 : coalesce page ++ * return 0 : don't coalesce page ++ */ ++int ++filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ u64 p_stripe, r_stripe; ++ ++ if (pgio->pg_boundary == 0) ++ return 1; ++ p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; ++ r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; ++ ++ do_div(p_stripe, pgio->pg_boundary); ++ do_div(r_stripe, pgio->pg_boundary); ++ ++ return (p_stripe == r_stripe); ++} ++ ++struct layoutdriver_io_operations filelayout_io_operations = { ++ .commit = filelayout_commit, ++ .read_pagelist = filelayout_read_pagelist, ++ .write_pagelist = filelayout_write_pagelist, ++ .alloc_layout = filelayout_alloc_layout, ++ .free_layout = filelayout_free_layout, ++ .alloc_lseg = filelayout_alloc_lseg, ++ .free_lseg = filelayout_free_lseg, ++ .initialize_mountpoint = filelayout_initialize_mountpoint, ++ .uninitialize_mountpoint = filelayout_uninitialize_mountpoint, ++}; ++ ++struct layoutdriver_policy_operations filelayout_policy_operations = { ++ .flags = PNFS_USE_RPC_CODE, ++ .get_stripesize = filelayout_get_stripesize, ++ .pg_test = filelayout_pg_test, ++}; ++ ++struct pnfs_layoutdriver_type filelayout_type = { ++ .id = LAYOUT_NFSV4_1_FILES, ++ .name = "LAYOUT_NFSV4_1_FILES", ++ .ld_io_ops = &filelayout_io_operations, ++ .ld_policy_ops = &filelayout_policy_operations, ++}; ++ ++static int __init nfs4filelayout_init(void) ++{ ++ printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", ++ __func__); ++ ++ /* ++ * Need to register file_operations struct with global list to indicate ++ * that NFS4 file layout is a possible pNFS I/O module ++ */ ++ pnfs_callback_ops = pnfs_register_layoutdriver(&filelayout_type); ++ ++ return 0; ++} ++ ++static void __exit nfs4filelayout_exit(void) ++{ ++ printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", ++ __func__); ++ ++ /* Unregister NFS4 file layout driver with pNFS client*/ ++ pnfs_unregister_layoutdriver(&filelayout_type); ++} ++ ++module_init(nfs4filelayout_init); ++module_exit(nfs4filelayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-31 20:42:05.521233147 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-31 20:42:05.521233147 -0400 +@@ -0,0 +1,635 @@ ++/* ++ * linux/fs/nfs/nfs4filelayoutdev.c ++ * ++ * Device operations for the pnfs nfs4 file layout driver. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * Garth Goodson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include "nfs4filelayout.h" ++#include "internal.h" ++#include "nfs4_fs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++DEFINE_SPINLOCK(nfs4_ds_cache_lock); ++static LIST_HEAD(nfs4_data_server_cache); ++ ++void ++print_ds(struct nfs4_pnfs_ds *ds) ++{ ++ if (ds == NULL) { ++ dprintk("%s NULL device \n", __func__); ++ return; ++ } ++ dprintk(" ip_addr %x\n", ntohl(ds->ds_ip_addr)); ++ dprintk(" port %hu\n", ntohs(ds->ds_port)); ++ dprintk(" client %p\n", ds->ds_clp); ++ dprintk(" ref count %d\n", atomic_read(&ds->ds_count)); ++ if (ds->ds_clp) ++ dprintk(" cl_exchange_flags %x\n", ++ ds->ds_clp->cl_exchange_flags); ++ dprintk(" ip:port %s\n", ds->r_addr); ++} ++ ++void ++print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr) ++{ ++ int i; ++ ++ dprintk("%s dsaddr->ds_num %d\n", __func__, ++ dsaddr->ds_num); ++ for (i = 0; i < dsaddr->ds_num; i++) ++ print_ds(dsaddr->ds_list[i]); ++} ++ ++/* Debugging function assuming a 64bit major/minor split of the deviceid */ ++char * ++deviceid_fmt(const struct pnfs_deviceid *dev_id) ++{ ++ static char buf[17]; ++ uint32_t *p = (uint32_t *)dev_id->data; ++ uint64_t major, minor; ++ ++ p = xdr_decode_hyper(p, &major); ++ p = xdr_decode_hyper(p, &minor); ++ ++ sprintf(buf, "%08llu %08llu", major, minor); ++ return buf; ++} ++ ++/* nfs4_ds_cache_lock is held */ ++static inline struct nfs4_pnfs_ds * ++_data_server_lookup(u32 ip_addr, u32 port) ++{ ++ struct nfs4_pnfs_ds *ds; ++ ++ dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", ++ ntohl(ip_addr), ntohs(port)); ++ ++ list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { ++ if (ds->ds_ip_addr == ip_addr && ++ ds->ds_port == port) { ++ return ds; ++ } ++ } ++ return NULL; ++} ++ ++/* Create an rpc to the data server defined in 'dev_list' */ ++static int ++nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) ++{ ++ struct nfs_server *tmp; ++ struct sockaddr_in sin; ++ struct rpc_clnt *mds_clnt = mds_srv->client; ++ struct nfs_client *clp = mds_srv->nfs_client; ++ struct sockaddr *mds_addr; ++ int err = 0; ++ ++ dprintk("--> %s ip:port %s au_flavor %d\n", __func__, ++ ds->r_addr, mds_clnt->cl_auth->au_flavor); ++ ++ sin.sin_family = AF_INET; ++ sin.sin_addr.s_addr = ds->ds_ip_addr; ++ sin.sin_port = ds->ds_port; ++ ++ /* ++ * If this DS is also the MDS, use the MDS session only if the ++ * MDS exchangeid flags show the EXCHGID4_FLAG_USE_PNFS_DS pNFS role. ++ */ ++ mds_addr = (struct sockaddr *)&clp->cl_addr; ++ if (nfs_sockaddr_cmp((struct sockaddr *)&sin, mds_addr)) { ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO "ip:port %s is not a pNFS Data " ++ "Server\n", ds->r_addr); ++ err = -ENODEV; ++ } else { ++ atomic_inc(&clp->cl_count); ++ ds->ds_clp = clp; ++ dprintk("%s Using MDS Session for DS\n", __func__); ++ } ++ goto out; ++ } ++ ++ /* Temporay server for nfs4_set_client */ ++ tmp = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); ++ if (!tmp) ++ goto out; ++ ++ /* ++ * Set a retrans, timeout interval, and authflavor equual to the MDS ++ * values. Use the MDS nfs_client cl_ipaddr field so as to use the ++ * same co_ownerid as the MDS. ++ */ ++ err = nfs4_set_client(tmp, ++ mds_srv->nfs_client->cl_hostname, ++ (struct sockaddr *)&sin, ++ sizeof(struct sockaddr), ++ mds_srv->nfs_client->cl_ipaddr, ++ mds_clnt->cl_auth->au_flavor, ++ IPPROTO_TCP, ++ mds_clnt->cl_xprt->timeout, ++ 1 /* minorversion */); ++ if (err < 0) ++ goto out_free; ++ ++ clp = tmp->nfs_client; ++ ++ /* Ask for only the EXCHGID4_FLAG_USE_PNFS_DS pNFS role */ ++ dprintk("%s EXCHANGE_ID for clp %p\n", __func__, clp); ++ clp->cl_exchange_flags = EXCHGID4_FLAG_USE_PNFS_DS; ++ ++ err = nfs4_recover_expired_lease(clp); ++ if (!err) ++ err = nfs4_check_client_ready(clp); ++ if (err) ++ goto out_put; ++ ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO "ip:port %s is not a pNFS Data Server\n", ++ ds->r_addr); ++ err = -ENODEV; ++ goto out_put; ++ } ++ /* ++ * Mask the (possibly) returned EXCHGID4_FLAG_USE_PNFS_MDS pNFS role ++ * The is_ds_only_session depends on this. ++ */ ++ clp->cl_exchange_flags &= ~EXCHGID4_FLAG_USE_PNFS_MDS; ++ /* ++ * Set DS lease equal to the MDS lease, renewal is scheduled in ++ * create_session ++ */ ++ spin_lock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time; ++ spin_unlock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_last_renewal = jiffies; ++ ++ clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); ++ ds->ds_clp = clp; ++ ++ dprintk("%s: ip=%x, port=%hu, rpcclient %p\n", __func__, ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), ++ clp->cl_rpcclient); ++out_free: ++ kfree(tmp); ++out: ++ dprintk("%s Returns %d\n", __func__, err); ++ return err; ++out_put: ++ nfs_put_client(clp); ++ goto out_free; ++} ++ ++static void ++destroy_ds(struct nfs4_pnfs_ds *ds) ++{ ++ dprintk("--> %s\n", __func__); ++ print_ds(ds); ++ ++ if (ds->ds_clp) ++ nfs_put_client(ds->ds_clp); ++ kfree(ds); ++} ++ ++static void ++nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) ++{ ++ struct nfs4_pnfs_ds *ds; ++ int i; ++ ++ dprintk("%s: device id=%s\n", __func__, ++ deviceid_fmt(&dsaddr->deviceid.de_id)); ++ ++ for (i = 0; i < dsaddr->ds_num; i++) { ++ ds = dsaddr->ds_list[i]; ++ if (ds != NULL) { ++ if (atomic_dec_and_lock(&ds->ds_count, ++ &nfs4_ds_cache_lock)) { ++ list_del_init(&ds->ds_node); ++ spin_unlock(&nfs4_ds_cache_lock); ++ destroy_ds(ds); ++ } ++ } ++ } ++ kfree(dsaddr->stripe_indices); ++ kfree(dsaddr); ++} ++ ++void ++nfs4_fl_free_deviceid_callback(struct kref *kref) ++{ ++ struct nfs4_deviceid *device = ++ container_of(kref, struct nfs4_deviceid, de_kref); ++ struct nfs4_file_layout_dsaddr *dsaddr = ++ container_of(device, struct nfs4_file_layout_dsaddr, deviceid); ++ ++ nfs4_fl_free_deviceid(dsaddr); ++} ++ ++static void ++nfs4_pnfs_ds_add(struct inode *inode, struct nfs4_pnfs_ds **dsp, ++ u32 ip_addr, u32 port, char *r_addr, int len) ++{ ++ struct nfs4_pnfs_ds *tmp_ds, *ds; ++ ++ *dsp = NULL; ++ ++ ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL); ++ if (!ds) ++ return; ++ ++ spin_lock(&nfs4_ds_cache_lock); ++ tmp_ds = _data_server_lookup(ip_addr, port); ++ if (tmp_ds == NULL) { ++ ds->ds_ip_addr = ip_addr; ++ ds->ds_port = port; ++ strncpy(ds->r_addr, r_addr, len); ++ atomic_set(&ds->ds_count, 1); ++ INIT_LIST_HEAD(&ds->ds_node); ++ ds->ds_clp = NULL; ++ list_add(&ds->ds_node, &nfs4_data_server_cache); ++ *dsp = ds; ++ dprintk("%s add new data server ip 0x%x\n", __func__, ++ ds->ds_ip_addr); ++ spin_unlock(&nfs4_ds_cache_lock); ++ } else { ++ atomic_inc(&tmp_ds->ds_count); ++ *dsp = tmp_ds; ++ dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", ++ __func__, tmp_ds->ds_ip_addr, ++ atomic_read(&tmp_ds->ds_count)); ++ spin_unlock(&nfs4_ds_cache_lock); ++ kfree(ds); ++ } ++} ++ ++static struct nfs4_pnfs_ds * ++decode_and_add_ds(uint32_t **pp, struct inode *inode) ++{ ++ struct nfs4_pnfs_ds *ds = NULL; ++ char r_addr[29]; /* max size of ip/port string */ ++ int len; ++ u32 ip_addr, port; ++ int tmp[6]; ++ uint32_t *p = *pp; ++ ++ dprintk("%s enter\n", __func__); ++ /* check and skip r_netid */ ++ len = be32_to_cpup(p++); ++ /* "tcp" */ ++ if (len != 3) { ++ printk("%s: ERROR: non TCP r_netid len %d\n", ++ __func__, len); ++ goto out_err; ++ } ++ /* ++ * Read the bytes into a temporary buffer ++ * XXX: should probably sanity check them ++ */ ++ tmp[0] = be32_to_cpup(p++); ++ ++ len = be32_to_cpup(p++); ++ if (len >= sizeof(r_addr)) { ++ printk("%s: ERROR: Device ip/port too long (%d)\n", ++ __func__, len); ++ goto out_err; ++ } ++ memcpy(r_addr, p, len); ++ p += XDR_QUADLEN(len); ++ *pp = p; ++ r_addr[len] = '\0'; ++ sscanf(r_addr, "%d.%d.%d.%d.%d.%d", &tmp[0], &tmp[1], ++ &tmp[2], &tmp[3], &tmp[4], &tmp[5]); ++ ip_addr = htonl((tmp[0]<<24) | (tmp[1]<<16) | (tmp[2]<<8) | (tmp[3])); ++ port = htons((tmp[4] << 8) | (tmp[5])); ++ ++ nfs4_pnfs_ds_add(inode, &ds, ip_addr, port, r_addr, len); ++ ++ dprintk("%s: addr:port string = %s\n", __func__, r_addr); ++ return ds; ++out_err: ++ dprintk("%s returned NULL\n", __func__); ++ return NULL; ++} ++ ++/* Decode opaque device data and return the result */ ++static struct nfs4_file_layout_dsaddr* ++decode_device(struct inode *ino, struct pnfs_device *pdev) ++{ ++ int i, dummy; ++ u32 cnt, num; ++ u8 *indexp; ++ uint32_t *p = (u32 *)pdev->area, *indicesp; ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ ++ /* Get the stripe count (number of stripe index) */ ++ cnt = be32_to_cpup(p++); ++ dprintk("%s stripe count %d\n", __func__, cnt); ++ if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { ++ printk(KERN_WARNING "%s: stripe count %d greater than " ++ "supported maximum %d\n", __func__, ++ cnt, NFS4_PNFS_MAX_STRIPE_CNT); ++ goto out_err; ++ } ++ ++ /* Check the multipath list count */ ++ indicesp = p; ++ p += XDR_QUADLEN(cnt << 2); ++ num = be32_to_cpup(p++); ++ dprintk("%s ds_num %u\n", __func__, num); ++ if (num > NFS4_PNFS_MAX_MULTI_CNT) { ++ printk(KERN_WARNING "%s: multipath count %d greater than " ++ "supported maximum %d\n", __func__, ++ num, NFS4_PNFS_MAX_MULTI_CNT); ++ goto out_err; ++ } ++ dsaddr = kzalloc(sizeof(*dsaddr) + ++ (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), ++ GFP_KERNEL); ++ if (!dsaddr) ++ goto out_err; ++ ++ dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL); ++ if (!dsaddr->stripe_indices) ++ goto out_err_free; ++ ++ dsaddr->stripe_count = cnt; ++ dsaddr->ds_num = num; ++ ++ memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ ++ /* Go back an read stripe indices */ ++ p = indicesp; ++ indexp = &dsaddr->stripe_indices[0]; ++ for (i = 0; i < dsaddr->stripe_count; i++) { ++ dummy = be32_to_cpup(p++); ++ *indexp = dummy; /* bound by NFS4_PNFS_MAX_MULTI_CNT */ ++ indexp++; ++ } ++ /* Skip already read multipath list count */ ++ p++; ++ ++ for (i = 0; i < dsaddr->ds_num; i++) { ++ int j; ++ ++ dummy = be32_to_cpup(p++); /* multipath count */ ++ if (dummy > 1) { ++ printk(KERN_WARNING ++ "%s: Multipath count %d not supported, " ++ "skipping all greater than 1\n", __func__, ++ dummy); ++ } ++ for (j = 0; j < dummy; j++) { ++ if (j == 0) { ++ dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); ++ if (dsaddr->ds_list[i] == NULL) ++ goto out_err_free; ++ } else { ++ u32 len; ++ /* skip extra multipath */ ++ len = be32_to_cpup(p++); ++ p += XDR_QUADLEN(len); ++ len = be32_to_cpup(p++); ++ p += XDR_QUADLEN(len); ++ continue; ++ } ++ } ++ } ++ nfs4_init_deviceid_node(&dsaddr->deviceid); ++ ++ return dsaddr; ++ ++out_err_free: ++ nfs4_fl_free_deviceid(dsaddr); ++out_err: ++ dprintk("%s ERROR: returning NULL\n", __func__); ++ return NULL; ++} ++ ++/* ++ * Decode the opaque device specified in 'dev' ++ * and add it to the list of available devices. ++ * If the deviceid is already cached, nfs4_add_deviceid will return ++ * a pointer to the cached struct and throw away the new. ++ */ ++static struct nfs4_file_layout_dsaddr* ++decode_and_add_device(struct inode *inode, struct pnfs_device *dev) ++{ ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ struct nfs4_deviceid *d; ++ ++ dsaddr = decode_device(inode, dev); ++ if (!dsaddr) { ++ printk(KERN_WARNING "%s: Could not decode or add device\n", ++ __func__); ++ return NULL; ++ } ++ ++ d = nfs4_add_get_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, ++ &dsaddr->deviceid); ++ ++ return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); ++} ++ ++/* ++ * Retrieve the information for dev_id, add it to the list ++ * of available devices, and return it. ++ */ ++struct nfs4_file_layout_dsaddr * ++get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id) ++{ ++ struct pnfs_device *pdev = NULL; ++ u32 max_resp_sz; ++ int max_pages; ++ struct page **pages = NULL; ++ struct nfs4_file_layout_dsaddr *dsaddr = NULL; ++ int rc, i; ++ struct nfs_server *server = NFS_SERVER(inode); ++ ++ /* ++ * Use the session max response size as the basis for setting ++ * GETDEVICEINFO's maxcount ++ */ ++ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; ++ max_pages = max_resp_sz >> PAGE_SHIFT; ++ dprintk("%s inode %p max_resp_sz %u max_pages %d\n", ++ __func__, inode, max_resp_sz, max_pages); ++ ++ pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL); ++ if (pdev == NULL) ++ return NULL; ++ ++ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); ++ if (pages == NULL) { ++ kfree(pdev); ++ return NULL; ++ } ++ for (i = 0; i < max_pages; i++) { ++ pages[i] = alloc_page(GFP_KERNEL); ++ if (!pages[i]) ++ goto out_free; ++ } ++ ++ /* set pdev->area */ ++ pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); ++ if (!pdev->area) ++ goto out_free; ++ ++ memcpy(&pdev->dev_id, dev_id, NFS4_PNFS_DEVICEID4_SIZE); ++ pdev->layout_type = LAYOUT_NFSV4_1_FILES; ++ pdev->pages = pages; ++ pdev->pgbase = 0; ++ pdev->pglen = PAGE_SIZE * max_pages; ++ pdev->mincount = 0; ++ /* TODO: Update types when CB_NOTIFY_DEVICEID is available */ ++ pdev->dev_notify_types = 0; ++ ++ rc = pnfs_callback_ops->nfs_getdeviceinfo(server, pdev); ++ dprintk("%s getdevice info returns %d\n", __func__, rc); ++ if (rc) ++ goto out_free; ++ ++ /* ++ * Found new device, need to decode it and then add it to the ++ * list of known devices for this mountpoint. ++ */ ++ dsaddr = decode_and_add_device(inode, pdev); ++out_free: ++ if (pdev->area != NULL) ++ vunmap(pdev->area); ++ for (i = 0; i < max_pages; i++) ++ __free_page(pages[i]); ++ kfree(pages); ++ kfree(pdev); ++ dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); ++ return dsaddr; ++} ++ ++struct nfs4_file_layout_dsaddr * ++nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ ++ d = nfs4_find_get_deviceid(clp->cl_devid_cache, id); ++ dprintk("%s device id (%s) nfs4_deviceid %p\n", __func__, ++ deviceid_fmt(id), d); ++ return (d == NULL) ? NULL : ++ container_of(d, struct nfs4_file_layout_dsaddr, deviceid); ++} ++ ++/* ++ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit ++ * Then: ((res + fsi) % dsaddr->stripe_count) ++ */ ++static inline u32 ++_nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ u64 tmp; ++ ++ tmp = offset - flseg->pattern_offset; ++ do_div(tmp, flseg->stripe_unit); ++ tmp += flseg->first_stripe_index; ++ return do_div(tmp, FILE_DSADDR(lseg)->stripe_count); ++} ++ ++u32 ++nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ u32 j; ++ ++ j = _nfs4_fl_calc_j_index(lseg, offset); ++ return FILE_DSADDR(lseg)->stripe_indices[j]; ++} ++ ++struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ u32 i; ++ ++ if (flseg->stripe_type == STRIPE_SPARSE) { ++ if (flseg->num_fh == 1) ++ i = 0; ++ else if (flseg->num_fh == 0) ++ return NULL; ++ else ++ i = nfs4_fl_calc_ds_index(lseg, offset); ++ } else ++ i = _nfs4_fl_calc_j_index(lseg, offset); ++ return &flseg->fh_array[i]; ++} ++ ++struct nfs4_pnfs_ds * ++nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ ++ dsaddr = FILE_DSADDR(lseg); ++ if (dsaddr->ds_list[ds_idx] == NULL) { ++ printk(KERN_ERR "%s: No data server for device id (%s)!!\n", ++ __func__, deviceid_fmt(&flseg->dev_id)); ++ return NULL; ++ } ++ ++ if (!dsaddr->ds_list[ds_idx]->ds_clp) { ++ int err; ++ ++ err = nfs4_pnfs_ds_create(PNFS_NFS_SERVER(lseg->layout), ++ dsaddr->ds_list[ds_idx]); ++ if (err) { ++ printk(KERN_ERR "%s nfs4_pnfs_ds_create error %d\n", ++ __func__, err); ++ return NULL; ++ } ++ } ++ dprintk("%s: dev_id=%s, ds_idx=%u\n", ++ __func__, deviceid_fmt(&flseg->dev_id), ds_idx); ++ ++ return dsaddr->ds_list[ds_idx]; ++} ++ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-31 20:42:05.520222923 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-08-31 20:42:05.520222923 -0400 +@@ -0,0 +1,96 @@ ++/* ++ * pnfs_nfs4filelayout.h ++ * ++ * NFSv4 file layout driver data structures. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef FS_NFS_NFS4FILELAYOUT_H ++#define FS_NFS_NFS4FILELAYOUT_H ++ ++#include ++#include ++ ++#define NFS4_PNFS_DEV_HASH_BITS 5 ++#define NFS4_PNFS_DEV_HASH_SIZE (1 << NFS4_PNFS_DEV_HASH_BITS) ++#define NFS4_PNFS_DEV_HASH_MASK (NFS4_PNFS_DEV_HASH_SIZE - 1) ++ ++#define NFS4_PNFS_MAX_STRIPE_CNT 4096 ++#define NFS4_PNFS_MAX_MULTI_CNT 64 /* 256 fit into a u8 stripe_index */ ++#define NFS4_PNFS_MAX_MULTI_DS 2 ++ ++#define FILE_DSADDR(lseg) (container_of(lseg->deviceid, \ ++ struct nfs4_file_layout_dsaddr, \ ++ deviceid)) ++ ++enum stripetype4 { ++ STRIPE_SPARSE = 1, ++ STRIPE_DENSE = 2 ++}; ++ ++/* Individual ip address */ ++struct nfs4_pnfs_ds { ++ struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ ++ u32 ds_ip_addr; ++ u32 ds_port; ++ struct nfs_client *ds_clp; ++ atomic_t ds_count; ++ char r_addr[29]; ++}; ++ ++struct nfs4_file_layout_dsaddr { ++ struct nfs4_deviceid deviceid; ++ u32 stripe_count; ++ u8 *stripe_indices; ++ u32 ds_num; ++ struct nfs4_pnfs_ds *ds_list[1]; ++}; ++ ++struct nfs4_pnfs_dev_hlist { ++ rwlock_t dev_lock; ++ struct hlist_head dev_list[NFS4_PNFS_DEV_HASH_SIZE]; ++}; ++ ++struct nfs4_filelayout_segment { ++ u32 stripe_type; ++ u32 commit_through_mds; ++ u32 stripe_unit; ++ u32 first_stripe_index; ++ u64 pattern_offset; ++ struct pnfs_deviceid dev_id; ++ unsigned int num_fh; ++ struct nfs_fh *fh_array; ++}; ++ ++struct nfs4_filelayout { ++ struct pnfs_layout_hdr fl_layout; ++ u32 stripe_unit; ++}; ++ ++extern struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset); ++ ++static inline struct nfs4_filelayout * ++FILE_LO(struct pnfs_layout_hdr *lo) ++{ ++ return container_of(lo, struct nfs4_filelayout, fl_layout); ++} ++ ++extern struct pnfs_client_operations *pnfs_callback_ops; ++ ++extern void nfs4_fl_free_deviceid_callback(struct kref *); ++extern void print_ds(struct nfs4_pnfs_ds *ds); ++char *deviceid_fmt(const struct pnfs_deviceid *dev_id); ++u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset); ++struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, ++ u32 ds_idx); ++extern struct nfs4_file_layout_dsaddr * ++nfs4_fl_find_get_deviceid(struct nfs_client *, struct pnfs_deviceid *dev_id); ++struct nfs4_file_layout_dsaddr * ++get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id); ++ ++#endif /* FS_NFS_NFS4FILELAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nfs4_fs.h +--- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-31 20:41:19.154160465 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-31 20:42:05.519163219 -0400 +@@ -45,8 +45,28 @@ enum nfs4_client_state { + NFS4CLNT_RECLAIM_NOGRACE, + NFS4CLNT_DELEGRETURN, + NFS4CLNT_SESSION_RESET, +- NFS4CLNT_SESSION_DRAINING, + NFS4CLNT_RECALL_SLOT, ++ NFS4CLNT_LAYOUT_RECALL, ++}; ++ ++enum nfs4_session_state { ++ NFS4_SESSION_INITING, ++ NFS4_SESSION_DRAINING, ++}; ++ ++struct nfs4_minor_version_ops { ++ u32 minor_version; ++ ++ int (*call_sync)(struct nfs_server *server, ++ struct rpc_message *msg, ++ struct nfs4_sequence_args *args, ++ struct nfs4_sequence_res *res, ++ int cache_reply); ++ int (*validate_stateid)(struct nfs_delegation *, ++ const nfs4_stateid *); ++ const struct nfs4_state_recovery_ops *reboot_recovery_ops; ++ const struct nfs4_state_recovery_ops *nograce_recovery_ops; ++ const struct nfs4_state_maintenance_ops *state_renewal_ops; + }; + + /* +@@ -89,7 +109,6 @@ struct nfs_unique_id { + */ + struct nfs4_state_owner { + struct nfs_unique_id so_owner_id; +- struct nfs_client *so_client; + struct nfs_server *so_server; + struct rb_node so_client_node; + +@@ -99,7 +118,6 @@ struct nfs4_state_owner { + atomic_t so_count; + unsigned long so_flags; + struct list_head so_states; +- struct list_head so_delegations; + struct nfs_seqid_counter so_seqid; + struct rpc_sequence so_sequence; + }; +@@ -125,10 +143,20 @@ enum { + * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) + */ + ++struct nfs4_lock_owner { ++ unsigned int lo_type; ++#define NFS4_ANY_LOCK_TYPE (0U) ++#define NFS4_FLOCK_LOCK_TYPE (1U << 0) ++#define NFS4_POSIX_LOCK_TYPE (1U << 1) ++ union { ++ fl_owner_t posix_owner; ++ pid_t flock_owner; ++ } lo_u; ++}; ++ + struct nfs4_lock_state { + struct list_head ls_locks; /* Other lock stateids */ + struct nfs4_state * ls_state; /* Pointer to open state */ +- fl_owner_t ls_owner; /* POSIX lock owner */ + #define NFS_LOCK_INITIALIZED 1 + int ls_flags; + struct nfs_seqid_counter ls_seqid; +@@ -136,6 +164,7 @@ struct nfs4_lock_state { + struct nfs_unique_id ls_id; + nfs4_stateid ls_stateid; + atomic_t ls_count; ++ struct nfs4_lock_owner ls_owner; + }; + + /* bits for nfs4_state->flags */ +@@ -219,22 +248,34 @@ extern int nfs4_open_revalidate(struct i + extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); + extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, + struct nfs4_fs_locations *fs_locations, struct page *page); ++extern void nfs4_release_lockowner(const struct nfs4_lock_state *); + +-extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[]; +-extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[]; + #if defined(CONFIG_NFS_V4_1) +-extern int nfs4_setup_sequence(struct nfs_client *clp, ++static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) ++{ ++ return server->nfs_client->cl_session; ++} ++ ++extern int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task); + extern void nfs4_destroy_session(struct nfs4_session *session); + extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); ++extern int nfs4_proc_exchange_id(struct nfs_client *, struct rpc_cred *); + extern int nfs4_proc_create_session(struct nfs_client *); + extern int nfs4_proc_destroy_session(struct nfs4_session *); + extern int nfs4_init_session(struct nfs_server *server); + extern int nfs4_proc_get_lease_time(struct nfs_client *clp, + struct nfs_fsinfo *fsinfo); + #else /* CONFIG_NFS_v4_1 */ +-static inline int nfs4_setup_sequence(struct nfs_client *clp, ++static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) ++{ ++ return NULL; ++} ++ ++static inline int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task) + { +@@ -247,12 +288,12 @@ static inline int nfs4_init_session(stru + } + #endif /* CONFIG_NFS_V4_1 */ + +-extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[]; ++extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; + + extern const u32 nfs4_fattr_bitmap[2]; + extern const u32 nfs4_statfs_bitmap[2]; + extern const u32 nfs4_pathconf_bitmap[2]; +-extern const u32 nfs4_fsinfo_bitmap[2]; ++extern const u32 nfs4_fsinfo_bitmap[3]; + extern const u32 nfs4_fs_locations_bitmap[2]; + + /* nfs4renewd.c */ +@@ -284,7 +325,7 @@ extern void nfs41_handle_sequence_flag_e + extern void nfs41_handle_recall_slot(struct nfs_client *clp); + extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); + extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); +-extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); ++extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); + + extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); + extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); +@@ -293,6 +334,7 @@ extern void nfs_increment_lock_seqid(int + extern void nfs_release_seqid(struct nfs_seqid *seqid); + extern void nfs_free_seqid(struct nfs_seqid *seqid); + ++/* write.c */ + extern const nfs4_stateid zero_stateid; + + /* nfs4xdr.c */ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-08-31 20:41:19.157140145 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-31 20:42:05.524099925 -0400 +@@ -49,12 +49,14 @@ + #include + #include + #include ++#include + + #include "nfs4_fs.h" + #include "delegation.h" + #include "internal.h" + #include "iostat.h" + #include "callback.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PROC + +@@ -67,7 +69,7 @@ struct nfs4_opendata; + static int _nfs4_proc_open(struct nfs4_opendata *data); + static int _nfs4_recover_proc_open(struct nfs4_opendata *data); + static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); +-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); ++static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, struct nfs_client *); + static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, +@@ -125,11 +127,16 @@ const u32 nfs4_pathconf_bitmap[2] = { + 0 + }; + +-const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE ++const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE + | FATTR4_WORD0_MAXREAD + | FATTR4_WORD0_MAXWRITE + | FATTR4_WORD0_LEASE_TIME, ++#ifdef CONFIG_NFS_V4_1 ++ FATTR4_WORD1_FS_LAYOUT_TYPES, ++ FATTR4_WORD2_LAYOUT_BLKSIZE ++#else /* CONFIG_NFS_V4_1 */ + 0 ++#endif /* CONFIG_NFS_V4_1 */ + }; + + const u32 nfs4_fs_locations_bitmap[2] = { +@@ -356,7 +363,7 @@ static void nfs41_check_drain_session_co + { + struct rpc_task *task; + +- if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) { ++ if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); + if (task) + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); +@@ -370,12 +377,11 @@ static void nfs41_check_drain_session_co + complete(&ses->complete); + } + +-static void nfs41_sequence_free_slot(const struct nfs_client *clp, +- struct nfs4_sequence_res *res) ++static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) + { + struct nfs4_slot_table *tbl; + +- tbl = &clp->cl_session->fc_slot_table; ++ tbl = &res->sr_session->fc_slot_table; + if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { + /* just wake up the next guy waiting since + * we may have not consumed a slot after all */ +@@ -385,18 +391,17 @@ static void nfs41_sequence_free_slot(con + + spin_lock(&tbl->slot_tbl_lock); + nfs4_free_slot(tbl, res->sr_slotid); +- nfs41_check_drain_session_complete(clp->cl_session); ++ nfs41_check_drain_session_complete(res->sr_session); + spin_unlock(&tbl->slot_tbl_lock); + res->sr_slotid = NFS4_MAX_SLOT_TABLE; + } + +-static void nfs41_sequence_done(struct nfs_client *clp, +- struct nfs4_sequence_res *res, +- int rpc_status) ++static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) + { + unsigned long timestamp; + struct nfs4_slot_table *tbl; + struct nfs4_slot *slot; ++ struct nfs_client *clp; + + /* + * sr_status remains 1 if an RPC level error occurred. The server +@@ -411,13 +416,16 @@ static void nfs41_sequence_done(struct n + if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) + goto out; + ++ tbl = &res->sr_session->fc_slot_table; ++ slot = tbl->slots + res->sr_slotid; ++ + /* Check the SEQUENCE operation status */ +- if (res->sr_status == 0) { +- tbl = &clp->cl_session->fc_slot_table; +- slot = tbl->slots + res->sr_slotid; ++ switch (res->sr_status) { ++ case 0: + /* Update the slot's sequence and clientid lease timer */ + ++slot->seq_nr; + timestamp = res->sr_renewal_time; ++ clp = res->sr_session->clp; + spin_lock(&clp->cl_lock); + if (time_before(clp->cl_last_renewal, timestamp)) + clp->cl_last_renewal = timestamp; +@@ -425,11 +433,39 @@ static void nfs41_sequence_done(struct n + /* Check sequence flags */ + if (atomic_read(&clp->cl_count) > 1) + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); ++ break; ++ case -NFS4ERR_DELAY: ++ /* The server detected a resend of the RPC call and ++ * returned NFS4ERR_DELAY as per Section 2.10.6.2 ++ * of RFC5661. ++ */ ++ dprintk("%s: slot=%d seq=%d: Operation in progress\n", ++ __func__, res->sr_slotid, slot->seq_nr); ++ goto out_retry; ++ default: ++ /* Just update the slot sequence no. */ ++ ++slot->seq_nr; + } + out: + /* The session may be reset by one of the error handlers. */ + dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); +- nfs41_sequence_free_slot(clp, res); ++ nfs41_sequence_free_slot(res); ++ return 1; ++out_retry: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ rpc_restart_call(task); ++ /* FIXME: rpc_restart_call() should be made to return success/fail */ ++ if (RPC_ASSASSINATED(task)) ++ goto out; ++ return 0; ++} ++ ++static int nfs4_sequence_done(struct rpc_task *task, ++ struct nfs4_sequence_res *res) ++{ ++ if (res->sr_session == NULL) ++ return 1; ++ return nfs41_sequence_done(task, res); + } + + /* +@@ -480,12 +516,11 @@ static int nfs41_setup_sequence(struct n + if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) + return 0; + +- memset(res, 0, sizeof(*res)); + res->sr_slotid = NFS4_MAX_SLOT_TABLE; + tbl = &session->fc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); +- if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) && ++ if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && + !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { + /* + * The state manager will wait until the slot table is empty. +@@ -525,6 +560,7 @@ static int nfs41_setup_sequence(struct n + res->sr_session = session; + res->sr_slotid = slotid; + res->sr_renewal_time = jiffies; ++ res->sr_status_flags = 0; + /* + * sr_status is only set in decode_sequence, and so will remain + * set to 1 if an rpc level failure occurs. +@@ -533,33 +569,36 @@ static int nfs41_setup_sequence(struct n + return 0; + } + +-int nfs4_setup_sequence(struct nfs_client *clp, ++int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply, + struct rpc_task *task) + { ++ struct nfs4_session *session = nfs4_get_session(server); + int ret = 0; + ++ if (ds_session) ++ session = ds_session; ++ if (session == NULL) { ++ args->sa_session = NULL; ++ res->sr_session = NULL; ++ goto out; ++ } ++ + dprintk("--> %s clp %p session %p sr_slotid %d\n", +- __func__, clp, clp->cl_session, res->sr_slotid); ++ __func__, session->clp, session, res->sr_slotid); + +- if (!nfs4_has_session(clp)) +- goto out; +- ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply, ++ ret = nfs41_setup_sequence(session, args, res, cache_reply, + task); +- if (ret && ret != -EAGAIN) { +- /* terminate rpc task */ +- task->tk_status = ret; +- task->tk_action = NULL; +- } + out: + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; + } + + struct nfs41_call_sync_data { +- struct nfs_client *clp; ++ const struct nfs_server *seq_server; + struct nfs4_sequence_args *seq_args; + struct nfs4_sequence_res *seq_res; + int cache_reply; +@@ -569,9 +608,9 @@ static void nfs41_call_sync_prepare(stru + { + struct nfs41_call_sync_data *data = calldata; + +- dprintk("--> %s data->clp->cl_session %p\n", __func__, +- data->clp->cl_session); +- if (nfs4_setup_sequence(data->clp, data->seq_args, ++ dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); ++ ++ if (nfs4_setup_sequence(data->seq_server, NULL, data->seq_args, + data->seq_res, data->cache_reply, task)) + return; + rpc_call_start(task); +@@ -587,7 +626,7 @@ static void nfs41_call_sync_done(struct + { + struct nfs41_call_sync_data *data = calldata; + +- nfs41_sequence_done(data->clp, data->seq_res, task->tk_status); ++ nfs41_sequence_done(task, data->seq_res); + } + + struct rpc_call_ops nfs41_call_sync_ops = { +@@ -600,8 +639,7 @@ struct rpc_call_ops nfs41_call_priv_sync + .rpc_call_done = nfs41_call_sync_done, + }; + +-static int nfs4_call_sync_sequence(struct nfs_client *clp, +- struct rpc_clnt *clnt, ++static int nfs4_call_sync_sequence(struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, +@@ -611,13 +649,13 @@ static int nfs4_call_sync_sequence(struc + int ret; + struct rpc_task *task; + struct nfs41_call_sync_data data = { +- .clp = clp, ++ .seq_server = server, + .seq_args = args, + .seq_res = res, + .cache_reply = cache_reply, + }; + struct rpc_task_setup task_setup = { +- .rpc_client = clnt, ++ .rpc_client = server->client, + .rpc_message = msg, + .callback_ops = &nfs41_call_sync_ops, + .callback_data = &data +@@ -642,10 +680,15 @@ int _nfs4_call_sync_session(struct nfs_s + struct nfs4_sequence_res *res, + int cache_reply) + { +- return nfs4_call_sync_sequence(server->nfs_client, server->client, +- msg, args, res, cache_reply, 0); ++ return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0); + } + ++#else ++static int nfs4_sequence_done(struct rpc_task *task, ++ struct nfs4_sequence_res *res) ++{ ++ return 1; ++} + #endif /* CONFIG_NFS_V4_1 */ + + int _nfs4_call_sync(struct nfs_server *server, +@@ -659,18 +702,9 @@ int _nfs4_call_sync(struct nfs_server *s + } + + #define nfs4_call_sync(server, msg, args, res, cache_reply) \ +- (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \ ++ (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \ + &(res)->seq_res, (cache_reply)) + +-static void nfs4_sequence_done(const struct nfs_server *server, +- struct nfs4_sequence_res *res, int rpc_status) +-{ +-#ifdef CONFIG_NFS_V4_1 +- if (nfs4_has_session(server->nfs_client)) +- nfs41_sequence_done(server->nfs_client, res, rpc_status); +-#endif /* CONFIG_NFS_V4_1 */ +-} +- + static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) + { + struct nfs_inode *nfsi = NFS_I(dir); +@@ -745,19 +779,14 @@ static struct nfs4_opendata *nfs4_openda + p->o_arg.server = server; + p->o_arg.bitmask = server->attr_bitmask; + p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; +- if (flags & O_EXCL) { +- if (nfs4_has_persistent_session(server->nfs_client)) { +- /* GUARDED */ +- p->o_arg.u.attrs = &p->attrs; +- memcpy(&p->attrs, attrs, sizeof(p->attrs)); +- } else { /* EXCLUSIVE4_1 */ +- u32 *s = (u32 *) p->o_arg.u.verifier.data; +- s[0] = jiffies; +- s[1] = current->pid; +- } +- } else if (flags & O_CREAT) { ++ if (flags & O_CREAT) { ++ u32 *s; ++ + p->o_arg.u.attrs = &p->attrs; + memcpy(&p->attrs, attrs, sizeof(p->attrs)); ++ s = (u32 *) p->o_arg.u.verifier.data; ++ s[0] = jiffies; ++ s[1] = current->pid; + } + p->c_arg.fh = &p->o_res.fh; + p->c_arg.stateid = &p->o_res.stateid; +@@ -851,8 +880,10 @@ static void update_open_stateflags(struc + static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) + { + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) +- memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data)); +- memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data)); ++ memcpy(state->stateid.u.data, stateid->u.data, ++ sizeof(state->stateid.u.data)); ++ memcpy(state->open_stateid.u.data, stateid->u.data, ++ sizeof(state->open_stateid.u.data)); + switch (fmode) { + case FMODE_READ: + set_bit(NFS_O_RDONLY_STATE, &state->flags); +@@ -880,7 +911,8 @@ static void __update_open_stateid(struct + */ + write_seqlock(&state->seqlock); + if (deleg_stateid != NULL) { +- memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data)); ++ memcpy(state->stateid.u.data, deleg_stateid->u.data, ++ sizeof(state->stateid.u.data)); + set_bit(NFS_DELEGATED_STATE, &state->flags); + } + if (open_stateid != NULL) +@@ -911,7 +943,8 @@ static int update_open_stateid(struct nf + + if (delegation == NULL) + delegation = &deleg_cur->stateid; +- else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0) ++ else if (memcmp(deleg_cur->stateid.u.data, delegation->u.data, ++ NFS4_STATEID_SIZE) != 0) + goto no_delegation_unlock; + + nfs_mark_delegation_referenced(deleg_cur); +@@ -973,7 +1006,8 @@ static struct nfs4_state *nfs4_try_open_ + break; + } + /* Save the delegation */ +- memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); ++ memcpy(stateid.u.data, delegation->stateid.u.data, ++ sizeof(stateid.u.data)); + rcu_read_unlock(); + ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); + if (ret != 0) +@@ -1127,10 +1161,13 @@ static int nfs4_open_recover(struct nfs4 + * Check if we need to update the current stateid. + */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 && +- memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) { ++ memcmp(state->stateid.u.data, state->open_stateid.u.data, ++ sizeof(state->stateid.u.data)) != 0) { + write_seqlock(&state->seqlock); + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) +- memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)); ++ memcpy(state->stateid.u.data, ++ state->open_stateid.u.data, ++ sizeof(state->stateid.u.data)); + write_sequnlock(&state->seqlock); + } + return 0; +@@ -1199,8 +1236,8 @@ static int _nfs4_open_delegation_recall( + if (IS_ERR(opendata)) + return PTR_ERR(opendata); + opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; +- memcpy(opendata->o_arg.u.delegation.data, stateid->data, +- sizeof(opendata->o_arg.u.delegation.data)); ++ memcpy(opendata->o_arg.u.delegation.u.data, stateid->u.data, ++ sizeof(opendata->o_arg.u.delegation.u.data)); + ret = nfs4_open_recover(opendata, state); + nfs4_opendata_put(opendata); + return ret; +@@ -1258,8 +1295,8 @@ static void nfs4_open_confirm_done(struc + if (RPC_ASSASSINATED(task)) + return; + if (data->rpc_status == 0) { +- memcpy(data->o_res.stateid.data, data->c_res.stateid.data, +- sizeof(data->o_res.stateid.data)); ++ memcpy(data->o_res.stateid.u.data, data->c_res.stateid.u.data, ++ sizeof(data->o_res.stateid.u.data)); + nfs_confirm_seqid(&data->owner->so_seqid, 0); + renew_lease(data->o_res.server, data->timestamp); + data->rpc_done = 1; +@@ -1356,13 +1393,13 @@ static void nfs4_open_prepare(struct rpc + } + /* Update sequence id. */ + data->o_arg.id = sp->so_owner_id.id; +- data->o_arg.clientid = sp->so_client->cl_clientid; ++ data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; + if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; + nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); + } + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->o_arg.server->nfs_client, ++ if (nfs4_setup_sequence(data->o_arg.server, NULL, + &data->o_arg.seq_args, + &data->o_res.seq_res, 1, task)) + return; +@@ -1385,8 +1422,8 @@ static void nfs4_open_done(struct rpc_ta + + data->rpc_status = task->tk_status; + +- nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->o_res.seq_res)) ++ return; + + if (RPC_ASSASSINATED(task)) + return; +@@ -1539,9 +1576,8 @@ static int _nfs4_proc_open(struct nfs4_o + return 0; + } + +-static int nfs4_recover_expired_lease(struct nfs_server *server) ++int nfs4_recover_expired_lease(struct nfs_client *clp) + { +- struct nfs_client *clp = server->nfs_client; + unsigned int loop; + int ret; + +@@ -1557,6 +1593,7 @@ static int nfs4_recover_expired_lease(st + } + return ret; + } ++EXPORT_SYMBOL(nfs4_recover_expired_lease); + + /* + * OPEN_EXPIRED: +@@ -1646,7 +1683,7 @@ static int _nfs4_do_open(struct inode *d + dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); + goto out_err; + } +- status = nfs4_recover_expired_lease(server); ++ status = nfs4_recover_expired_lease(server->nfs_client); + if (status != 0) + goto err_put_state_owner; + if (path->dentry->d_inode != NULL) +@@ -1773,7 +1810,7 @@ static int _nfs4_do_setattr(struct inode + if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { + /* Use that stateid */ + } else if (state != NULL) { +- nfs4_copy_stateid(&arg.stateid, state, current->files); ++ nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid); + } else + memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); + +@@ -1838,7 +1875,8 @@ static void nfs4_close_done(struct rpc_t + struct nfs4_state *state = calldata->state; + struct nfs_server *server = NFS_SERVER(calldata->inode); + +- nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status); ++ if (!nfs4_sequence_done(task, &calldata->res.seq_res)) ++ return; + if (RPC_ASSASSINATED(task)) + return; + /* hmm. we are done with the inode, and in the process of freeing +@@ -1858,7 +1896,7 @@ static void nfs4_close_done(struct rpc_t + if (calldata->arg.fmode == 0) + break; + default: +- if (nfs4_async_handle_error(task, server, state) == -EAGAIN) ++ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } + nfs_release_seqid(calldata->arg.seqid); +@@ -1903,7 +1941,7 @@ static void nfs4_close_prepare(struct rp + + nfs_fattr_init(calldata->res.fattr); + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client, ++ if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), NULL, + &calldata->arg.seq_args, &calldata->res.seq_res, + 1, task)) + return; +@@ -2323,6 +2361,9 @@ nfs4_proc_setattr(struct dentry *dentry, + struct nfs4_state *state = NULL; + int status; + ++ if (pnfs_ld_layoutret_on_setattr(inode)) ++ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); ++ + nfs_fattr_init(fattr); + + /* Search for an existing open(O_WRITE) file */ +@@ -2648,8 +2689,9 @@ static int nfs4_proc_unlink_done(struct + { + struct nfs_removeres *res = task->tk_msg.rpc_resp; + +- nfs4_sequence_done(res->server, &res->seq_res, task->tk_status); +- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) ++ if (!nfs4_sequence_done(task, &res->seq_res)) ++ return 0; ++ if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN) + return 0; + update_changeattr(dir, &res->cinfo); + nfs_post_op_update_inode(dir, res->dir_attr); +@@ -3090,18 +3132,31 @@ static int nfs4_proc_pathconf(struct nfs + static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) + { + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; + + dprintk("--> %s\n", __func__); + +- nfs4_sequence_done(server, &data->res.seq_res, task->tk_status); ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; + +- if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, server->nfs_client); ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } + + nfs_invalidate_atime(data->inode); +- if (task->tk_status > 0) ++ if (task->tk_status > 0 && client == server->nfs_client) + renew_lease(server, data->timestamp); + return 0; + } +@@ -3112,20 +3167,56 @@ static void nfs4_proc_read_setup(struct + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + } + ++static void pnfs4_update_write_done(struct nfs_inode *nfsi, struct nfs_write_data *data) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ + static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- +- nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, +- task->tk_status); ++ struct nfs_server *server = NFS_SERVER(inode); ++ struct nfs_client *client = server->nfs_client; + +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; ++ ++#ifdef CONFIG_NFS_V4_1 ++ /* restore original count after retry? */ ++ if (data->pdata.orig_count) { ++ dprintk("%s: restoring original count %u\n", __func__, ++ data->pdata.orig_count); ++ data->args.count = data->pdata.orig_count; ++ } ++ ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } ++ ++ /* ++ * MDS write: renew lease ++ * DS write: update lastbyte written, mark for layout commit ++ */ + if (task->tk_status >= 0) { +- renew_lease(NFS_SERVER(inode), data->timestamp); +- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ if (client == server->nfs_client) { ++ renew_lease(server, data->timestamp); ++ nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ } else ++ pnfs4_update_write_done(NFS_I(inode), data); + } + return 0; + } +@@ -3138,20 +3229,42 @@ static void nfs4_proc_write_setup(struct + data->res.server = server; + data->timestamp = jiffies; + ++#ifdef CONFIG_NFS_V4_1 ++ /* writes to DS use pnfs vector */ ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_WRITE]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; + } + + static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- +- nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, +- task->tk_status); +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { ++ struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; ++ ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS commit\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; ++ ++ if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL, NULL) == -EAGAIN) { + nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + return -EAGAIN; + } +- nfs_refresh_inode(inode, data->res.fattr); ++ if (client == server->nfs_client) ++ nfs_refresh_inode(inode, data->res.fattr); + return 0; + } + +@@ -3161,6 +3274,12 @@ static void nfs4_proc_commit_setup(struc + + data->args.bitmask = server->cache_consistency_bitmask; + data->res.server = server; ++#if defined(CONFIG_NFS_V4_1) ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_COMMIT]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; + } + +@@ -3464,9 +3583,12 @@ static int nfs4_proc_set_acl(struct inod + } + + static int +-_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state) ++nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state, struct nfs_client *clp) + { +- if (!clp || task->tk_status >= 0) ++ if (!clp) ++ clp = server->nfs_client; ++ ++ if (task->tk_status >= 0) + return 0; + switch(task->tk_status) { + case -NFS4ERR_ADMIN_REVOKED: +@@ -3491,8 +3613,9 @@ _nfs4_async_handle_error(struct rpc_task + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: +- dprintk("%s ERROR %d, Reset session\n", __func__, +- task->tk_status); ++ dprintk("%s ERROR %d, Reset session. Exchangeid " ++ "flags 0x%x\n", __func__, task->tk_status, ++ clp->cl_exchange_flags); + nfs4_schedule_state_recovery(clp); + task->tk_status = 0; + return -EAGAIN; +@@ -3512,6 +3635,8 @@ _nfs4_async_handle_error(struct rpc_task + task->tk_status = nfs4_map_errors(task->tk_status); + return 0; + do_state_recovery: ++ if (is_ds_only_client(clp)) ++ return 0; + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); + nfs4_schedule_state_recovery(clp); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) +@@ -3520,12 +3645,6 @@ do_state_recovery: + return -EAGAIN; + } + +-static int +-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) +-{ +- return _nfs4_async_handle_error(task, server, server->nfs_client, state); +-} +- + int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, + unsigned short port, struct rpc_cred *cred, + struct nfs4_setclientid_res *res) +@@ -3641,8 +3760,8 @@ static void nfs4_delegreturn_done(struct + { + struct nfs4_delegreturndata *data = calldata; + +- nfs4_sequence_done(data->res.server, &data->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; + + switch (task->tk_status) { + case -NFS4ERR_STALE_STATEID: +@@ -3651,8 +3770,8 @@ static void nfs4_delegreturn_done(struct + renew_lease(data->res.server, data->timestamp); + break; + default: +- if (nfs4_async_handle_error(task, data->res.server, NULL) == +- -EAGAIN) { ++ if (nfs4_async_handle_error(task, data->res.server, NULL, NULL) ++ == -EAGAIN) { + nfs_restart_rpc(task, data->res.server->nfs_client); + return; + } +@@ -3672,7 +3791,7 @@ static void nfs4_delegreturn_prepare(str + + d_data = (struct nfs4_delegreturndata *)data; + +- if (nfs4_setup_sequence(d_data->res.server->nfs_client, ++ if (nfs4_setup_sequence(d_data->res.server, NULL, + &d_data->args.seq_args, + &d_data->res.seq_res, 1, task)) + return; +@@ -3892,15 +4011,16 @@ static void nfs4_locku_done(struct rpc_t + { + struct nfs4_unlockdata *calldata = data; + +- nfs4_sequence_done(calldata->server, &calldata->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &calldata->res.seq_res)) ++ return; + if (RPC_ASSASSINATED(task)) + return; + switch (task->tk_status) { + case 0: +- memcpy(calldata->lsp->ls_stateid.data, +- calldata->res.stateid.data, +- sizeof(calldata->lsp->ls_stateid.data)); ++ memcpy(calldata->lsp->ls_stateid.u.data, ++ calldata->res.stateid.u.data, ++ sizeof(calldata->lsp->ls_stateid.u. ++ data)); + renew_lease(calldata->server, calldata->timestamp); + break; + case -NFS4ERR_BAD_STATEID: +@@ -3909,7 +4029,7 @@ static void nfs4_locku_done(struct rpc_t + case -NFS4ERR_EXPIRED: + break; + default: +- if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) ++ if (nfs4_async_handle_error(task, calldata->server, NULL, NULL) == -EAGAIN) + nfs_restart_rpc(task, + calldata->server->nfs_client); + } +@@ -3927,7 +4047,7 @@ static void nfs4_locku_prepare(struct rp + return; + } + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence(calldata->server->nfs_client, ++ if (nfs4_setup_sequence(calldata->server, NULL, + &calldata->arg.seq_args, + &calldata->res.seq_res, 1, task)) + return; +@@ -4082,7 +4202,8 @@ static void nfs4_lock_prepare(struct rpc + } else + data->arg.new_lock_owner = 0; + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args, ++ if (nfs4_setup_sequence(data->server, NULL, ++ &data->arg.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +@@ -4101,8 +4222,8 @@ static void nfs4_lock_done(struct rpc_ta + + dprintk("%s: begin!\n", __func__); + +- nfs4_sequence_done(data->server, &data->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; + + data->rpc_status = task->tk_status; + if (RPC_ASSASSINATED(task)) +@@ -4114,8 +4235,8 @@ static void nfs4_lock_done(struct rpc_ta + goto out; + } + if (data->rpc_status == 0) { +- memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, +- sizeof(data->lsp->ls_stateid.data)); ++ memcpy(data->lsp->ls_stateid.u.data, data->res.stateid.u.data, ++ sizeof(data->lsp->ls_stateid.u.data)); + data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; + renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp); + } +@@ -4424,6 +4545,34 @@ out: + return err; + } + ++static void nfs4_release_lockowner_release(void *calldata) ++{ ++ kfree(calldata); ++} ++ ++const struct rpc_call_ops nfs4_release_lockowner_ops = { ++ .rpc_release = nfs4_release_lockowner_release, ++}; ++ ++void nfs4_release_lockowner(const struct nfs4_lock_state *lsp) ++{ ++ struct nfs_server *server = lsp->ls_state->owner->so_server; ++ struct nfs_release_lockowner_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], ++ }; ++ ++ if (server->nfs_client->cl_mvops->minor_version != 0) ++ return; ++ args = kmalloc(sizeof(*args), GFP_NOFS); ++ if (!args) ++ return; ++ args->lock_owner.clientid = server->nfs_client->cl_clientid; ++ args->lock_owner.id = lsp->ls_id.id; ++ msg.rpc_argp = args; ++ rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); ++} ++ + #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" + + int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, +@@ -4526,7 +4675,7 @@ int nfs4_proc_exchange_id(struct nfs_cli + nfs4_verifier verifier; + struct nfs41_exchange_id_args args = { + .client = clp, +- .flags = clp->cl_exchange_flags, ++ .flags = clp->cl_exchange_flags & ~EXCHGID4_FLAG_CONFIRMED_R, + }; + struct nfs41_exchange_id_res res = { + .client = clp, +@@ -4574,6 +4723,7 @@ int nfs4_proc_exchange_id(struct nfs_cli + dprintk("<-- %s status= %d\n", __func__, status); + return status; + } ++EXPORT_SYMBOL(nfs4_proc_exchange_id); + + struct nfs4_get_lease_time_data { + struct nfs4_get_lease_time_args *args; +@@ -4611,7 +4761,8 @@ static void nfs4_get_lease_time_done(str + (struct nfs4_get_lease_time_data *)calldata; + + dprintk("--> %s\n", __func__); +- nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status); ++ if (!nfs41_sequence_done(task, &data->res->lr_seq_res)) ++ return; + switch (task->tk_status) { + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: +@@ -4805,13 +4956,6 @@ struct nfs4_session *nfs4_alloc_session( + if (!session) + return NULL; + +- /* +- * The create session reply races with the server back +- * channel probe. Mark the client NFS_CS_SESSION_INITING +- * so that the client back channel can find the +- * nfs_client struct +- */ +- clp->cl_cons_state = NFS_CS_SESSION_INITING; + init_completion(&session->complete); + + tbl = &session->fc_slot_table; +@@ -4824,6 +4968,8 @@ struct nfs4_session *nfs4_alloc_session( + spin_lock_init(&tbl->slot_tbl_lock); + rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); + ++ session->session_state = 1<clp = clp; + return session; + } +@@ -5040,6 +5186,10 @@ int nfs4_init_session(struct nfs_server + if (!nfs4_has_session(clp)) + return 0; + ++ session = clp->cl_session; ++ if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) ++ return 0; ++ + rsize = server->rsize; + if (rsize == 0) + rsize = NFS_MAX_FILE_IO_SIZE; +@@ -5047,11 +5197,10 @@ int nfs4_init_session(struct nfs_server + if (wsize == 0) + wsize = NFS_MAX_FILE_IO_SIZE; + +- session = clp->cl_session; + session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; + session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; + +- ret = nfs4_recover_expired_lease(server); ++ ret = nfs4_recover_expired_lease(server->nfs_client); + if (!ret) + ret = nfs4_check_client_ready(clp); + return ret; +@@ -5060,69 +5209,70 @@ int nfs4_init_session(struct nfs_server + /* + * Renew the cl_session lease. + */ +-static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +-{ ++struct nfs4_sequence_data { ++ struct nfs_client *clp; + struct nfs4_sequence_args args; + struct nfs4_sequence_res res; +- +- struct rpc_message msg = { +- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], +- .rpc_argp = &args, +- .rpc_resp = &res, +- .rpc_cred = cred, +- }; +- +- args.sa_cache_this = 0; +- +- return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args, +- &res, args.sa_cache_this, 1); +-} ++}; + + static void nfs41_sequence_release(void *data) + { +- struct nfs_client *clp = (struct nfs_client *)data; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); ++ kfree(calldata); ++} ++ ++static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp) ++{ ++ switch(task->tk_status) { ++ case -NFS4ERR_DELAY: ++ case -EKEYEXPIRED: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ return -EAGAIN; ++ default: ++ nfs4_schedule_state_recovery(clp); ++ } ++ return 0; + } + + static void nfs41_sequence_call_done(struct rpc_task *task, void *data) + { +- struct nfs_client *clp = (struct nfs_client *)data; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + +- nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status); ++ if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp)) ++ return; + + if (task->tk_status < 0) { + dprintk("%s ERROR %d\n", __func__, task->tk_status); + if (atomic_read(&clp->cl_count) == 1) + goto out; + +- if (_nfs4_async_handle_error(task, NULL, clp, NULL) +- == -EAGAIN) { +- nfs_restart_rpc(task, clp); ++ if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) { ++ rpc_restart_call_prepare(task); + return; + } + } + dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); + out: +- kfree(task->tk_msg.rpc_argp); +- kfree(task->tk_msg.rpc_resp); +- + dprintk("<-- %s\n", __func__); + } + + static void nfs41_sequence_prepare(struct rpc_task *task, void *data) + { +- struct nfs_client *clp; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + struct nfs4_sequence_args *args; + struct nfs4_sequence_res *res; + +- clp = (struct nfs_client *)data; + args = task->tk_msg.rpc_argp; + res = task->tk_msg.rpc_resp; + +- if (nfs4_setup_sequence(clp, args, res, 0, task)) ++ if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task)) + return; + rpc_call_start(task); + } +@@ -5133,32 +5283,67 @@ static const struct rpc_call_ops nfs41_s + .rpc_release = nfs41_sequence_release, + }; + +-static int nfs41_proc_async_sequence(struct nfs_client *clp, +- struct rpc_cred *cred) ++static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) + { +- struct nfs4_sequence_args *args; +- struct nfs4_sequence_res *res; ++ struct nfs4_sequence_data *calldata; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], + .rpc_cred = cred, + }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = clp->cl_rpcclient, ++ .rpc_message = &msg, ++ .callback_ops = &nfs41_sequence_ops, ++ .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT, ++ }; + + if (!atomic_inc_not_zero(&clp->cl_count)) +- return -EIO; +- args = kzalloc(sizeof(*args), GFP_NOFS); +- res = kzalloc(sizeof(*res), GFP_NOFS); +- if (!args || !res) { +- kfree(args); +- kfree(res); ++ return ERR_PTR(-EIO); ++ calldata = kmalloc(sizeof(*calldata), GFP_NOFS); ++ if (calldata == NULL) { + nfs_put_client(clp); +- return -ENOMEM; ++ return ERR_PTR(-ENOMEM); + } +- res->sr_slotid = NFS4_MAX_SLOT_TABLE; +- msg.rpc_argp = args; +- msg.rpc_resp = res; ++ calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ msg.rpc_argp = &calldata->args; ++ msg.rpc_resp = &calldata->res; ++ calldata->clp = clp; ++ task_setup_data.callback_data = calldata; + +- return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, +- &nfs41_sequence_ops, (void *)clp); ++ return rpc_run_task(&task_setup_data); ++} ++ ++static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred) ++{ ++ struct rpc_task *task; ++ int ret = 0; ++ ++ task = _nfs41_proc_sequence(clp, cred); ++ if (IS_ERR(task)) ++ ret = PTR_ERR(task); ++ else ++ rpc_put_task(task); ++ dprintk("<-- %s status=%d\n", __func__, ret); ++ return ret; ++} ++ ++static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) ++{ ++ struct rpc_task *task; ++ int ret; ++ ++ task = _nfs41_proc_sequence(clp, cred); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ ret = rpc_wait_for_completion_task(task); ++ if (!ret) ++ ret = task->tk_status; ++ rpc_put_task(task); ++out: ++ dprintk("<-- %s status=%d\n", __func__, ret); ++ return ret; + } + + struct nfs4_reclaim_complete_data { +@@ -5172,13 +5357,31 @@ static void nfs4_reclaim_complete_prepar + struct nfs4_reclaim_complete_data *calldata = data; + + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); +- if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, ++ if (nfs41_setup_sequence(calldata->clp->cl_session, ++ &calldata->arg.seq_args, + &calldata->res.seq_res, 0, task)) + return; + + rpc_call_start(task); + } + ++static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) ++{ ++ switch(task->tk_status) { ++ case 0: ++ case -NFS4ERR_COMPLETE_ALREADY: ++ case -NFS4ERR_WRONG_CRED: /* What to do here? */ ++ break; ++ case -NFS4ERR_DELAY: ++ case -EKEYEXPIRED: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ return -EAGAIN; ++ default: ++ nfs4_schedule_state_recovery(clp); ++ } ++ return 0; ++} ++ + static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) + { + struct nfs4_reclaim_complete_data *calldata = data; +@@ -5186,32 +5389,13 @@ static void nfs4_reclaim_complete_done(s + struct nfs4_sequence_res *res = &calldata->res.seq_res; + + dprintk("--> %s\n", __func__); +- nfs41_sequence_done(clp, res, task->tk_status); +- switch (task->tk_status) { +- case 0: +- case -NFS4ERR_COMPLETE_ALREADY: +- break; +- case -NFS4ERR_BADSESSION: +- case -NFS4ERR_DEADSESSION: +- /* +- * Handle the session error, but do not retry the operation, as +- * we have no way of telling whether the clientid had to be +- * reset before we got our reply. If reset, a new wave of +- * reclaim operations will follow, containing their own reclaim +- * complete. We don't want our retry to get on the way of +- * recovery by incorrectly indicating to the server that we're +- * done reclaiming state since the process had to be restarted. +- */ +- _nfs4_async_handle_error(task, NULL, clp, NULL); +- break; +- default: +- if (_nfs4_async_handle_error( +- task, NULL, clp, NULL) == -EAGAIN) { +- rpc_restart_call_prepare(task); +- return; +- } +- } ++ if (!nfs41_sequence_done(task, res)) ++ return; + ++ if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) { ++ rpc_restart_call_prepare(task); ++ return; ++ } + dprintk("<-- %s\n", __func__); + } + +@@ -5268,6 +5452,404 @@ out: + dprintk("<-- %s status=%d\n", __func__, status); + return status; + } ++ ++static void ++nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutget *lgp = calldata; ++ struct inode *ino = lgp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args, ++ &lgp->res.seq_res, 0, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutget *lgp = calldata; ++ struct inode *ino = lgp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!nfs4_sequence_done(task, &lgp->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ pnfs_get_layout_done(lgp, task->tk_status); ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ lgp->status = task->tk_status; ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void nfs4_layoutget_release(void *calldata) ++{ ++ struct nfs4_layoutget *lgp = calldata; ++ ++ dprintk("--> %s\n", __func__); ++ pnfs_layout_release(NFS_I(lgp->args.inode)->layout, NULL); ++ if (lgp->res.layout.buf != NULL) ++ free_page((unsigned long) lgp->res.layout.buf); ++ kfree(calldata); ++ dprintk("<-- %s\n", __func__); ++} ++ ++static const struct rpc_call_ops nfs4_layoutget_call_ops = { ++ .rpc_call_prepare = nfs4_layoutget_prepare, ++ .rpc_call_done = nfs4_layoutget_done, ++ .rpc_release = nfs4_layoutget_release, ++}; ++ ++/* FIXME: We need to call nfs4_handle_exception ++ * and deal with retries. ++ * Currently we can't since we release lgp and its contents. ++ */ ++static int _nfs4_proc_layoutget(struct nfs4_layoutget *lgp) ++{ ++ struct nfs_server *server = NFS_SERVER(lgp->args.inode); ++ struct rpc_task *task; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], ++ .rpc_argp = &lgp->args, ++ .rpc_resp = &lgp->res, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = server->client, ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_layoutget_call_ops, ++ .callback_data = lgp, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ int status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ ++ lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); ++ if (lgp->res.layout.buf == NULL) { ++ nfs4_layoutget_release(lgp); ++ return -ENOMEM; ++ } ++ ++ lgp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = lgp->status; ++ if (status != 0) ++ goto out; ++ status = pnfs_layout_process(lgp); ++out: ++ rpc_put_task(task); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ return status; ++} ++ ++int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) ++{ ++ struct nfs_server *server = NFS_SERVER(lgp->args.inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, _nfs4_proc_layoutget(lgp), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *data) ++{ ++ struct nfs4_layoutcommit_data *ldata = ++ (struct nfs4_layoutcommit_data *)data; ++ struct nfs_server *server = NFS_SERVER(ldata->args.inode); ++ ++ if (nfs4_setup_sequence(server, NULL, &ldata->args.seq_args, ++ &ldata->res.seq_res, 1, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void ++nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutcommit_data *data = ++ (struct nfs4_layoutcommit_data *)calldata; ++ struct nfs_server *server = NFS_SERVER(data->args.inode); ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ data->status = task->tk_status; ++} ++ ++static void nfs4_layoutcommit_release(void *lcdata) ++{ ++ struct nfs4_layoutcommit_data *data = ++ (struct nfs4_layoutcommit_data *)lcdata; ++ ++ put_rpccred(data->cred); ++ pnfs_cleanup_layoutcommit(lcdata); ++ pnfs_layoutcommit_free(lcdata); ++ /* Matched by get_layout in pnfs_layoutcommit_inode */ ++ put_layout(data->args.inode); ++} ++ ++static const struct rpc_call_ops nfs4_layoutcommit_ops = { ++ .rpc_call_prepare = nfs4_layoutcommit_prepare, ++ .rpc_call_done = nfs4_layoutcommit_done, ++ .rpc_release = nfs4_layoutcommit_release, ++}; ++ ++/* Execute a layoutcommit to the server */ ++static int ++_nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync) ++{ ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT], ++ .rpc_argp = &data->args, ++ .rpc_resp = &data->res, ++ .rpc_cred = data->cred, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .task = &data->task, ++ .rpc_client = NFS_CLIENT(data->args.inode), ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_layoutcommit_ops, ++ .callback_data = data, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ struct rpc_task *task; ++ int status = 0; ++ ++ dprintk("NFS: %4d initiating layoutcommit call. %llu@%llu lbw: %llu " ++ "type: %d issync %d\n", ++ data->task.tk_pid, ++ data->args.range.length, ++ data->args.range.offset, ++ data->args.lastbytewritten, ++ data->args.layout_type, issync); ++ ++ data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = data->status; ++out: ++ dprintk("%s: status %d\n", __func__, status); ++ rpc_put_task(task); ++ return 0; ++} ++ ++int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync) ++{ ++ struct nfs4_exception exception = { }; ++ struct nfs_server *server = NFS_SERVER(data->args.inode); ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_proc_layoutcommit(data, issync), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static void ++nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ if (nfs4_setup_sequence(server, NULL, &lrp->args.seq_args, ++ &lrp->res.seq_res, 0, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!nfs4_sequence_done(task, &lrp->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void nfs4_layoutreturn_release(void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; ++ ++ dprintk("--> %s return_type %d lo %p\n", __func__, ++ lrp->args.return_type, lo); ++ ++ if (lrp->args.return_type == RETURN_FILE) { ++ if (!lrp->res.lrs_present) ++ pnfs_set_layout_stateid(lo, &zero_stateid); ++ pnfs_layout_release(lo, &lrp->args.range); ++ } ++ kfree(calldata); ++ dprintk("<-- %s\n", __func__); ++} ++ ++static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { ++ .rpc_call_prepare = nfs4_layoutreturn_prepare, ++ .rpc_call_done = nfs4_layoutreturn_done, ++ .rpc_release = nfs4_layoutreturn_release, ++}; ++ ++int _nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync) ++{ ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ struct rpc_task *task; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], ++ .rpc_argp = &lrp->args, ++ .rpc_resp = &lrp->res, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = server->client, ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_layoutreturn_call_ops, ++ .callback_data = lrp, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ int status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ lrp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = task->tk_status; ++out: ++ dprintk("<-- %s\n", __func__); ++ rpc_put_task(task); ++ return status; ++} ++ ++int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync) ++{ ++ struct nfs_server *server = NFS_SERVER(lrp->args.inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_proc_layoutreturn(lrp, issync), ++ &exception); ++ } while (exception.retry); ++ ++ return err; ++} ++ ++/* ++ * Retrieve the list of Data Server devices from the MDS. ++ */ ++static int _nfs4_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_getdevicelist_args args = { ++ .fh = fh, ++ .layoutclass = server->pnfs_curr_ld->id, ++ }; ++ struct nfs4_getdevicelist_res res = { ++ .devlist = devlist, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], ++ .rpc_argp = &args, ++ .rpc_resp = &res, ++ }; ++ int status; ++ ++ dprintk("--> %s\n", __func__); ++ status = nfs4_call_sync(server, &msg, &args, &res, 0); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ return status; ++} ++ ++int nfs4_proc_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_getdevicelist(server, fh, devlist), ++ &exception); ++ } while (exception.retry); ++ ++ dprintk("nfs4_pnfs_getdevlist: err=%d, num_devs=%u\n", ++ err, devlist->num_devs); ++ ++ return err; ++} ++ ++int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) ++{ ++ struct nfs4_getdeviceinfo_args args = { ++ .pdev = pdev, ++ }; ++ struct nfs4_getdeviceinfo_res res = { ++ .pdev = pdev, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], ++ .rpc_argp = &args, ++ .rpc_resp = &res, ++ }; ++ int status; ++ ++ dprintk("--> %s\n", __func__); ++ status = nfs4_call_sync(server, &msg, &args, &res, 0); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ ++ return status; ++} ++ + #endif /* CONFIG_NFS_V4_1 */ + + struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { +@@ -5325,28 +5907,30 @@ struct nfs4_state_maintenance_ops nfs41_ + }; + #endif + +-/* +- * Per minor version reboot and network partition recovery ops +- */ +- +-struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = { +- &nfs40_reboot_recovery_ops, +-#if defined(CONFIG_NFS_V4_1) +- &nfs41_reboot_recovery_ops, +-#endif ++static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { ++ .minor_version = 0, ++ .call_sync = _nfs4_call_sync, ++ .validate_stateid = nfs4_validate_delegation_stateid, ++ .reboot_recovery_ops = &nfs40_reboot_recovery_ops, ++ .nograce_recovery_ops = &nfs40_nograce_recovery_ops, ++ .state_renewal_ops = &nfs40_state_renewal_ops, + }; + +-struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = { +- &nfs40_nograce_recovery_ops, + #if defined(CONFIG_NFS_V4_1) +- &nfs41_nograce_recovery_ops, +-#endif ++static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { ++ .minor_version = 1, ++ .call_sync = _nfs4_call_sync_session, ++ .validate_stateid = nfs41_validate_delegation_stateid, ++ .reboot_recovery_ops = &nfs41_reboot_recovery_ops, ++ .nograce_recovery_ops = &nfs41_nograce_recovery_ops, ++ .state_renewal_ops = &nfs41_state_renewal_ops, + }; ++#endif + +-struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = { +- &nfs40_state_renewal_ops, ++const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { ++ [0] = &nfs_v4_0_minor_ops, + #if defined(CONFIG_NFS_V4_1) +- &nfs41_state_renewal_ops, ++ [1] = &nfs_v4_1_minor_ops, + #endif + }; + +@@ -5364,6 +5948,7 @@ const struct nfs_rpc_ops nfs_v4_clientop + .dentry_ops = &nfs4_dentry_operations, + .dir_inode_ops = &nfs4_dir_inode_operations, + .file_inode_ops = &nfs4_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs4_proc_get_root, + .getattr = nfs4_proc_getattr, + .setattr = nfs4_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs/nfs4renewd.c +--- linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-08-31 20:42:05.526213255 -0400 +@@ -54,17 +54,17 @@ + void + nfs4_renew_state(struct work_struct *work) + { +- struct nfs4_state_maintenance_ops *ops; ++ const struct nfs4_state_maintenance_ops *ops; + struct nfs_client *clp = + container_of(work, struct nfs_client, cl_renewd.work); + struct rpc_cred *cred; + long lease; + unsigned long last, now; + +- ops = nfs4_state_renewal_ops[clp->cl_minorversion]; ++ ops = clp->cl_mvops->state_renewal_ops; + dprintk("%s: start\n", __func__); + /* Are there any active superblocks? */ +- if (list_empty(&clp->cl_superblocks)) ++ if (list_empty(&clp->cl_superblocks) && !is_ds_only_client(clp)) + goto out; + spin_lock(&clp->cl_lock); + lease = clp->cl_lease_time; +diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/nfs4state.c +--- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-08-31 20:41:19.158078621 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-31 20:42:05.527232994 -0400 +@@ -48,11 +48,13 @@ + #include + #include + #include ++#include + + #include "nfs4_fs.h" + #include "callback.h" + #include "delegation.h" + #include "internal.h" ++#include "pnfs.h" + + #define OPENOWNER_POOL_SIZE 8 + +@@ -126,6 +128,11 @@ static int nfs41_setup_state_renewal(str + int status; + struct nfs_fsinfo fsinfo; + ++ if (is_ds_only_client(clp)) { ++ nfs4_schedule_state_renewal(clp); ++ return 0; ++ } ++ + status = nfs4_proc_get_lease_time(clp, &fsinfo); + if (status == 0) { + /* Update lease time and schedule renewal */ +@@ -145,7 +152,9 @@ static void nfs4_end_drain_session(struc + struct nfs4_session *ses = clp->cl_session; + int max_slots; + +- if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) { ++ if (ses == NULL) ++ return; ++ if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + spin_lock(&ses->fc_slot_table.slot_tbl_lock); + max_slots = ses->fc_slot_table.max_slots; + while (max_slots--) { +@@ -167,7 +176,7 @@ static int nfs4_begin_drain_session(stru + struct nfs4_slot_table *tbl = &ses->fc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); +- set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state); ++ set_bit(NFS4_SESSION_DRAINING, &ses->session_state); + if (tbl->highest_used_slotid != -1) { + INIT_COMPLETION(ses->complete); + spin_unlock(&tbl->slot_tbl_lock); +@@ -371,7 +380,6 @@ nfs4_alloc_state_owner(void) + return NULL; + spin_lock_init(&sp->so_lock); + INIT_LIST_HEAD(&sp->so_states); +- INIT_LIST_HEAD(&sp->so_delegations); + rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); + sp->so_seqid.sequence = &sp->so_sequence; + spin_lock_init(&sp->so_sequence.lock); +@@ -384,7 +392,7 @@ static void + nfs4_drop_state_owner(struct nfs4_state_owner *sp) + { + if (!RB_EMPTY_NODE(&sp->so_client_node)) { +- struct nfs_client *clp = sp->so_client; ++ struct nfs_client *clp = sp->so_server->nfs_client; + + spin_lock(&clp->cl_lock); + rb_erase(&sp->so_client_node, &clp->cl_state_owners); +@@ -406,7 +414,6 @@ struct nfs4_state_owner *nfs4_get_state_ + new = nfs4_alloc_state_owner(); + if (new == NULL) + return NULL; +- new->so_client = clp; + new->so_server = server; + new->so_cred = cred; + spin_lock(&clp->cl_lock); +@@ -423,7 +430,7 @@ struct nfs4_state_owner *nfs4_get_state_ + + void nfs4_put_state_owner(struct nfs4_state_owner *sp) + { +- struct nfs_client *clp = sp->so_client; ++ struct nfs_client *clp = sp->so_server->nfs_client; + struct rpc_cred *cred = sp->so_cred; + + if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) +@@ -583,8 +590,24 @@ static void __nfs4_close(struct path *pa + if (!call_close) { + nfs4_put_open_state(state); + nfs4_put_state_owner(owner); +- } else ++ } else { ++ u32 roc_iomode; ++ struct nfs_inode *nfsi = NFS_I(state->inode); ++ ++ if (has_layout(nfsi) && ++ (roc_iomode = pnfs_layout_roc_iomode(nfsi)) != 0) { ++ struct pnfs_layout_range range = { ++ .iomode = roc_iomode, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ ++ pnfs_return_layout(state->inode, &range, NULL, ++ RETURN_FILE, wait); ++ } ++ + nfs4_do_close(path, state, gfp_mask, wait); ++ } + } + + void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) +@@ -602,12 +625,21 @@ void nfs4_close_sync(struct path *path, + * that is compatible with current->files + */ + static struct nfs4_lock_state * +-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) ++__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) + { + struct nfs4_lock_state *pos; + list_for_each_entry(pos, &state->lock_states, ls_locks) { +- if (pos->ls_owner != fl_owner) ++ if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type) + continue; ++ switch (pos->ls_owner.lo_type) { ++ case NFS4_POSIX_LOCK_TYPE: ++ if (pos->ls_owner.lo_u.posix_owner != fl_owner) ++ continue; ++ break; ++ case NFS4_FLOCK_LOCK_TYPE: ++ if (pos->ls_owner.lo_u.flock_owner != fl_pid) ++ continue; ++ } + atomic_inc(&pos->ls_count); + return pos; + } +@@ -619,10 +651,10 @@ __nfs4_find_lock_state(struct nfs4_state + * exists, return an uninitialized one. + * + */ +-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) ++static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) + { + struct nfs4_lock_state *lsp; +- struct nfs_client *clp = state->owner->so_client; ++ struct nfs_client *clp = state->owner->so_server->nfs_client; + + lsp = kzalloc(sizeof(*lsp), GFP_NOFS); + if (lsp == NULL) +@@ -633,7 +665,18 @@ static struct nfs4_lock_state *nfs4_allo + lsp->ls_seqid.sequence = &lsp->ls_sequence; + atomic_set(&lsp->ls_count, 1); + lsp->ls_state = state; +- lsp->ls_owner = fl_owner; ++ lsp->ls_owner.lo_type = type; ++ switch (lsp->ls_owner.lo_type) { ++ case NFS4_FLOCK_LOCK_TYPE: ++ lsp->ls_owner.lo_u.flock_owner = fl_pid; ++ break; ++ case NFS4_POSIX_LOCK_TYPE: ++ lsp->ls_owner.lo_u.posix_owner = fl_owner; ++ break; ++ default: ++ kfree(lsp); ++ return NULL; ++ } + spin_lock(&clp->cl_lock); + nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); + spin_unlock(&clp->cl_lock); +@@ -643,7 +686,7 @@ static struct nfs4_lock_state *nfs4_allo + + static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) + { +- struct nfs_client *clp = lsp->ls_state->owner->so_client; ++ struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client; + + spin_lock(&clp->cl_lock); + nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); +@@ -657,13 +700,13 @@ static void nfs4_free_lock_state(struct + * exists, return an uninitialized one. + * + */ +-static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) ++static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type) + { + struct nfs4_lock_state *lsp, *new = NULL; + + for(;;) { + spin_lock(&state->state_lock); +- lsp = __nfs4_find_lock_state(state, owner); ++ lsp = __nfs4_find_lock_state(state, owner, pid, type); + if (lsp != NULL) + break; + if (new != NULL) { +@@ -674,7 +717,7 @@ static struct nfs4_lock_state *nfs4_get_ + break; + } + spin_unlock(&state->state_lock); +- new = nfs4_alloc_lock_state(state, owner); ++ new = nfs4_alloc_lock_state(state, owner, pid, type); + if (new == NULL) + return NULL; + } +@@ -701,6 +744,8 @@ void nfs4_put_lock_state(struct nfs4_loc + if (list_empty(&state->lock_states)) + clear_bit(LK_STATE_IN_USE, &state->flags); + spin_unlock(&state->state_lock); ++ if (lsp->ls_flags & NFS_LOCK_INITIALIZED) ++ nfs4_release_lockowner(lsp); + nfs4_free_lock_state(lsp); + } + +@@ -728,7 +773,12 @@ int nfs4_set_lock_state(struct nfs4_stat + + if (fl->fl_ops != NULL) + return 0; +- lsp = nfs4_get_lock_state(state, fl->fl_owner); ++ if (fl->fl_flags & FL_POSIX) ++ lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE); ++ else if (fl->fl_flags & FL_FLOCK) ++ lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE); ++ else ++ return -EINVAL; + if (lsp == NULL) + return -ENOMEM; + fl->fl_u.nfs4_fl.owner = lsp; +@@ -740,7 +790,7 @@ int nfs4_set_lock_state(struct nfs4_stat + * Byte-range lock aware utility to initialize the stateid of read/write + * requests. + */ +-void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) ++void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid) + { + struct nfs4_lock_state *lsp; + int seq; +@@ -753,7 +803,7 @@ void nfs4_copy_stateid(nfs4_stateid *dst + return; + + spin_lock(&state->state_lock); +- lsp = __nfs4_find_lock_state(state, fl_owner); ++ lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); + if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) + memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); + spin_unlock(&state->state_lock); +@@ -1031,8 +1081,8 @@ restart: + * Open state on this file cannot be recovered + * All we can do is revert to using the zero stateid. + */ +- memset(state->stateid.data, 0, +- sizeof(state->stateid.data)); ++ memset(state->stateid.u.data, 0, ++ sizeof(state->stateid.u.data)); + /* Mark the file as being 'closed' */ + state->state = 0; + break; +@@ -1041,11 +1091,11 @@ restart: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: +- nfs4_state_mark_reclaim_nograce(sp->so_client, state); ++ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_NO_GRACE: +- nfs4_state_mark_reclaim_nograce(sp->so_client, state); ++ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: +@@ -1120,8 +1170,7 @@ static void nfs4_state_end_reclaim_reboo + if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + return; + +- nfs4_reclaim_complete(clp, +- nfs4_reboot_recovery_ops[clp->cl_minorversion]); ++ nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); + + for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); +@@ -1211,8 +1260,8 @@ restart: + static int nfs4_check_lease(struct nfs_client *clp) + { + struct rpc_cred *cred; +- struct nfs4_state_maintenance_ops *ops = +- nfs4_state_renewal_ops[clp->cl_minorversion]; ++ const struct nfs4_state_maintenance_ops *ops = ++ clp->cl_mvops->state_renewal_ops; + int status = -NFS4ERR_EXPIRED; + + /* Is the client already known to have an expired lease? */ +@@ -1235,8 +1284,8 @@ out: + static int nfs4_reclaim_lease(struct nfs_client *clp) + { + struct rpc_cred *cred; +- struct nfs4_state_recovery_ops *ops = +- nfs4_reboot_recovery_ops[clp->cl_minorversion]; ++ const struct nfs4_state_recovery_ops *ops = ++ clp->cl_mvops->reboot_recovery_ops; + int status = -ENOENT; + + cred = ops->get_clid_cred(clp); +@@ -1421,6 +1470,7 @@ static void nfs4_state_manager(struct nf + } + clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); ++ pnfs_destroy_all_layouts(clp); + } + + if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { +@@ -1444,7 +1494,7 @@ static void nfs4_state_manager(struct nf + /* First recover reboot state... */ + if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, +- nfs4_reboot_recovery_ops[clp->cl_minorversion]); ++ clp->cl_mvops->reboot_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + continue; +@@ -1458,7 +1508,7 @@ static void nfs4_state_manager(struct nf + /* Now recover expired state... */ + if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, +- nfs4_nograce_recovery_ops[clp->cl_minorversion]); ++ clp->cl_mvops->nograce_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || + test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) +diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-31 20:41:19.160150207 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-31 20:42:05.530092192 -0400 +@@ -50,8 +50,10 @@ + #include + #include + #include ++#include + #include "nfs4_fs.h" + #include "internal.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_XDR + +@@ -89,7 +91,7 @@ static int nfs4_stat_to_errno(int); + #define encode_getfh_maxsz (op_encode_hdr_maxsz) + #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ + ((3+NFS4_FHSIZE) >> 2)) +-#define nfs4_fattr_bitmap_maxsz 3 ++#define nfs4_fattr_bitmap_maxsz 4 + #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) + #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) + #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) +@@ -111,7 +113,11 @@ static int nfs4_stat_to_errno(int); + #define encode_restorefh_maxsz (op_encode_hdr_maxsz) + #define decode_restorefh_maxsz (op_decode_hdr_maxsz) + #define encode_fsinfo_maxsz (encode_getattr_maxsz) +-#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) ++/* The 5 accounts for the PNFS attributes, and assumes that at most three ++ * layout types will be returned. ++ */ ++#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ ++ nfs4_fattr_bitmap_maxsz + 8 + 5) + #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) + #define decode_renew_maxsz (op_decode_hdr_maxsz) + #define encode_setclientid_maxsz \ +@@ -202,14 +208,17 @@ static int nfs4_stat_to_errno(int); + #define encode_link_maxsz (op_encode_hdr_maxsz + \ + nfs4_name_maxsz) + #define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) ++#define encode_lockowner_maxsz (7) + #define encode_lock_maxsz (op_encode_hdr_maxsz + \ + 7 + \ +- 1 + encode_stateid_maxsz + 8) ++ 1 + encode_stateid_maxsz + 1 + \ ++ encode_lockowner_maxsz) + #define decode_lock_denied_maxsz \ + (8 + decode_lockowner_maxsz) + #define decode_lock_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) +-#define encode_lockt_maxsz (op_encode_hdr_maxsz + 12) ++#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \ ++ encode_lockowner_maxsz) + #define decode_lockt_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) + #define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \ +@@ -217,6 +226,11 @@ static int nfs4_stat_to_errno(int); + 4) + #define decode_locku_maxsz (op_decode_hdr_maxsz + \ + decode_stateid_maxsz) ++#define encode_release_lockowner_maxsz \ ++ (op_encode_hdr_maxsz + \ ++ encode_lockowner_maxsz) ++#define decode_release_lockowner_maxsz \ ++ (op_decode_hdr_maxsz) + #define encode_access_maxsz (op_encode_hdr_maxsz + 1) + #define decode_access_maxsz (op_decode_hdr_maxsz + 2) + #define encode_symlink_maxsz (op_encode_hdr_maxsz + \ +@@ -302,6 +316,35 @@ static int nfs4_stat_to_errno(int); + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) + #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) + #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) ++#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ ++ encode_verifier_maxsz) ++#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1 + \ ++ decode_verifier_maxsz + \ ++ XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ ++ NFS4_PNFS_DEVICEID4_SIZE)) ++#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ ++ XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE)) ++#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ ++ 4 /*layout type */ + \ ++ 4 /* opaque devaddr4 length */ +\ ++ 4 /* notification bitmap length */ + \ ++ 4 /* notification bitmap */) ++#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ ++ encode_stateid_maxsz) ++#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ ++ decode_stateid_maxsz + \ ++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) ++#define encode_layoutcommit_maxsz (18 + \ ++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \ ++ op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz) ++#define decode_layoutcommit_maxsz (3 + op_decode_hdr_maxsz) ++#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz + \ ++ 1 /* FIXME: opaque lrf_body always empty at ++ *the moment */) ++#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ ++ 1 + decode_stateid_maxsz) + #else /* CONFIG_NFS_V4_1 */ + #define encode_sequence_maxsz 0 + #define decode_sequence_maxsz 0 +@@ -471,6 +514,12 @@ static int nfs4_stat_to_errno(int); + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_locku_maxsz) ++#define NFS4_enc_release_lockowner_sz \ ++ (compound_encode_hdr_maxsz + \ ++ encode_lockowner_maxsz) ++#define NFS4_dec_release_lockowner_sz \ ++ (compound_decode_hdr_maxsz + \ ++ decode_lockowner_maxsz) + #define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ +@@ -685,6 +734,60 @@ static int nfs4_stat_to_errno(int); + #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_reclaim_complete_maxsz) ++#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_getdevicelist_maxsz) ++#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_getdevicelist_maxsz) ++#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_getdeviceinfo_maxsz) ++#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_getdeviceinfo_maxsz) ++#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_layoutget_maxsz) ++#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutget_maxsz) ++#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_layoutcommit_maxsz + \ ++ encode_getattr_maxsz) ++#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutcommit_maxsz + \ ++ decode_getattr_maxsz) ++#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_layoutreturn_maxsz) ++#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutreturn_maxsz) ++#define NFS4_enc_dswrite_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_write_maxsz) ++#define NFS4_dec_dswrite_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_write_maxsz) ++#define NFS4_enc_dscommit_sz (compound_encode_hdr_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_commit_maxsz) ++#define NFS4_dec_dscommit_sz (compound_decode_hdr_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_commit_maxsz) + + const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_encode_hdr_maxsz + +@@ -915,7 +1018,7 @@ static void encode_close(struct xdr_stre + p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_CLOSE); + *p++ = cpu_to_be32(arg->seqid->sequence->counter); +- xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_close_maxsz; + } +@@ -989,6 +1092,35 @@ static void encode_getattr_two(struct xd + hdr->replen += decode_getattr_maxsz; + } + ++static void ++encode_getattr_three(struct xdr_stream *xdr, ++ uint32_t bm0, uint32_t bm1, uint32_t bm2, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(OP_GETATTR); ++ if (bm2) { ++ p = reserve_space(xdr, 16); ++ *p++ = cpu_to_be32(3); ++ *p++ = cpu_to_be32(bm0); ++ *p++ = cpu_to_be32(bm1); ++ *p = cpu_to_be32(bm2); ++ } else if (bm1) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(2); ++ *p++ = cpu_to_be32(bm0); ++ *p = cpu_to_be32(bm1); ++ } else { ++ p = reserve_space(xdr, 8); ++ *p++ = cpu_to_be32(1); ++ *p = cpu_to_be32(bm0); ++ } ++ hdr->nops++; ++ hdr->replen += decode_getattr_maxsz; ++} ++ + static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { + encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], +@@ -997,8 +1129,11 @@ static void encode_getfattr(struct xdr_s + + static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { +- encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], +- bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); ++ encode_getattr_three(xdr, ++ bitmask[0] & nfs4_fsinfo_bitmap[0], ++ bitmask[1] & nfs4_fsinfo_bitmap[1], ++ bitmask[2] & nfs4_fsinfo_bitmap[2], ++ hdr); + } + + static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) +@@ -1042,6 +1177,17 @@ static inline uint64_t nfs4_lock_length( + return fl->fl_end - fl->fl_start + 1; + } + ++static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 28); ++ p = xdr_encode_hyper(p, lowner->clientid); ++ *p++ = cpu_to_be32(16); ++ p = xdr_encode_opaque_fixed(p, "lock id:", 8); ++ xdr_encode_hyper(p, lowner->id); ++} ++ + /* + * opcode,type,reclaim,offset,length,new_lock_owner = 32 + * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 +@@ -1058,18 +1204,16 @@ static void encode_lock(struct xdr_strea + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + *p = cpu_to_be32(args->new_lock_owner); + if (args->new_lock_owner){ +- p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); ++ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(args->open_seqid->sequence->counter); +- p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->open_stateid->u.data, ++ NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); +- p = xdr_encode_hyper(p, args->lock_owner.clientid); +- *p++ = cpu_to_be32(16); +- p = xdr_encode_opaque_fixed(p, "lock id:", 8); +- xdr_encode_hyper(p, args->lock_owner.id); ++ encode_lockowner(xdr, &args->lock_owner); + } + else { + p = reserve_space(xdr, NFS4_STATEID_SIZE+4); +- p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->lock_stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(args->lock_seqid->sequence->counter); + } + hdr->nops++; +@@ -1080,15 +1224,12 @@ static void encode_lockt(struct xdr_stre + { + __be32 *p; + +- p = reserve_space(xdr, 52); ++ p = reserve_space(xdr, 24); + *p++ = cpu_to_be32(OP_LOCKT); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); +- p = xdr_encode_hyper(p, args->lock_owner.clientid); +- *p++ = cpu_to_be32(16); +- p = xdr_encode_opaque_fixed(p, "lock id:", 8); +- xdr_encode_hyper(p, args->lock_owner.id); ++ encode_lockowner(xdr, &args->lock_owner); + hdr->nops++; + hdr->replen += decode_lockt_maxsz; + } +@@ -1101,13 +1242,25 @@ static void encode_locku(struct xdr_stre + *p++ = cpu_to_be32(OP_LOCKU); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + *p++ = cpu_to_be32(args->seqid->sequence->counter); +- p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->stateid->u.data, ++ NFS4_STATEID_SIZE); + p = xdr_encode_hyper(p, args->fl->fl_start); + xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + hdr->nops++; + hdr->replen += decode_locku_maxsz; + } + ++static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(OP_RELEASE_LOCKOWNER); ++ encode_lockowner(xdr, lowner); ++ hdr->nops++; ++ hdr->replen += decode_release_lockowner_maxsz; ++} ++ + static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) + { + int len = name->len; +@@ -1172,7 +1325,7 @@ static inline void encode_createmode(str + break; + default: + clp = arg->server->nfs_client; +- if (clp->cl_minorversion > 0) { ++ if (clp->cl_mvops->minor_version > 0) { + if (nfs4_has_persistent_session(clp)) { + *p = cpu_to_be32(NFS4_CREATE_GUARDED); + encode_attrs(xdr, arg->u.attrs, arg->server); +@@ -1251,7 +1404,7 @@ static inline void encode_claim_delegate + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); +- xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, stateid->u.data, NFS4_STATEID_SIZE); + encode_string(xdr, name->len, name->name); + } + +@@ -1282,7 +1435,7 @@ static void encode_open_confirm(struct x + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_CONFIRM); +- p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); + hdr->nops++; + hdr->replen += decode_open_confirm_maxsz; +@@ -1294,7 +1447,7 @@ static void encode_open_downgrade(struct + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); +- p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); + encode_share_access(xdr, arg->fmode); + hdr->nops++; +@@ -1324,17 +1477,17 @@ static void encode_putrootfh(struct xdr_ + hdr->replen += decode_putrootfh_maxsz; + } + +-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) ++static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx) + { + nfs4_stateid stateid; + __be32 *p; + + p = reserve_space(xdr, NFS4_STATEID_SIZE); + if (ctx->state != NULL) { +- nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); +- xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); ++ nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); ++ xdr_encode_opaque_fixed(p, stateid.u.data, NFS4_STATEID_SIZE); + } else +- xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, zero_stateid.u.data, NFS4_STATEID_SIZE); + } + + static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) +@@ -1344,7 +1497,7 @@ static void encode_read(struct xdr_strea + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_READ); + +- encode_stateid(xdr, args->context); ++ encode_stateid(xdr, args->context, args->lock_context); + + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->offset); +@@ -1448,7 +1601,7 @@ encode_setacl(struct xdr_stream *xdr, st + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); +- xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, zero_stateid.u.data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 2*4); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(FATTR4_WORD0_ACL); +@@ -1479,7 +1632,7 @@ static void encode_setattr(struct xdr_st + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); +- xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, arg->stateid.u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_setattr_maxsz; + encode_attrs(xdr, arg->iap, server); +@@ -1523,7 +1676,7 @@ static void encode_write(struct xdr_stre + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_WRITE); + +- encode_stateid(xdr, args->context); ++ encode_stateid(xdr, args->context, args->lock_context); + + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, args->offset); +@@ -1542,7 +1695,7 @@ static void encode_delegreturn(struct xd + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + + *p++ = cpu_to_be32(OP_DELEGRETURN); +- xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, stateid->u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_delegreturn_maxsz; + } +@@ -1696,6 +1849,162 @@ static void encode_sequence(struct xdr_s + #endif /* CONFIG_NFS_V4_1 */ + } + ++#ifdef CONFIG_NFS_V4_1 ++static void ++encode_getdevicelist(struct xdr_stream *xdr, ++ const struct nfs4_getdevicelist_args *args, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ nfs4_verifier dummy = { ++ .data = "dummmmmy", ++ }; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_GETDEVICELIST); ++ *p++ = cpu_to_be32(args->layoutclass); ++ *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); ++ xdr_encode_hyper(p, 0ULL); /* cookie */ ++ encode_nfs4_verifier(xdr, &dummy); ++ hdr->nops++; ++} ++ ++static void ++encode_getdeviceinfo(struct xdr_stream *xdr, ++ const struct nfs4_getdeviceinfo_args *args, ++ struct compound_hdr *hdr) ++{ ++ int has_bitmap = (args->pdev->dev_notify_types != 0); ++ int len = 16 + NFS4_PNFS_DEVICEID4_SIZE + (has_bitmap * 4); ++ __be32 *p; ++ ++ p = reserve_space(xdr, len); ++ *p++ = cpu_to_be32(OP_GETDEVICEINFO); ++ p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ *p++ = cpu_to_be32(args->pdev->layout_type); ++ *p++ = cpu_to_be32(args->pdev->pglen + len); /* gdia_maxcount */ ++ *p++ = cpu_to_be32(has_bitmap); /* bitmap length [01] */ ++ if (has_bitmap) ++ *p = cpu_to_be32(args->pdev->dev_notify_types); ++ hdr->nops++; ++} ++ ++static void ++encode_layoutget(struct xdr_stream *xdr, ++ const struct nfs4_layoutget_args *args, ++ struct compound_hdr *hdr) ++{ ++ nfs4_stateid stateid; ++ __be32 *p; ++ ++ p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(OP_LAYOUTGET); ++ *p++ = cpu_to_be32(0); /* Signal layout available */ ++ *p++ = cpu_to_be32(args->type); ++ *p++ = cpu_to_be32(args->range.iomode); ++ p = xdr_encode_hyper(p, args->range.offset); ++ p = xdr_encode_hyper(p, args->range.length); ++ p = xdr_encode_hyper(p, args->minlength); ++ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout); ++ p = xdr_encode_opaque_fixed(p, &stateid.u.data, NFS4_STATEID_SIZE); ++ *p = cpu_to_be32(args->maxcount); ++ ++ dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", ++ __func__, ++ args->type, ++ args->range.iomode, ++ (unsigned long)args->range.offset, ++ (unsigned long)args->range.length, ++ args->maxcount); ++ hdr->nops++; ++ hdr->replen += decode_layoutget_maxsz; ++} ++ ++static int ++encode_layoutcommit(struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *args, ++ struct compound_hdr *hdr) ++{ ++ struct layoutdriver_io_operations *ld_io_ops = ++ NFS_SERVER(args->inode)->pnfs_curr_ld->ld_io_ops; ++ __be32 *p; ++ ++ dprintk("%s: %llu@%llu lbw: %llu type: %d\n", __func__, ++ args->range.length, args->range.offset, args->lastbytewritten, ++ args->layout_type); ++ ++ p = reserve_space(xdr, 40 + NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); ++ p = xdr_encode_hyper(p, args->range.offset); ++ p = xdr_encode_hyper(p, args->range.length); ++ *p++ = cpu_to_be32(0); /* reclaim */ ++ p = xdr_encode_opaque_fixed(p, args->stateid.u.data, NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(1); /* newoffset = TRUE */ ++ p = xdr_encode_hyper(p, args->lastbytewritten); ++ *p = cpu_to_be32(args->time_modify_changed != 0); ++ if (args->time_modify_changed) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(0); ++ *p++ = cpu_to_be32(args->time_modify.tv_sec); ++ *p = cpu_to_be32(args->time_modify.tv_nsec); ++ } ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(args->layout_type); ++ ++ if (ld_io_ops->encode_layoutcommit) { ++ ld_io_ops->encode_layoutcommit(NFS_I(args->inode)->layout, ++ xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ xdr_encode_opaque(p, NULL, 0); ++ } ++ ++ hdr->nops++; ++ hdr->replen += decode_layoutcommit_maxsz; ++ return 0; ++} ++ ++static void ++encode_layoutreturn(struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args, ++ struct compound_hdr *hdr) ++{ ++ nfs4_stateid stateid; ++ __be32 *p; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_LAYOUTRETURN); ++ *p++ = cpu_to_be32(args->reclaim); ++ *p++ = cpu_to_be32(args->layout_type); ++ *p++ = cpu_to_be32(args->range.iomode); ++ *p = cpu_to_be32(args->return_type); ++ if (args->return_type == RETURN_FILE) { ++ struct layoutdriver_io_operations *ld_io_ops = ++ NFS_SERVER(args->inode)->pnfs_curr_ld->ld_io_ops; ++ ++ p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE); ++ p = xdr_encode_hyper(p, args->range.offset); ++ p = xdr_encode_hyper(p, args->range.length); ++ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout); ++ p = xdr_encode_opaque_fixed(p, &stateid.u.data, ++ NFS4_STATEID_SIZE); ++ dprintk("%s: call %pF\n", __func__, ++ ld_io_ops->encode_layoutreturn); ++ if (ld_io_ops->encode_layoutreturn) { ++ ld_io_ops->encode_layoutreturn( ++ NFS_I(args->inode)->layout, xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(0); ++ } ++ } ++ hdr->nops++; ++ hdr->replen += decode_layoutreturn_maxsz; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ + /* + * END OF "GENERIC" ENCODE ROUTINES. + */ +@@ -1704,7 +2013,7 @@ static u32 nfs4_xdr_minorversion(const s + { + #if defined(CONFIG_NFS_V4_1) + if (args->sa_session) +- return args->sa_session->clp->cl_minorversion; ++ return args->sa_session->clp->cl_mvops->minor_version; + #endif /* CONFIG_NFS_V4_1 */ + return 0; + } +@@ -2048,6 +2357,20 @@ static int nfs4_xdr_enc_locku(struct rpc + return 0; + } + ++static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = 0, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_release_lockowner(&xdr, &args->lock_owner, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ + /* + * Encode a READLINK request + */ +@@ -2330,7 +2653,7 @@ static int nfs4_xdr_enc_setclientid_conf + struct compound_hdr hdr = { + .nops = 0, + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); +@@ -2395,7 +2718,7 @@ static int nfs4_xdr_enc_exchange_id(stru + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = args->client->cl_minorversion, ++ .minorversion = args->client->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2413,7 +2736,7 @@ static int nfs4_xdr_enc_create_session(s + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = args->client->cl_minorversion, ++ .minorversion = args->client->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2431,7 +2754,7 @@ static int nfs4_xdr_enc_destroy_session( + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = session->clp->cl_minorversion, ++ .minorversion = session->clp->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2469,7 +2792,7 @@ static int nfs4_xdr_enc_get_lease_time(s + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); +@@ -2499,6 +2822,159 @@ static int nfs4_xdr_enc_reclaim_complete + return 0; + } + ++/* ++ * Encode GETDEVICELIST request ++ */ ++static int ++nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_getdevicelist_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_getdevicelist(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode GETDEVICEINFO request ++ */ ++static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_getdeviceinfo_args *args) ++{ ++ struct xdr_stream xdr; ++ struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ int replen; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_getdeviceinfo(&xdr, args, &hdr); ++ ++ /* set up reply kvec. Subtract notification bitmap max size (8) ++ * so that notification bitmap is put in xdr_buf tail */ ++ replen = (RPC_REPHDRSIZE + auth->au_rslack + ++ NFS4_dec_getdeviceinfo_sz - 8) << 2; ++ xdr_inline_pages(&req->rq_rcv_buf, replen, args->pdev->pages, ++ args->pdev->pgbase, args->pdev->pglen); ++ dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", ++ __func__, replen, args->pdev->pages, ++ args->pdev->pgbase, args->pdev->pglen); ++ ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTGET request ++ */ ++static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_layoutget_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, NFS_FH(args->inode), &hdr); ++ encode_layoutget(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTCOMMIT request ++ */ ++static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_layoutcommit_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_layoutcommit(&xdr, args, &hdr); ++ encode_getfattr(&xdr, args->bitmask, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTRETURN request ++ */ ++static int nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_layoutreturn_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, NFS_FH(args->inode), &hdr); ++ encode_layoutreturn(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server WRITE request ++ */ ++static int nfs4_xdr_enc_dswrite(struct rpc_rqst *req, uint32_t *p, ++ struct nfs_writeargs *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_write(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server COMMIT request ++ */ ++static int nfs4_xdr_enc_dscommit(struct rpc_rqst *req, uint32_t *p, ++ struct nfs_writeargs *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_commit(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} + #endif /* CONFIG_NFS_V4_1 */ + + static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +@@ -2599,14 +3075,17 @@ static int decode_attr_bitmap(struct xdr + goto out_overflow; + bmlen = be32_to_cpup(p); + +- bitmap[0] = bitmap[1] = 0; ++ bitmap[0] = bitmap[1] = bitmap[2] = 0; + p = xdr_inline_decode(xdr, (bmlen << 2)); + if (unlikely(!p)) + goto out_overflow; + if (bmlen > 0) { + bitmap[0] = be32_to_cpup(p++); +- if (bmlen > 1) +- bitmap[1] = be32_to_cpup(p); ++ if (bmlen > 1) { ++ bitmap[1] = be32_to_cpup(p++); ++ if (bmlen > 2) ++ bitmap[2] = be32_to_cpup(p); ++ } + } + return 0; + out_overflow: +@@ -2635,8 +3114,9 @@ static int decode_attr_supported(struct + decode_attr_bitmap(xdr, bitmask); + bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; + } else +- bitmask[0] = bitmask[1] = 0; +- dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); ++ bitmask[0] = bitmask[1] = bitmask[2] = 0; ++ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, ++ bitmask[0], bitmask[1], bitmask[2]); + return 0; + } + +@@ -3565,7 +4045,7 @@ static int decode_opaque_fixed(struct xd + + static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) + { +- return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); ++ return decode_opaque_fixed(xdr, stateid->u.data, NFS4_STATEID_SIZE); + } + + static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +@@ -3621,7 +4101,7 @@ out_overflow: + static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3647,7 +4127,7 @@ xdr_error: + static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3679,7 +4159,7 @@ xdr_error: + static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3705,7 +4185,7 @@ static int decode_getfattr(struct xdr_st + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}, ++ bitmap[3] = {0}, + type; + int status; + umode_t fmode = 0; +@@ -3824,24 +4304,101 @@ xdr_error: + return status; + } + +- +-static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) ++#if defined(CONFIG_NFS_V4_1) ++/* ++ * Decode potentially multiple layout types. Currently we only support ++ * one layout driver per file system. ++ */ ++static int decode_pnfs_list(struct xdr_stream *xdr, uint32_t *layoutclass) + { +- __be32 *savep; +- uint32_t attrlen, bitmap[2]; +- int status; ++ uint32_t *p; ++ int num; + +- if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +- goto xdr_error; +- if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) +- goto xdr_error; +- if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) +- goto xdr_error; ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ num = be32_to_cpup(p); + +- fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ ++ /* pNFS is not supported by the underlying file system */ ++ if (num == 0) { ++ *layoutclass = 0; ++ return 0; ++ } + +- if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) +- goto xdr_error; ++ /* TODO: We will eventually support multiple layout drivers ? */ ++ if (num > 1) ++ printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers " ++ "per filesystem not supported\n", __func__); ++ ++ /* Decode and set first layout type */ ++ p = xdr_inline_decode(xdr, num * 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ *layoutclass = be32_to_cpup(p); ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++/* ++ * The type of file system exported ++ */ ++static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, ++ uint32_t *layoutclass) ++{ ++ int status = 0; ++ ++ dprintk("%s: bitmap is %x\n", __func__, bitmap[1]); ++ if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) ++ return -EIO; ++ if (likely(bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES)) { ++ status = decode_pnfs_list(xdr, layoutclass); ++ bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; ++ } ++ return status; ++} ++ ++/* ++ * The prefered block size for layout directed io ++ */ ++static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, ++ uint32_t *res) ++{ ++ __be32 *p; ++ ++ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); ++ *res = 0; ++ if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) { ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++ } ++ *res = be32_to_cpup(p); ++ bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; ++ } ++ return 0; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ ++static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) ++{ ++ __be32 *savep; ++ uint32_t attrlen, bitmap[3]; ++ int status; ++ ++ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) ++ goto xdr_error; ++ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) ++ goto xdr_error; ++ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) ++ goto xdr_error; ++ ++ fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ ++ ++ if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) ++ goto xdr_error; + if ((status = decode_attr_maxfilesize(xdr, bitmap, &fsinfo->maxfilesize)) != 0) + goto xdr_error; + if ((status = decode_attr_maxread(xdr, bitmap, &fsinfo->rtmax)) != 0) +@@ -3850,6 +4407,14 @@ static int decode_fsinfo(struct xdr_stre + if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) + goto xdr_error; + fsinfo->wtpref = fsinfo->wtmax; ++#if defined(CONFIG_NFS_V4_1) ++ status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); ++ if (status) ++ goto xdr_error; ++ status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); ++ if (status) ++ goto xdr_error; ++#endif /* CONFIG_NFS_V4_1 */ + + status = verify_attr_len(xdr, savep, attrlen); + xdr_error: +@@ -3973,6 +4538,11 @@ static int decode_locku(struct xdr_strea + return status; + } + ++static int decode_release_lockowner(struct xdr_stream *xdr) ++{ ++ return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER); ++} ++ + static int decode_lookup(struct xdr_stream *xdr) + { + return decode_op_hdr(xdr, OP_LOOKUP); +@@ -4333,7 +4903,7 @@ static int decode_getacl(struct xdr_stre + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}; ++ bitmap[3] = {0}; + struct kvec *iov = req->rq_rcv_buf.head; + int status; + +@@ -4682,6 +5252,226 @@ out_overflow: + #endif /* CONFIG_NFS_V4_1 */ + } + ++#if defined(CONFIG_NFS_V4_1) ++/* ++ * TODO: Need to handle case when EOF != true; ++ */ ++static int decode_getdevicelist(struct xdr_stream *xdr, ++ struct pnfs_devicelist *res) ++{ ++ __be32 *p; ++ int status, i; ++ struct nfs_writeverf verftemp; ++ ++ status = decode_op_hdr(xdr, OP_GETDEVICELIST); ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 8 + 8 + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ ++ /* TODO: Skip cookie for now */ ++ p += 2; ++ ++ /* Read verifier */ ++ p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); ++ ++ res->num_devs = be32_to_cpup(p); ++ ++ dprintk("%s: num_dev %d\n", __func__, res->num_devs); ++ ++ if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) ++ return -NFS4ERR_REP_TOO_BIG; ++ ++ p = xdr_inline_decode(xdr, ++ res->num_devs * NFS4_PNFS_DEVICEID4_SIZE + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ for (i = 0; i < res->num_devs; i++) ++ p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ res->eof = be32_to_cpup(p); ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_getdeviceinfo(struct xdr_stream *xdr, ++ struct pnfs_device *pdev) ++{ ++ __be32 *p; ++ uint32_t len, type; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_GETDEVICEINFO); ++ if (status) { ++ if (status == -ETOOSMALL) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ pdev->mincount = be32_to_cpup(p); ++ dprintk("%s: Min count too small. mincnt = %u\n", ++ __func__, pdev->mincount); ++ } ++ return status; ++ } ++ ++ p = xdr_inline_decode(xdr, 8); ++ if (unlikely(!p)) ++ goto out_overflow; ++ type = be32_to_cpup(p++); ++ if (type != pdev->layout_type) { ++ dprintk("%s: layout mismatch req: %u pdev: %u\n", ++ __func__, pdev->layout_type, type); ++ return -EINVAL; ++ } ++ /* ++ * Get the length of the opaque device_addr4. xdr_read_pages places ++ * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages) ++ * and places the remaining xdr data in xdr_buf->tail ++ */ ++ pdev->mincount = be32_to_cpup(p); ++ xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ ++ ++ /* At most one bitmap word */ ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ len = be32_to_cpup(p); ++ if (len) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ pdev->dev_notify_types = be32_to_cpup(p); ++ } else ++ pdev->dev_notify_types = 0; ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, ++ struct nfs4_layoutget_res *res) ++{ ++ __be32 *p; ++ int status; ++ u32 layout_count, dummy; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTGET); ++ if (status) ++ return status; ++ p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->return_on_close = be32_to_cpup(p++); ++ p = xdr_decode_opaque_fixed(p, res->stateid.u.data, NFS4_STATEID_SIZE); ++ layout_count = be32_to_cpup(p); ++ if (!layout_count) { ++ dprintk("%s: server responded with empty layout array\n", ++ __func__); ++ return -EINVAL; ++ } ++ ++ p = xdr_inline_decode(xdr, 24); ++ if (unlikely(!p)) ++ goto out_overflow; ++ p = xdr_decode_hyper(p, &res->range.offset); ++ p = xdr_decode_hyper(p, &res->range.length); ++ res->range.iomode = be32_to_cpup(p++); ++ res->type = be32_to_cpup(p++); ++ ++ status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p); ++ if (unlikely(status)) ++ return status; ++ ++ dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", ++ __func__, ++ (unsigned long)res->range.offset, ++ (unsigned long)res->range.length, ++ res->range.iomode, ++ res->type, ++ res->layout.len); ++ ++ /* presuambly, nfs4_proc_layoutget allocated a single page */ ++ if (res->layout.len > PAGE_SIZE) ++ return -ENOMEM; ++ memcpy(res->layout.buf, p, res->layout.len); ++ ++ /* FIXME: the whole layout array should be passed up to the pnfs ++ * client */ ++ if (layout_count > 1) { ++ dprintk("%s: server responded with %d layouts, dropping tail\n", ++ __func__, layout_count); ++ ++ while (--layout_count) { ++ p = xdr_inline_decode(xdr, 24); ++ if (unlikely(!p)) ++ goto out_overflow; ++ status = decode_opaque_inline(xdr, &dummy, (char **)&p); ++ if (unlikely(status)) ++ return status; ++ } ++ } ++ ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutreturn(struct xdr_stream *xdr, ++ struct nfs4_layoutreturn_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTRETURN); ++ if (status) ++ return status; ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->lrs_present = be32_to_cpup(p); ++ if (res->lrs_present) ++ status = decode_stateid(xdr, &res->stateid); ++ return status; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutcommit(struct xdr_stream *xdr, ++ struct rpc_rqst *req, ++ struct nfs4_layoutcommit_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->sizechanged = be32_to_cpup(p); ++ ++ if (res->sizechanged) { ++ p = xdr_inline_decode(xdr, 8); ++ if (unlikely(!p)) ++ goto out_overflow; ++ xdr_decode_hyper(p, &res->newsize); ++ } ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ + /* + * END OF "GENERIC" DECODE ROUTINES. + */ +@@ -5259,6 +6049,19 @@ out: + return status; + } + ++static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (!status) ++ status = decode_release_lockowner(&xdr); ++ return status; ++} ++ + /* + * Decode READLINK response + */ +@@ -5696,6 +6499,186 @@ static int nfs4_xdr_dec_reclaim_complete + status = decode_reclaim_complete(&xdr, (void *)NULL); + return status; + } ++ ++/* ++ * Decode GETDEVICELIST response ++ */ ++static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_getdevicelist_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ dprintk("encoding getdevicelist!\n"); ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status != 0) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status != 0) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status != 0) ++ goto out; ++ status = decode_getdevicelist(&xdr, res->devlist); ++out: ++ return status; ++} ++ ++/* ++ * Decode GETDEVINFO response ++ */ ++static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_getdeviceinfo_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status != 0) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status != 0) ++ goto out; ++ status = decode_getdeviceinfo(&xdr, res->pdev); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTGET response ++ */ ++static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_layoutget_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutget(&xdr, rqstp, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTRETURN response ++ */ ++static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_layoutreturn_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutreturn(&xdr, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTCOMMIT response ++ */ ++static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_layoutcommit_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutcommit(&xdr, rqstp, res); ++ if (status) ++ goto out; ++ decode_getfattr(&xdr, res->fattr, res->server, ++ !RPC_IS_ASYNC(rqstp->rq_task)); ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server WRITE response ++ */ ++static int nfs4_xdr_dec_dswrite(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs_writeres *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_write(&xdr, res); ++ if (!status) ++ return res->count; ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server COMMIT response ++ */ ++static int nfs4_xdr_dec_dscommit(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs_writeres *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_commit(&xdr, res); ++out: ++ return status; ++} + #endif /* CONFIG_NFS_V4_1 */ + + __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) +@@ -5866,6 +6849,7 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(GETACL, enc_getacl, dec_getacl), + PROC(SETACL, enc_setacl, dec_setacl), + PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), ++ PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), + #if defined(CONFIG_NFS_V4_1) + PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), + PROC(CREATE_SESSION, enc_create_session, dec_create_session), +@@ -5873,6 +6857,13 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(SEQUENCE, enc_sequence, dec_sequence), + PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), ++ PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), ++ PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), ++ PROC(LAYOUTGET, enc_layoutget, dec_layoutget), ++ PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), ++ PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), ++ PROC(PNFS_WRITE, enc_dswrite, dec_dswrite), ++ PROC(PNFS_COMMIT, enc_dscommit, dec_dscommit), + #endif /* CONFIG_NFS_V4_1 */ + }; + +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild +--- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-31 20:42:05.532213157 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-08-31 20:42:05.532213157 -0400 +@@ -0,0 +1,11 @@ ++# ++# Makefile for the pNFS Objects Layout Driver kernel module ++# ++objlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o objio_osd.o ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o ++ ++# ++# Panasas pNFS Layout Driver kernel module ++# ++panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o ++obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-31 20:42:05.533243491 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-31 20:42:05.534105468 -0400 +@@ -0,0 +1,1087 @@ ++/* ++ * objio_osd.c ++ * ++ * pNFS Objects layout implementation over open-osd initiator library ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++#define _LLU(x) ((unsigned long long)x) ++ ++enum { BIO_MAX_PAGES_KMALLOC = ++ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), ++}; ++ ++/* A per mountpoint struct currently for device cache */ ++struct objio_mount_type { ++ struct list_head dev_list; ++ spinlock_t dev_list_lock; ++}; ++ ++struct _dev_ent { ++ struct list_head list; ++ struct pnfs_deviceid d_id; ++ struct osd_dev *od; ++}; ++ ++static void _dev_list_remove_all(struct objio_mount_type *omt) ++{ ++ spin_lock(&omt->dev_list_lock); ++ ++ while (!list_empty(&omt->dev_list)) { ++ struct _dev_ent *de = list_entry(omt->dev_list.next, ++ struct _dev_ent, list); ++ ++ list_del_init(&de->list); ++ osduld_put_device(de->od); ++ kfree(de); ++ } ++ ++ spin_unlock(&omt->dev_list_lock); ++} ++ ++static struct osd_dev *___dev_list_find(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id) ++{ ++ struct list_head *le; ++ ++ list_for_each(le, &omt->dev_list) { ++ struct _dev_ent *de = list_entry(le, struct _dev_ent, list); ++ ++ if (0 == memcmp(&de->d_id, d_id, sizeof(*d_id))) ++ return de->od; ++ } ++ ++ return NULL; ++} ++ ++static struct osd_dev *_dev_list_find(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id) ++{ ++ struct osd_dev *od; ++ ++ spin_lock(&omt->dev_list_lock); ++ od = ___dev_list_find(omt, d_id); ++ spin_unlock(&omt->dev_list_lock); ++ return od; ++} ++ ++static int _dev_list_add(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id, struct osd_dev *od) ++{ ++ struct _dev_ent *de = kzalloc(sizeof(*de), GFP_KERNEL); ++ ++ if (!de) ++ return -ENOMEM; ++ ++ spin_lock(&omt->dev_list_lock); ++ ++ if (___dev_list_find(omt, d_id)) { ++ kfree(de); ++ goto out; ++ } ++ ++ de->d_id = *d_id; ++ de->od = od; ++ list_add(&de->list, &omt->dev_list); ++ ++out: ++ spin_unlock(&omt->dev_list_lock); ++ return 0; ++} ++ ++struct objio_segment { ++ struct pnfs_osd_layout *layout; ++ ++ unsigned mirrors_p1; ++ unsigned stripe_unit; ++ unsigned group_width; /* Data stripe_units without integrity comps */ ++ u64 group_depth; ++ unsigned group_count; ++ ++ unsigned num_comps; ++ /* variable length */ ++ struct osd_dev *ods[1]; ++}; ++ ++struct objio_state; ++typedef ssize_t (*objio_done_fn)(struct objio_state *ios); ++ ++struct objio_state { ++ /* Generic layer */ ++ struct objlayout_io_state ol_state; ++ ++ struct objio_segment *objio_seg; ++ ++ struct kref kref; ++ objio_done_fn done; ++ void *private; ++ ++ unsigned long length; ++ unsigned numdevs; /* Actually used devs in this IO */ ++ /* A per-device variable array of size numdevs */ ++ struct _objio_per_comp { ++ struct bio *bio; ++ struct osd_request *or; ++ unsigned long length; ++ u64 offset; ++ unsigned dev; ++ } per_dev[]; ++}; ++ ++/* Send and wait for a get_device_info of devices in the layout, ++ then look them up with the osd_initiator library */ ++static struct osd_dev *_device_lookup(struct pnfs_layout_hdr *pnfslay, ++ struct objio_segment *objio_seg, unsigned comp) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ struct pnfs_osd_deviceaddr *deviceaddr; ++ struct pnfs_deviceid *d_id; ++ struct osd_dev *od; ++ struct osd_dev_info odi; ++ struct objio_mount_type *omt = PNFS_NFS_SERVER(pnfslay)->pnfs_ld_data; ++ int err; ++ ++ d_id = &layout->olo_comps[comp].oc_object_id.oid_device_id; ++ ++ od = _dev_list_find(omt, d_id); ++ if (od) ++ return od; ++ ++ err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr); ++ if (unlikely(err)) { ++ dprintk("%s: objlayout_get_deviceinfo=>%d\n", __func__, err); ++ return ERR_PTR(err); ++ } ++ ++ odi.systemid_len = deviceaddr->oda_systemid.len; ++ if (odi.systemid_len > sizeof(odi.systemid)) { ++ err = -EINVAL; ++ goto out; ++ } else if (odi.systemid_len) ++ memcpy(odi.systemid, deviceaddr->oda_systemid.data, ++ odi.systemid_len); ++ odi.osdname_len = deviceaddr->oda_osdname.len; ++ odi.osdname = (u8 *)deviceaddr->oda_osdname.data; ++ ++ if (!odi.osdname_len && !odi.systemid_len) { ++ dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", ++ __func__); ++ err = -ENODEV; ++ goto out; ++ } ++ ++ od = osduld_info_lookup(&odi); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ dprintk("%s: osduld_info_lookup => %d\n", __func__, err); ++ goto out; ++ } ++ ++ _dev_list_add(omt, d_id, od); ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ objlayout_put_deviceinfo(deviceaddr); ++ return err ? ERR_PTR(err) : od; ++} ++ ++static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, ++ struct objio_segment *objio_seg) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ unsigned i, num_comps = layout->olo_num_comps; ++ int err; ++ ++ /* lookup all devices */ ++ for (i = 0; i < num_comps; i++) { ++ struct osd_dev *od; ++ ++ od = _device_lookup(pnfslay, objio_seg, i); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ goto out; ++ } ++ objio_seg->ods[i] = od; ++ } ++ objio_seg->num_comps = num_comps; ++ err = 0; ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ return err; ++} ++ ++static int _verify_data_map(struct pnfs_osd_layout *layout) ++{ ++ struct pnfs_osd_data_map *data_map = &layout->olo_map; ++ u64 stripe_length; ++ u32 group_width; ++ ++/* FIXME: Only raid0 for now. if not go through MDS */ ++ if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { ++ printk(KERN_ERR "Only RAID_0 for now\n"); ++ return -ENOTSUPP; ++ } ++ if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { ++ printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", ++ data_map->odm_num_comps, data_map->odm_mirror_cnt); ++ return -EINVAL; ++ } ++ ++ if (data_map->odm_group_width) ++ group_width = data_map->odm_group_width; ++ else ++ group_width = data_map->odm_num_comps / ++ (data_map->odm_mirror_cnt + 1); ++ ++ stripe_length = (u64)data_map->odm_stripe_unit * group_width; ++ if (stripe_length >= (1ULL << 32)) { ++ printk(KERN_ERR "Total Stripe length(0x%llx)" ++ " >= 32bit is not supported\n", _LLU(stripe_length)); ++ return -ENOTSUPP; ++ } ++ ++ if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { ++ printk(KERN_ERR "Stripe Unit(0x%llx)" ++ " must be Multples of PAGE_SIZE(0x%lx)\n", ++ _LLU(data_map->odm_stripe_unit), PAGE_SIZE); ++ return -ENOTSUPP; ++ } ++ ++ return 0; ++} ++ ++int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ struct objio_segment *objio_seg; ++ int err; ++ ++ err = _verify_data_map(layout); ++ if (unlikely(err)) ++ return err; ++ ++ objio_seg = kzalloc(sizeof(*objio_seg) + ++ (layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]), ++ GFP_KERNEL); ++ if (!objio_seg) ++ return -ENOMEM; ++ ++ objio_seg->layout = layout; ++ err = objio_devices_lookup(pnfslay, objio_seg); ++ if (err) ++ goto free_seg; ++ ++ objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1; ++ objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit; ++ if (layout->olo_map.odm_group_width) { ++ objio_seg->group_width = layout->olo_map.odm_group_width; ++ objio_seg->group_depth = layout->olo_map.odm_group_depth; ++ objio_seg->group_count = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1 / ++ objio_seg->group_width; ++ } else { ++ objio_seg->group_width = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1; ++ objio_seg->group_depth = -1; ++ objio_seg->group_count = 1; ++ } ++ ++ *outp = objio_seg; ++ return 0; ++ ++free_seg: ++ dprintk("%s: Error: return %d\n", __func__, err); ++ kfree(objio_seg); ++ *outp = NULL; ++ return err; ++} ++ ++void objio_free_lseg(void *p) ++{ ++ struct objio_segment *objio_seg = p; ++ ++ kfree(objio_seg); ++} ++ ++int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct objio_segment *objio_seg = seg; ++ struct objio_state *ios; ++ const unsigned first_size = sizeof(*ios) + ++ objio_seg->num_comps * sizeof(ios->per_dev[0]); ++ const unsigned sec_size = objio_seg->num_comps * ++ sizeof(ios->ol_state.ioerrs[0]); ++ ++ dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps); ++ ios = kzalloc(first_size + sec_size, GFP_KERNEL); ++ if (unlikely(!ios)) ++ return -ENOMEM; ++ ++ ios->objio_seg = objio_seg; ++ ios->ol_state.ioerrs = ((void *)ios) + first_size; ++ ios->ol_state.num_comps = objio_seg->num_comps; ++ ++ *outp = &ios->ol_state; ++ return 0; ++} ++ ++void objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ ++ kfree(ios); ++} ++ ++enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) ++{ ++ switch (oep) { ++ case OSD_ERR_PRI_NO_ERROR: ++ return (enum pnfs_osd_errno)0; ++ ++ case OSD_ERR_PRI_CLEAR_PAGES: ++ BUG_ON(1); ++ return 0; ++ ++ case OSD_ERR_PRI_RESOURCE: ++ return PNFS_OSD_ERR_RESOURCE; ++ case OSD_ERR_PRI_BAD_CRED: ++ return PNFS_OSD_ERR_BAD_CRED; ++ case OSD_ERR_PRI_NO_ACCESS: ++ return PNFS_OSD_ERR_NO_ACCESS; ++ case OSD_ERR_PRI_UNREACHABLE: ++ return PNFS_OSD_ERR_UNREACHABLE; ++ case OSD_ERR_PRI_NOT_FOUND: ++ return PNFS_OSD_ERR_NOT_FOUND; ++ case OSD_ERR_PRI_NO_SPACE: ++ return PNFS_OSD_ERR_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case OSD_ERR_PRI_EIO: ++ return PNFS_OSD_ERR_EIO; ++ } ++} ++ ++static void _clear_bio(struct bio *bio) ++{ ++ struct bio_vec *bv; ++ unsigned i; ++ ++ __bio_for_each_segment(bv, bio, i, 0) { ++ unsigned this_count = bv->bv_len; ++ ++ if (likely(PAGE_SIZE == this_count)) ++ clear_highpage(bv->bv_page); ++ else ++ zero_user(bv->bv_page, bv->bv_offset, this_count); ++ } ++} ++ ++static int _io_check(struct objio_state *ios, bool is_write) ++{ ++ enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; ++ int lin_ret = 0; ++ int i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_sense_info osi; ++ struct osd_request *or = ios->per_dev[i].or; ++ int ret; ++ ++ if (!or) ++ continue; ++ ++ ret = osd_req_decode_sense(or, &osi); ++ if (likely(!ret)) ++ continue; ++ ++ if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { ++ /* start read offset passed endof file */ ++ BUG_ON(is_write); ++ _clear_bio(ios->per_dev[i].bio); ++ dprintk("%s: start read offset passed end of file " ++ "offset=0x%llx, length=0x%lx\n", __func__, ++ _LLU(ios->per_dev[i].offset), ++ ios->per_dev[i].length); ++ ++ continue; /* we recovered */ ++ } ++ objlayout_io_set_result(&ios->ol_state, ios->per_dev[i].dev, ++ osd_pri_2_pnfs_err(osi.osd_err_pri), ++ ios->per_dev[i].offset, ++ ios->per_dev[i].length, ++ is_write); ++ ++ if (osi.osd_err_pri >= oep) { ++ oep = osi.osd_err_pri; ++ lin_ret = ret; ++ } ++ } ++ ++ return lin_ret; ++} ++ ++/* ++ * Common IO state helpers. ++ */ ++static void _io_free(struct objio_state *ios) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[i]; ++ ++ if (per_dev->or) { ++ osd_end_request(per_dev->or); ++ per_dev->or = NULL; ++ } ++ ++ if (per_dev->bio) { ++ bio_put(per_dev->bio); ++ per_dev->bio = NULL; ++ } ++ } ++} ++ ++struct osd_dev * _io_od(struct objio_state *ios, unsigned dev) ++{ ++ unsigned min_dev = ios->objio_seg->layout->olo_comps_index; ++ unsigned max_dev = min_dev + ios->ol_state.num_comps; ++ ++ BUG_ON(dev < min_dev || max_dev <= dev); ++ return ios->objio_seg->ods[dev - min_dev]; ++} ++ ++struct _striping_info { ++ u64 obj_offset; ++ u64 group_length; ++ u64 total_group_length; ++ u64 Major; ++ unsigned dev; ++ unsigned unit_off; ++}; ++ ++static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, ++ struct _striping_info *si) ++{ ++ u32 stripe_unit = ios->objio_seg->stripe_unit; ++ u32 group_width = ios->objio_seg->group_width; ++ u64 group_depth = ios->objio_seg->group_depth; ++ u32 U = stripe_unit * group_width; ++ ++ u64 T = U * group_depth; ++ u64 S = T * ios->objio_seg->group_count; ++ u64 M = div64_u64(file_offset, S); ++ ++ /* ++ G = (L - (M * S)) / T ++ H = (L - (M * S)) % T ++ */ ++ u64 LmodU = file_offset - M * S; ++ u32 G = div64_u64(LmodU, T); ++ u64 H = LmodU - G * T; ++ ++ u32 N = div_u64(H, U); ++ ++ div_u64_rem(file_offset, stripe_unit, &si->unit_off); ++ si->obj_offset = si->unit_off + (N * stripe_unit) + ++ (M * group_depth * stripe_unit); ++ ++ /* "H - (N * U)" is just "H % U" so it's bound to u32 */ ++ si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; ++ si->dev *= ios->objio_seg->mirrors_p1; ++ ++ si->group_length = T - H; ++ si->total_group_length = T; ++ si->Major = M; ++} ++ ++static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, ++ unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len) ++{ ++ unsigned pg = *cur_pg; ++ struct request_queue *q = ++ osd_request_queue(_io_od(ios, per_dev->dev)); ++ ++ per_dev->length += cur_len; ++ ++ if (per_dev->bio == NULL) { ++ unsigned stripes = ios->ol_state.num_comps / ++ ios->objio_seg->mirrors_p1; ++ unsigned pages_in_stripe = stripes * ++ (ios->objio_seg->stripe_unit / PAGE_SIZE); ++ unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / ++ stripes; ++ ++ per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); ++ if (unlikely(!per_dev->bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", bio_size); ++ return -ENOMEM; ++ } ++ } ++ ++ while (cur_len > 0) { ++ unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); ++ unsigned added_len; ++ ++ BUG_ON(ios->ol_state.nr_pages <= pg); ++ cur_len -= pglen; ++ ++ added_len = bio_add_pc_page(q, per_dev->bio, ++ ios->ol_state.pages[pg], pglen, pgbase); ++ if (unlikely(pglen != added_len)) ++ return -ENOMEM; ++ pgbase = 0; ++ ++pg; ++ } ++ BUG_ON(cur_len); ++ ++ *cur_pg = pg; ++ return 0; ++} ++ ++static int _prepare_one_group(struct objio_state *ios, u64 length, ++ struct _striping_info *si, unsigned first_comp, ++ unsigned *last_pg) ++{ ++ unsigned stripe_unit = ios->objio_seg->stripe_unit; ++ unsigned mirrors_p1 = ios->objio_seg->mirrors_p1; ++ unsigned devs_in_group = ios->objio_seg->group_width * mirrors_p1; ++ unsigned dev = si->dev; ++ unsigned first_dev = dev - (dev % devs_in_group); ++ unsigned comp = first_comp + (dev - first_dev); ++ unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; ++ unsigned cur_pg = *last_pg; ++ int ret = 0; ++ ++ while (length) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[comp]; ++ unsigned cur_len, page_off = 0; ++ ++ if (!per_dev->length) { ++ per_dev->dev = dev; ++ if (dev < si->dev) { ++ per_dev->offset = si->obj_offset + stripe_unit - ++ si->unit_off; ++ cur_len = stripe_unit; ++ } else if (dev == si->dev) { ++ per_dev->offset = si->obj_offset; ++ cur_len = stripe_unit - si->unit_off; ++ page_off = si->unit_off & ~PAGE_MASK; ++ BUG_ON(page_off && ++ (page_off != ios->ol_state.pgbase)); ++ } else { /* dev > si->dev */ ++ per_dev->offset = si->obj_offset - si->unit_off; ++ cur_len = stripe_unit; ++ } ++ ++ if (max_comp < comp) ++ max_comp = comp; ++ ++ dev += mirrors_p1; ++ dev = (dev % devs_in_group) + first_dev; ++ } else { ++ cur_len = stripe_unit; ++ } ++ if (cur_len >= length) ++ cur_len = length; ++ ++ ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, ++ cur_len); ++ if (unlikely(ret)) ++ goto out; ++ ++ comp += mirrors_p1; ++ comp = (comp % devs_in_group) + first_comp; ++ ++ length -= cur_len; ++ ios->length += cur_len; ++ } ++out: ++ ios->numdevs = max_comp + mirrors_p1; ++ *last_pg = cur_pg; ++ return ret; ++} ++ ++static int _io_rw_pagelist(struct objio_state *ios) ++{ ++ u64 length = ios->ol_state.count; ++ struct _striping_info si; ++ unsigned devs_in_group = ios->objio_seg->group_width * ++ ios->objio_seg->mirrors_p1; ++ unsigned first_comp = 0; ++ unsigned num_comps = ios->objio_seg->layout->olo_map.odm_num_comps; ++ unsigned last_pg = 0; ++ int ret = 0; ++ ++ _calc_stripe_info(ios, ios->ol_state.offset, &si); ++ while (length) { ++ if (length < si.group_length) ++ si.group_length = length; ++ ++ ret = _prepare_one_group(ios, si.group_length, &si, first_comp, ++ &last_pg); ++ if (unlikely(ret)) ++ goto out; ++ ++ length -= si.group_length; ++ ++ si.group_length = si.total_group_length; ++ si.unit_off = 0; ++ ++si.Major; ++ si.obj_offset = si.Major * ios->objio_seg->stripe_unit * ++ ios->objio_seg->group_depth; ++ ++ si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group; ++ si.dev %= num_comps; ++ ++ first_comp += devs_in_group; ++ first_comp %= num_comps; ++ } ++ ++out: ++ if (!ios->length) ++ return ret; ++ ++ return 0; ++} ++ ++static ssize_t _sync_done(struct objio_state *ios) ++{ ++ struct completion *waiting = ios->private; ++ ++ complete(waiting); ++ return 0; ++} ++ ++static void _last_io(struct kref *kref) ++{ ++ struct objio_state *ios = container_of(kref, struct objio_state, kref); ++ ++ ios->done(ios); ++} ++ ++static void _done_io(struct osd_request *or, void *p) ++{ ++ struct objio_state *ios = p; ++ ++ kref_put(&ios->kref, _last_io); ++} ++ ++static ssize_t _io_exec(struct objio_state *ios) ++{ ++ DECLARE_COMPLETION_ONSTACK(wait); ++ ssize_t status = 0; /* sync status */ ++ unsigned i; ++ objio_done_fn saved_done_fn = ios->done; ++ bool sync = ios->ol_state.sync; ++ ++ if (sync) { ++ ios->done = _sync_done; ++ ios->private = &wait; ++ } ++ ++ kref_init(&ios->kref); ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_request *or = ios->per_dev[i].or; ++ ++ if (!or) ++ continue; ++ ++ kref_get(&ios->kref); ++ osd_execute_request_async(or, _done_io, ios); ++ } ++ ++ kref_put(&ios->kref, _last_io); ++ ++ if (sync) { ++ wait_for_completion(&wait); ++ status = saved_done_fn(ios); ++ } ++ ++ return status; ++} ++ ++/* ++ * read ++ */ ++static ssize_t _read_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, false); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) ++ status = ios->length; ++ else ++ status = ret; ++ ++ objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct osd_request *or = NULL; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = per_dev->dev; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ int ret; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ ++err: ++ return ret; ++} ++ ++static ssize_t _read_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _read_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _read_done; ++ return _io_exec(ios); /* In sync mode exec returns the io status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _read_exec(ios); ++} ++ ++/* ++ * write ++ */ ++static ssize_t _write_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, true); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) { ++ /* FIXME: should be based on the OSD's persistence model ++ * See OSD2r05 Section 4.13 Data persistence model */ ++ ios->ol_state.committed = NFS_UNSTABLE; //NFS_FILE_SYNC; ++ status = ios->length; ++ } else { ++ status = ret; ++ } ++ ++ objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = ios->per_dev[cur_comp].dev; ++ unsigned last_comp = cur_comp + ios->objio_seg->mirrors_p1; ++ int ret; ++ ++ for (; cur_comp < last_comp; ++cur_comp, ++dev) { ++ struct osd_request *or = NULL; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ struct bio *bio; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ if (per_dev != master_dev) { ++ bio = bio_kmalloc(GFP_KERNEL, ++ master_dev->bio->bi_max_vecs); ++ if (unlikely(!bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", ++ master_dev->bio->bi_max_vecs); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ __bio_clone(bio, master_dev->bio); ++ bio->bi_bdev = NULL; ++ bio->bi_next = NULL; ++ per_dev->bio = bio; ++ per_dev->dev = dev; ++ per_dev->length = master_dev->length; ++ per_dev->offset = master_dev->offset; ++ } else { ++ bio = master_dev->bio; ++ /* FIXME: bio_set_dir() */ ++ bio->bi_rw |= (1 << BIO_RW); ++ } ++ ++ osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ } ++ ++err: ++ return ret; ++} ++ ++static ssize_t _write_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _write_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _write_done; ++ return _io_exec(ios); /* In sync mode exec returns the io->status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ /* TODO: ios->stable = stable; */ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _write_exec(ios); ++} ++ ++/* ++ * Policy Operations ++ */ ++ ++/* ++ * Return the stripe size for the specified file ++ */ ++ssize_t ++objlayout_get_stripesize(struct pnfs_layout_hdr *pnfslay) ++{ ++ ssize_t sz, maxsz = -1; ++ struct pnfs_layout_segment *lseg; ++ ++ list_for_each_entry(lseg, &pnfslay->segs, fi_list) { ++ int n; ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(lseg); ++ struct pnfs_osd_layout *lo = ++ (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; ++ struct pnfs_osd_data_map *map = &lo->olo_map; ++ ++ n = map->odm_group_width; ++ if (n == 0) ++ n = map->odm_num_comps / (map->odm_mirror_cnt + 1); ++ ++ switch (map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_5: ++ n -= 1; ++ break; ++ ++ case PNFS_OSD_RAID_PQ: ++ n -= 2; ++ break; ++ ++ default: ++ BUG_ON(1); ++ } ++ sz = map->odm_stripe_unit * n; ++ if (sz > maxsz) ++ maxsz = sz; ++ } ++ dprintk("%s: Return %Zx\n", __func__, maxsz); ++ return maxsz; ++} ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++objlayout_get_blocksize(void) ++{ ++ ssize_t sz = BIO_MAX_PAGES_KMALLOC * PAGE_SIZE; ++ ++ return sz; ++} ++ ++static struct layoutdriver_policy_operations objlayout_policy_operations = { ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ .get_stripesize = objlayout_get_stripesize, ++ .get_blocksize = objlayout_get_blocksize, ++}; ++ ++static struct pnfs_layoutdriver_type objlayout_type = { ++ .id = LAYOUT_OSD2_OBJECTS, ++ .name = "LAYOUT_OSD2_OBJECTS", ++ .ld_io_ops = &objlayout_io_operations, ++ .ld_policy_ops = &objlayout_policy_operations, ++}; ++ ++void *objio_init_mt(void) ++{ ++ struct objio_mount_type *omt = kzalloc(sizeof(*omt), GFP_KERNEL); ++ ++ if (!omt) ++ return ERR_PTR(-ENOMEM); ++ ++ INIT_LIST_HEAD(&omt->dev_list); ++ spin_lock_init(&omt->dev_list_lock); ++ return omt; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++ _dev_list_remove_all(mountid); ++ kfree(mountid); ++} ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++objlayout_init(void) ++{ ++ pnfs_client_ops = pnfs_register_layoutdriver(&objlayout_type); ++ printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", ++ __func__); ++ return 0; ++} ++ ++static void __exit ++objlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&objlayout_type); ++ printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(objlayout_init); ++module_exit(objlayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-31 20:42:05.535059115 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-08-31 20:42:05.535059115 -0400 +@@ -0,0 +1,790 @@ ++/* ++ * objlayout.c ++ * ++ * pNFS layout driver for Panasas OSDs ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pnfs_client_operations *pnfs_client_ops; ++ ++/* ++ * Create a objlayout layout structure for the given inode and return it. ++ */ ++static struct pnfs_layout_hdr * ++objlayout_alloc_layout(struct inode *inode) ++{ ++ struct objlayout *objlay; ++ ++ objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL); ++ if (objlay) { ++ spin_lock_init(&objlay->lock); ++ INIT_LIST_HEAD(&objlay->err_list); ++ } ++ dprintk("%s: Return %p\n", __func__, objlay); ++ return &objlay->pnfs_layout; ++} ++ ++/* ++ * Free an objlayout layout structure ++ */ ++static void ++objlayout_free_layout(struct pnfs_layout_hdr *lo) ++{ ++ struct objlayout *objlay = OBJLAYOUT(lo); ++ ++ dprintk("%s: objlay %p\n", __func__, objlay); ++ ++ WARN_ON(!list_empty(&objlay->err_list)); ++ kfree(objlay); ++} ++ ++/* ++ * Unmarshall layout and store it in pnfslay. ++ */ ++static struct pnfs_layout_segment * ++objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, ++ struct nfs4_layoutget_res *lgr) ++{ ++ int status; ++ void *layout = lgr->layout.buf; ++ struct pnfs_layout_segment *lseg; ++ struct objlayout_segment *objlseg; ++ struct pnfs_osd_layout *pnfs_osd_layout; ++ ++ dprintk("%s: Begin pnfslay %p layout %p\n", __func__, pnfslay, layout); ++ ++ BUG_ON(!layout); ++ ++ status = -ENOMEM; ++ lseg = kzalloc(sizeof(*lseg) + sizeof(*objlseg) + ++ pnfs_osd_layout_incore_sz(layout), GFP_KERNEL); ++ if (!lseg) ++ goto err; ++ ++ objlseg = LSEG_LD_DATA(lseg); ++ pnfs_osd_layout = (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; ++ pnfs_osd_xdr_decode_layout(pnfs_osd_layout, layout); ++ ++ status = objio_alloc_lseg(&objlseg->internal, pnfslay, lseg, ++ pnfs_osd_layout); ++ if (status) ++ goto err; ++ ++ dprintk("%s: Return %p\n", __func__, lseg); ++ return lseg; ++ ++ err: ++ kfree(lseg); ++ return ERR_PTR(status); ++} ++ ++/* ++ * Free a layout segement ++ */ ++static void ++objlayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ struct objlayout_segment *objlseg; ++ ++ dprintk("%s: freeing layout segment %p\n", __func__, lseg); ++ ++ if (unlikely(!lseg)) ++ return; ++ ++ objlseg = LSEG_LD_DATA(lseg); ++ objio_free_lseg(objlseg->internal); ++ kfree(lseg); ++} ++ ++/* ++ * I/O Operations ++ */ ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ ++static struct objlayout_io_state * ++objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, ++ struct page **pages, ++ unsigned pgbase, ++ unsigned nr_pages, ++ loff_t offset, ++ size_t count, ++ struct pnfs_layout_segment *lseg, ++ void *rpcdata) ++{ ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(lseg); ++ struct objlayout_io_state *state; ++ u64 lseg_end_offset; ++ size_t size_nr_pages; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ if (objio_alloc_io_state(objlseg->internal, &state)) ++ return NULL; ++ ++ BUG_ON(offset < lseg->range.offset); ++ lseg_end_offset = end_offset(lseg->range.offset, lseg->range.length); ++ BUG_ON(offset >= lseg_end_offset); ++ if (offset + count > lseg_end_offset) { ++ count = lseg->range.length - (offset - lseg->range.offset); ++ dprintk("%s: truncated count %Zd\n", __func__, count); ++ } ++ ++ if (pgbase > PAGE_SIZE) { ++ unsigned n = pgbase >> PAGE_SHIFT; ++ ++ pgbase &= ~PAGE_MASK; ++ pages += n; ++ nr_pages -= n; ++ } ++ ++ size_nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ BUG_ON(nr_pages < size_nr_pages); ++ if (nr_pages > size_nr_pages) ++ nr_pages = size_nr_pages; ++ ++ INIT_LIST_HEAD(&state->err_list); ++ state->lseg = lseg; ++ state->rpcdata = rpcdata; ++ state->pages = pages; ++ state->pgbase = pgbase; ++ state->nr_pages = nr_pages; ++ state->offset = offset; ++ state->count = count; ++ state->sync = 0; ++ ++ return state; ++} ++ ++static void ++objlayout_free_io_state(struct objlayout_io_state *state) ++{ ++ dprintk("%s: freeing io_state\n", __func__); ++ if (unlikely(!state)) ++ return; ++ ++ objio_free_io_state(state); ++} ++ ++/* ++ * I/O done common code ++ */ ++static void ++objlayout_iodone(struct objlayout_io_state *state) ++{ ++ dprintk("%s: state %p status\n", __func__, state); ++ ++ if (likely(state->status >= 0)) { ++ objlayout_free_io_state(state); ++ } else { ++ struct objlayout *objlay = OBJLAYOUT(state->lseg->layout); ++ ++ spin_lock(&objlay->lock); ++ objlay->delta_space_valid = OBJ_DSU_INVALID; ++ list_add(&objlay->err_list, &state->err_list); ++ spin_unlock(&objlay->lock); ++ } ++} ++ ++/* ++ * objlayout_io_set_result - Set an osd_error code on a specific osd comp. ++ * ++ * The @index component IO failed (error returned from target). Register ++ * the error for later reporting at layout-return. ++ */ ++void ++objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, ++ int osd_error, u64 offset, u64 length, bool is_write) ++{ ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; ++ ++ BUG_ON(index >= state->num_comps); ++ if (osd_error) { ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(state->lseg); ++ struct pnfs_osd_layout *layout = ++ (typeof(layout))objlseg->pnfs_osd_layout; ++ ++ ioerr->oer_component = layout->olo_comps[index].oc_object_id; ++ ioerr->oer_comp_offset = offset; ++ ioerr->oer_comp_length = length; ++ ioerr->oer_iswrite = is_write; ++ ioerr->oer_errno = osd_error; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " ++ "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", ++ __func__, index, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ } else { ++ /* User need not call if no error is reported */ ++ ioerr->oer_errno = 0; ++ } ++} ++ ++static void _rpc_commit_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_client_ops->nfs_commit_complete(wdata); ++} ++ ++/* ++ * Commit data remotely on OSDs ++ */ ++enum pnfs_try_status ++objlayout_commit(struct nfs_write_data *wdata, int how) ++{ ++ int status = PNFS_ATTEMPTED; ++ ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_commit_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ dprintk("%s: Return %d\n", __func__, status); ++ return status; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_read_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ ++ pnfs_client_ops->nfs_readlist_complete(rdata); ++} ++ ++void ++objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) ++{ ++ int eof = state->eof; ++ struct nfs_read_data *rdata; ++ ++ state->status = status; ++ dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof); ++ rdata = state->rpcdata; ++ rdata->task.tk_status = status; ++ if (status >= 0) { ++ rdata->res.count = status; ++ rdata->res.eof = eof; ++ } ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_client_ops->nfs_readlist_complete(rdata); ++ else { ++ INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); ++ schedule_work(&rdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async reads. ++ */ ++enum pnfs_try_status ++objlayout_read_pagelist(struct nfs_read_data *rdata, unsigned nr_pages) ++{ ++ loff_t offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct objlayout_io_state *state; ++ ssize_t status = 0; ++ loff_t eof; ++ ++ dprintk("%s: Begin inode %p offset %llu count %d\n", ++ __func__, rdata->inode, offset, (int)count); ++ ++ eof = i_size_read(rdata->inode); ++ if (unlikely(offset + count > eof)) { ++ if (offset >= eof) { ++ status = 0; ++ rdata->res.count = 0; ++ rdata->res.eof = 1; ++ goto out; ++ } ++ count = eof - offset; ++ } ++ ++ state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, ++ rdata->args.pages, rdata->args.pgbase, ++ nr_pages, offset, count, ++ rdata->pdata.lseg, rdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->eof = state->offset + state->count >= eof; ++ ++ status = objio_read_pagelist(state); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ rdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_write_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_client_ops->nfs_writelist_complete(wdata); ++} ++ ++void ++objlayout_write_done(struct objlayout_io_state *state, ssize_t status, ++ bool sync) ++{ ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s: Begin\n", __func__); ++ wdata = state->rpcdata; ++ state->status = status; ++ wdata->task.tk_status = status; ++ if (status >= 0) { ++ wdata->res.count = status; ++ wdata->verf.committed = state->committed; ++ dprintk("%s: Return status %d committed %d\n", ++ __func__, wdata->task.tk_status, ++ wdata->verf.committed); ++ } else ++ dprintk("%s: Return status %d\n", ++ __func__, wdata->task.tk_status); ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_client_ops->nfs_writelist_complete(wdata); ++ else { ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async writes. ++ */ ++enum pnfs_try_status ++objlayout_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int how) ++{ ++ struct objlayout_io_state *state; ++ ssize_t status; ++ ++ dprintk("%s: Begin inode %p offset %llu count %u\n", ++ __func__, wdata->inode, wdata->args.offset, wdata->args.count); ++ ++ state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, ++ wdata->args.pages, ++ wdata->args.pgbase, ++ nr_pages, ++ wdata->args.offset, ++ wdata->args.count, ++ wdata->pdata.lseg, wdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->sync = how & FLUSH_SYNC; ++ ++ status = objio_write_pagelist(state, how & FLUSH_STABLE); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ wdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++void ++objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct pnfs_osd_layoutupdate lou; ++ __be32 *start; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ spin_lock(&objlay->lock); ++ lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); ++ lou.dsu_delta = objlay->delta_space_used; ++ objlay->delta_space_used = 0; ++ objlay->delta_space_valid = OBJ_DSU_INIT; ++ lou.olu_ioerr_flag = !list_empty(&objlay->err_list); ++ spin_unlock(&objlay->lock); ++ ++ start = xdr_reserve_space(xdr, 4); ++ ++ BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ ++ dprintk("%s: Return delta_space_used %lld err %d\n", __func__, ++ lou.dsu_delta, lou.olu_ioerr_flag); ++} ++ ++static int ++err_prio(u32 oer_errno) ++{ ++ switch (oer_errno) { ++ case 0: ++ return 0; ++ ++ case PNFS_OSD_ERR_RESOURCE: ++ return OSD_ERR_PRI_RESOURCE; ++ case PNFS_OSD_ERR_BAD_CRED: ++ return OSD_ERR_PRI_BAD_CRED; ++ case PNFS_OSD_ERR_NO_ACCESS: ++ return OSD_ERR_PRI_NO_ACCESS; ++ case PNFS_OSD_ERR_UNREACHABLE: ++ return OSD_ERR_PRI_UNREACHABLE; ++ case PNFS_OSD_ERR_NOT_FOUND: ++ return OSD_ERR_PRI_NOT_FOUND; ++ case PNFS_OSD_ERR_NO_SPACE: ++ return OSD_ERR_PRI_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case PNFS_OSD_ERR_EIO: ++ return OSD_ERR_PRI_EIO; ++ } ++} ++ ++static void ++merge_ioerr(struct pnfs_osd_ioerr *dest_err, ++ const struct pnfs_osd_ioerr *src_err) ++{ ++ u64 dest_end, src_end; ++ ++ if (!dest_err->oer_errno) { ++ *dest_err = *src_err; ++ /* accumulated device must be blank */ ++ memset(&dest_err->oer_component.oid_device_id, 0, ++ sizeof(dest_err->oer_component.oid_device_id)); ++ ++ return; ++ } ++ ++ if (dest_err->oer_component.oid_partition_id != ++ src_err->oer_component.oid_partition_id) ++ dest_err->oer_component.oid_partition_id = 0; ++ ++ if (dest_err->oer_component.oid_object_id != ++ src_err->oer_component.oid_object_id) ++ dest_err->oer_component.oid_object_id = 0; ++ ++ if (dest_err->oer_comp_offset > src_err->oer_comp_offset) ++ dest_err->oer_comp_offset = src_err->oer_comp_offset; ++ ++ dest_end = end_offset(dest_err->oer_comp_offset, ++ dest_err->oer_comp_length); ++ src_end = end_offset(src_err->oer_comp_offset, ++ src_err->oer_comp_length); ++ if (dest_end < src_end) ++ dest_end = src_end; ++ ++ dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; ++ ++ if ((src_err->oer_iswrite == dest_err->oer_iswrite) && ++ (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { ++ dest_err->oer_errno = src_err->oer_errno; ++ } else if (src_err->oer_iswrite) { ++ dest_err->oer_iswrite = true; ++ dest_err->oer_errno = src_err->oer_errno; ++ } ++} ++ ++static void ++encode_accumulated_error(struct objlayout *objlay, struct xdr_stream *xdr) ++{ ++ struct objlayout_io_state *state, *tmp; ++ struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ ++ for (i = 0; i < state->num_comps; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ merge_ioerr(&accumulated_err, ioerr); ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++ ++ BUG_ON(pnfs_osd_xdr_encode_ioerr(xdr, &accumulated_err)); ++} ++ ++void ++objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct objlayout_io_state *state, *tmp; ++ __be32 *start, *uninitialized_var(last_xdr); ++ ++ dprintk("%s: Begin\n", __func__); ++ start = xdr_reserve_space(xdr, 4); ++ BUG_ON(!start); ++ ++ spin_lock(&objlay->lock); ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ int res = 0; ++ ++ for (i = 0; i < state->num_comps && !res; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ last_xdr = xdr->p; ++ res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]); ++ } ++ if (unlikely(res)) { ++ /* no space for even one error descriptor */ ++ BUG_ON(last_xdr == start + 1); ++ ++ /* we've encountered a situation with lots and lots of ++ * errors and no space to encode them all. Use the last ++ * available slot to report the union of all the ++ * remaining errors. ++ */ ++ xdr_rewind_stream(xdr, last_xdr - ++ pnfs_osd_ioerr_xdr_sz() / 4); ++ encode_accumulated_error(objlay, xdr); ++ goto loop_done; ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++loop_done: ++ spin_unlock(&objlay->lock); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ dprintk("%s: Return\n", __func__); ++} ++ ++struct objlayout_deviceinfo { ++ struct page *page; ++ struct pnfs_osd_deviceaddr da; /* This must be last */ ++}; ++ ++/* Initialize and call nfs_getdeviceinfo, then decode and return a ++ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() ++ * should be called. ++ */ ++int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi; ++ struct pnfs_device pd; ++ struct super_block *sb; ++ struct page *page; ++ size_t sz; ++ u32 *p; ++ int err; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ pd.area = page_address(page); ++ ++ memcpy(&pd.dev_id, d_id, sizeof(*d_id)); ++ pd.layout_type = LAYOUT_OSD2_OBJECTS; ++ pd.dev_notify_types = 0; ++ pd.pages = &page; ++ pd.pgbase = 0; ++ pd.pglen = PAGE_SIZE; ++ pd.mincount = 0; ++ ++ sb = PNFS_INODE(pnfslay)->i_sb; ++ err = pnfs_client_ops->nfs_getdeviceinfo(PNFS_NFS_SERVER(pnfslay), &pd); ++ dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); ++ if (err) ++ goto err_out; ++ ++ p = pd.area; ++ sz = pnfs_osd_xdr_deviceaddr_incore_sz(p); ++ odi = kzalloc(sz + (sizeof(*odi) - sizeof(odi->da)), GFP_KERNEL); ++ if (!odi) { ++ err = -ENOMEM; ++ goto err_out; ++ } ++ pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); ++ odi->page = page; ++ *deviceaddr = &odi->da; ++ return 0; ++ ++err_out: ++ __free_page(page); ++ return err; ++} ++ ++void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi = container_of(deviceaddr, ++ struct objlayout_deviceinfo, ++ da); ++ ++ __free_page(odi->page); ++ kfree(odi); ++} ++ ++/* ++ * Initialize a mountpoint by retrieving the list of ++ * available devices for it. ++ * Return the pnfs_mount_type structure so the ++ * pNFS_client can refer to the mount point later on. ++ */ ++static int ++objlayout_initialize_mountpoint(struct nfs_server *server, ++ const struct nfs_fh *mntfh) ++{ ++ void *data; ++ ++ data = objio_init_mt(); ++ if (IS_ERR(data)) { ++ printk(KERN_INFO "%s: objlayout lib not ready err=%ld\n", ++ __func__, PTR_ERR(data)); ++ return PTR_ERR(data); ++ } ++ server->pnfs_ld_data = data; ++ ++ dprintk("%s: Return data=%p\n", __func__, data); ++ return 0; ++} ++ ++/* ++ * Uninitialize a mountpoint ++ */ ++static int ++objlayout_uninitialize_mountpoint(struct nfs_server *server) ++{ ++ dprintk("%s: Begin %p\n", __func__, server->pnfs_ld_data); ++ objio_fini_mt(server->pnfs_ld_data); ++ return 0; ++} ++ ++struct layoutdriver_io_operations objlayout_io_operations = { ++ .commit = objlayout_commit, ++ .read_pagelist = objlayout_read_pagelist, ++ .write_pagelist = objlayout_write_pagelist, ++ .alloc_layout = objlayout_alloc_layout, ++ .free_layout = objlayout_free_layout, ++ .alloc_lseg = objlayout_alloc_lseg, ++ .free_lseg = objlayout_free_lseg, ++ .encode_layoutcommit = objlayout_encode_layoutcommit, ++ .encode_layoutreturn = objlayout_encode_layoutreturn, ++ .initialize_mountpoint = objlayout_initialize_mountpoint, ++ .uninitialize_mountpoint = objlayout_uninitialize_mountpoint, ++}; +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-31 20:42:05.535059115 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-08-31 20:42:05.535059115 -0400 +@@ -0,0 +1,171 @@ ++/* ++ * objlayout.h ++ * ++ * Data types and function declerations for interfacing with the ++ * pNFS standard object layout driver. ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#ifndef _OBJLAYOUT_H ++#define _OBJLAYOUT_H ++ ++#include ++#include ++#include ++ ++/* ++ * in-core layout segment ++ */ ++struct objlayout_segment { ++ void *internal; /* for provider internal use */ ++ u8 pnfs_osd_layout[]; ++}; ++ ++/* ++ * per-inode layout ++ */ ++struct objlayout { ++ struct pnfs_layout_hdr pnfs_layout; ++ ++ /* for layout_commit */ ++ enum osd_delta_space_valid_enum { ++ OBJ_DSU_INIT = 0, ++ OBJ_DSU_VALID, ++ OBJ_DSU_INVALID, ++ } delta_space_valid; ++ s64 delta_space_used; /* consumed by write ops */ ++ ++ /* for layout_return */ ++ spinlock_t lock; ++ struct list_head err_list; ++}; ++ ++static inline struct objlayout * ++OBJLAYOUT(struct pnfs_layout_hdr *lo) ++{ ++ return container_of(lo, struct objlayout, pnfs_layout); ++} ++ ++/* ++ * per-I/O operation state ++ * embedded in objects provider io_state data structure ++ */ ++struct objlayout_io_state { ++ struct pnfs_layout_segment *lseg; ++ ++ struct page **pages; ++ unsigned pgbase; ++ unsigned nr_pages; ++ unsigned long count; ++ loff_t offset; ++ bool sync; ++ ++ void *rpcdata; ++ int status; /* res */ ++ int eof; /* res */ ++ int committed; /* res */ ++ ++ /* Error reporting (layout_return) */ ++ struct list_head err_list; ++ unsigned num_comps; ++ /* Pointer to array of error descriptors of size num_comps. ++ * It should contain as many entries as devices in the osd_layout ++ * that participate in the I/O. It is up to the io_engine to allocate ++ * needed space and set num_comps. ++ */ ++ struct pnfs_osd_ioerr *ioerrs; ++}; ++ ++/* ++ * Raid engine I/O API ++ */ ++extern void *objio_init_mt(void); ++extern void objio_fini_mt(void *mt); ++ ++extern int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout); ++extern void objio_free_lseg(void *p); ++ ++extern int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp); ++extern void objio_free_io_state(struct objlayout_io_state *state); ++ ++extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); ++extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable); ++ ++/* ++ * callback API ++ */ ++extern void objlayout_io_set_result(struct objlayout_io_state *state, ++ unsigned index, int osd_error, ++ u64 offset, u64 length, bool is_write); ++ ++static inline void ++objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) ++{ ++ struct objlayout *objlay = OBJLAYOUT(state->lseg->layout); ++ ++ /* If one of the I/Os errored out and the delta_space_used was ++ * invalid we render the complete report as invalid. Protocol mandate ++ * the DSU be accurate or not reported. ++ */ ++ spin_lock(&objlay->lock); ++ if (objlay->delta_space_valid != OBJ_DSU_INVALID) { ++ objlay->delta_space_valid = OBJ_DSU_VALID; ++ objlay->delta_space_used += space_used; ++ } ++ spin_unlock(&objlay->lock); ++} ++ ++extern void objlayout_read_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++extern void objlayout_write_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++ ++extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr); ++extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); ++ ++/* ++ * exported generic objects function vectors ++ */ ++extern struct layoutdriver_io_operations objlayout_io_operations; ++extern struct pnfs_client_operations *pnfs_client_ops; ++ ++#endif /* _OBJLAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-31 20:42:05.536110535 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-31 20:42:05.536110535 -0400 +@@ -0,0 +1,734 @@ ++/* ++ * panfs_shim.c ++ * ++ * Shim layer for interfacing with the Panasas DirectFlow module I/O stack ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include "objlayout.h" ++#include "panfs_shim.h" ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct panfs_export_operations *panfs_export_ops; ++ ++void * ++objio_init_mt(void) ++{ ++ return panfs_export_ops == NULL ? ERR_PTR(-EAGAIN) : NULL; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++} ++ ++static int ++panfs_shim_conv_raid01(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) { ++ hdr->type = PAN_AGG_RAID1; ++ hdr->hdr.raid1.num_comps = lo_map->odm_mirror_cnt + 1; ++ } else if (layout->olo_num_comps > 1) { ++ hdr->type = PAN_AGG_RAID0; ++ hdr->hdr.raid0.num_comps = layout->olo_num_comps; ++ hdr->hdr.raid0.stripe_unit = lo_map->odm_stripe_unit; ++ } else ++ hdr->type = PAN_AGG_SIMPLE; ++ return 0; ++} ++ ++static int ++panfs_shim_conv_raid5(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) ++ goto err; ++ ++ if (lo_map->odm_group_width || lo_map->odm_group_depth) { ++ if (!lo_map->odm_group_width || !lo_map->odm_group_depth) ++ goto err; ++ ++ hdr->type = PAN_AGG_GRP_RAID5_LEFT; ++ hdr->hdr.grp_raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.grp_raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.grp_raid5_left.stripe_unit = lo_map->odm_stripe_unit; ++ hdr->hdr.grp_raid5_left.rg_width = lo_map->odm_group_width; ++ hdr->hdr.grp_raid5_left.rg_depth = lo_map->odm_group_depth; ++ /* this is a guess, panasas server is not supposed to ++ hand out layotu otherwise */ ++ hdr->hdr.grp_raid5_left.group_layout_policy = ++ PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN; ++ } else { ++ hdr->type = PAN_AGG_RAID5_LEFT; ++ hdr->hdr.raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.raid5_left.stripe_unit2 = ++ hdr->hdr.raid5_left.stripe_unit1 = ++ hdr->hdr.raid5_left.stripe_unit0 = lo_map->odm_stripe_unit; ++ } ++ ++ return 0; ++err: ++ return -EINVAL; ++} ++ ++/* ++ * Convert a pnfs_osd data map into Panasas aggregation layout header ++ */ ++static int ++panfs_shim_conv_pnfs_osd_data_map( ++ struct pnfs_osd_layout *layout, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ int status = -EINVAL; ++ struct pnfs_osd_data_map *lo_map = &layout->olo_map; ++ ++ if (!layout->olo_num_comps) { ++ dprintk("%s: !!layout.n_comps(%u)\n", __func__, ++ layout->olo_num_comps); ++ goto err; ++ } ++ ++ switch (lo_map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_0 " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid01(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_5: ++ if (!lo_map->odm_group_width) { ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_5 !group_width " ++ "layout.n_comps(%u)!=map.n_comps(%u) " ++ "|| comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ } else if ((layout->olo_num_comps != lo_map->odm_num_comps && ++ layout->olo_num_comps > lo_map->odm_group_width) || ++ (layout->olo_comps_index % lo_map->odm_group_width)){ ++ dprintk("%s: !!PNFS_OSD_RAID_5 group_width(%u) " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ lo_map->odm_group_width, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid5(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_PQ: ++ default: ++ dprintk("%s: !!PNFS_OSD_RAID_(%d)\n", __func__, ++ lo_map->odm_raid_algorithm); ++ goto err; ++ } ++ ++ return 0; ++ ++err: ++ return status; ++} ++ ++/* ++ * Convert pnfs_osd layout into Panasas map and caps type ++ */ ++int ++objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ int i, total_comps; ++ int status; ++ struct pnfs_osd_object_cred *lo_comp; ++ pan_size_t alloc_sz, local_sz; ++ pan_sm_map_cap_t *mcs = NULL; ++ u8 *buf; ++ pan_agg_comp_obj_t *pan_comp; ++ pan_sm_sec_t *pan_sec; ++ ++ status = -EINVAL; ++ if (layout->olo_num_comps < layout->olo_map.odm_group_width) { ++ total_comps = layout->olo_comps_index + layout->olo_num_comps; ++ } else { ++ /* allocate full map, otherwise SAM gets confused */ ++ total_comps = layout->olo_map.odm_num_comps; ++ } ++ alloc_sz = total_comps * ++ (sizeof(pan_agg_comp_obj_t) + sizeof(pan_sm_sec_t)); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p = layout->olo_comps[i].oc_cap.cred; ++ if (panfs_export_ops->sm_sec_t_get_size_otw( ++ (pan_sm_sec_otw_t *)&p, &local_sz, NULL, NULL)) ++ goto err; ++ alloc_sz += local_sz; ++ } ++ ++ status = -ENOMEM; ++ mcs = kzalloc(sizeof(*mcs) + alloc_sz, GFP_KERNEL); ++ if (!mcs) ++ goto err; ++ buf = (u8 *)&mcs[1]; ++ ++ mcs->offset = lseg->range.offset; ++ mcs->length = lseg->range.length; ++#if 0 ++ /* FIXME: for now */ ++ mcs->expiration_time.ts_sec = 0; ++ mcs->expiration_time.ts_nsec = 0; ++#endif ++ mcs->full_map.map_hdr.avail_state = PAN_AGG_OBJ_STATE_NORMAL; ++ status = panfs_shim_conv_pnfs_osd_data_map(layout, ++ &mcs->full_map.layout_hdr); ++ if (status) ++ goto err; ++ ++ mcs->full_map.components.size = total_comps; ++ mcs->full_map.components.data = (pan_agg_comp_obj_t *)buf; ++ buf += total_comps * sizeof(pan_agg_comp_obj_t); ++ ++ mcs->secs.size = total_comps; ++ mcs->secs.data = (pan_sm_sec_t *)buf; ++ buf += total_comps * sizeof(pan_sm_sec_t); ++ ++ lo_comp = layout->olo_comps; ++ pan_comp = mcs->full_map.components.data + layout->olo_comps_index; ++ pan_sec = mcs->secs.data + layout->olo_comps_index; ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p; ++ pan_stor_obj_id_t *obj_id = &mcs->full_map.map_hdr.obj_id; ++ struct pnfs_osd_objid *oc_obj_id = &lo_comp->oc_object_id; ++ u64 dev_id = __be64_to_cpup( ++ (__be64 *)oc_obj_id->oid_device_id.data + 1); ++ ++ dprintk("%s: i=%d deviceid=%Lx:%Lx partition=%Lx object=%Lx\n", ++ __func__, i, ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data), ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data + 1), ++ oc_obj_id->oid_partition_id, oc_obj_id->oid_object_id); ++ ++ if (i == 0) { ++ /* make up mgr_id to calm sam down */ ++ pan_mgr_id_construct_artificial(PAN_MGR_SM, 0, ++ &obj_id->dev_id); ++ obj_id->grp_id = oc_obj_id->oid_partition_id; ++ obj_id->obj_id = oc_obj_id->oid_object_id; ++ } ++ ++ if (obj_id->grp_id != lo_comp->oc_object_id.oid_partition_id) { ++ dprintk("%s: i=%d grp_id=0x%Lx oid_partition_id=0x%Lx\n", ++ __func__, i, (u64)obj_id->grp_id, ++ lo_comp->oc_object_id.oid_partition_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ if (obj_id->obj_id != lo_comp->oc_object_id.oid_object_id) { ++ dprintk("%s: i=%d obj_id=0x%Lx oid_object_id=0x%Lx\n", ++ __func__, i, obj_id->obj_id, ++ lo_comp->oc_object_id.oid_object_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ pan_comp->dev_id = dev_id; ++ if (!pan_stor_is_device_id_an_obsd_id(pan_comp->dev_id)) { ++ dprintk("%s: i=%d dev_id=0x%Lx not an obsd_id\n", ++ __func__, i, obj_id->dev_id); ++ status = -EINVAL; ++ goto err; ++ } ++ if (lo_comp->oc_osd_version == PNFS_OSD_MISSING) { ++ dprintk("%s: degraded maps not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ pan_comp->avail_state = PAN_AGG_COMP_STATE_NORMAL; ++ if (lo_comp->oc_cap_key_sec != PNFS_OSD_CAP_KEY_SEC_NONE) { ++ dprintk("%s: cap key security not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ ++ p = lo_comp->oc_cap.cred; ++ panfs_export_ops->sm_sec_t_unmarshall( ++ (pan_sm_sec_otw_t *)&p, ++ pan_sec, ++ buf, ++ alloc_sz, ++ NULL, ++ &local_sz); ++ buf += local_sz; ++ alloc_sz -= local_sz; ++ ++ lo_comp++; ++ pan_comp++; ++ pan_sec++; ++ } ++ ++ *outp = mcs; ++ dprintk("%s:Return mcs=%p\n", __func__, mcs); ++ return 0; ++ ++err: ++ objio_free_lseg(mcs); ++ dprintk("%s:Error %d\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Free a Panasas map and caps type ++ */ ++void ++objio_free_lseg(void *p) ++{ ++ kfree(p); ++} ++ ++/* ++ * I/O routines ++ */ ++int ++objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct panfs_shim_io_state *p; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ p = kzalloc(sizeof(*p), GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ *outp = &p->ol_state; ++ return 0; ++} ++ ++/* ++ * Free an I/O state ++ */ ++void ++objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ int i; ++ ++ dprintk("%s: freeing io_state\n", __func__); ++ for (i = 0; i < state->ol_state.nr_pages; i++) ++ kunmap(state->ol_state.pages[i]); ++ ++ if (state->ucreds) ++ panfs_export_ops->ucreds_put(state->ucreds); ++ kfree(state->sg_list); ++ kfree(state); ++} ++ ++static int ++panfs_shim_pages_to_sg( ++ struct panfs_shim_io_state *state, ++ struct page **pages, ++ unsigned int pgbase, ++ unsigned nr_pages, ++ size_t count) ++{ ++ unsigned i, n; ++ pan_sg_entry_t *sg; ++ ++ dprintk("%s pgbase %u nr_pages %u count %d " ++ "pg0 %p flags 0x%x index %llu\n", ++ __func__, pgbase, nr_pages, (int)count, pages[0], ++ (unsigned)pages[0]->flags, (unsigned long long)pages[0]->index); ++ ++ sg = kmalloc(nr_pages * sizeof(*sg), GFP_KERNEL); ++ if (sg == NULL) ++ return -ENOMEM; ++ ++ dprintk("%s sg_list %p pages %p pgbase %u nr_pages %u\n", ++ __func__, sg, pages, pgbase, nr_pages); ++ ++ for (i = 0; i < nr_pages; i++) { ++ sg[i].buffer = (char *)kmap(pages[i]) + pgbase; ++ n = PAGE_SIZE - pgbase; ++ pgbase = 0; ++ if (n > count) ++ n = count; ++ sg[i].chunk_size = n; ++ count -= n; ++ if (likely(count)) { ++ sg[i].next = &sg[i+1]; ++ } else { ++ /* we're done */ ++ sg[i].next = NULL; ++ break; ++ } ++ } ++ BUG_ON(count); ++ ++ state->sg_list = sg; ++ return 0; ++} ++ ++/* ++ * Callback function for async reads ++ */ ++static void ++panfs_shim_read_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.read.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++ status = res_p->length; ++ WARN_ON(status < 0); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_read rc %d: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_read_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ struct objlayout_segment *lseg = LSEG_LD_DATA(ol_state->lseg); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)lseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.read.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.read.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_read(PAN_SAM_ACCESS_BYPASS_TIMESTAMP, ++ &state->u.read.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_read_done, ++ state, NULL, ++ &state->u.read.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_read_done(state, NULL, &state->u.read.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Callback function for async writes ++ */ ++static void ++panfs_shim_write_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.write.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++/* state->ol_state.committed = NFS_FILE_SYNC;*/ ++ state->ol_state.committed = NFS_UNSTABLE; ++ status = res_p->length; ++ WARN_ON(status < 0); ++ ++ objlayout_add_delta_space_used(&state->ol_state, ++ res_p->delta_capacity_used); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_write rc %u: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_write_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable /* unused, PanOSD writes are stable */) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ struct objlayout_segment *lseg = LSEG_LD_DATA(ol_state->lseg); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)lseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.write.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.write.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_write(PAN_SAM_ACCESS_NONE, ++ &state->u.write.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_write_done, ++ state, ++ NULL, ++ &state->u.write.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_write_done(state, NULL, &state->u.write.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++int ++panfs_shim_register(struct panfs_export_operations *ops) ++{ ++ if (panfs_export_ops) { ++ printk(KERN_INFO ++ "%s: panfs already registered (panfs ops %p)\n", ++ __func__, panfs_export_ops); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: registering panfs ops %p\n", ++ __func__, ops); ++ ++ panfs_export_ops = ops; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_register); ++ ++int ++panfs_shim_unregister(void) ++{ ++ if (!panfs_export_ops) { ++ printk(KERN_INFO "%s: panfs is not registered\n", __func__); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: unregistering panfs ops %p\n", ++ __func__, panfs_export_ops); ++ ++ panfs_export_ops = NULL; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_unregister); ++ ++/* ++ * Policy Operations ++ */ ++ ++/* ++ * Return the stripe size for the specified file ++ */ ++ssize_t ++panlayout_get_stripesize(struct pnfs_layout_hdr *pnfslay) ++{ ++ ssize_t sz, maxsz = -1; ++ struct pnfs_layout_segment *lseg; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ list_for_each_entry(lseg, &pnfslay->segs, fi_list) { ++ int n; ++ struct objlayout_segment *panlseg = LSEG_LD_DATA(lseg); ++ struct pnfs_osd_layout *lo = ++ (struct pnfs_osd_layout *)panlseg->pnfs_osd_layout; ++ struct pnfs_osd_data_map *map = &lo->olo_map; ++ ++ n = map->odm_group_width; ++ if (n == 0) ++ n = map->odm_num_comps / (map->odm_mirror_cnt + 1); ++ ++ switch (map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_5: ++ n -= 1; ++ n *= 8; /* FIXME: until we have 2-D coalescing */ ++ break; ++ ++ case PNFS_OSD_RAID_PQ: ++ n -= 2; ++ break; ++ ++ default: ++ BUG_ON(1); ++ } ++ sz = map->odm_stripe_unit * n; ++ if (sz > maxsz) ++ maxsz = sz; ++ } ++ dprintk("%s: Return %Zd\n", __func__, maxsz); ++ return maxsz; ++} ++ ++#define PANLAYOUT_DEF_STRIPE_UNIT (64*1024) ++#define PANLAYOUT_DEF_STRIPE_WIDTH 9 ++#define PANLAYOUT_MAX_STRIPE_WIDTH 11 ++#define PANLAYOUT_MAX_GATHER_STRIPES 8 ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++panlayout_get_blocksize(void) ++{ ++ ssize_t sz = (PANLAYOUT_MAX_STRIPE_WIDTH-1) * ++ PANLAYOUT_DEF_STRIPE_UNIT * ++ PANLAYOUT_MAX_GATHER_STRIPES; ++ dprintk("%s: Return %Zd\n", __func__, sz); ++ return sz; ++} ++ ++static struct layoutdriver_policy_operations panlayout_policy_operations = { ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ .get_stripesize = panlayout_get_stripesize, ++ .get_blocksize = panlayout_get_blocksize, ++}; ++ ++#define PNFS_LAYOUT_PANOSD (NFS4_PNFS_PRIVATE_LAYOUT | LAYOUT_OSD2_OBJECTS) ++ ++static struct pnfs_layoutdriver_type panlayout_type = { ++ .id = PNFS_LAYOUT_PANOSD, ++ .name = "PNFS_LAYOUT_PANOSD", ++ .ld_io_ops = &objlayout_io_operations, ++ .ld_policy_ops = &panlayout_policy_operations, ++}; ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for Panasas OSDs"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++panlayout_init(void) ++{ ++ pnfs_client_ops = pnfs_register_layoutdriver(&panlayout_type); ++ printk(KERN_INFO "%s: Registered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++ return 0; ++} ++ ++static void __exit ++panlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&panlayout_type); ++ printk(KERN_INFO "%s: Unregistered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(panlayout_init); ++module_exit(panlayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-31 20:42:05.537124598 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-31 20:42:05.537124598 -0400 +@@ -0,0 +1,482 @@ ++/* ++ * panfs_shim.h ++ * ++ * Data types and external function declerations for interfacing with ++ * panfs (Panasas DirectFlow) I/O stack ++ * ++ * Copyright (C) 2007 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#ifndef _PANLAYOUT_PANFS_SHIM_H ++#define _PANLAYOUT_PANFS_SHIM_H ++ ++typedef s8 pan_int8_t; ++typedef u8 pan_uint8_t; ++typedef s16 pan_int16_t; ++typedef u16 pan_uint16_t; ++typedef s32 pan_int32_t; ++typedef u32 pan_uint32_t; ++typedef s64 pan_int64_t; ++typedef u64 pan_uint64_t; ++ ++/* ++ * from pan_base_types.h ++ */ ++typedef pan_uint64_t pan_rpc_none_t; ++typedef pan_uint32_t pan_rpc_arrdim_t; ++typedef pan_uint32_t pan_status_t; ++typedef pan_uint8_t pan_otw_t; ++typedef pan_uint8_t pan_pad_t; ++ ++typedef pan_uint32_t pan_timespec_sec_t; ++typedef pan_uint32_t pan_timespec_nsec_t; ++ ++typedef struct pan_timespec_s pan_timespec_t; ++struct pan_timespec_s { ++ pan_timespec_sec_t ts_sec; ++ pan_timespec_nsec_t ts_nsec; ++}; ++ ++/* ++ * from pan_std_types.h ++ */ ++typedef pan_uint32_t pan_size_t; ++typedef int pan_bool_t; ++ ++/* ++ * from pan_common_error.h ++ */ ++#define PAN_SUCCESS ((pan_status_t)0) ++#define PAN_ERR_IN_PROGRESS ((pan_status_t)55) ++ ++/* ++ * from pan_sg.h ++ */ ++typedef struct pan_sg_entry_s pan_sg_entry_t; ++struct pan_sg_entry_s { ++ void *buffer; /* pointer to memory */ ++ pan_uint32_t chunk_size; /* size of each chunk (bytes) */ ++ pan_sg_entry_t *next; ++}; ++ ++/* ++ * from pan_storage.h ++ */ ++typedef pan_uint64_t pan_stor_dev_id_t; ++typedef pan_uint32_t pan_stor_obj_grp_id_t; ++typedef pan_uint64_t pan_stor_obj_uniq_t; ++typedef pan_uint32_t pan_stor_action_t; ++typedef pan_uint8_t pan_stor_cap_key_t[20]; ++ ++typedef pan_uint8_t pan_stor_key_type_t; ++typedef pan_uint64_t pan_stor_len_t; ++typedef pan_int64_t pan_stor_delta_len_t; ++typedef pan_uint64_t pan_stor_offset_t; ++typedef pan_uint16_t pan_stor_op_t; ++ ++typedef pan_uint16_t pan_stor_sec_level_t; ++ ++struct pan_stor_obj_id_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_obj_uniq_t obj_id; ++ pan_stor_obj_grp_id_t grp_id; ++}; ++ ++typedef struct pan_stor_obj_id_s pan_stor_obj_id_t; ++ ++#define PAN_STOR_OP_NONE ((pan_stor_op_t) 0U) ++#define PAN_STOR_OP_READ ((pan_stor_op_t) 8U) ++#define PAN_STOR_OP_WRITE ((pan_stor_op_t) 9U) ++#define PAN_STOR_OP_APPEND ((pan_stor_op_t) 10U) ++#define PAN_STOR_OP_GETATTR ((pan_stor_op_t) 11U) ++#define PAN_STOR_OP_SETATTR ((pan_stor_op_t) 12U) ++#define PAN_STOR_OP_FLUSH ((pan_stor_op_t) 13U) ++#define PAN_STOR_OP_CLEAR ((pan_stor_op_t) 14U) ++ ++/* ++ * from pan_aggregation_map.h ++ */ ++typedef pan_uint8_t pan_agg_type_t; ++typedef pan_uint64_t pan_agg_map_version_t; ++typedef pan_uint8_t pan_agg_obj_state_t; ++typedef pan_uint8_t pan_agg_comp_state_t; ++typedef pan_uint8_t pan_agg_comp_flag_t; ++ ++#define PAN_AGG_OBJ_STATE_INVALID ((pan_agg_obj_state_t) 0x00) ++#define PAN_AGG_OBJ_STATE_NORMAL ((pan_agg_obj_state_t) 0x01) ++#define PAN_AGG_OBJ_STATE_DEGRADED ((pan_agg_obj_state_t) 0x02) ++#define PAN_AGG_OBJ_STATE_RECONSTRUCT ((pan_agg_obj_state_t) 0x03) ++#define PAN_AGG_OBJ_STATE_COPYBACK ((pan_agg_obj_state_t) 0x04) ++#define PAN_AGG_OBJ_STATE_UNAVAILABLE ((pan_agg_obj_state_t) 0x05) ++#define PAN_AGG_OBJ_STATE_CREATING ((pan_agg_obj_state_t) 0x06) ++#define PAN_AGG_OBJ_STATE_DELETED ((pan_agg_obj_state_t) 0x07) ++#define PAN_AGG_COMP_STATE_INVALID ((pan_agg_comp_state_t) 0x00) ++#define PAN_AGG_COMP_STATE_NORMAL ((pan_agg_comp_state_t) 0x01) ++#define PAN_AGG_COMP_STATE_UNAVAILABLE ((pan_agg_comp_state_t) 0x02) ++#define PAN_AGG_COMP_STATE_COPYBACK ((pan_agg_comp_state_t) 0x03) ++#define PAN_AGG_COMP_F_NONE ((pan_agg_comp_flag_t) 0x00) ++#define PAN_AGG_COMP_F_ATTR_STORING ((pan_agg_comp_flag_t) 0x01) ++#define PAN_AGG_COMP_F_OBJ_CORRUPT_OBS ((pan_agg_comp_flag_t) 0x02) ++#define PAN_AGG_COMP_F_TEMP ((pan_agg_comp_flag_t) 0x04) ++ ++struct pan_aggregation_map_s { ++ pan_agg_map_version_t version; ++ pan_agg_obj_state_t avail_state; ++ pan_stor_obj_id_t obj_id; ++}; ++ ++typedef struct pan_aggregation_map_s pan_aggregation_map_t; ++ ++struct pan_agg_comp_obj_s { ++ pan_stor_dev_id_t dev_id; ++ pan_agg_comp_state_t avail_state; ++ pan_agg_comp_flag_t comp_flags; ++}; ++ ++typedef struct pan_agg_comp_obj_s pan_agg_comp_obj_t; ++ ++struct pan_agg_simple_header_s { ++ pan_uint8_t unused; ++}; ++ ++typedef struct pan_agg_simple_header_s pan_agg_simple_header_t; ++ ++struct pan_agg_raid1_header_s { ++ pan_uint16_t num_comps; ++}; ++ ++typedef struct pan_agg_raid1_header_s pan_agg_raid1_header_t; ++ ++struct pan_agg_raid0_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++}; ++ ++typedef struct pan_agg_raid0_header_s pan_agg_raid0_header_t; ++ ++struct pan_agg_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit0; ++ pan_uint32_t stripe_unit1; ++ pan_uint32_t stripe_unit2; ++}; ++ ++typedef struct pan_agg_raid5_left_header_s pan_agg_raid5_left_header_t; ++ ++typedef struct pan_agg_grp_raid5_left_header_s pan_agg_grp_raid5_left_header_t; ++ ++struct pan_agg_grp_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++ pan_uint16_t rg_width; ++ pan_uint16_t rg_depth; ++ pan_uint8_t group_layout_policy; ++}; ++ ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_INVALID ((pan_uint8_t) 0x00) ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN ((pan_uint8_t) 0x01) ++ ++#define PAN_AGG_NULL_MAP ((pan_agg_type_t) 0x00) ++#define PAN_AGG_SIMPLE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_RAID1 ((pan_agg_type_t) 0x02) ++#define PAN_AGG_RAID0 ((pan_agg_type_t) 0x03) ++#define PAN_AGG_RAID5_LEFT ((pan_agg_type_t) 0x04) ++#define PAN_AGG_GRP_RAID5_LEFT ((pan_agg_type_t) 0x06) ++#define PAN_AGG_MINTYPE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_MAXTYPE ((pan_agg_type_t) 0x06) ++ ++struct pan_agg_layout_hdr_s { ++ pan_agg_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_uint64_t null; ++ pan_agg_simple_header_t simple; ++ pan_agg_raid1_header_t raid1; ++ pan_agg_raid0_header_t raid0; ++ pan_agg_raid5_left_header_t raid5_left; ++ pan_agg_grp_raid5_left_header_t grp_raid5_left; ++ } hdr; ++}; ++ ++typedef struct pan_agg_layout_hdr_s pan_agg_layout_hdr_t; ++ ++struct pan_agg_comp_obj_a_s { ++ pan_rpc_arrdim_t size; ++ pan_agg_comp_obj_t *data; ++}; ++typedef struct pan_agg_comp_obj_a_s pan_agg_comp_obj_a; ++ ++struct pan_agg_full_map_s { ++ pan_aggregation_map_t map_hdr; ++ pan_agg_layout_hdr_t layout_hdr; ++ pan_agg_comp_obj_a components; ++}; ++ ++typedef struct pan_agg_full_map_s pan_agg_full_map_t; ++ ++/* ++ * from pan_obsd_rpc_types.h ++ */ ++typedef pan_uint8_t pan_obsd_security_key_a[16]; ++ ++typedef pan_uint8_t pan_obsd_capability_key_a[20]; ++ ++typedef pan_uint8_t pan_obsd_key_holder_id_t; ++ ++#define PAN_OBSD_KEY_HOLDER_BASIS_KEY ((pan_obsd_key_holder_id_t) 0x01) ++#define PAN_OBSD_KEY_HOLDER_CAP_KEY ((pan_obsd_key_holder_id_t) 0x02) ++ ++struct pan_obsd_key_holder_s { ++ pan_obsd_key_holder_id_t select; ++ pan_pad_t pad[3]; ++ union { ++ pan_obsd_security_key_a basis_key; ++ pan_obsd_capability_key_a cap_key; ++ } key; ++}; ++ ++typedef struct pan_obsd_key_holder_s pan_obsd_key_holder_t; ++ ++/* ++ * from pan_sm_sec.h ++ */ ++typedef pan_uint8_t pan_sm_sec_type_t; ++typedef pan_uint8_t pan_sm_sec_otw_allo_mode_t; ++ ++struct pan_obsd_capability_generic_otw_t_s { ++ pan_rpc_arrdim_t size; ++ pan_uint8_t *data; ++}; ++typedef struct pan_obsd_capability_generic_otw_t_s ++ pan_obsd_capability_generic_otw_t; ++ ++struct pan_sm_sec_obsd_s { ++ pan_obsd_key_holder_t key; ++ pan_obsd_capability_generic_otw_t cap_otw; ++ pan_sm_sec_otw_allo_mode_t allo_mode; ++}; ++ ++typedef struct pan_sm_sec_obsd_s pan_sm_sec_obsd_t; ++ ++struct pan_sm_sec_s { ++ pan_sm_sec_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_rpc_none_t none; ++ pan_sm_sec_obsd_t obsd; ++ } variant; ++}; ++ ++typedef struct pan_sm_sec_s pan_sm_sec_t; ++ ++struct pan_sm_sec_a_s { ++ pan_rpc_arrdim_t size; ++ pan_sm_sec_t *data; ++}; ++typedef struct pan_sm_sec_a_s pan_sm_sec_a; ++typedef pan_otw_t *pan_sm_sec_otw_t; ++ ++/* ++ * from pan_sm_types.h ++ */ ++typedef pan_uint64_t pan_sm_cap_handle_t; ++ ++struct pan_sm_map_cap_s { ++ pan_agg_full_map_t full_map; ++ pan_stor_offset_t offset; ++ pan_stor_len_t length; ++ pan_sm_sec_a secs; ++ pan_sm_cap_handle_t handle; ++ pan_timespec_t expiration_time; ++ pan_stor_action_t action_mask; ++ pan_uint32_t flags; ++}; ++ ++typedef struct pan_sm_map_cap_s pan_sm_map_cap_t; ++ ++/* ++ * from pan_sm_ops.h ++ */ ++typedef pan_rpc_none_t pan_sm_cache_ptr_t; ++ ++/* ++ * from pan_sam_api.h ++ */ ++typedef pan_uint32_t pan_sam_access_flags_t; ++ ++typedef struct pan_sam_dev_error_s pan_sam_dev_error_t; ++struct pan_sam_dev_error_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_op_t stor_op; ++ pan_status_t error; ++}; ++ ++typedef struct pan_sam_ext_status_s pan_sam_ext_status_t; ++struct pan_sam_ext_status_s { ++ pan_uint32_t available; ++ pan_uint32_t size; ++ pan_sam_dev_error_t *errors; ++}; ++ ++enum pan_sam_rpc_sec_sel_e { ++ PAN_SAM_RPC_SEC_DEFAULT, ++ PAN_SAM_RPC_SEC_ATLEAST, ++ PAN_SAM_RPC_SEC_EXACTLY ++}; ++typedef enum pan_sam_rpc_sec_sel_e pan_sam_rpc_sec_sel_t; ++ ++typedef struct pan_sam_obj_sec_s pan_sam_obj_sec_t; ++struct pan_sam_obj_sec_s { ++ pan_stor_sec_level_t min_security; ++ pan_sm_map_cap_t *map_ccaps; ++}; ++ ++typedef struct pan_sam_rpc_sec_s pan_sam_rpc_sec_t; ++struct pan_sam_rpc_sec_s { ++ pan_sam_rpc_sec_sel_t selector; ++}; ++ ++typedef struct pan_sam_read_args_s pan_sam_read_args_t; ++struct pan_sam_read_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ void *return_attr; ++ void *checksum; ++ pan_stor_offset_t offset; ++ pan_uint16_t sm_options; ++ void *callout; ++ void *callout_arg; ++}; ++ ++typedef struct pan_sam_read_res_s pan_sam_read_res_t; ++struct pan_sam_read_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ void *attr; ++ void *checksum; ++}; ++ ++typedef void (*pan_sam_read_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t status); ++ ++#define PAN_SAM_ACCESS_NONE 0x0000 ++#define PAN_SAM_ACCESS_BYPASS_TIMESTAMP 0x0020 ++ ++typedef struct pan_sam_write_args_s pan_sam_write_args_t; ++struct pan_sam_write_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ pan_stor_offset_t offset; ++ void *attr; ++ void *return_attr; ++}; ++ ++typedef struct pan_sam_write_res_s pan_sam_write_res_t; ++struct pan_sam_write_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ pan_stor_delta_len_t delta_capacity_used; ++ pan_bool_t parity_dirty; ++ void *attr; ++}; ++ ++typedef void (*pan_sam_write_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t status); ++ ++/* ++ * from pan_mgr_types.h ++ */ ++#define PAN_MGR_ID_TYPE_SHIFT 56 ++#define PAN_MGR_ID_TYPE_MASK ((pan_mgr_id_t)18374686479671623680ULL) ++#define PAN_MGR_ID_UNIQ_MASK ((pan_mgr_id_t)72057594037927935ULL) ++ ++typedef pan_uint16_t pan_mgr_type_t; ++typedef pan_uint64_t pan_mgr_id_t; ++ ++#define PAN_MGR_SM ((pan_mgr_type_t) 2U) ++#define PAN_MGR_OBSD ((pan_mgr_type_t) 6U) ++ ++/* ++ * from pan_mgr_types_c.h ++ */ ++#define pan_mgr_id_construct_artificial(_mgr_type_, _mgr_uniq_, _mgr_id_p_) { \ ++ pan_mgr_id_t _id1, _id2; \ ++\ ++ _id1 = (_mgr_type_); \ ++ _id1 <<= PAN_MGR_ID_TYPE_SHIFT; \ ++ _id1 &= PAN_MGR_ID_TYPE_MASK; \ ++ _id2 = (_mgr_uniq_); \ ++ _id2 &= PAN_MGR_ID_UNIQ_MASK; \ ++ _id1 |= _id2; \ ++ *(_mgr_id_p_) = _id1; \ ++} ++ ++/* ++ * from pan_storage_c.h ++ */ ++#define pan_stor_is_device_id_an_obsd_id(_device_id_) \ ++ ((((_device_id_) & PAN_MGR_ID_TYPE_MASK) >> PAN_MGR_ID_TYPE_SHIFT) \ ++ == PAN_MGR_OBSD) ++ ++/* ++ * pnfs_shim internal definitions ++ */ ++ ++struct panfs_shim_io_state { ++ struct objlayout_io_state ol_state; ++ ++ pan_sg_entry_t *sg_list; ++ pan_sam_obj_sec_t obj_sec; ++ void *ucreds; ++ union { ++ struct { ++ pan_sam_read_args_t args; ++ pan_sam_read_res_t res; ++ } read; ++ struct { ++ pan_sam_write_args_t args; ++ pan_sam_write_res_t res; ++ } write; ++ } u; ++}; ++ ++#endif /* _PANLAYOUT_PANFS_SHIM_H */ +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-31 20:42:05.538121971 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-31 20:42:05.538121971 -0400 +@@ -0,0 +1,435 @@ ++/* ++ * pnfs_osd_xdr.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* ++ * The following implementation is based on these Internet Drafts: ++ * ++ * draft-ietf-nfsv4-minorversion-21 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_objid(u32 *p, struct pnfs_osd_objid *objid) ++{ ++ COPYMEM(objid->oid_device_id.data, sizeof(objid->oid_device_id.data)); ++ READ64(objid->oid_partition_id); ++ READ64(objid->oid_object_id); ++ return p; ++} ++ ++static inline u32 * ++pnfs_osd_xdr_decode_opaque_cred(u32 *p, ++ struct pnfs_osd_opaque_cred *opaque_cred) ++{ ++ READ32(opaque_cred->cred_len); ++ COPYMEM(opaque_cred->cred, opaque_cred->cred_len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_object_cred(u32 *p, struct pnfs_osd_object_cred *comp, ++ u8 **credp) ++{ ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_objid(p, &comp->oc_object_id); ++ READ32(comp->oc_osd_version); ++ READ32(comp->oc_cap_key_sec); ++ ++ cred = *credp; ++ comp->oc_cap_key.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap_key); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap_key.cred_len)); ++ comp->oc_cap.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap.cred_len)); ++ *credp = cred; ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_data_map(u32 *p, struct pnfs_osd_data_map *data_map) ++{ ++ READ32(data_map->odm_num_comps); ++ READ64(data_map->odm_stripe_unit); ++ READ32(data_map->odm_group_width); ++ READ32(data_map->odm_group_depth); ++ READ32(data_map->odm_mirror_cnt); ++ READ32(data_map->odm_raid_algorithm); ++ dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " ++ "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", ++ __func__, ++ data_map->odm_num_comps, ++ (unsigned long long)data_map->odm_stripe_unit, ++ data_map->odm_group_width, ++ data_map->odm_group_depth, ++ data_map->odm_mirror_cnt, ++ data_map->odm_raid_algorithm); ++ return p; ++} ++ ++struct pnfs_osd_layout * ++pnfs_osd_xdr_decode_layout(struct pnfs_osd_layout *layout, u32 *p) ++{ ++ int i; ++ u32 *start = p; ++ struct pnfs_osd_object_cred *comp; ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_data_map(p, &layout->olo_map); ++ READ32(layout->olo_comps_index); ++ READ32(layout->olo_num_comps); ++ layout->olo_comps = (struct pnfs_osd_object_cred *)(layout + 1); ++ comp = layout->olo_comps; ++ cred = (u8 *)(comp + layout->olo_num_comps); ++ dprintk("%s: comps_index=%u num_comps=%u\n", ++ __func__, layout->olo_comps_index, layout->olo_num_comps); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ p = pnfs_osd_xdr_decode_object_cred(p, comp, &cred); ++ dprintk("%s: comp[%d]=dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "key_len=%u cap_len=%u\n", ++ __func__, i, ++ _DEVID_LO(&comp->oc_object_id.oid_device_id), ++ _DEVID_HI(&comp->oc_object_id.oid_device_id), ++ comp->oc_object_id.oid_partition_id, ++ comp->oc_object_id.oid_object_id, ++ comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); ++ comp++; ++ } ++ dprintk("%s: xdr_size=%Zd end=%p in_core_size=%Zd\n", __func__, ++ (char *)p - (char *)start, cred, (char *)cred - (char *)layout); ++ return layout; ++} ++ ++/* ++ * Get Device Information Decoding ++ * ++ * Note: since Device Information is currently done synchronously, most ++ * of the actual fields are left inside the rpc buffer and are only ++ * pointed to by the pnfs_osd_deviceaddr members. So the read buffer ++ * should not be freed while the returned information is in use. ++ */ ++ ++u32 *__xdr_read_calc_nfs4_string( ++ u32 *p, struct nfs4_string *str, u8 **freespace) ++{ ++ u32 len; ++ char *data; ++ bool need_copy; ++ ++ READ32(len); ++ data = (char *)p; ++ ++ if (data[len]) { /* Not null terminated we'll need extra space */ ++ data = *freespace; ++ *freespace += len + 1; ++ need_copy = true; ++ } else { ++ need_copy = false; ++ } ++ ++ if (str) { ++ str->len = len; ++ str->data = data; ++ if (need_copy) { ++ memcpy(data, p, len); ++ data[len] = 0; ++ } ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++u32 *__xdr_read_calc_u8_opaque( ++ u32 *p, struct nfs4_string *str) ++{ ++ u32 len; ++ ++ READ32(len); ++ ++ if (str) { ++ str->len = len; ++ str->data = (char *)p; ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetid { ++ * u32 oti_type; ++ * struct nfs4_string oti_scsi_device_id; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetid( ++ u32 *p, struct pnfs_osd_targetid* targetid, u8 **freespace) ++{ ++ u32 oti_type; ++ ++ READ32(oti_type); ++ if (targetid) ++ targetid->oti_type = oti_type; ++ ++ switch (oti_type) { ++ case OBJ_TARGET_SCSI_NAME: ++ case OBJ_TARGET_SCSI_DEVICE_ID: ++ p = __xdr_read_calc_u8_opaque(p, ++ targetid ? &targetid->oti_scsi_device_id : NULL); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_net_addr { ++ * struct nfs4_string r_netid; ++ * struct nfs4_string r_addr; ++ * }; ++ */ ++u32 *__xdr_read_calc_net_addr( ++ u32 *p, struct pnfs_osd_net_addr* netaddr, u8 **freespace) ++{ ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_netid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_addr : NULL, ++ freespace); ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetaddr { ++ * u32 ota_available; ++ * struct pnfs_osd_net_addr ota_netaddr; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetaddr( ++ u32 *p, struct pnfs_osd_targetaddr *targetaddr, u8 **freespace) ++{ ++ u32 ota_available; ++ ++ READ32(ota_available); ++ if (targetaddr) ++ targetaddr->ota_available = ota_available; ++ ++ if (ota_available) { ++ p = __xdr_read_calc_net_addr(p, ++ targetaddr ? &targetaddr->ota_netaddr : NULL, ++ freespace); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++u32 *__xdr_read_calc_deviceaddr( ++ u32 *p, struct pnfs_osd_deviceaddr *deviceaddr, u8 **freespace) ++{ ++ p = __xdr_read_calc_targetid(p, ++ deviceaddr ? &deviceaddr->oda_targetid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_targetaddr(p, ++ deviceaddr ? &deviceaddr->oda_targetaddr : NULL, ++ freespace); ++ ++ if (deviceaddr) ++ COPYMEM(deviceaddr->oda_lun, sizeof(deviceaddr->oda_lun)); ++ else ++ p += XDR_QUADLEN(sizeof(deviceaddr->oda_lun)); ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_systemid : NULL); ++ ++ if (deviceaddr) { ++ p = pnfs_osd_xdr_decode_object_cred(p, ++ &deviceaddr->oda_root_obj_cred, freespace); ++ } else { ++ *freespace += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_osdname : NULL); ++ ++ return p; ++} ++ ++size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p) ++{ ++ u8 *null_freespace = NULL; ++ size_t sz; ++ ++ __xdr_read_calc_deviceaddr(p, NULL, &null_freespace); ++ sz = sizeof(struct pnfs_osd_deviceaddr) + (size_t)null_freespace; ++ ++ return sz; ++} ++ ++void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p) ++{ ++ u8 *freespace = (u8 *)(deviceaddr + 1); ++ ++ __xdr_read_calc_deviceaddr(p, deviceaddr, &freespace); ++} ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou) ++{ ++ __be32 *p = xdr_reserve_space(xdr, 16); ++ ++ if (!p) ++ return -E2BIG; ++ ++ *p++ = cpu_to_be32(lou->dsu_valid); ++ if (lou->dsu_valid) ++ p = xdr_encode_hyper(p, lou->dsu_delta); ++ *p++ = cpu_to_be32(lou->olu_ioerr_flag); ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ */ ++static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p; ++ ++ p = xdr_reserve_space(xdr, 32); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, ++ sizeof(object_id->oid_device_id.data)); ++ p = xdr_encode_hyper(p, object_id->oid_partition_id); ++ p = xdr_encode_hyper(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, ++ struct pnfs_osd_ioerr *ioerr) ++{ ++ __be32 *p; ++ int ret; ++ ++ ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component); ++ if (ret) ++ return ret; ++ ++ p = xdr_reserve_space(xdr, 24); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_hyper(p, ioerr->oer_comp_offset); ++ p = xdr_encode_hyper(p, ioerr->oer_comp_length); ++ *p++ = cpu_to_be32(ioerr->oer_iswrite); ++ *p = cpu_to_be32(ioerr->oer_errno); ++ ++ return 0; ++} +diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/pagelist.c +--- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-08-31 20:41:19.162150222 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-31 20:42:05.539131687 -0400 +@@ -20,6 +20,7 @@ + #include + + #include "internal.h" ++#include "pnfs.h" + + static struct kmem_cache *nfs_page_cachep; + +@@ -56,7 +57,8 @@ nfs_page_free(struct nfs_page *p) + struct nfs_page * + nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, + struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + +@@ -79,7 +81,11 @@ nfs_create_request(struct nfs_open_conte + req->wb_pgbase = offset; + req->wb_bytes = count; + req->wb_context = get_nfs_open_context(ctx); ++ req->wb_lock_context = nfs_get_lock_context(ctx); + kref_init(&req->wb_kref); ++ req->wb_lseg = lseg; ++ if (lseg) ++ get_lseg(lseg); + return req; + } + +@@ -141,18 +147,26 @@ void nfs_clear_request(struct nfs_page * + { + struct page *page = req->wb_page; + struct nfs_open_context *ctx = req->wb_context; ++ struct nfs_lock_context *l_ctx = req->wb_lock_context; + + if (page != NULL) { + page_cache_release(page); + req->wb_page = NULL; + } ++ if (l_ctx != NULL) { ++ nfs_put_lock_context(l_ctx); ++ req->wb_lock_context = NULL; ++ } + if (ctx != NULL) { + put_nfs_open_context(ctx); + req->wb_context = NULL; + } ++ if (req->wb_lseg != NULL) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } + } + +- + /** + * nfs_release_request - Release the count on an NFS read/write request + * @req: request to release +@@ -231,11 +245,12 @@ void nfs_pageio_init(struct nfs_pageio_d + * Return 'true' if this is the case, else return 'false'. + */ + static int nfs_can_coalesce_requests(struct nfs_page *prev, +- struct nfs_page *req) ++ struct nfs_page *req, ++ struct nfs_pageio_descriptor *pgio) + { + if (req->wb_context->cred != prev->wb_context->cred) + return 0; +- if (req->wb_context->lockowner != prev->wb_context->lockowner) ++ if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) + return 0; + if (req->wb_context->state != prev->wb_context->state) + return 0; +@@ -245,6 +260,12 @@ static int nfs_can_coalesce_requests(str + return 0; + if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) + return 0; ++ if (req->wb_lseg != prev->wb_lseg) ++ return 0; ++#ifdef CONFIG_NFS_V4_1 ++ if (pgio->pg_test && !pgio->pg_test(pgio, prev, req)) ++ return 0; ++#endif /* CONFIG_NFS_V4_1 */ + return 1; + } + +@@ -277,7 +298,7 @@ static int nfs_pageio_do_add_request(str + if (newlen > desc->pg_bsize) + return 0; + prev = nfs_list_entry(desc->pg_list.prev); +- if (!nfs_can_coalesce_requests(prev, req)) ++ if (!nfs_can_coalesce_requests(prev, req, desc)) + return 0; + } else + desc->pg_base = req->wb_pgbase; +@@ -366,6 +387,7 @@ void nfs_pageio_cond_complete(struct nfs + * @idx_start: lower bound of page->index to scan + * @npages: idx_start + npages sets the upper bound to scan. + * @tag: tag to scan for ++ * @use_pnfs: will be set TRUE if commit needs to be handled by layout driver + * + * Moves elements from one of the inode request lists. + * If the number of requests is set to 0, the entire address_space +@@ -375,7 +397,7 @@ void nfs_pageio_cond_complete(struct nfs + */ + int nfs_scan_list(struct nfs_inode *nfsi, + struct list_head *dst, pgoff_t idx_start, +- unsigned int npages, int tag) ++ unsigned int npages, int tag, int *use_pnfs) + { + struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; + struct nfs_page *req; +@@ -406,6 +428,8 @@ int nfs_scan_list(struct nfs_inode *nfsi + radix_tree_tag_clear(&nfsi->nfs_page_tree, + req->wb_index, tag); + nfs_list_add_request(req, dst); ++ if (req->wb_lseg) ++ *use_pnfs = 1; + res++; + if (res == INT_MAX) + goto out; +diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs.c +--- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-08-31 20:42:05.541150301 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-08-31 20:42:05.541150301 -0400 +@@ -0,0 +1,2037 @@ ++/* ++ * linux/fs/nfs/pnfs.c ++ * ++ * pNFS functions to call and manage layout drivers. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "internal.h" ++#include "nfs4_fs.h" ++#include "pnfs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS ++ ++#define MIN_POOL_LC (4) ++ ++static int pnfs_initialized; ++ ++static void pnfs_free_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range); ++static inline void get_layout(struct pnfs_layout_hdr *lo); ++ ++/* Locking: ++ * ++ * pnfs_spinlock: ++ * protects pnfs_modules_tbl. ++ */ ++static spinlock_t pnfs_spinlock = __SPIN_LOCK_UNLOCKED(pnfs_spinlock); ++ ++/* ++ * pnfs_modules_tbl holds all pnfs modules ++ */ ++static struct list_head pnfs_modules_tbl; ++static struct kmem_cache *pnfs_cachep; ++static mempool_t *pnfs_layoutcommit_mempool; ++ ++static inline struct nfs4_layoutcommit_data *pnfs_layoutcommit_alloc(void) ++{ ++ struct nfs4_layoutcommit_data *p = ++ mempool_alloc(pnfs_layoutcommit_mempool, GFP_NOFS); ++ if (p) ++ memset(p, 0, sizeof(*p)); ++ ++ return p; ++} ++ ++void pnfs_layoutcommit_free(struct nfs4_layoutcommit_data *p) ++{ ++ mempool_free(p, pnfs_layoutcommit_mempool); ++} ++ ++/* ++ * struct pnfs_module - One per pNFS device module. ++ */ ++struct pnfs_module { ++ struct pnfs_layoutdriver_type *pnfs_ld_type; ++ struct list_head pnfs_tblid; ++}; ++ ++int ++pnfs_initialize(void) ++{ ++ INIT_LIST_HEAD(&pnfs_modules_tbl); ++ ++ pnfs_cachep = kmem_cache_create("nfs4_layoutcommit_data", ++ sizeof(struct nfs4_layoutcommit_data), ++ 0, SLAB_HWCACHE_ALIGN, NULL); ++ if (pnfs_cachep == NULL) ++ return -ENOMEM; ++ ++ pnfs_layoutcommit_mempool = mempool_create(MIN_POOL_LC, ++ mempool_alloc_slab, ++ mempool_free_slab, ++ pnfs_cachep); ++ if (pnfs_layoutcommit_mempool == NULL) { ++ kmem_cache_destroy(pnfs_cachep); ++ return -ENOMEM; ++ } ++ ++ pnfs_initialized = 1; ++ return 0; ++} ++ ++void pnfs_uninitialize(void) ++{ ++ mempool_destroy(pnfs_layoutcommit_mempool); ++ kmem_cache_destroy(pnfs_cachep); ++} ++ ++/* search pnfs_modules_tbl for right pnfs module */ ++static int ++find_pnfs(u32 id, struct pnfs_module **module) { ++ struct pnfs_module *local = NULL; ++ ++ dprintk("PNFS: %s: Searching for %u\n", __func__, id); ++ list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) { ++ if (local->pnfs_ld_type->id == id) { ++ *module = local; ++ return(1); ++ } ++ } ++ return 0; ++} ++ ++/* Set cred to indicate we require a layoutcommit ++ * If we don't even have a layout, we don't need to commit it. ++ */ ++void ++pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx) ++{ ++ dprintk("%s: has_layout=%d ctx=%p\n", __func__, has_layout(nfsi), ctx); ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (has_layout(nfsi) && ++ !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->state)) { ++ nfsi->layout->cred = get_rpccred(ctx->state->owner->so_cred); ++ __set_bit(NFS_INO_LAYOUTCOMMIT, ++ &nfsi->layout->state); ++ nfsi->change_attr++; ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ dprintk("%s: Set layoutcommit\n", __func__); ++ return; ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* Update last_write_offset for layoutcommit. ++ * TODO: We should only use commited extents, but the current nfs ++ * implementation does not calculate the written range in nfs_commit_done. ++ * We therefore update this field in writeback_done. ++ */ ++void ++pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent) ++{ ++ loff_t end_pos; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (offset < nfsi->layout->write_begin_pos) ++ nfsi->layout->write_begin_pos = offset; ++ end_pos = offset + extent - 1; /* I'm being inclusive */ ++ if (end_pos > nfsi->layout->write_end_pos) ++ nfsi->layout->write_end_pos = end_pos; ++ dprintk("%s: Wrote %lu@%lu bpos %lu, epos: %lu\n", ++ __func__, ++ (unsigned long) extent, ++ (unsigned long) offset , ++ (unsigned long) nfsi->layout->write_begin_pos, ++ (unsigned long) nfsi->layout->write_end_pos); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* Unitialize a mountpoint in a layout driver */ ++void ++unmount_pnfs_layoutdriver(struct nfs_server *nfss) ++{ ++ if (PNFS_EXISTS_LDIO_OP(nfss, uninitialize_mountpoint)) ++ nfss->pnfs_curr_ld->ld_io_ops->uninitialize_mountpoint(nfss); ++} ++ ++/* ++ * Set the server pnfs module to the first registered pnfs_type. ++ * Only one pNFS layout driver is supported. ++ */ ++void ++set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, ++ u32 id) ++{ ++ struct pnfs_module *mod = NULL; ++ ++ if (server->pnfs_curr_ld) ++ return; ++ ++ if (!find_pnfs(id, &mod)) { ++ request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); ++ find_pnfs(id, &mod); ++ } ++ ++ if (!mod) { ++ dprintk("%s: No pNFS module found for %u. ", __func__, id); ++ goto out_err; ++ } ++ ++ server->pnfs_curr_ld = mod->pnfs_ld_type; ++ if (mod->pnfs_ld_type->ld_io_ops->initialize_mountpoint( ++ server, mntfh)) { ++ printk(KERN_ERR "%s: Error initializing mount point " ++ "for layout driver %u. ", __func__, id); ++ goto out_err; ++ } ++ ++ dprintk("%s: pNFS module for %u set\n", __func__, id); ++ return; ++ ++out_err: ++ dprintk("Using NFSv4 I/O\n"); ++ server->pnfs_curr_ld = NULL; ++} ++ ++/* Allow I/O module to set its functions structure */ ++struct pnfs_client_operations* ++pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) ++{ ++ struct pnfs_module *pnfs_mod; ++ struct layoutdriver_io_operations *io_ops = ld_type->ld_io_ops; ++ ++ if (!pnfs_initialized) { ++ printk(KERN_ERR "%s Registration failure. " ++ "pNFS not initialized.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops || !io_ops->alloc_layout || !io_ops->free_layout) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "alloc_layout and free_layout.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops->alloc_lseg || !io_ops->free_lseg) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "alloc_lseg and free_lseg.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops->read_pagelist || !io_ops->write_pagelist || ++ !io_ops->commit) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "read_pagelist, write_pagelist, and commit.\n", ++ __func__); ++ return NULL; ++ } ++ ++ pnfs_mod = kmalloc(sizeof(struct pnfs_module), GFP_KERNEL); ++ if (pnfs_mod != NULL) { ++ dprintk("%s Registering id:%u name:%s\n", ++ __func__, ++ ld_type->id, ++ ld_type->name); ++ pnfs_mod->pnfs_ld_type = ld_type; ++ INIT_LIST_HEAD(&pnfs_mod->pnfs_tblid); ++ ++ spin_lock(&pnfs_spinlock); ++ list_add(&pnfs_mod->pnfs_tblid, &pnfs_modules_tbl); ++ spin_unlock(&pnfs_spinlock); ++ } ++ ++ return &pnfs_ops; ++} ++ ++/* Allow I/O module to set its functions structure */ ++void ++pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) ++{ ++ struct pnfs_module *pnfs_mod; ++ ++ if (find_pnfs(ld_type->id, &pnfs_mod)) { ++ dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); ++ spin_lock(&pnfs_spinlock); ++ list_del(&pnfs_mod->pnfs_tblid); ++ spin_unlock(&pnfs_spinlock); ++ kfree(pnfs_mod); ++ } ++} ++ ++/* ++ * pNFS client layout cache ++ */ ++#if defined(CONFIG_SMP) ++#define BUG_ON_UNLOCKED_INO(ino) \ ++ BUG_ON(!spin_is_locked(&ino->i_lock)) ++#define BUG_ON_UNLOCKED_LO(lo) \ ++ BUG_ON_UNLOCKED_INO(PNFS_INODE(lo)) ++#else /* CONFIG_SMP */ ++#define BUG_ON_UNLOCKED_INO(lo) do {} while (0) ++#define BUG_ON_UNLOCKED_LO(lo) do {} while (0) ++#endif /* CONFIG_SMP */ ++ ++static inline void ++get_layout(struct pnfs_layout_hdr *lo) ++{ ++ BUG_ON_UNLOCKED_LO(lo); ++ lo->refcount++; ++} ++ ++static inline void ++put_layout_locked(struct pnfs_layout_hdr *lo) ++{ ++ BUG_ON_UNLOCKED_LO(lo); ++ BUG_ON(lo->refcount <= 0); ++ ++ lo->refcount--; ++ if (!lo->refcount) { ++ struct layoutdriver_io_operations *io_ops = PNFS_LD_IO_OPS(lo); ++ struct nfs_inode *nfsi = PNFS_NFS_INODE(lo); ++ ++ dprintk("%s: freeing layout cache %p\n", __func__, lo); ++ WARN_ON(!list_empty(&lo->layouts)); ++ io_ops->free_layout(lo); ++ nfsi->layout = NULL; ++ } ++} ++ ++void ++put_layout(struct inode *inode) ++{ ++ spin_lock(&inode->i_lock); ++ put_layout_locked(NFS_I(inode)->layout); ++ spin_unlock(&inode->i_lock); ++ ++} ++ ++void ++pnfs_layout_release(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range) ++{ ++ struct nfs_inode *nfsi = PNFS_NFS_INODE(lo); ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (range) ++ pnfs_free_layout(lo, range); ++ /* ++ * Matched in _pnfs_update_layout for layoutget ++ * and by get_layout in _pnfs_return_layout for layoutreturn ++ */ ++ put_layout_locked(lo); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ wake_up_all(&nfsi->lo_waitq); ++} ++ ++void ++pnfs_destroy_layout(struct nfs_inode *nfsi) ++{ ++ struct pnfs_layout_hdr *lo; ++ struct pnfs_layout_range range = { ++ .iomode = IOMODE_ANY, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ lo = nfsi->layout; ++ if (lo) { ++ pnfs_free_layout(lo, &range); ++ WARN_ON(!list_empty(&nfsi->layout->segs)); ++ WARN_ON(!list_empty(&nfsi->layout->layouts)); ++ ++ if (nfsi->layout->refcount != 1) ++ printk(KERN_WARNING "%s: layout refcount not=1 %d\n", ++ __func__, nfsi->layout->refcount); ++ WARN_ON(nfsi->layout->refcount != 1); ++ ++ /* Matched by refcount set to 1 in alloc_init_layout */ ++ put_layout_locked(lo); ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* ++ * Called by the state manger to remove all layouts established under an ++ * expired lease. ++ */ ++void ++pnfs_destroy_all_layouts(struct nfs_client *clp) ++{ ++ struct pnfs_layout_hdr *lo; ++ ++ while (!list_empty(&clp->cl_layouts)) { ++ lo = list_entry(clp->cl_layouts.next, struct pnfs_layout_hdr, ++ layouts); ++ dprintk("%s freeing layout for inode %lu\n", __func__, ++ lo->inode->i_ino); ++ pnfs_destroy_layout(NFS_I(lo->inode)); ++ } ++} ++ ++static inline void ++init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) ++{ ++ INIT_LIST_HEAD(&lseg->fi_list); ++ kref_init(&lseg->kref); ++ lseg->valid = true; ++ lseg->layout = lo; ++} ++ ++static void ++destroy_lseg(struct kref *kref) ++{ ++ struct pnfs_layout_segment *lseg = ++ container_of(kref, struct pnfs_layout_segment, kref); ++ ++ dprintk("--> %s\n", __func__); ++ /* Matched by get_layout in pnfs_insert_layout */ ++ put_layout_locked(lseg->layout); ++ PNFS_LD_IO_OPS(lseg->layout)->free_lseg(lseg); ++} ++ ++static void ++put_lseg_locked(struct pnfs_layout_segment *lseg) ++{ ++ bool do_wake_up; ++ struct nfs_inode *nfsi; ++ ++ if (!lseg) ++ return; ++ ++ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, ++ atomic_read(&lseg->kref.refcount), lseg->valid); ++ do_wake_up = !lseg->valid; ++ nfsi = PNFS_NFS_INODE(lseg->layout); ++ kref_put(&lseg->kref, destroy_lseg); ++ if (do_wake_up) ++ wake_up(&nfsi->lo_waitq); ++} ++ ++void ++put_lseg(struct pnfs_layout_segment *lseg) ++{ ++ bool do_wake_up; ++ struct nfs_inode *nfsi; ++ ++ if (!lseg) ++ return; ++ ++ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, ++ atomic_read(&lseg->kref.refcount), lseg->valid); ++ do_wake_up = !lseg->valid; ++ nfsi = PNFS_NFS_INODE(lseg->layout); ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ kref_put(&lseg->kref, destroy_lseg); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ if (do_wake_up) ++ wake_up(&nfsi->lo_waitq); ++} ++EXPORT_SYMBOL(put_lseg); ++ ++void get_lseg(struct pnfs_layout_segment *lseg) ++{ ++ kref_get(&lseg->kref); ++} ++EXPORT_SYMBOL(get_lseg); ++ ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end: NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1: NFS4_MAX_UINT64; ++} ++ ++/* ++ * is l2 fully contained in l1? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_contained(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (start1 <= start2) && (end1 >= end2); ++} ++ ++/* ++ * is l1 and l2 intersecting? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_intersecting(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (end1 == NFS4_MAX_UINT64 || end1 > start2) && ++ (end2 == NFS4_MAX_UINT64 || end2 > start1); ++} ++ ++void ++pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, ++ const nfs4_stateid *stateid) ++{ ++ write_seqlock(&lo->seqlock); ++ memcpy(lo->stateid.u.data, stateid->u.data, sizeof(lo->stateid.u.data)); ++ write_sequnlock(&lo->seqlock); ++} ++ ++void ++pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo) ++{ ++ int seq; ++ ++ dprintk("--> %s\n", __func__); ++ ++ do { ++ seq = read_seqbegin(&lo->seqlock); ++ memcpy(dst->u.data, lo->stateid.u.data, ++ sizeof(lo->stateid.u.data)); ++ } while (read_seqretry(&lo->seqlock, seq)); ++ ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void ++pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo, ++ struct nfs4_state *state) ++{ ++ int seq; ++ ++ dprintk("--> %s\n", __func__); ++ ++ write_seqlock(&lo->seqlock); ++ if (!memcmp(lo->stateid.u.data, &zero_stateid, NFS4_STATEID_SIZE)) ++ do { ++ seq = read_seqbegin(&state->seqlock); ++ memcpy(lo->stateid.u.data, state->stateid.u.data, ++ sizeof(state->stateid.u.data)); ++ } while (read_seqretry(&state->seqlock, seq)); ++ write_sequnlock(&lo->seqlock); ++ dprintk("<-- %s\n", __func__); ++} ++ ++/* ++* Get layout from server. ++* for now, assume that whole file layouts are requested. ++* arg->offset: 0 ++* arg->length: all ones ++*/ ++static int ++send_layoutget(struct inode *ino, ++ struct nfs_open_context *ctx, ++ struct pnfs_layout_range *range, ++ struct pnfs_layout_segment **lsegpp, ++ struct pnfs_layout_hdr *lo) ++{ ++ int status; ++ struct nfs_server *server = NFS_SERVER(ino); ++ struct nfs4_layoutget *lgp; ++ ++ dprintk("--> %s\n", __func__); ++ ++ lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); ++ if (lgp == NULL) { ++ pnfs_layout_release(lo, NULL); ++ return -ENOMEM; ++ } ++ lgp->args.minlength = NFS4_MAX_UINT64; ++ lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; ++ lgp->args.range.iomode = range->iomode; ++ lgp->args.range.offset = 0; ++ lgp->args.range.length = NFS4_MAX_UINT64; ++ lgp->args.type = server->pnfs_curr_ld->id; ++ lgp->args.inode = ino; ++ lgp->lsegpp = lsegpp; ++ ++ if (!memcmp(lo->stateid.u.data, &zero_stateid, NFS4_STATEID_SIZE)) { ++ struct nfs_open_context *oldctx = ctx; ++ ++ if (!oldctx) { ++ ctx = nfs_find_open_context(ino, NULL, ++ (range->iomode == IOMODE_READ) ? ++ FMODE_READ: FMODE_WRITE); ++ BUG_ON(!ctx); ++ } ++ /* Set the layout stateid from the open stateid */ ++ pnfs_layout_from_open_stateid(NFS_I(ino)->layout, ctx->state); ++ if (!oldctx) ++ put_nfs_open_context(ctx); ++ } ++ ++ /* Retrieve layout information from server */ ++ status = nfs4_proc_layoutget(lgp); ++ ++ dprintk("<-- %s status %d\n", __func__, status); ++ return status; ++} ++ ++/* ++ * iomode matching rules: ++ * range lseg match ++ * ----- ----- ----- ++ * ANY READ true ++ * ANY RW true ++ * RW READ false ++ * RW RW true ++ * READ READ true ++ * READ RW false ++ */ ++static inline int ++should_free_lseg(struct pnfs_layout_segment *lseg, ++ struct pnfs_layout_range *range) ++{ ++ return (range->iomode == IOMODE_ANY || ++ lseg->range.iomode == range->iomode) && ++ lo_seg_intersecting(&lseg->range, range); ++} ++ ++static struct pnfs_layout_segment * ++has_layout_to_return(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range) ++{ ++ struct pnfs_layout_segment *out = NULL, *lseg; ++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", ++ __func__, lo, range->offset, range->length, range->iomode); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry (lseg, &lo->segs, fi_list) ++ if (should_free_lseg(lseg, range)) { ++ out = lseg; ++ break; ++ } ++ ++ dprintk("%s:Return lseg=%p\n", __func__, out); ++ return out; ++} ++ ++static inline bool ++_pnfs_can_return_lseg(struct pnfs_layout_segment *lseg) ++{ ++ return atomic_read(&lseg->kref.refcount) == 1; ++} ++ ++ ++static void ++pnfs_free_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range) ++{ ++ struct pnfs_layout_segment *lseg, *next; ++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", ++ __func__, lo, range->offset, range->length, range->iomode); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry_safe (lseg, next, &lo->segs, fi_list) { ++ if (!should_free_lseg(lseg, range) || ++ !_pnfs_can_return_lseg(lseg)) ++ continue; ++ dprintk("%s: freeing lseg %p iomode %d " ++ "offset %llu length %llu\n", __func__, ++ lseg, lseg->range.iomode, lseg->range.offset, ++ lseg->range.length); ++ list_del(&lseg->fi_list); ++ put_lseg_locked(lseg); ++ } ++ if (list_empty(&lo->segs)) { ++ struct nfs_client *clp; ++ ++ clp = PNFS_NFS_SERVER(lo)->nfs_client; ++ spin_lock(&clp->cl_lock); ++ list_del_init(&lo->layouts); ++ spin_unlock(&clp->cl_lock); ++ pnfs_set_layout_stateid(lo, &zero_stateid); ++ } ++ ++ dprintk("%s:Return\n", __func__); ++} ++ ++static bool ++pnfs_return_layout_barrier(struct nfs_inode *nfsi, ++ struct pnfs_layout_range *range) ++{ ++ struct pnfs_layout_segment *lseg; ++ bool ret = false; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ list_for_each_entry(lseg, &nfsi->layout->segs, fi_list) { ++ if (!should_free_lseg(lseg, range)) ++ continue; ++ lseg->valid = false; ++ if (!_pnfs_can_return_lseg(lseg)) { ++ dprintk("%s: wait on lseg %p refcount %d\n", ++ __func__, lseg, ++ atomic_read(&lseg->kref.refcount)); ++ ret = true; ++ } ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ dprintk("%s:Return %d\n", __func__, ret); ++ return ret; ++} ++ ++static int ++return_layout(struct inode *ino, struct pnfs_layout_range *range, ++ enum pnfs_layoutreturn_type type, struct pnfs_layout_hdr *lo, ++ bool wait) ++{ ++ struct nfs4_layoutreturn *lrp; ++ struct nfs_server *server = NFS_SERVER(ino); ++ int status = -ENOMEM; ++ ++ dprintk("--> %s\n", __func__); ++ ++ BUG_ON(type != RETURN_FILE); ++ ++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); ++ if (lrp == NULL) { ++ if (lo && (type == RETURN_FILE)) ++ pnfs_layout_release(lo, NULL); ++ goto out; ++ } ++ lrp->args.reclaim = 0; ++ lrp->args.layout_type = server->pnfs_curr_ld->id; ++ lrp->args.return_type = type; ++ lrp->args.range = *range; ++ lrp->args.inode = ino; ++ ++ status = nfs4_proc_layoutreturn(lrp, wait); ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; ++} ++ ++int ++_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ struct pnfs_layout_hdr *lo = NULL; ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_range arg; ++ int status = 0; ++ ++ dprintk("--> %s type %d\n", __func__, type); ++ ++ ++ arg.iomode = range ? range->iomode : IOMODE_ANY; ++ arg.offset = 0; ++ arg.length = NFS4_MAX_UINT64; ++ ++ if (type == RETURN_FILE) { ++ spin_lock(&ino->i_lock); ++ lo = nfsi->layout; ++ if (lo && !has_layout_to_return(lo, &arg)) { ++ lo = NULL; ++ } ++ if (!lo) { ++ spin_unlock(&ino->i_lock); ++ dprintk("%s: no layout segments to return\n", __func__); ++ goto out; ++ } ++ ++ /* Reference for layoutreturn matched in pnfs_layout_release */ ++ get_layout(lo); ++ ++ spin_unlock(&ino->i_lock); ++ ++ if (pnfs_return_layout_barrier(nfsi, &arg)) { ++ if (stateid) { /* callback */ ++ status = -EAGAIN; ++ goto out_put; ++ } ++ dprintk("%s: waiting\n", __func__); ++ wait_event(nfsi->lo_waitq, ++ !pnfs_return_layout_barrier(nfsi, &arg)); ++ } ++ ++ if (layoutcommit_needed(nfsi)) { ++ if (stateid && !wait) { /* callback */ ++ dprintk("%s: layoutcommit pending\n", __func__); ++ status = -EAGAIN; ++ goto out_put; ++ } ++ status = pnfs_layoutcommit_inode(ino, wait); ++ if (status) { ++ /* Return layout even if layoutcommit fails */ ++ dprintk("%s: layoutcommit failed, status=%d. " ++ "Returning layout anyway\n", ++ __func__, status); ++ } ++ } ++ ++ if (!stateid) ++ status = return_layout(ino, &arg, type, lo, wait); ++ else ++ pnfs_layout_release(lo, &arg); ++ } ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; ++out_put: ++ put_layout(ino); ++ goto out; ++} ++ ++/* ++ * cmp two layout segments for sorting into layout cache ++ */ ++static inline s64 ++cmp_layout(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) ++{ ++ s64 d; ++ ++ /* higher offset > lower offset */ ++ d = l1->offset - l2->offset; ++ if (d) ++ return d; ++ ++ /* longer length > shorter length */ ++ d = l1->length - l2->length; ++ if (d) ++ return d; ++ ++ /* read > read/write */ ++ return (int)(l1->iomode == IOMODE_READ) - ++ (int)(l2->iomode == IOMODE_READ); ++} ++ ++static void ++pnfs_insert_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct pnfs_layout_segment *lp; ++ int found = 0; ++ ++ dprintk("%s:Begin\n", __func__); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ if (list_empty(&lo->segs)) { ++ struct nfs_client *clp = PNFS_NFS_SERVER(lo)->nfs_client; ++ ++ spin_lock(&clp->cl_lock); ++ BUG_ON(!list_empty(&lo->layouts)); ++ list_add_tail(&lo->layouts, &clp->cl_layouts); ++ spin_unlock(&clp->cl_lock); ++ } ++ list_for_each_entry (lp, &lo->segs, fi_list) { ++ if (cmp_layout(&lp->range, &lseg->range) > 0) ++ continue; ++ list_add_tail(&lseg->fi_list, &lp->fi_list); ++ dprintk("%s: inserted lseg %p " ++ "iomode %d offset %llu length %llu before " ++ "lp %p iomode %d offset %llu length %llu\n", ++ __func__, lseg, lseg->range.iomode, ++ lseg->range.offset, lseg->range.length, ++ lp, lp->range.iomode, lp->range.offset, ++ lp->range.length); ++ found = 1; ++ break; ++ } ++ if (!found) { ++ list_add_tail(&lseg->fi_list, &lo->segs); ++ dprintk("%s: inserted lseg %p " ++ "iomode %d offset %llu length %llu at tail\n", ++ __func__, lseg, lseg->range.iomode, ++ lseg->range.offset, lseg->range.length); ++ } ++ get_layout(lo); ++ ++ dprintk("%s:Return\n", __func__); ++} ++ ++/* ++ * Each layoutdriver embeds pnfs_layout_hdr as the first field in it's ++ * per-layout type layout cache structure and returns it ZEROed ++ * from layoutdriver_io_ops->alloc_layout ++ */ ++static struct pnfs_layout_hdr * ++alloc_init_layout(struct inode *ino) ++{ ++ struct pnfs_layout_hdr *lo; ++ struct layoutdriver_io_operations *io_ops; ++ ++ io_ops = NFS_SERVER(ino)->pnfs_curr_ld->ld_io_ops; ++ lo = io_ops->alloc_layout(ino); ++ if (!lo) { ++ printk(KERN_ERR ++ "%s: out of memory: io_ops->alloc_layout failed\n", ++ __func__); ++ return NULL; ++ } ++ lo->refcount = 1; ++ INIT_LIST_HEAD(&lo->layouts); ++ INIT_LIST_HEAD(&lo->segs); ++ seqlock_init(&lo->seqlock); ++ lo->inode = ino; ++ return lo; ++} ++ ++/* ++ * Retrieve and possibly allocate the inode layout ++ * ++ * ino->i_lock must be taken by the caller. ++ */ ++static struct pnfs_layout_hdr * ++pnfs_alloc_layout(struct inode *ino) ++{ ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_hdr *new = NULL; ++ ++ dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); ++ ++ BUG_ON_UNLOCKED_INO(ino); ++ if (likely(nfsi->layout)) ++ return nfsi->layout; ++ ++ spin_unlock(&ino->i_lock); ++ new = alloc_init_layout(ino); ++ spin_lock(&ino->i_lock); ++ ++ if (likely(nfsi->layout == NULL)) { /* Won the race? */ ++ nfsi->layout = new; ++ } else if (new) { ++ /* Reference the layout accross i_lock release and grab */ ++ get_layout(nfsi->layout); ++ spin_unlock(&ino->i_lock); ++ NFS_SERVER(ino)->pnfs_curr_ld->ld_io_ops->free_layout(new); ++ spin_lock(&ino->i_lock); ++ put_layout_locked(nfsi->layout); ++ } ++ return nfsi->layout; ++} ++ ++/* ++ * iomode matching rules: ++ * range lseg match ++ * ----- ----- ----- ++ * ANY READ true ++ * ANY RW true ++ * RW READ false ++ * RW RW true ++ * READ READ true ++ * READ RW true ++ */ ++static inline int ++has_matching_lseg(struct pnfs_layout_segment *lseg, ++ struct pnfs_layout_range *range) ++{ ++ struct pnfs_layout_range range1; ++ ++ if ((range->iomode == IOMODE_RW && lseg->range.iomode != IOMODE_RW) || ++ !lo_seg_intersecting(&lseg->range, range)) ++ return 0; ++ ++ /* range1 covers only the first byte in the range */ ++ range1 = *range; ++ range1.length = 1; ++ return lo_seg_contained(&lseg->range, &range1); ++} ++ ++/* ++ * lookup range in layout ++ */ ++static struct pnfs_layout_segment * ++pnfs_has_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range, ++ bool take_ref, ++ bool only_valid) ++{ ++ struct pnfs_layout_segment *lseg, *ret = NULL; ++ ++ dprintk("%s:Begin\n", __func__); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry (lseg, &lo->segs, fi_list) { ++ if (has_matching_lseg(lseg, range) && ++ (lseg->valid || !only_valid)) { ++ ret = lseg; ++ if (take_ref) ++ get_lseg(ret); ++ break; ++ } ++ if (cmp_layout(range, &lseg->range) > 0) ++ break; ++ } ++ ++ dprintk("%s:Return lseg %p take_ref %d ref %d valid %d\n", ++ __func__, ret, take_ref, ++ ret ? atomic_read(&ret->kref.refcount) : 0, ++ ret ? ret->valid : 0); ++ return ret; ++} ++ ++/* Update the file's layout for the given range and iomode. ++ * Layout is retreived from the server if needed. ++ * If lsegpp is given, the appropriate layout segment is referenced and ++ * returned to the caller. ++ */ ++void ++_pnfs_update_layout(struct inode *ino, ++ struct nfs_open_context *ctx, ++ loff_t pos, ++ u64 count, ++ enum pnfs_iomode iomode, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ struct pnfs_layout_range arg = { ++ .iomode = iomode, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_hdr *lo; ++ struct pnfs_layout_segment *lseg = NULL; ++ bool take_ref = (lsegpp != NULL); ++ ++ if (take_ref) ++ *lsegpp = NULL; ++ spin_lock(&ino->i_lock); ++ lo = pnfs_alloc_layout(ino); ++ if (lo == NULL) { ++ dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); ++ goto out_unlock; ++ } ++ ++ /* Check to see if the layout for the given range already exists */ ++ lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref); ++ if (lseg && !lseg->valid) { ++ if (take_ref) ++ put_lseg_locked(lseg); ++ /* someone is cleaning the layout */ ++ lseg = NULL; ++ goto out_unlock; ++ } ++ ++ if (lseg) { ++ dprintk("%s: Using cached lseg %p for %llu@%llu iomode %d)\n", ++ __func__, ++ lseg, ++ arg.length, ++ arg.offset, ++ arg.iomode); ++ ++ goto out_unlock; ++ } ++ ++ /* if get layout already failed once goto out */ ++ if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state)) { ++ if (unlikely(nfsi->pnfs_layout_suspend && ++ get_seconds() >= nfsi->pnfs_layout_suspend)) { ++ dprintk("%s: layout_get resumed\n", __func__); ++ clear_bit(lo_fail_bit(iomode), ++ &nfsi->layout->state); ++ nfsi->pnfs_layout_suspend = 0; ++ } else ++ goto out_unlock; ++ } ++ ++ /* Reference the layout for layoutget matched in pnfs_layout_release */ ++ get_layout(lo); ++ spin_unlock(&ino->i_lock); ++ ++ send_layoutget(ino, ctx, &arg, lsegpp, lo); ++out: ++ dprintk("%s end, state 0x%lx lseg %p\n", __func__, ++ nfsi->layout->state, lseg); ++ return; ++out_unlock: ++ if (lsegpp) ++ *lsegpp = lseg; ++ spin_unlock(&ino->i_lock); ++ goto out; ++} ++ ++void ++pnfs_get_layout_done(struct nfs4_layoutget *lgp, int rpc_status) ++{ ++ struct pnfs_layout_segment *lseg = NULL; ++ struct nfs_inode *nfsi = NFS_I(lgp->args.inode); ++ time_t suspend = 0; ++ ++ dprintk("-->%s\n", __func__); ++ ++ lgp->status = rpc_status; ++ if (likely(!rpc_status)) { ++ if (unlikely(lgp->res.layout.len < 0)) { ++ printk(KERN_ERR ++ "%s: ERROR Returned layout size is ZERO\n", __func__); ++ lgp->status = -EIO; ++ } ++ goto out; ++ } ++ ++ dprintk("%s: ERROR retrieving layout %d\n", __func__, rpc_status); ++ switch (rpc_status) { ++ case -NFS4ERR_BADLAYOUT: ++ lgp->status = -ENOENT; ++ /* FALLTHROUGH */ ++ case -EACCES: /* NFS4ERR_ACCESS */ ++ /* transient error, don't mark with NFS_INO_LAYOUT_FAILED */ ++ goto out; ++ ++ case -NFS4ERR_LAYOUTTRYLATER: ++ case -NFS4ERR_RECALLCONFLICT: ++ case -NFS4ERR_OLD_STATEID: ++ case -EAGAIN: /* NFS4ERR_LOCKED */ ++ lgp->status = -NFS4ERR_DELAY; /* for nfs4_handle_exception */ ++ /* FALLTHROUGH */ ++ case -NFS4ERR_GRACE: ++ case -NFS4ERR_DELAY: ++ goto out; ++ ++ case -NFS4ERR_ADMIN_REVOKED: ++ case -NFS4ERR_DELEG_REVOKED: ++ /* The layout is expected to be returned at this point. ++ * This should clear the layout stateid as well */ ++ suspend = get_seconds() + 1; ++ break; ++ ++ case -NFS4ERR_LAYOUTUNAVAILABLE: ++ lgp->status = -ENOTSUPP; ++ break; ++ ++ case -NFS4ERR_REP_TOO_BIG: ++ case -NFS4ERR_REP_TOO_BIG_TO_CACHE: ++ lgp->status = -E2BIG; ++ break; ++ ++ /* Leave the following errors untranslated */ ++ case -NFS4ERR_DEADSESSION: ++ case -NFS4ERR_DQUOT: ++ case -EINVAL: /* NFS4ERR_INVAL */ ++ case -EIO: /* NFS4ERR_IO */ ++ case -NFS4ERR_FHEXPIRED: ++ case -NFS4ERR_MOVED: ++ case -NFS4ERR_NOSPC: ++ case -ESERVERFAULT: /* NFS4ERR_SERVERFAULT */ ++ case -ESTALE: /* NFS4ERR_STALE */ ++ case -ETOOSMALL: /* NFS4ERR_TOOSMALL */ ++ break; ++ ++ /* The following errors are our fault and should never happen */ ++ case -NFS4ERR_BADIOMODE: ++ case -NFS4ERR_BADXDR: ++ case -NFS4ERR_REQ_TOO_BIG: ++ case -NFS4ERR_UNKNOWN_LAYOUTTYPE: ++ case -NFS4ERR_WRONG_TYPE: ++ lgp->status = -EINVAL; ++ /* FALLTHROUGH */ ++ case -NFS4ERR_BAD_STATEID: ++ case -NFS4ERR_NOFILEHANDLE: ++ case -ENOTSUPP: /* NFS4ERR_NOTSUPP */ ++ case -NFS4ERR_OPENMODE: ++ case -NFS4ERR_OP_NOT_IN_SESSION: ++ case -NFS4ERR_TOO_MANY_OPS: ++ dprintk("%s: error %d: should never happen\n", __func__, ++ rpc_status); ++ break; ++ ++ /* The following errors are the server's fault */ ++ default: ++ dprintk("%s: illegal error %d\n", __func__, rpc_status); ++ lgp->status = -EIO; ++ break; ++ } ++ ++ /* remember that get layout failed and suspend trying */ ++ nfsi->pnfs_layout_suspend = suspend; ++ set_bit(lo_fail_bit(lgp->args.range.iomode), ++ &nfsi->layout->state); ++ dprintk("%s: layout_get suspended until %ld\n", ++ __func__, suspend); ++out: ++ dprintk("%s end (err:%d) state 0x%lx lseg %p\n", ++ __func__, lgp->status, nfsi->layout->state, lseg); ++ return; ++} ++ ++int ++pnfs_layout_process(struct nfs4_layoutget *lgp) ++{ ++ struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; ++ struct nfs4_layoutget_res *res = &lgp->res; ++ struct pnfs_layout_segment *lseg; ++ struct inode *ino = PNFS_INODE(lo); ++ int status = 0; ++ ++ /* Inject layout blob into I/O device driver */ ++ lseg = PNFS_LD_IO_OPS(lo)->alloc_lseg(lo, res); ++ if (!lseg || IS_ERR(lseg)) { ++ if (!lseg) ++ status = -ENOMEM; ++ else ++ status = PTR_ERR(lseg); ++ dprintk("%s: Could not allocate layout: error %d\n", ++ __func__, status); ++ goto out; ++ } ++ ++ spin_lock(&ino->i_lock); ++ init_lseg(lo, lseg); ++ lseg->range = res->range; ++ if (lgp->lsegpp) { ++ get_lseg(lseg); ++ *lgp->lsegpp = lseg; ++ } ++ pnfs_insert_layout(lo, lseg); ++ ++ if (res->return_on_close) { ++ lo->roc_iomode |= res->range.iomode; ++ if (!lo->roc_iomode) ++ lo->roc_iomode = IOMODE_ANY; ++ } ++ ++ /* Done processing layoutget. Set the layout stateid */ ++ pnfs_set_layout_stateid(lo, &res->stateid); ++ spin_unlock(&ino->i_lock); ++out: ++ return status; ++} ++ ++void ++readahead_range(struct inode *inode, struct list_head *pages, loff_t *offset, ++ size_t *count) ++{ ++ struct page *first, *last; ++ loff_t foff, i_size = i_size_read(inode); ++ pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; ++ size_t range; ++ ++ ++ first = list_entry((pages)->prev, struct page, lru); ++ last = list_entry((pages)->next, struct page, lru); ++ ++ foff = (loff_t)first->index << PAGE_CACHE_SHIFT; ++ ++ range = (last->index - first->index) * PAGE_CACHE_SIZE; ++ if (last->index == end_index) ++ range += ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; ++ else ++ range += PAGE_CACHE_SIZE; ++ dprintk("%s foff %lu, range %Zu\n", __func__, (unsigned long)foff, ++ range); ++ *offset = foff; ++ *count = range; ++} ++ ++void ++pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio) ++{ ++ struct pnfs_layout_hdr *lo; ++ struct pnfs_layoutdriver_type *ld; ++ ++ pgio->pg_test = NULL; ++ ++ lo = NFS_I(inode)->layout; ++ ld = NFS_SERVER(inode)->pnfs_curr_ld; ++ if (!pnfs_enabled_sb(NFS_SERVER(inode)) || !lo) ++ return; ++ ++ if (ld->ld_policy_ops) ++ pgio->pg_test = ld->ld_policy_ops->pg_test; ++} ++ ++static u32 ++pnfs_getboundary(struct inode *inode) ++{ ++ u32 stripe_size = 0; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct layoutdriver_policy_operations *policy_ops; ++ ++ if (!nfss->pnfs_curr_ld) ++ goto out; ++ ++ policy_ops = nfss->pnfs_curr_ld->ld_policy_ops; ++ if (!policy_ops || !policy_ops->get_stripesize) ++ goto out; ++ ++ /* The default is to not gather across stripes */ ++ if (pnfs_ld_gather_across_stripes(nfss->pnfs_curr_ld)) ++ goto out; ++ ++ spin_lock(&inode->i_lock); ++ if (NFS_I(inode)->layout) ++ stripe_size = policy_ops->get_stripesize(NFS_I(inode)->layout); ++ spin_unlock(&inode->i_lock); ++out: ++ return stripe_size; ++} ++ ++/* ++ * rsize is already set by caller to MDS rsize. ++ */ ++void ++pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, ++ struct inode *inode, ++ struct nfs_open_context *ctx, ++ struct list_head *pages, ++ size_t *rsize) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ size_t count = 0; ++ loff_t loff; ++ ++ pgio->pg_iswrite = 0; ++ pgio->pg_boundary = 0; ++ pgio->pg_test = NULL; ++ pgio->pg_lseg = NULL; ++ ++ if (!pnfs_enabled_sb(nfss)) ++ return; ++ ++ /* Calculate the total read-ahead count */ ++ readahead_range(inode, pages, &loff, &count); ++ ++ if (count > 0) { ++ _pnfs_update_layout(inode, ctx, loff, count, IOMODE_READ, ++ &pgio->pg_lseg); ++ if (!pgio->pg_lseg) ++ return; ++ ++ *rsize = NFS_SERVER(inode)->ds_rsize; ++ pgio->pg_boundary = pnfs_getboundary(inode); ++ if (pgio->pg_boundary) ++ pnfs_set_pg_test(inode, pgio); ++ } ++} ++ ++void ++pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, ++ size_t *wsize) ++{ ++ struct nfs_server *server = NFS_SERVER(inode); ++ ++ pgio->pg_iswrite = 1; ++ if (!pnfs_enabled_sb(server)) { ++ pgio->pg_boundary = 0; ++ pgio->pg_test = NULL; ++ return; ++ } ++ pgio->pg_boundary = pnfs_getboundary(inode); ++ pnfs_set_pg_test(inode, pgio); ++ *wsize = server->ds_wsize; ++} ++ ++/* Return I/O buffer size for a layout driver ++ * This value will determine what size reads and writes ++ * will be gathered into and sent to the data servers. ++ * blocksize must be a multiple of the page cache size. ++ */ ++unsigned int ++pnfs_getiosize(struct nfs_server *server) ++{ ++ if (!PNFS_EXISTS_LDPOLICY_OP(server, get_blocksize)) ++ return 0; ++ return server->pnfs_curr_ld->ld_policy_ops->get_blocksize(); ++} ++ ++void ++pnfs_set_ds_iosize(struct nfs_server *server) ++{ ++ unsigned dssize = pnfs_getiosize(server); ++ ++ /* Set buffer size for data servers */ ++ if (dssize > 0) { ++ server->ds_rsize = server->ds_wsize = ++ nfs_block_size(dssize, NULL); ++ } else { ++ server->ds_wsize = server->wsize; ++ server->ds_rsize = server->rsize; ++ } ++} ++ ++static int ++pnfs_call_done(struct pnfs_call_data *pdata, struct rpc_task *task, void *data) ++{ ++ put_lseg(pdata->lseg); ++ pdata->lseg = NULL; ++ pdata->call_ops->rpc_call_done(task, data); ++ if (pdata->pnfs_error == -EAGAIN || task->tk_status == -EAGAIN) ++ return -EAGAIN; ++ if (pdata->pnfsflags & PNFS_NO_RPC) { ++ pdata->call_ops->rpc_release(data); ++ } else { ++ /* ++ * just restore original rpc call ops ++ * rpc_release will be called later by the rpc scheduling layer. ++ */ ++ task->tk_ops = pdata->call_ops; ++ } ++ return 0; ++} ++ ++/* Post-write completion function ++ * Invoked by all layout drivers when write_pagelist is done. ++ * ++ * NOTE: callers set data->pnfsflags PNFS_NO_RPC ++ * so that the NFS cleanup routines perform only the page cache ++ * cleanup. ++ */ ++static void ++pnfs_write_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ struct pnfs_layout_range range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = wdata->args.offset; ++ range.length = wdata->args.count; ++ _pnfs_return_layout(wdata->inode, &range, NULL, RETURN_FILE, true); ++ pnfs_initiate_write(wdata, NFS_CLIENT(wdata->inode), ++ wdata->pdata.call_ops, wdata->pdata.how); ++} ++ ++static void ++pnfs_writeback_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ /* update last write offset and need layout commit ++ * for non-files layout types (files layout calls ++ * pnfs4_write_done for this) ++ */ ++ if ((pdata->pnfsflags & PNFS_NO_RPC) && ++ data->task.tk_status >= 0 && data->res.count > 0) { ++ struct nfs_inode *nfsi = NFS_I(data->inode); ++ ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++ } ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_write_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++ ++static void _pnfs_clear_lseg_from_pages(struct list_head *head) ++{ ++ struct nfs_page *req; ++ ++ list_for_each_entry(req, head, wb_list) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } ++} ++ ++/* ++ * Call the appropriate parallel I/O subsystem write function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ * ++ * TODO: Is wdata->how and wdata->args.stable always the same value? ++ * TODO: It seems in NFS, the server may not do a stable write even ++ * though it was requested (and vice-versa?). To check, it looks ++ * in data->res.verf->committed. Do we need this ability ++ * for non-file layout drivers? ++ */ ++enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *wdata, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ struct inode *inode = wdata->inode; ++ enum pnfs_try_status trypnfs; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = wdata->req->wb_lseg; ++ ++ wdata->pdata.call_ops = call_ops; ++ wdata->pdata.pnfs_error = 0; ++ wdata->pdata.how = how; ++ ++ dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, ++ inode->i_ino, wdata->args.count, wdata->args.offset, how); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ wdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ wdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->write_pagelist(wdata, ++ nfs_page_array_len(wdata->args.pgbase, wdata->args.count), ++ how); ++ ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ wdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ wdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&wdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* Post-read completion function. Invoked by all layout drivers when ++ * read_pagelist is done ++ */ ++static void ++pnfs_read_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ struct pnfs_layout_range range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = rdata->args.offset; ++ range.length = rdata->args.count; ++ _pnfs_return_layout(rdata->inode, &range, NULL, RETURN_FILE, true); ++ pnfs_initiate_read(rdata, NFS_CLIENT(rdata->inode), ++ rdata->pdata.call_ops); ++} ++ ++static void ++pnfs_read_done(struct nfs_read_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_read_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++ ++/* ++ * Call the appropriate parallel I/O subsystem read function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ */ ++enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *rdata, ++ const struct rpc_call_ops *call_ops) ++{ ++ struct inode *inode = rdata->inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = rdata->req->wb_lseg; ++ enum pnfs_try_status trypnfs; ++ ++ rdata->pdata.call_ops = call_ops; ++ rdata->pdata.pnfs_error = 0; ++ ++ dprintk("%s: Reading ino:%lu %u@%llu\n", ++ __func__, inode->i_ino, rdata->args.count, rdata->args.offset); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ rdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ rdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->read_pagelist(rdata, ++ nfs_page_array_len(rdata->args.pgbase, rdata->args.count)); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ rdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ rdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&rdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_READ); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* ++ * This gives the layout driver an opportunity to read in page "around" ++ * the data to be written. It returns 0 on success, otherwise an error code ++ * which will either be passed up to user, or ignored if ++ * some previous part of write succeeded. ++ * Note the range [pos, pos+len-1] is entirely within the page. ++ */ ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata) ++{ ++ struct pnfs_fsdata *data; ++ int status = 0; ++ ++ dprintk("--> %s: pos=%llu len=%u\n", ++ __func__, (unsigned long long)pos, len); ++ data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL); ++ if (!data) { ++ status = -ENOMEM; ++ goto out; ++ } ++ data->lseg = lseg; /* refcount passed into data to be managed there */ ++ status = NFS_SERVER(inode)->pnfs_curr_ld->ld_io_ops->write_begin( ++ lseg, page, pos, len, data); ++ if (status) { ++ kfree(data); ++ data = NULL; ++ } ++out: ++ *fsdata = data; ++ dprintk("<-- %s: status=%d\n", __func__, status); ++ return status; ++} ++ ++/* Return 0 on succes, negative on failure */ ++/* CAREFUL - what happens if copied < len??? */ ++int _pnfs_write_end(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int status; ++ ++ status = nfss->pnfs_curr_ld->ld_io_ops->write_end(inode, page, ++ pos, len, copied, lseg); ++ return status; ++} ++ ++/* pNFS Commit callback function for all layout drivers */ ++static void ++pnfs_commit_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ struct pnfs_layout_range range = { ++ .iomode = IOMODE_RW, ++ .offset = data->args.offset, ++ .length = data->args.count, ++ }; ++ dprintk("%s: retrying\n", __func__); ++ _pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE, ++ true); ++ pnfs_initiate_commit(data, NFS_CLIENT(data->inode), ++ pdata->call_ops, pdata->how, 1); ++ } ++} ++ ++enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int sync) ++{ ++ struct inode *inode = data->inode; ++ struct nfs_server *nfss = NFS_SERVER(data->inode); ++ enum pnfs_try_status trypnfs; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ /* We need to account for possibility that ++ * each nfs_page can point to a different lseg (or be NULL). ++ * For the immediate case of whole-file-only layouts, we at ++ * least know there can be only a single lseg. ++ * We still have to account for the possibility of some being NULL. ++ * This will be done by passing the buck to the layout driver. ++ */ ++ data->pdata.call_ops = call_ops; ++ data->pdata.pnfs_error = 0; ++ data->pdata.how = sync; ++ data->pdata.lseg = NULL; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->commit(data, sync); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ data->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ _pnfs_clear_lseg_from_pages(&data->pages); ++ } else ++ nfs_inc_stats(inode, NFSIOS_PNFS_COMMIT); ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) ++{ ++ struct nfs_server *nfss = NFS_SERVER(data->args.inode); ++ ++ /* TODO: Maybe we should avoid this by allowing the layout driver ++ * to directly xdr its layout on the wire. ++ */ ++ if (nfss->pnfs_curr_ld->ld_io_ops->cleanup_layoutcommit) ++ nfss->pnfs_curr_ld->ld_io_ops->cleanup_layoutcommit( ++ NFS_I(data->args.inode)->layout, ++ &data->args, data->status); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int ++pnfs_layoutcommit_setup(struct inode *inode, ++ struct nfs4_layoutcommit_data *data, ++ loff_t write_begin_pos, loff_t write_end_pos) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int result = 0; ++ ++ dprintk("--> %s\n", __func__); ++ ++ data->args.inode = inode; ++ data->args.fh = NFS_FH(inode); ++ data->args.layout_type = nfss->pnfs_curr_ld->id; ++ data->res.fattr = &data->fattr; ++ nfs_fattr_init(&data->fattr); ++ ++ /* TODO: Need to determine the correct values */ ++ data->args.time_modify_changed = 0; ++ ++ /* Set values from inode so it can be reset ++ */ ++ data->args.range.iomode = IOMODE_RW; ++ data->args.range.offset = write_begin_pos; ++ data->args.range.length = write_end_pos - write_begin_pos + 1; ++ data->args.lastbytewritten = min(write_end_pos, ++ i_size_read(inode) - 1); ++ data->args.bitmask = nfss->attr_bitmask; ++ data->res.server = nfss; ++ ++ /* Call layout driver to set the arguments */ ++ if (nfss->pnfs_curr_ld->ld_io_ops->setup_layoutcommit) ++ result = nfss->pnfs_curr_ld->ld_io_ops->setup_layoutcommit( ++ NFS_I(inode)->layout, &data->args); ++ ++ dprintk("<-- %s Status %d\n", __func__, result); ++ return result; ++} ++ ++/* Issue a async layoutcommit for an inode. ++ */ ++int ++pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ struct nfs4_layoutcommit_data *data; ++ struct nfs_inode *nfsi = NFS_I(inode); ++ loff_t write_begin_pos; ++ loff_t write_end_pos; ++ ++ int status = 0; ++ ++ dprintk("%s Begin (sync:%d)\n", __func__, sync); ++ ++ BUG_ON(!has_layout(nfsi)); ++ ++ data = pnfs_layoutcommit_alloc(); ++ if (!data) ++ return -ENOMEM; ++ ++ spin_lock(&inode->i_lock); ++ if (!layoutcommit_needed(nfsi)) { ++ spin_unlock(&inode->i_lock); ++ goto out_free; ++ } ++ ++ /* Clear layoutcommit properties in the inode so ++ * new lc info can be generated ++ */ ++ write_begin_pos = nfsi->layout->write_begin_pos; ++ write_end_pos = nfsi->layout->write_end_pos; ++ data->cred = nfsi->layout->cred; ++ nfsi->layout->write_begin_pos = 0; ++ nfsi->layout->write_end_pos = 0; ++ nfsi->layout->cred = NULL; ++ __clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->state); ++ pnfs_get_layout_stateid(&data->args.stateid, nfsi->layout); ++ ++ /* Reference for layoutcommit matched in pnfs_layoutcommit_release */ ++ get_layout(NFS_I(inode)->layout); ++ ++ spin_unlock(&inode->i_lock); ++ ++ /* Set up layout commit args */ ++ status = pnfs_layoutcommit_setup(inode, data, write_begin_pos, ++ write_end_pos); ++ if (status) { ++ /* The layout driver failed to setup the layoutcommit */ ++ put_rpccred(data->cred); ++ put_layout(inode); ++ goto out_free; ++ } ++ status = nfs4_proc_layoutcommit(data, sync); ++out: ++ dprintk("%s end (err:%d)\n", __func__, status); ++ return status; ++out_free: ++ pnfs_layoutcommit_free(data); ++ goto out; ++} ++ ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata) ++{ ++ if (fsdata) { ++ /* lseg refcounting handled directly in nfs_Write_end */ ++ kfree(fsdata); ++ } ++} ++ ++/* Callback operations for layout drivers. ++ */ ++struct pnfs_client_operations pnfs_ops = { ++ .nfs_getdevicelist = nfs4_proc_getdevicelist, ++ .nfs_getdeviceinfo = nfs4_proc_getdeviceinfo, ++ .nfs_readlist_complete = pnfs_read_done, ++ .nfs_writelist_complete = pnfs_writeback_done, ++ .nfs_commit_complete = pnfs_commit_done, ++}; ++ ++EXPORT_SYMBOL(pnfs_unregister_layoutdriver); ++EXPORT_SYMBOL(pnfs_register_layoutdriver); ++ ++ ++/* Device ID cache. Supports one layout type per struct nfs_client */ ++int ++nfs4_alloc_init_deviceid_cache(struct nfs_client *clp, ++ void (*free_callback)(struct kref *)) ++{ ++ struct nfs4_deviceid_cache *c; ++ ++ c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL); ++ if (!c) ++ return -ENOMEM; ++ spin_lock(&clp->cl_lock); ++ if (clp->cl_devid_cache != NULL) { ++ kref_get(&clp->cl_devid_cache->dc_kref); ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [kref [%d]]\n", __func__, ++ atomic_read(&clp->cl_devid_cache->dc_kref.refcount)); ++ kfree(c); ++ } else { ++ int i; ++ ++ spin_lock_init(&c->dc_lock); ++ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE ; i++) ++ INIT_HLIST_HEAD(&c->dc_deviceids[i]); ++ kref_init(&c->dc_kref); ++ c->dc_free_callback = free_callback; ++ clp->cl_devid_cache = c; ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [new]\n", __func__); ++ } ++ return 0; ++} ++EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache); ++ ++void ++nfs4_init_deviceid_node(struct nfs4_deviceid *d) ++{ ++ INIT_HLIST_NODE(&d->de_node); ++ kref_init(&d->de_kref); ++} ++EXPORT_SYMBOL(nfs4_init_deviceid_node); ++ ++/* Called from layoutdriver_io_operations->alloc_lseg */ ++void ++nfs4_set_layout_deviceid(struct pnfs_layout_segment *l, struct nfs4_deviceid *d) ++{ ++ dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); ++ l->deviceid = d; ++} ++EXPORT_SYMBOL(nfs4_set_layout_deviceid); ++ ++/* Called from layoutdriver_io_operations->free_lseg */ ++void ++nfs4_put_unset_layout_deviceid(struct pnfs_layout_segment *l, ++ struct nfs4_deviceid *d, ++ void (*free_callback)(struct kref *)) ++{ ++ dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); ++ l->deviceid = NULL; ++ kref_put(&d->de_kref, free_callback); ++} ++EXPORT_SYMBOL(nfs4_put_unset_layout_deviceid); ++ ++/* Find and reference a deviceid */ ++struct nfs4_deviceid * ++nfs4_find_get_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ long hash = nfs4_deviceid_hash(id); ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ rcu_read_lock(); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) { ++ if (!atomic_inc_not_zero(&d->de_kref.refcount)) { ++ goto fail; ++ } else { ++ rcu_read_unlock(); ++ return d; ++ } ++ } ++ } ++fail: ++ rcu_read_unlock(); ++ return NULL; ++} ++EXPORT_SYMBOL(nfs4_find_get_deviceid); ++ ++/* ++ * Add and kref_get a deviceid. ++ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new ++ */ ++struct nfs4_deviceid * ++nfs4_add_get_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ long hash = nfs4_deviceid_hash(&new->de_id); ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ spin_lock(&c->dc_lock); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) { ++ kref_get(&d->de_kref); ++ spin_unlock(&c->dc_lock); ++ dprintk("%s [discard]\n", __func__); ++ c->dc_free_callback(&new->de_kref); ++ return d; ++ } ++ } ++ hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]); ++ kref_get(&new->de_kref); ++ spin_unlock(&c->dc_lock); ++ dprintk("%s [new]\n", __func__); ++ return new; ++} ++EXPORT_SYMBOL(nfs4_add_get_deviceid); ++ ++/* ++ * Remove the first deviceid from a hash bucket, or return 0 if bucket list ++ * is empty. ++ */ ++static int ++nfs4_remove_deviceid(struct nfs4_deviceid_cache *c, long hash, ++ struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ spin_lock(&c->dc_lock); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (id && memcmp(id, &d->de_id, NFS4_PNFS_DEVICEID4_SIZE)) ++ continue; ++ hlist_del_rcu(&d->de_node); ++ spin_unlock(&c->dc_lock); ++ synchronize_rcu(); ++ dprintk("%s [%d]\n", __func__, ++ atomic_read(&d->de_kref.refcount)); ++ kref_put(&d->de_kref, c->dc_free_callback); ++ return 1; ++ } ++ spin_unlock(&c->dc_lock); ++ return 0; ++} ++ ++void ++nfs4_delete_device(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) ++{ ++ long hash = nfs4_deviceid_hash(id); ++ ++ nfs4_remove_deviceid(c, hash, id); ++} ++EXPORT_SYMBOL(nfs4_delete_device); ++ ++static void ++nfs4_free_deviceid_cache(struct kref *kref) ++{ ++ struct nfs4_deviceid_cache *cache = ++ container_of(kref, struct nfs4_deviceid_cache, dc_kref); ++ long i; ++ ++ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) ++ while (nfs4_remove_deviceid(cache, i, NULL)) ++ ; ++ kfree(cache); ++} ++ ++void ++nfs4_put_deviceid_cache(struct nfs_client *clp) ++{ ++ struct nfs4_deviceid_cache *tmp = clp->cl_devid_cache; ++ int refcount; ++ ++ dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); ++ spin_lock(&clp->cl_lock); ++ refcount = atomic_read(&clp->cl_devid_cache->dc_kref.refcount); ++ if (refcount == 1) ++ clp->cl_devid_cache = NULL; ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [%d]\n", __func__, refcount); ++ kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache); ++} ++EXPORT_SYMBOL(nfs4_put_deviceid_cache); +diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs.h +--- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-08-31 20:42:05.542222767 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-08-31 20:42:05.542222767 -0400 +@@ -0,0 +1,354 @@ ++/* ++ * fs/nfs/pnfs.h ++ * ++ * pNFS client data structures. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef FS_NFS_PNFS_H ++#define FS_NFS_PNFS_H ++ ++#include ++ ++#ifdef CONFIG_NFS_V4_1 ++ ++#include ++#include ++#include "iostat.h" ++ ++/* nfs4proc.c */ ++extern int nfs4_proc_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist); ++extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, ++ struct pnfs_device *dev); ++extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); ++extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, ++ int issync); ++extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait); ++ ++/* pnfs.c */ ++extern const nfs4_stateid zero_stateid; ++ ++void _pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp); ++ ++int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type, bool wait); ++void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id); ++void unmount_pnfs_layoutdriver(struct nfs_server *); ++enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, ++ const struct rpc_call_ops *); ++int pnfs_initialize(void); ++void pnfs_uninitialize(void); ++void pnfs_layoutcommit_free(struct nfs4_layoutcommit_data *data); ++void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); ++int pnfs_layoutcommit_inode(struct inode *inode, int sync); ++void pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent); ++void pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx); ++unsigned int pnfs_getiosize(struct nfs_server *server); ++void pnfs_set_ds_iosize(struct nfs_server *server); ++enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, ++ struct nfs_open_context *, struct list_head *, ++ size_t *); ++void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, ++ size_t *); ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata); ++void pnfs_get_layout_done(struct nfs4_layoutget *, int rpc_status); ++int pnfs_layout_process(struct nfs4_layoutget *lgp); ++void pnfs_layout_release(struct pnfs_layout_hdr *, struct pnfs_layout_range *range); ++void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, ++ const nfs4_stateid *stateid); ++void pnfs_destroy_layout(struct nfs_inode *); ++void pnfs_destroy_all_layouts(struct nfs_client *); ++void put_layout(struct inode *inode); ++void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo); ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata); ++int _pnfs_write_end(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg); ++ ++#define PNFS_EXISTS_LDIO_OP(srv, opname) ((srv)->pnfs_curr_ld && \ ++ (srv)->pnfs_curr_ld->ld_io_ops && \ ++ (srv)->pnfs_curr_ld->ld_io_ops->opname) ++#define PNFS_EXISTS_LDPOLICY_OP(srv, opname) ((srv)->pnfs_curr_ld && \ ++ (srv)->pnfs_curr_ld->ld_policy_ops && \ ++ (srv)->pnfs_curr_ld->ld_policy_ops->opname) ++ ++#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" ++ ++static inline int lo_fail_bit(u32 iomode) ++{ ++ return iomode == IOMODE_RW ? ++ NFS_INO_RW_LAYOUT_FAILED : NFS_INO_RO_LAYOUT_FAILED; ++} ++ ++/* Return true if a layout driver is being used for this mountpoint */ ++static inline int pnfs_enabled_sb(struct nfs_server *nfss) ++{ ++ return nfss->pnfs_curr_ld != NULL; ++} ++ ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) ++{ ++ return !fsdata || ((struct pnfs_layout_segment *)fsdata == lseg) || ++ !fsdata->bypass_eof; ++} ++ ++/* Should the pNFS client commit and return the layout upon a setattr */ ++static inline bool ++pnfs_ld_layoutret_on_setattr(struct inode *inode) ++{ ++ if (!pnfs_enabled_sb(NFS_SERVER(inode))) ++ return false; ++ return NFS_SERVER(inode)->pnfs_curr_ld->ld_policy_ops->flags & ++ PNFS_LAYOUTRET_ON_SETATTR; ++} ++ ++/* Should the pNFS client commit and return the layout on close ++ */ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout->roc_iomode; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int status = 0; ++ ++ *fsdata = lseg; ++ if (lseg && PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ status = _pnfs_write_begin(inode, page, pos, len, lseg, ++ (struct pnfs_fsdata **) fsdata); ++ return status; ++} ++ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_end)) ++ return _pnfs_write_end(inode, page, pos, len, copied, lseg); ++ else ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++ if (fsdata) { ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_end_cleanup)) ++ nfss->pnfs_curr_ld->ld_io_ops->write_end_cleanup(filp, fsdata); ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ pnfs_free_fsdata(fsdata); ++ } ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct pnfs_layout_range *range, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct nfs_server *nfss = NFS_SERVER(ino); ++ ++ if (pnfs_enabled_sb(nfss) && ++ (type != RETURN_FILE || has_layout(nfsi))) ++ return _pnfs_return_layout(ino, range, stateid, type, wait); ++ ++ return 0; ++} ++ ++static inline void pnfs_update_layout(struct inode *ino, ++ struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ struct nfs_server *nfss = NFS_SERVER(ino); ++ ++ if (pnfs_enabled_sb(nfss)) ++ _pnfs_update_layout(ino, ctx, pos, count, access_type, lsegpp); ++ else { ++ if (lsegpp) ++ *lsegpp = NULL; ++ } ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline int pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ if (pnfs_enabled_sb(nfss)) ++ return pnfs_ld_use_rpc_code(nfss->pnfs_curr_ld); ++ ++ return 1; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ if (fsdata) { ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ return ((struct pnfs_fsdata *) fsdata)->lseg; ++ } ++ return fsdata; ++} ++#else /* CONFIG_NFS_V4_1 */ ++ ++static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) ++{ ++} ++ ++static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) ++{ ++} ++ ++static inline void get_lseg(struct pnfs_layout_segment *lseg) ++{ ++} ++ ++static inline void put_lseg(struct pnfs_layout_segment *lseg) ++{ ++} ++ ++static inline void ++pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ if (lsegpp) ++ *lsegpp = NULL; ++} ++ ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) ++{ ++ return 1; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ *fsdata = NULL; ++ return 0; ++} ++ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return 0; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return 0; ++} ++ ++static inline int pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ return 1; ++} ++ ++static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ return 0; ++} ++ ++static inline bool ++pnfs_ld_layoutret_on_setattr(struct inode *inode) ++{ ++ return false; ++} ++ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return 0; ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct pnfs_layout_range *range, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ return 0; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ return NULL; ++} ++ ++#endif /* CONFIG_NFS_V4_1 */ ++ ++#endif /* FS_NFS_PNFS_H */ +diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc.c +--- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-08-31 20:41:19.163155499 -0400 ++++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-31 20:42:05.543103394 -0400 +@@ -443,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, stru + fattr = nfs_alloc_fattr(); + status = -ENOMEM; + if (fh == NULL || fattr == NULL) +- goto out; ++ goto out_free; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); +@@ -455,7 +455,7 @@ nfs_proc_symlink(struct inode *dir, stru + */ + if (status == 0) + status = nfs_instantiate(dentry, fh, fattr); +- ++out_free: + nfs_free_fattr(fattr); + nfs_free_fhandle(fh); + out: +@@ -694,6 +694,7 @@ const struct nfs_rpc_ops nfs_v2_clientop + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs_dir_inode_operations, + .file_inode_ops = &nfs_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs_proc_get_root, + .getattr = nfs_proc_getattr, + .setattr = nfs_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read.c +--- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-08-31 20:41:19.164160482 -0400 ++++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-31 20:42:05.544233042 -0400 +@@ -18,8 +18,12 @@ + #include + #include + #include ++#include ++#include + + #include ++#include ++#include "pnfs.h" + + #include "nfs4_fs.h" + #include "internal.h" +@@ -117,11 +121,14 @@ int nfs_readpage_async(struct nfs_open_c + LIST_HEAD(one_request); + struct nfs_page *new; + unsigned int len; ++ struct pnfs_layout_segment *lseg; + + len = nfs_page_length(page); + if (len == 0) + return nfs_return_empty_page(page); +- new = nfs_create_request(ctx, inode, page, 0, len); ++ pnfs_update_layout(inode, ctx, 0, NFS4_MAX_UINT64, IOMODE_READ, &lseg); ++ new = nfs_create_request(ctx, inode, page, 0, len, lseg); ++ put_lseg(lseg); + if (IS_ERR(new)) { + unlock_page(page); + return PTR_ERR(new); +@@ -155,24 +162,20 @@ static void nfs_readpage_release(struct + nfs_release_request(req); + } + +-/* +- * Set up the NFS read request struct +- */ +-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset) ++int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, +@@ -180,9 +183,46 @@ static int nfs_read_rpcsetup(struct nfs_ + .flags = RPC_TASK_ASYNC | swap_flags, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->read_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_read); ++ ++int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) ++ return pnfs_get_read_status(data); ++ ++ return nfs_initiate_read(data, clnt, call_ops); ++} ++ ++/* ++ * Set up the NFS read request struct ++ */ ++static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + data->req = req; + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -190,27 +230,14 @@ static int nfs_read_rpcsetup(struct nfs_ + data->args.pages = data->pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); ++ data->args.lock_context = req->wb_lock_context; + + data->res.fattr = &data->fattr; + data->res.count = count; + data->res.eof = 0; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- rpc_put_task(task); +- return 0; ++ return pnfs_initiate_read(data, NFS_CLIENT(inode), call_ops); + } + + static void +@@ -354,7 +381,14 @@ static void nfs_readpage_retry(struct rp + { + struct nfs_readargs *argp = &data->args; + struct nfs_readres *resp = &data->res; ++ struct nfs_client *clp = NFS_SERVER(data->inode)->nfs_client; + ++#ifdef CONFIG_NFS_V4_1 ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + if (resp->eof || resp->count == argp->count) + return; + +@@ -368,7 +402,10 @@ static void nfs_readpage_retry(struct rp + argp->offset += resp->count; + argp->pgbase += resp->count; + argp->count -= resp->count; +- nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ ++ nfs_restart_rpc(task, clp); + } + + /* +@@ -409,13 +446,19 @@ static void nfs_readpage_release_partial + void nfs_read_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_read_data *data = calldata; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client, ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, + &data->args.seq_args, &data->res.seq_res, + 0, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_read_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_read_partial_ops = { +@@ -568,7 +611,8 @@ readpage_async_filler(void *data, struct + if (len == 0) + return nfs_return_empty_page(page); + +- new = nfs_create_request(desc->ctx, inode, page, 0, len); ++ new = nfs_create_request(desc->ctx, inode, page, 0, len, ++ desc->pgio->pg_lseg); + if (IS_ERR(new)) + goto out_error; + +@@ -624,6 +668,9 @@ int nfs_readpages(struct file *filp, str + if (ret == 0) + goto read_complete; /* all pages were read */ + ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_pageio_init_read(&pgio, inode, desc.ctx, pages, &rsize); ++#endif /* CONFIG_NFS_V4_1 */ + if (rsize < PAGE_CACHE_SIZE) + nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); + else +@@ -632,6 +679,7 @@ int nfs_readpages(struct file *filp, str + ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); + + nfs_pageio_complete(&pgio); ++ put_lseg(pgio.pg_lseg); + npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nfs_add_stats(inode, NFSIOS_READPAGES, npages); + read_complete: +diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/super.c +--- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-31 20:41:19.165170508 -0400 ++++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-31 20:42:05.545114737 -0400 +@@ -64,6 +64,7 @@ + #include "iostat.h" + #include "internal.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_VFS + +@@ -676,6 +677,28 @@ static int nfs_show_options(struct seq_f + + return 0; + } ++#ifdef CONFIG_NFS_V4_1 ++void show_sessions(struct seq_file *m, struct nfs_server *server) ++{ ++ if (nfs4_has_session(server->nfs_client)) ++ seq_printf(m, ",sessions"); ++} ++#else ++void show_sessions(struct seq_file *m, struct nfs_server *server) {} ++#endif ++ ++#ifdef CONFIG_NFS_V4_1 ++void show_pnfs(struct seq_file *m, struct nfs_server *server) ++{ ++ seq_printf(m, ",pnfs="); ++ if (server->pnfs_curr_ld) ++ seq_printf(m, "%s", server->pnfs_curr_ld->name); ++ else ++ seq_printf(m, "not configured"); ++} ++#else /* CONFIG_NFS_V4_1 */ ++void show_pnfs(struct seq_file *m, struct nfs_server *server) {} ++#endif /* CONFIG_NFS_V4_1 */ + + /* + * Present statistical information for this VFS mountpoint +@@ -714,6 +737,8 @@ static int nfs_show_stats(struct seq_fil + seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); + seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); + seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); ++ show_sessions(m, nfss); ++ show_pnfs(m, nfss); + } + #endif + +diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unlink.c +--- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-08-31 20:41:19.166151095 -0400 ++++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-31 20:42:05.546131839 -0400 +@@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task + struct nfs_unlinkdata *data = calldata; + struct nfs_server *server = NFS_SERVER(data->dir); + +- if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, ++ if (nfs4_setup_sequence(server, NULL, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/write.c +--- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-08-31 20:41:17.273213379 -0400 ++++ linux-2.6.34.noarch/fs/nfs/write.c 2010-08-31 20:42:05.548212682 -0400 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + #include + +@@ -28,6 +29,7 @@ + #include "iostat.h" + #include "nfs4_fs.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PAGECACHE + +@@ -59,6 +61,7 @@ struct nfs_write_data *nfs_commitdata_al + } + return p; + } ++EXPORT_SYMBOL(nfs_commitdata_alloc); + + void nfs_commit_free(struct nfs_write_data *p) + { +@@ -66,6 +69,7 @@ void nfs_commit_free(struct nfs_write_da + kfree(p->pagevec); + mempool_free(p, nfs_commit_mempool); + } ++EXPORT_SYMBOL(nfs_commit_free); + + struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) + { +@@ -418,6 +422,17 @@ static void nfs_inode_remove_request(str + nfs_clear_request(req); + nfs_release_request(req); + } ++static void ++nfs_mark_request_nopnfs(struct nfs_page *req) ++{ ++ struct pnfs_layout_segment *lseg = req->wb_lseg; ++ ++ if (req->wb_lseg == NULL) ++ return; ++ req->wb_lseg = NULL; ++ put_lseg(lseg); ++ dprintk(" retry through MDS\n"); ++} + + static void + nfs_mark_request_dirty(struct nfs_page *req) +@@ -523,7 +538,7 @@ nfs_need_commit(struct nfs_inode *nfsi) + * The requests are *not* checked to ensure that they form a contiguous set. + */ + static int +-nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) ++nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int *use_pnfs) + { + struct nfs_inode *nfsi = NFS_I(inode); + int ret; +@@ -531,7 +546,8 @@ nfs_scan_commit(struct inode *inode, str + if (!nfs_need_commit(nfsi)) + return 0; + +- ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); ++ ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT, ++ use_pnfs); + if (ret > 0) + nfsi->ncommit -= ret; + if (nfs_need_commit(NFS_I(inode))) +@@ -560,7 +576,8 @@ static inline int nfs_scan_commit(struct + static struct nfs_page *nfs_try_to_update_request(struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int bytes) ++ unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + unsigned int rqend; +@@ -585,8 +602,8 @@ static struct nfs_page *nfs_try_to_updat + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. + */ +- if (offset > rqend +- || end < req->wb_offset) ++ if (offset > rqend || end < req->wb_offset || ++ req->wb_lseg != lseg) + goto out_flushme; + + if (nfs_set_page_tag_locked(req)) +@@ -634,16 +651,17 @@ out_err: + * already called nfs_flush_incompatible() if necessary. + */ + static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, +- struct page *page, unsigned int offset, unsigned int bytes) ++ struct page *page, unsigned int offset, unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct inode *inode = page->mapping->host; + struct nfs_page *req; + int error; + +- req = nfs_try_to_update_request(inode, page, offset, bytes); ++ req = nfs_try_to_update_request(inode, page, offset, bytes, lseg); + if (req != NULL) + goto out; +- req = nfs_create_request(ctx, inode, page, offset, bytes); ++ req = nfs_create_request(ctx, inode, page, offset, bytes, lseg); + if (IS_ERR(req)) + goto out; + error = nfs_inode_add_request(inode, req); +@@ -656,23 +674,27 @@ out: + } + + static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, ++ void *fsdata) + { + struct nfs_page *req; + +- req = nfs_setup_write_request(ctx, page, offset, count); ++ req = nfs_setup_write_request(ctx, page, offset, count, lseg); + if (IS_ERR(req)) + return PTR_ERR(req); + nfs_mark_request_dirty(req); + /* Update file length */ +- nfs_grow_file(page, offset, count); ++ if (pnfs_grow_ok(lseg, fsdata)) ++ nfs_grow_file(page, offset, count); + nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + return 0; + } + +-int nfs_flush_incompatible(struct file *file, struct page *page) ++int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct nfs_page *req; +@@ -689,7 +711,10 @@ int nfs_flush_incompatible(struct file * + req = nfs_page_find_request(page); + if (req == NULL) + return 0; +- do_flush = req->wb_page != page || req->wb_context != ctx; ++ do_flush = req->wb_page != page || req->wb_context != ctx || ++ req->wb_lock_context->lockowner != current->files || ++ req->wb_lock_context->pid != current->tgid || ++ req->wb_lseg != lseg; + nfs_release_request(req); + if (!do_flush) + return 0; +@@ -716,7 +741,8 @@ static int nfs_write_pageuptodate(struct + * things with a page scheduled for an RPC call (e.g. invalidate it). + */ + int nfs_updatepage(struct file *file, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct inode *inode = page->mapping->host; +@@ -741,7 +767,7 @@ int nfs_updatepage(struct file *file, st + offset = 0; + } + +- status = nfs_writepage_setup(ctx, page, offset, count); ++ status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata); + if (status < 0) + nfs_set_pageerror(page); + +@@ -771,25 +797,21 @@ static int flush_task_priority(int how) + return RPC_PRIORITY_NORMAL; + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_write_rpcsetup(struct nfs_page *req, +- struct nfs_write_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset, +- int how) ++int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .task = &data->task, + .rpc_message = &msg, + .callback_ops = call_ops, +@@ -800,12 +822,62 @@ static int nfs_write_rpcsetup(struct nfs + }; + int ret = 0; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->write_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ if (how & FLUSH_SYNC) { ++ ret = rpc_wait_for_completion_task(task); ++ if (ret == 0) ++ ret = task->tk_status; ++ } ++ rpc_put_task(task); ++out: ++ return ret; ++} ++EXPORT_SYMBOL(nfs_initiate_write); ++ ++int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_write(data, clnt, call_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_write_rpcsetup(struct nfs_page *req, ++ struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset, ++ int how) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + data->req = req; + data->inode = inode = req->wb_context->path.dentry->d_inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -813,6 +885,7 @@ static int nfs_write_rpcsetup(struct nfs + data->args.pages = data->pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); ++ data->args.lock_context = req->wb_lock_context; + data->args.stable = NFS_UNSTABLE; + if (how & FLUSH_STABLE) { + data->args.stable = NFS_DATA_SYNC; +@@ -825,30 +898,7 @@ static int nfs_write_rpcsetup(struct nfs + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- dprintk("NFS: %5u initiated write call " +- "(req %s/%lld, %u bytes @ offset %llu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) { +- ret = PTR_ERR(task); +- goto out; +- } +- if (how & FLUSH_SYNC) { +- ret = rpc_wait_for_completion_task(task); +- if (ret == 0) +- ret = task->tk_status; +- } +- rpc_put_task(task); +-out: +- return ret; ++ return pnfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); + } + + /* If a nfs_flush_* function fails, it should remove reqs from @head and +@@ -859,6 +909,7 @@ static void nfs_redirty_request(struct n + { + struct page *page = req->wb_page; + ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); +@@ -971,6 +1022,10 @@ static void nfs_pageio_init_write(struct + { + size_t wsize = NFS_SERVER(inode)->wsize; + ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_pageio_init_write(pgio, inode, &wsize); ++#endif /* CONFIG_NFS_V4_1 */ ++ + if (wsize < PAGE_CACHE_SIZE) + nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); + else +@@ -1036,13 +1091,27 @@ out: + void nfs_write_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_write_data *data = calldata; +- struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(clp, &data->args.seq_args, ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } else if (data->args.count > NFS_SERVER(data->inode)->wsize) { ++ /* retrying via MDS? */ ++ data->pdata.orig_count = data->args.count; ++ data->args.count = NFS_SERVER(data->inode)->wsize; ++ dprintk("%s: trimmed count %u to wsize %u\n", __func__, ++ data->pdata.orig_count, data->args.count); ++ } else ++ data->pdata.orig_count = 0; ++ ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, ++ &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_write_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_write_partial_ops = { +@@ -1126,10 +1195,11 @@ int nfs_writeback_done(struct rpc_task * + struct nfs_writeargs *argp = &data->args; + struct nfs_writeres *resp = &data->res; + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *clp = server->nfs_client; + int status; + +- dprintk("NFS: %5u nfs_writeback_done (status %d)\n", +- task->tk_pid, task->tk_status); ++ dprintk("NFS: %5u nfs_writeback_done (status %d count %u)\n", ++ task->tk_pid, task->tk_status, resp->count); + + /* + * ->write_done will attempt to use post-op attributes to detect +@@ -1142,6 +1212,13 @@ int nfs_writeback_done(struct rpc_task * + if (status != 0) + return status; + nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); ++#ifdef CONFIG_NFS_V4_1 ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) + if (resp->verf->committed < argp->stable && task->tk_status >= 0) { +@@ -1158,7 +1235,7 @@ int nfs_writeback_done(struct rpc_task * + if (time_before(complain, jiffies)) { + dprintk("NFS: faulty NFS server %s:" + " (committed = %d) != (stable = %d)\n", +- server->nfs_client->cl_hostname, ++ clp->cl_hostname, + resp->verf->committed, argp->stable); + complain = jiffies + 300 * HZ; + } +@@ -1168,6 +1245,9 @@ int nfs_writeback_done(struct rpc_task * + if (task->tk_status >= 0 && resp->count < argp->count) { + static unsigned long complain; + ++ dprintk("NFS: short write:" ++ " (resp->count %u) < (argp->count = %u)\n", ++ resp->count, argp->count); + nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE); + + /* Has the server at least made some progress? */ +@@ -1184,7 +1264,10 @@ int nfs_writeback_done(struct rpc_task * + */ + argp->stable = NFS_FILE_SYNC; + } +- nfs_restart_rpc(task, server->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ ++ nfs_restart_rpc(task, clp); + return -EAGAIN; + } + if (time_before(complain, jiffies)) { +@@ -1228,40 +1311,73 @@ static void nfs_commitdata_release(void + nfs_commit_free(wdata); + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_commit_rpcsetup(struct list_head *head, +- struct nfs_write_data *data, +- int how) ++int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct nfs_page *first = nfs_list_entry(head->next); +- struct inode *inode = first->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = first->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, +- .callback_ops = &nfs_commit_ops, ++ .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + .priority = priority, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->commit_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_commit); ++ ++ ++int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs) ++{ ++ if (pnfs && ++ (pnfs_try_to_commit(data, &nfs_commit_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_commit(data, clnt, &nfs_commit_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_commit_rpcsetup(struct list_head *head, ++ struct nfs_write_data *data, ++ int how, int pnfs) ++{ ++ struct nfs_page *first = nfs_list_entry(head->next); ++ struct inode *inode = first->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + list_splice_init(head, &data->pages); + + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = first->wb_context->cred; + + data->args.fh = NFS_FH(data->inode); + /* Note: we always request a commit of the entire inode */ +@@ -1272,45 +1388,47 @@ static int nfs_commit_rpcsetup(struct li + data->res.fattr = &data->fattr; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); ++ kref_init(&data->refcount); ++ data->parent = NULL; ++ data->args.context = first->wb_context; /* used by commit done */ + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->commit_setup(data, &msg); ++ return pnfs_initiate_commit(data, NFS_CLIENT(inode), &nfs_commit_ops, ++ how, pnfs); ++} + +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++/* Handle memory error during commit */ ++void nfs_mark_list_commit(struct list_head *head) ++{ ++ struct nfs_page *req; + +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- rpc_put_task(task); +- return 0; ++ while (!list_empty(head)) { ++ req = nfs_list_entry(head->next); ++ nfs_list_remove_request(req); ++ nfs_mark_request_commit(req); ++ dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); ++ dec_bdi_stat(req->wb_page->mapping->backing_dev_info, ++ BDI_RECLAIMABLE); ++ nfs_clear_page_tag_locked(req); ++ } + } ++EXPORT_SYMBOL(nfs_mark_list_commit); + + /* + * Commit dirty pages + */ + static int +-nfs_commit_list(struct inode *inode, struct list_head *head, int how) ++nfs_commit_list(struct inode *inode, struct list_head *head, int how, int pnfs) + { + struct nfs_write_data *data; +- struct nfs_page *req; + + data = nfs_commitdata_alloc(); +- + if (!data) + goto out_bad; + + /* Set up the argument struct */ +- return nfs_commit_rpcsetup(head, data, how); ++ return nfs_commit_rpcsetup(head, data, how, pnfs); + out_bad: +- while (!list_empty(head)) { +- req = nfs_list_entry(head->next); +- nfs_list_remove_request(req); +- nfs_mark_request_commit(req); +- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); +- dec_bdi_stat(req->wb_page->mapping->backing_dev_info, +- BDI_RECLAIMABLE); +- nfs_clear_page_tag_locked(req); +- } ++ nfs_mark_list_commit(head); + nfs_commit_clear_lock(NFS_I(inode)); + return -ENOMEM; + } +@@ -1330,6 +1448,19 @@ static void nfs_commit_done(struct rpc_t + return; + } + ++static inline void nfs_commit_cleanup(struct kref *kref) ++{ ++ struct nfs_write_data *data; ++ ++ data = container_of(kref, struct nfs_write_data, refcount); ++ /* Clear lock only when all cloned commits are finished */ ++ if (data->parent) ++ kref_put(&data->parent->refcount, nfs_commit_cleanup); ++ else ++ nfs_commit_clear_lock(NFS_I(data->inode)); ++ nfs_commitdata_release(data); ++} ++ + static void nfs_commit_release(void *calldata) + { + struct nfs_write_data *data = calldata; +@@ -1347,6 +1478,11 @@ static void nfs_commit_release(void *cal + req->wb_bytes, + (long long)req_offset(req)); + if (status < 0) { ++ if (req->wb_lseg) { ++ nfs_mark_request_nopnfs(req); ++ nfs_mark_request_dirty(req); ++ goto next; ++ } + nfs_context_set_write_error(req->wb_context, status); + nfs_inode_remove_request(req); + dprintk(", error = %d\n", status); +@@ -1363,12 +1499,12 @@ static void nfs_commit_release(void *cal + } + /* We have a mismatch. Write the page again */ + dprintk(" mismatch\n"); ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + next: + nfs_clear_page_tag_locked(req); + } +- nfs_commit_clear_lock(NFS_I(data->inode)); +- nfs_commitdata_release(calldata); ++ kref_put(&data->refcount, nfs_commit_cleanup); + } + + static const struct rpc_call_ops nfs_commit_ops = { +@@ -1384,21 +1520,22 @@ int nfs_commit_inode(struct inode *inode + LIST_HEAD(head); + int may_wait = how & FLUSH_SYNC; + int res = 0; ++ int use_pnfs = 0; + + if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) + goto out_mark_dirty; + spin_lock(&inode->i_lock); +- res = nfs_scan_commit(inode, &head, 0, 0); ++ res = nfs_scan_commit(inode, &head, 0, 0, &use_pnfs); + spin_unlock(&inode->i_lock); + if (res) { +- int error = nfs_commit_list(inode, &head, how); ++ int error = nfs_commit_list(inode, &head, how, use_pnfs); + if (error < 0) + return error; +- if (may_wait) ++ if (may_wait) { + wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); +- else ++ } else + goto out_mark_dirty; + } else + nfs_commit_clear_lock(NFS_I(inode)); +@@ -1451,7 +1588,18 @@ static int nfs_commit_unstable_pages(str + + int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) + { +- return nfs_commit_unstable_pages(inode, wbc); ++ int ret; ++ ret = nfs_commit_unstable_pages(inode, wbc); ++ if (ret >= 0 && layoutcommit_needed(NFS_I(inode))) { ++ int err, sync = wbc->sync_mode; ++ ++ if (wbc->nonblocking || wbc->for_background) ++ sync = 0; ++ err = pnfs_layoutcommit_inode(inode, sync); ++ if (err < 0) ++ ret = err; ++ } ++ return ret; + } + + /* +@@ -1459,6 +1607,7 @@ int nfs_write_inode(struct inode *inode, + */ + int nfs_wb_all(struct inode *inode) + { ++ int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, +@@ -1466,7 +1615,8 @@ int nfs_wb_all(struct inode *inode) + .range_end = LLONG_MAX, + }; + +- return sync_inode(inode, &wbc); ++ ret = sync_inode(inode, &wbc); ++ return ret; + } + + int nfs_wb_page_cancel(struct inode *inode, struct page *page) +diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/include/linux/exportfs.h +--- linux-2.6.34.noarch/include/linux/exportfs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-08-31 20:42:05.577222704 -0400 +@@ -2,6 +2,7 @@ + #define LINUX_EXPORTFS_H 1 + + #include ++#include + + struct dentry; + struct inode; +@@ -175,4 +176,62 @@ extern struct dentry *generic_fh_to_pare + struct fid *fid, int fh_len, int fh_type, + struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)); + ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct pnfs_filelayout_device; ++struct pnfs_filelayout_layout; ++ ++extern int filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev); ++extern enum nfsstat4 filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct list_head; ++ ++extern int blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes); ++ ++extern enum nfsstat4 blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *layouts); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_PNFSD) ++#include ++ ++struct pnfsd_cb_operations; ++ ++struct pnfsd_cb_ctl { ++ spinlock_t lock; ++ struct module *module; ++ const struct pnfsd_cb_operations *cb_op; ++}; ++ ++/* in expfs.c so that file systems can depend on it */ ++extern struct pnfsd_cb_ctl pnfsd_cb_ctl; ++ ++static inline int ++pnfsd_get_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ int ret = -ENOENT; ++ ++ spin_lock(&pnfsd_cb_ctl.lock); ++ if (!pnfsd_cb_ctl.cb_op) ++ goto out; ++ if (!try_module_get(pnfsd_cb_ctl.module)) ++ goto out; ++ ctl->cb_op = pnfsd_cb_ctl.cb_op; ++ ctl->module = pnfsd_cb_ctl.module; ++ ret = 0; ++out: ++ spin_unlock(&pnfsd_cb_ctl.lock); ++ return ret; ++} ++ ++static inline void ++pnfsd_put_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ module_put(ctl->module); ++} ++#endif /* CONFIG_PNFSD */ + #endif /* LINUX_EXPORTFS_H */ +diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/include/linux/exp_xdr.h +--- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-08-31 20:42:05.576053304 -0400 ++++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-08-31 20:42:05.576053304 -0400 +@@ -0,0 +1,141 @@ ++#ifndef _LINUX_EXP_XDR_H ++#define _LINUX_EXP_XDR_H ++ ++#include ++#include ++#include ++ ++struct exp_xdr_stream { ++ __be32 *p; ++ __be32 *end; ++}; ++ ++/** ++ * exp_xdr_qwords - Calculate the number of quad-words holding nbytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline size_t ++exp_xdr_qwords(__u32 nbytes) ++{ ++ return DIV_ROUND_UP(nbytes, 4); ++} ++ ++/** ++ * exp_xdr_qbytes - Calculate the number of bytes holding qwords ++ * @qwords: number of quad-words to encode ++ */ ++static inline size_t ++exp_xdr_qbytes(size_t qwords) ++{ ++ return qwords << 2; ++} ++ ++/** ++ * exp_xdr_reserve_space - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nbytes: number of bytes to reserve ++ * ++ * Checks that we have enough buffer space to encode 'nbytes' more ++ * bytes of data. If so, update the xdr stream. ++ */ ++static inline __be32 * ++exp_xdr_reserve_space(struct exp_xdr_stream *xdr, size_t nbytes) ++{ ++ __be32 *p = xdr->p; ++ __be32 *q; ++ ++ /* align nbytes on the next 32-bit boundary */ ++ q = p + exp_xdr_qwords(nbytes); ++ if (unlikely(q > xdr->end || q < p)) ++ return NULL; ++ xdr->p = q; ++ return p; ++} ++ ++/** ++ * exp_xdr_reserve_qwords - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nwords: number of quad words (u32's) to reserve ++ */ ++static inline __be32 * ++exp_xdr_reserve_qwords(struct exp_xdr_stream *xdr, size_t qwords) ++{ ++ return exp_xdr_reserve_space(xdr, exp_xdr_qbytes(qwords)); ++} ++ ++/** ++ * exp_xdr_encode_u32 - Encode an unsigned 32-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u32(__be32 *p, __u32 val) ++{ ++ *p = cpu_to_be32(val); ++ return p + 1; ++} ++ ++/** ++ * exp_xdr_encode_u64 - Encode an unsigned 64-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u64(__be32 *p, __u64 val) ++{ ++ put_unaligned_be64(val, p); ++ return p + 2; ++} ++ ++/** ++ * exp_xdr_encode_bytes - Encode an array of bytes onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the array of bytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_bytes(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ if (likely(nbytes != 0)) { ++ unsigned int qwords = exp_xdr_qwords(nbytes); ++ unsigned int padding = exp_xdr_qbytes(qwords) - nbytes; ++ ++ memcpy(p, ptr, nbytes); ++ if (padding != 0) ++ memset((char *)p + nbytes, 0, padding); ++ p += qwords; ++ } ++ return p; ++} ++ ++/** ++ * exp_xdr_encode_opaque - Encode an opaque type onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the opaque array ++ * @nbytes: number of bytes to encode ++ * ++ * Encodes the 32-bit opaque size in bytes followed by the opaque value. ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ p = exp_xdr_encode_u32(p, nbytes); ++ return exp_xdr_encode_bytes(p, ptr, nbytes); ++} ++ ++/** ++ * exp_xdr_encode_opaque_qlen - Encode the opaque length onto a xdr stream ++ * @lenp: pointer to the opaque length destination ++ * @endp: pointer to the end of the opaque array ++ * ++ * Encodes the 32-bit opaque size in bytes given the start and end pointers ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque_len(__be32 *lenp, const void *endp) ++{ ++ size_t nbytes = (char *)endp - (char *)(lenp + 1); ++ ++ exp_xdr_encode_u32(lenp, nbytes); ++ return lenp + 1 + exp_xdr_qwords(nbytes); ++} ++#endif /* _LINUX_EXP_XDR_H */ +diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include/linux/fs.h +--- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-08-31 20:41:19.120034834 -0400 ++++ linux-2.6.34.noarch/include/linux/fs.h 2010-08-31 20:42:05.579212604 -0400 +@@ -387,6 +387,7 @@ struct inodes_stat_t { + #include + + struct export_operations; ++struct pnfs_export_operations; + struct hd_geometry; + struct iovec; + struct nameidata; +@@ -1329,6 +1330,7 @@ struct super_block { + const struct dquot_operations *dq_op; + const struct quotactl_ops *s_qcop; + const struct export_operations *s_export_op; ++ const struct pnfs_export_operations *s_pnfs_op; + unsigned long s_flags; + unsigned long s_magic; + struct dentry *s_root; +diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/include/linux/nfs4.h +--- linux-2.6.34.noarch/include/linux/nfs4.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-08-31 20:42:05.581035627 -0400 +@@ -17,7 +17,10 @@ + + #define NFS4_BITMAP_SIZE 2 + #define NFS4_VERIFIER_SIZE 8 +-#define NFS4_STATEID_SIZE 16 ++#define NFS4_CLIENTID_SIZE 8 ++#define NFS4_STATEID_SEQID_SIZE 4 ++#define NFS4_STATEID_OTHER_SIZE 12 ++#define NFS4_STATEID_SIZE (NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE) + #define NFS4_FHSIZE 128 + #define NFS4_MAXPATHLEN PATH_MAX + #define NFS4_MAXNAMLEN NAME_MAX +@@ -119,6 +122,13 @@ + #define EXCHGID4_FLAG_MASK_A 0x40070003 + #define EXCHGID4_FLAG_MASK_R 0x80070003 + ++static inline bool ++is_ds_only_session(u32 exchange_flags) ++{ ++ u32 mask = EXCHGID4_FLAG_USE_PNFS_DS | EXCHGID4_FLAG_USE_PNFS_MDS; ++ return (exchange_flags & mask) == EXCHGID4_FLAG_USE_PNFS_DS; ++} ++ + #define SEQ4_STATUS_CB_PATH_DOWN 0x00000001 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING 0x00000002 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED 0x00000004 +@@ -166,8 +176,25 @@ struct nfs4_acl { + struct nfs4_ace aces[0]; + }; + ++struct nfs4_fsid { ++ u64 major; ++ u64 minor; ++}; ++ + typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier; +-typedef struct { char data[NFS4_STATEID_SIZE]; } nfs4_stateid; ++typedef struct { char data[NFS4_CLIENTID_SIZE]; } nfs4_clientid; ++ ++struct nfs41_stateid { ++ __be32 seqid; ++ char other[NFS4_STATEID_OTHER_SIZE]; ++} __attribute__ ((packed)); ++ ++typedef struct { ++ union { ++ char data[NFS4_STATEID_SIZE]; ++ struct nfs41_stateid stateid; ++ } u; ++} nfs4_stateid; + + enum nfs_opnum4 { + OP_ACCESS = 3, +@@ -471,6 +498,8 @@ enum lock_type4 { + #define FATTR4_WORD1_TIME_MODIFY (1UL << 21) + #define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22) + #define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23) ++#define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30) ++#define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) + + #define NFSPROC4_NULL 0 + #define NFSPROC4_COMPOUND 1 +@@ -523,6 +552,7 @@ enum { + NFSPROC4_CLNT_GETACL, + NFSPROC4_CLNT_SETACL, + NFSPROC4_CLNT_FS_LOCATIONS, ++ NFSPROC4_CLNT_RELEASE_LOCKOWNER, + + /* nfs41 */ + NFSPROC4_CLNT_EXCHANGE_ID, +@@ -531,6 +561,13 @@ enum { + NFSPROC4_CLNT_SEQUENCE, + NFSPROC4_CLNT_GET_LEASE_TIME, + NFSPROC4_CLNT_RECLAIM_COMPLETE, ++ NFSPROC4_CLNT_LAYOUTGET, ++ NFSPROC4_CLNT_LAYOUTCOMMIT, ++ NFSPROC4_CLNT_LAYOUTRETURN, ++ NFSPROC4_CLNT_GETDEVICELIST, ++ NFSPROC4_CLNT_GETDEVICEINFO, ++ NFSPROC4_CLNT_PNFS_WRITE, ++ NFSPROC4_CLNT_PNFS_COMMIT, + }; + + /* nfs41 types */ +@@ -549,6 +586,43 @@ enum state_protect_how4 { + SP4_SSV = 2 + }; + ++enum pnfs_layouttype { ++ LAYOUT_NFSV4_1_FILES = 1, ++ LAYOUT_OSD2_OBJECTS = 2, ++ LAYOUT_BLOCK_VOLUME = 3, ++}; ++ ++/* used for both layout return and recall */ ++enum pnfs_layoutreturn_type { ++ RETURN_FILE = 1, ++ RETURN_FSID = 2, ++ RETURN_ALL = 3 ++}; ++ ++enum pnfs_iomode { ++ IOMODE_READ = 1, ++ IOMODE_RW = 2, ++ IOMODE_ANY = 3, ++}; ++ ++enum pnfs_notify_deviceid_type4 { ++ NOTIFY_DEVICEID4_CHANGE = 1 << 1, ++ NOTIFY_DEVICEID4_DELETE = 1 << 2, ++}; ++ ++#define NFL4_UFLG_MASK 0x0000003F ++#define NFL4_UFLG_DENSE 0x00000001 ++#define NFL4_UFLG_COMMIT_THRU_MDS 0x00000002 ++#define NFL4_UFLG_STRIPE_UNIT_SIZE_MASK 0xFFFFFFC0 ++ ++/* Encoded in the loh_body field of type layouthint4 */ ++enum filelayout_hint_care4 { ++ NFLH4_CARE_DENSE = NFL4_UFLG_DENSE, ++ NFLH4_CARE_COMMIT_THRU_MDS = NFL4_UFLG_COMMIT_THRU_MDS, ++ NFLH4_CARE_STRIPE_UNIT_SIZE = 0x00000040, ++ NFLH4_CARE_STRIPE_COUNT = 0x00000080 ++}; ++ + #endif + #endif + +diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfs4_pnfs.h +--- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-31 20:42:05.583087731 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-08-31 20:42:05.583087731 -0400 +@@ -0,0 +1,329 @@ ++/* ++ * include/linux/nfs4_pnfs.h ++ * ++ * Common data structures needed by the pnfs client and pnfs layout driver. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef LINUX_NFS4_PNFS_H ++#define LINUX_NFS4_PNFS_H ++ ++#include ++ ++enum pnfs_try_status { ++ PNFS_ATTEMPTED = 0, ++ PNFS_NOT_ATTEMPTED = 1, ++}; ++ ++#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 ++ ++/* Per-layout driver specific registration structure */ ++struct pnfs_layoutdriver_type { ++ const u32 id; ++ const char *name; ++ struct layoutdriver_io_operations *ld_io_ops; ++ struct layoutdriver_policy_operations *ld_policy_ops; ++}; ++ ++struct pnfs_fsdata { ++ int bypass_eof; ++ struct pnfs_layout_segment *lseg; ++ void *private; ++}; ++ ++#if defined(CONFIG_NFS_V4_1) ++ ++static inline struct nfs_inode * ++PNFS_NFS_INODE(struct pnfs_layout_hdr *lo) ++{ ++ return NFS_I(lo->inode); ++} ++ ++static inline struct inode * ++PNFS_INODE(struct pnfs_layout_hdr *lo) ++{ ++ return lo->inode; ++} ++ ++static inline struct nfs_server * ++PNFS_NFS_SERVER(struct pnfs_layout_hdr *lo) ++{ ++ return NFS_SERVER(PNFS_INODE(lo)); ++} ++ ++static inline struct pnfs_layoutdriver_type * ++PNFS_LD(struct pnfs_layout_hdr *lo) ++{ ++ return NFS_SERVER(PNFS_INODE(lo))->pnfs_curr_ld; ++} ++ ++static inline struct layoutdriver_io_operations * ++PNFS_LD_IO_OPS(struct pnfs_layout_hdr *lo) ++{ ++ return PNFS_LD(lo)->ld_io_ops; ++} ++ ++static inline struct layoutdriver_policy_operations * ++PNFS_LD_POLICY_OPS(struct pnfs_layout_hdr *lo) ++{ ++ return PNFS_LD(lo)->ld_policy_ops; ++} ++ ++static inline bool ++has_layout(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout != NULL; ++} ++ ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) ++{ ++ return has_layout(nfsi) && ++ test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->state); ++} ++ ++extern void put_lseg(struct pnfs_layout_segment *lseg); ++extern void get_lseg(struct pnfs_layout_segment *lseg); ++ ++#else /* CONFIG_NFS_V4_1 */ ++ ++static inline bool ++has_layout(struct nfs_inode *nfsi) ++{ ++ return false; ++} ++ ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_NFS_V4_1 */ ++ ++struct pnfs_layout_segment { ++ struct list_head fi_list; ++ struct pnfs_layout_range range; ++ struct kref kref; ++ bool valid; ++ struct pnfs_layout_hdr *layout; ++ struct nfs4_deviceid *deviceid; ++ u8 ld_data[]; /* layout driver private data */ ++}; ++ ++static inline void * ++LSEG_LD_DATA(struct pnfs_layout_segment *lseg) ++{ ++ return lseg->ld_data; ++} ++ ++/* Layout driver I/O operations. ++ * Either the pagecache or non-pagecache read/write operations must be implemented ++ */ ++struct layoutdriver_io_operations { ++ /* Functions that use the pagecache. ++ * If use_pagecache == 1, then these functions must be implemented. ++ */ ++ /* read and write pagelist should return just 0 (to indicate that ++ * the layout code has taken control) or 1 (to indicate that the ++ * layout code wishes to fall back to normal nfs.) If 0 is returned, ++ * information can be passed back through nfs_data->res and ++ * nfs_data->task.tk_status, and the appropriate pnfs done function ++ * MUST be called. ++ */ ++ enum pnfs_try_status ++ (*read_pagelist) (struct nfs_read_data *nfs_data, unsigned nr_pages); ++ enum pnfs_try_status ++ (*write_pagelist) (struct nfs_write_data *nfs_data, unsigned nr_pages, int how); ++ int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page, ++ loff_t pos, unsigned count, ++ struct pnfs_fsdata *fsdata); ++ int (*write_end)(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, ++ struct pnfs_layout_segment *lseg); ++ void (*write_end_cleanup)(struct file *filp, ++ struct pnfs_fsdata *fsdata); ++ ++ /* Consistency ops */ ++ /* 2 problems: ++ * 1) the page list contains nfs_pages, NOT pages ++ * 2) currently the NFS code doesn't create a page array (as it does with read/write) ++ */ ++ enum pnfs_try_status ++ (*commit) (struct nfs_write_data *nfs_data, int how); ++ ++ /* Layout information. For each inode, alloc_layout is executed once to retrieve an ++ * inode specific layout structure. Each subsequent layoutget operation results in ++ * a set_layout call to set the opaque layout in the layout driver.*/ ++ struct pnfs_layout_hdr * (*alloc_layout) (struct inode *inode); ++ void (*free_layout) (struct pnfs_layout_hdr *); ++ struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); ++ void (*free_lseg) (struct pnfs_layout_segment *lseg); ++ ++ int (*setup_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct nfs4_layoutcommit_args *args); ++ void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *args); ++ void (*cleanup_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct nfs4_layoutcommit_args *args, ++ int status); ++ void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args); ++ ++ /* Registration information for a new mounted file system ++ */ ++ int (*initialize_mountpoint) (struct nfs_server *, ++ const struct nfs_fh * mntfh); ++ int (*uninitialize_mountpoint) (struct nfs_server *server); ++}; ++ ++enum layoutdriver_policy_flags { ++ /* Should the full nfs rpc cleanup code be used after io */ ++ PNFS_USE_RPC_CODE = 1 << 0, ++ ++ /* Should the NFS req. gather algorithm cross stripe boundaries? */ ++ PNFS_GATHER_ACROSS_STRIPES = 1 << 1, ++ ++ /* Should the pNFS client commit and return the layout upon a setattr */ ++ PNFS_LAYOUTRET_ON_SETATTR = 1 << 3, ++}; ++ ++struct layoutdriver_policy_operations { ++ unsigned flags; ++ ++ /* The stripe size of the file system */ ++ ssize_t (*get_stripesize) (struct pnfs_layout_hdr *layoutid); ++ ++ /* test for nfs page cache coalescing */ ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++ ++ /* Retreive the block size of the file system. ++ * If gather_across_stripes == 1, then the file system will gather ++ * requests into the block size. ++ * TODO: Where will the layout driver get this info? It is hard ++ * coded in PVFS2. ++ */ ++ ssize_t (*get_blocksize) (void); ++}; ++ ++/* Should the full nfs rpc cleanup code be used after io */ ++static inline int ++pnfs_ld_use_rpc_code(struct pnfs_layoutdriver_type *ld) ++{ ++ return ld->ld_policy_ops->flags & PNFS_USE_RPC_CODE; ++} ++ ++/* Should the NFS req. gather algorithm cross stripe boundaries? */ ++static inline int ++pnfs_ld_gather_across_stripes(struct pnfs_layoutdriver_type *ld) ++{ ++ return ld->ld_policy_ops->flags & PNFS_GATHER_ACROSS_STRIPES; ++} ++ ++struct pnfs_device { ++ struct pnfs_deviceid dev_id; ++ unsigned int layout_type; ++ unsigned int mincount; ++ struct page **pages; ++ void *area; ++ unsigned int pgbase; ++ unsigned int pglen; ++ unsigned int dev_notify_types; ++}; ++ ++struct pnfs_devicelist { ++ unsigned int eof; ++ unsigned int num_devs; ++ struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; ++}; ++ ++/* ++ * Device ID RCU cache. A device ID is unique per client ID and layout type. ++ */ ++#define NFS4_DEVICE_ID_HASH_BITS 5 ++#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) ++#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) ++ ++static inline u32 ++nfs4_deviceid_hash(struct pnfs_deviceid *id) ++{ ++ unsigned char *cptr = (unsigned char *)id->data; ++ unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE; ++ u32 x = 0; ++ ++ while (nbytes--) { ++ x *= 37; ++ x += *cptr++; ++ } ++ return x & NFS4_DEVICE_ID_HASH_MASK; ++} ++ ++struct nfs4_deviceid_cache { ++ spinlock_t dc_lock; ++ struct kref dc_kref; ++ void (*dc_free_callback)(struct kref *); ++ struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE]; ++}; ++ ++/* Device ID cache node */ ++struct nfs4_deviceid { ++ struct hlist_node de_node; ++ struct pnfs_deviceid de_id; ++ struct kref de_kref; ++}; ++ ++extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *, ++ void (*free_callback)(struct kref *)); ++extern void nfs4_put_deviceid_cache(struct nfs_client *); ++extern void nfs4_init_deviceid_node(struct nfs4_deviceid *); ++extern struct nfs4_deviceid *nfs4_find_get_deviceid( ++ struct nfs4_deviceid_cache *, ++ struct pnfs_deviceid *); ++extern struct nfs4_deviceid *nfs4_add_get_deviceid(struct nfs4_deviceid_cache *, ++ struct nfs4_deviceid *); ++extern void nfs4_set_layout_deviceid(struct pnfs_layout_segment *, ++ struct nfs4_deviceid *); ++extern void nfs4_put_unset_layout_deviceid(struct pnfs_layout_segment *, ++ struct nfs4_deviceid *, ++ void (*free_callback)(struct kref *)); ++extern void nfs4_delete_device(struct nfs4_deviceid_cache *, ++ struct pnfs_deviceid *); ++ ++/* pNFS client callback functions. ++ * These operations allow the layout driver to access pNFS client ++ * specific information or call pNFS client->server operations. ++ * E.g., getdeviceinfo, I/O callbacks, etc ++ */ ++struct pnfs_client_operations { ++ int (*nfs_getdevicelist) (struct nfs_server *, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist); ++ int (*nfs_getdeviceinfo) (struct nfs_server *, ++ struct pnfs_device *dev); ++ ++ /* Post read callback. */ ++ void (*nfs_readlist_complete) (struct nfs_read_data *nfs_data); ++ ++ /* Post write callback. */ ++ void (*nfs_writelist_complete) (struct nfs_write_data *nfs_data); ++ ++ /* Post commit callback. */ ++ void (*nfs_commit_complete) (struct nfs_write_data *nfs_data); ++ void (*nfs_return_layout) (struct inode *); ++}; ++ ++extern struct pnfs_client_operations pnfs_ops; ++ ++extern struct pnfs_client_operations *pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); ++extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); ++ ++#define NFS4_PNFS_MAX_LAYOUTS 4 ++#define NFS4_PNFS_PRIVATE_LAYOUT 0x80000000 ++ ++#endif /* LINUX_NFS4_PNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarch/include/linux/nfsd4_block.h +--- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-08-31 20:42:05.596098115 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-08-31 20:42:05.596098115 -0400 +@@ -0,0 +1,101 @@ ++#ifndef NFSD4_BLOCK ++#define NFSD4_BLOCK ++ ++#include ++#include ++#include ++#include ++ ++#define PNFS_BLOCK_SUCCESS 1 ++#define PNFS_BLOCK_FAILURE 0 ++ ++#define PNFS_BLOCK_CTL_START 1 ++#define PNFS_BLOCK_CTL_STOP 2 ++#define PNFS_BLOCK_CTL_VERS 3 /* Allows daemon to request current ++ * version from kernel via an upcall. ++ */ ++ ++#define PNFS_UPCALL_MSG_STOP 0 ++#define PNFS_UPCALL_MSG_GETSIG 1 ++#define PNFS_UPCALL_MSG_GETSLICE 2 ++#define PNFS_UPCALL_MSG_DMCHK 3 // See if dev_t is a DM volume ++#define PNFS_UPCALL_MSG_DMGET 4 ++#define PNFS_UPCALL_MSG_VERS 5 ++ ++#define PNFS_UPCALL_VERS 8 ++ ++typedef struct stripe_dev { ++ int major, ++ minor, ++ offset; ++} stripe_dev_t; ++ ++typedef struct bl_comm_res { ++ int res_status; ++ union { ++ struct { ++ long long start, ++ length; ++ } slice; ++ struct { ++ int num_stripes, ++ stripe_size; ++ stripe_dev_t devs[]; ++ } stripe; ++ struct { ++ long long sector; ++ int offset, ++ len; ++ char sig[]; ++ } sig; ++ int vers, ++ dm_vol; ++ } u; ++} bl_comm_res_t; ++ ++typedef struct bl_comm_msg { ++ int msg_type, ++ msg_status; ++ union { ++ dev_t msg_dev; ++ int msg_vers; ++ } u; ++ bl_comm_res_t *msg_res; ++} bl_comm_msg_t; ++ ++#ifdef __KERNEL__ ++ ++typedef struct bl_comm { ++ /* ---- protects access to this structure ---- */ ++ struct mutex lock; ++ /* ---- protects access to rpc pipe ---- */ ++ struct mutex pipe_lock; ++ struct dentry *pipe_dentry; ++ wait_queue_head_t pipe_wq; ++ bl_comm_msg_t msg; ++} bl_comm_t; ++ ++int pnfs_block_enabled(struct inode *, int); ++int bl_layout_type(struct super_block *sb); ++int bl_getdeviceiter(struct super_block *, u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int bl_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++enum nfsstat4 bl_layoutget(struct inode *, struct exp_xdr_stream *, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int bl_layoutcommit(struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++int bl_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len); ++int bl_init_proc(void); ++int bl_upcall(bl_comm_t *, bl_comm_msg_t *, bl_comm_res_t **); ++ ++extern bl_comm_t *bl_comm_global; // Ugly... ++#endif /* __KERNEL__ */ ++ ++#endif /* NFSD4_BLOCK */ ++ +diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h +--- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-31 20:42:05.597097942 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-08-31 20:42:05.597097942 -0400 +@@ -0,0 +1,345 @@ ++/* ++ * include/linux/nfsd4_spnfs.h ++ * ++ * spNFS - simple pNFS implementation with userspace daemon ++ * ++ */ ++ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#ifndef NFS_SPNFS_H ++#define NFS_SPNFS_H ++ ++ ++#ifdef __KERNEL__ ++#include "exportfs.h" ++#include "sunrpc/svc.h" ++#include "nfsd/nfsfh.h" ++#else ++#include ++#endif /* __KERNEL__ */ ++ ++#define SPNFS_STATUS_INVALIDMSG 0x01 ++#define SPNFS_STATUS_AGAIN 0x02 ++#define SPNFS_STATUS_FAIL 0x04 ++#define SPNFS_STATUS_SUCCESS 0x08 ++ ++#define SPNFS_TYPE_LAYOUTGET 0x01 ++#define SPNFS_TYPE_LAYOUTCOMMIT 0x02 ++#define SPNFS_TYPE_LAYOUTRETURN 0x03 ++#define SPNFS_TYPE_GETDEVICEITER 0x04 ++#define SPNFS_TYPE_GETDEVICEINFO 0x05 ++#define SPNFS_TYPE_SETATTR 0x06 ++#define SPNFS_TYPE_OPEN 0x07 ++#define SPNFS_TYPE_CLOSE 0x08 ++#define SPNFS_TYPE_CREATE 0x09 ++#define SPNFS_TYPE_REMOVE 0x0a ++#define SPNFS_TYPE_COMMIT 0x0b ++#define SPNFS_TYPE_READ 0x0c ++#define SPNFS_TYPE_WRITE 0x0d ++ ++#define SPNFS_MAX_DEVICES 1 ++#define SPNFS_MAX_DATA_SERVERS 16 ++#define SPNFS_MAX_IO 512 ++ ++/* layout */ ++struct spnfs_msg_layoutget_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_filelayout_list { ++ u_int32_t fh_len; ++ unsigned char fh_val[128]; /* DMXXX fix this const */ ++}; ++ ++struct spnfs_msg_layoutget_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t stripe_size; ++ u_int32_t stripe_type; ++ u_int32_t stripe_count; ++ struct spnfs_filelayout_list flist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++/* layoutcommit */ ++struct spnfs_msg_layoutcommit_args { ++ unsigned long inode; ++ unsigned long generation; ++ u_int64_t file_size; ++}; ++ ++struct spnfs_msg_layoutcommit_res { ++ int status; ++}; ++ ++/* layoutreturn */ ++/* No op for the daemon */ ++/* ++struct spnfs_msg_layoutreturn_args { ++}; ++ ++struct spnfs_msg_layoutreturn_res { ++}; ++*/ ++ ++/* getdeviceiter */ ++struct spnfs_msg_getdeviceiter_args { ++ unsigned long inode; ++ u_int64_t cookie; ++ u_int64_t verf; ++}; ++ ++struct spnfs_msg_getdeviceiter_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t cookie; ++ u_int64_t verf; ++ u_int32_t eof; ++}; ++ ++/* getdeviceinfo */ ++struct spnfs_data_server { ++ u_int32_t dsid; ++ char netid[5]; ++ char addr[29]; ++}; ++ ++struct spnfs_device { ++ u_int64_t devid; ++ int dscount; ++ struct spnfs_data_server dslist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++struct spnfs_msg_getdeviceinfo_args { ++ u_int64_t devid; ++}; ++ ++struct spnfs_msg_getdeviceinfo_res { ++ int status; ++ struct spnfs_device devinfo; ++}; ++ ++/* setattr */ ++struct spnfs_msg_setattr_args { ++ unsigned long inode; ++ unsigned long generation; ++ int file_size; ++}; ++ ++struct spnfs_msg_setattr_res { ++ int status; ++}; ++ ++/* open */ ++struct spnfs_msg_open_args { ++ unsigned long inode; ++ unsigned long generation; ++ int create; ++ int createmode; ++ int truncate; ++}; ++ ++struct spnfs_msg_open_res { ++ int status; ++}; ++ ++/* close */ ++/* No op for daemon */ ++struct spnfs_msg_close_args { ++ int x; ++}; ++ ++struct spnfs_msg_close_res { ++ int y; ++}; ++ ++/* create */ ++/* ++struct spnfs_msg_create_args { ++ int x; ++}; ++ ++struct spnfs_msg_create_res { ++ int y; ++}; ++*/ ++ ++/* remove */ ++struct spnfs_msg_remove_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_msg_remove_res { ++ int status; ++}; ++ ++/* commit */ ++/* ++struct spnfs_msg_commit_args { ++ int x; ++}; ++ ++struct spnfs_msg_commit_res { ++ int y; ++}; ++*/ ++ ++/* read */ ++struct spnfs_msg_read_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++}; ++ ++struct spnfs_msg_read_res { ++ int status; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++/* write */ ++struct spnfs_msg_write_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++struct spnfs_msg_write_res { ++ int status; ++}; ++ ++/* bundle args and responses */ ++union spnfs_msg_args { ++ struct spnfs_msg_layoutget_args layoutget_args; ++ struct spnfs_msg_layoutcommit_args layoutcommit_args; ++/* ++ struct spnfs_msg_layoutreturn_args layoutreturn_args; ++*/ ++ struct spnfs_msg_getdeviceiter_args getdeviceiter_args; ++ struct spnfs_msg_getdeviceinfo_args getdeviceinfo_args; ++ struct spnfs_msg_setattr_args setattr_args; ++ struct spnfs_msg_open_args open_args; ++ struct spnfs_msg_close_args close_args; ++/* ++ struct spnfs_msg_create_args create_args; ++*/ ++ struct spnfs_msg_remove_args remove_args; ++/* ++ struct spnfs_msg_commit_args commit_args; ++*/ ++ struct spnfs_msg_read_args read_args; ++ struct spnfs_msg_write_args write_args; ++}; ++ ++union spnfs_msg_res { ++ struct spnfs_msg_layoutget_res layoutget_res; ++ struct spnfs_msg_layoutcommit_res layoutcommit_res; ++/* ++ struct spnfs_msg_layoutreturn_res layoutreturn_res; ++*/ ++ struct spnfs_msg_getdeviceiter_res getdeviceiter_res; ++ struct spnfs_msg_getdeviceinfo_res getdeviceinfo_res; ++ struct spnfs_msg_setattr_res setattr_res; ++ struct spnfs_msg_open_res open_res; ++ struct spnfs_msg_close_res close_res; ++/* ++ struct spnfs_msg_create_res create_res; ++*/ ++ struct spnfs_msg_remove_res remove_res; ++/* ++ struct spnfs_msg_commit_res commit_res; ++*/ ++ struct spnfs_msg_read_res read_res; ++ struct spnfs_msg_write_res write_res; ++}; ++ ++/* a spnfs message, args and response */ ++struct spnfs_msg { ++ unsigned char im_type; ++ unsigned char im_status; ++ union spnfs_msg_args im_args; ++ union spnfs_msg_res im_res; ++}; ++ ++/* spnfs configuration info */ ++struct spnfs_config { ++ unsigned char dense_striping; ++ int stripe_size; ++ int num_ds; ++ char ds_dir[SPNFS_MAX_DATA_SERVERS][80]; /* XXX */ ++}; ++ ++#if defined(__KERNEL__) && defined(CONFIG_SPNFS) ++ ++#include ++ ++/* pipe mgmt structure. messages flow through here */ ++struct spnfs { ++ struct dentry *spnfs_dentry; /* dentry for pipe */ ++ wait_queue_head_t spnfs_wq; ++ struct spnfs_msg spnfs_im; /* spnfs message */ ++ struct mutex spnfs_lock; /* Serializes upcalls */ ++ struct mutex spnfs_plock; ++}; ++ ++struct nfsd4_open; ++ ++int spnfs_layout_type(struct super_block *); ++enum nfsstat4 spnfs_layoutget(struct inode *, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int spnfs_layoutcommit(void); ++int spnfs_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int spnfs_getdeviceiter(struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int spnfs_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++int spnfs_setattr(void); ++int spnfs_open(struct inode *, struct nfsd4_open *); ++int spnfs_get_state(struct inode *, struct knfsd_fh *, struct pnfs_get_state *); ++int spnfs_remove(unsigned long, unsigned long); ++__be32 spnfs_read(struct inode *, loff_t, unsigned long *, ++ int, struct svc_rqst *); ++__be32 spnfs_write(struct inode *, loff_t, size_t, int, struct svc_rqst *); ++int spnfs_getfh(int, struct nfs_fh *); ++int spnfs_test_layoutrecall(char *, u64, u64); ++int spnfs_layoutrecall(struct inode *, int, u64, u64); ++ ++int nfsd_spnfs_new(void); ++void nfsd_spnfs_delete(void); ++int spnfs_upcall(struct spnfs *, struct spnfs_msg *, union spnfs_msg_res *); ++int spnfs_enabled(void); ++int spnfs_init_proc(void); ++ ++extern struct spnfs_config *spnfs_config; ++ ++#endif /* __KERNEL__ && CONFIG_SPNFS */ ++ ++#endif /* NFS_SPNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch/include/linux/nfsd/const.h +--- linux-2.6.34.noarch/include/linux/nfsd/const.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-08-31 20:42:05.591097762 -0400 +@@ -29,6 +29,7 @@ + #ifdef __KERNEL__ + + #include ++#include + + /* + * Largest number of bytes we need to allocate for an NFS +diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch/include/linux/nfsd/debug.h +--- linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-08-31 20:42:05.591097762 -0400 +@@ -32,6 +32,8 @@ + #define NFSDDBG_REPCACHE 0x0080 + #define NFSDDBG_XDR 0x0100 + #define NFSDDBG_LOCKD 0x0200 ++#define NFSDDBG_PNFS 0x0400 ++#define NFSDDBG_FILELAYOUT 0x0800 + #define NFSDDBG_ALL 0x7FFF + #define NFSDDBG_NOCHANGE 0xFFFF + +diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarch/include/linux/nfsd/export.h +--- linux-2.6.34.noarch/include/linux/nfsd/export.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-08-31 20:42:05.592118086 -0400 +@@ -100,6 +100,7 @@ struct svc_export { + uid_t ex_anon_uid; + gid_t ex_anon_gid; + int ex_fsid; ++ int ex_pnfs; + unsigned char * ex_uuid; /* 16 byte fsid */ + struct nfsd4_fs_locations ex_fslocs; + int ex_nflavors; +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-31 20:42:05.592118086 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-31 20:42:05.592118086 -0400 +@@ -0,0 +1,132 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef NFSD_NFS4LAYOUTXDR_H ++#define NFSD_NFS4LAYOUTXDR_H ++ ++#include ++#include ++ ++/* the nfsd4_pnfs_devlist dev_addr for the file layout type */ ++struct pnfs_filelayout_devaddr { ++ struct xdr_netobj r_netid; ++ struct xdr_netobj r_addr; ++}; ++ ++/* list of multipath servers */ ++struct pnfs_filelayout_multipath { ++ u32 fl_multipath_length; ++ struct pnfs_filelayout_devaddr *fl_multipath_list; ++}; ++ ++struct pnfs_filelayout_device { ++ u32 fl_stripeindices_length; ++ u32 *fl_stripeindices_list; ++ u32 fl_device_length; ++ struct pnfs_filelayout_multipath *fl_device_list; ++}; ++ ++struct pnfs_filelayout_layout { ++ u32 lg_layout_type; /* response */ ++ u32 lg_stripe_type; /* response */ ++ u32 lg_commit_through_mds; /* response */ ++ u64 lg_stripe_unit; /* response */ ++ u64 lg_pattern_offset; /* response */ ++ u32 lg_first_stripe_index; /* response */ ++ struct nfsd4_pnfs_deviceid device_id; /* response */ ++ u32 lg_fh_length; /* response */ ++ struct knfsd_fh *lg_fh_list; /* response */ ++}; ++ ++enum stripetype4 { ++ STRIPE_SPARSE = 1, ++ STRIPE_DENSE = 2 ++}; ++ ++enum pnfs_block_extent_state4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, ++ PNFS_BLOCK_NONE_DATA = 3 ++}; ++ ++enum pnfs_block_volume_type4 { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, ++ PNFS_BLOCK_VOLUME_SLICE = 1, ++ PNFS_BLOCK_VOLUME_CONCAT = 2, ++ PNFS_BLOCK_VOLUME_STRIPE = 3, ++}; ++typedef enum pnfs_block_volume_type4 pnfs_block_volume_type4; ++ ++enum bl_cache_state { ++ BLOCK_LAYOUT_NEW = 0, ++ BLOCK_LAYOUT_CACHE = 1, ++ BLOCK_LAYOUT_UPDATE = 2, ++}; ++ ++typedef struct pnfs_blocklayout_layout { ++ struct list_head bll_list; ++ struct nfsd4_pnfs_deviceid bll_vol_id; ++ u64 bll_foff; // file offset ++ u64 bll_len; ++ u64 bll_soff; // storage offset ++ int bll_recalled; ++ enum pnfs_block_extent_state4 bll_es; ++ enum bl_cache_state bll_cache_state; ++} pnfs_blocklayout_layout_t; ++ ++typedef struct pnfs_blocklayout_devinfo { ++ struct list_head bld_list; ++ pnfs_block_volume_type4 bld_type; ++ struct nfsd4_pnfs_deviceid bld_devid; ++ int bld_index_loc; ++ union { ++ struct { ++ u64 bld_offset; ++ u32 bld_sig_len, ++ *bld_sig; ++ } simple; ++ struct { ++ u64 bld_start, ++ bld_len; ++ u32 bld_index; /* Index of Simple Volume */ ++ } slice; ++ struct { ++ u32 bld_stripes; ++ u64 bld_chunk_size; ++ u32 *bld_stripe_indexs; ++ } stripe; ++ } u; ++} pnfs_blocklayout_devinfo_t; ++ ++#endif /* NFSD_NFS4LAYOUTXDR_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-31 20:42:05.593020723 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-31 20:42:05.593020723 -0400 +@@ -0,0 +1,54 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++#include ++ ++/* ++ * Length of comma separated pnfs data server IPv4 addresses. Enough room for ++ * 32 addresses. ++ */ ++#define NFSD_DLM_DS_LIST_MAX 512 ++/* ++ * Length of colon separated pnfs dlm device of the form ++ * disk_name:comma separated data server IPv4 address ++ */ ++#define NFSD_PNFS_DLM_DEVICE_MAX (NFSD_DLM_DS_LIST_MAX + DISK_NAME_LEN + 1) ++ ++#ifdef CONFIG_PNFSD ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++extern const struct pnfs_export_operations pnfs_dlm_export_ops; ++ ++int nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len); ++ ++void nfsd4_pnfs_dlm_shutdown(void); ++ ++ssize_t nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen); ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-31 20:42:05.594107962 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-31 20:42:05.594107962 -0400 +@@ -0,0 +1,271 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef _LINUX_NFSD_NFSD4_PNFS_H ++#define _LINUX_NFSD_NFSD4_PNFS_H ++ ++#include ++#include ++#include ++ ++struct nfsd4_pnfs_deviceid { ++ u64 sbid; /* per-superblock unique ID */ ++ u64 devid; /* filesystem-wide unique device ID */ ++}; ++ ++struct nfsd4_pnfs_dev_iter_res { ++ u64 gd_cookie; /* request/repsonse */ ++ u64 gd_verf; /* request/repsonse */ ++ u64 gd_devid; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++/* Arguments for set_device_notify */ ++struct pnfs_devnotify_arg { ++ struct nfsd4_pnfs_deviceid dn_devid; /* request */ ++ u32 dn_layout_type; /* request */ ++ u32 dn_notify_types; /* request/response */ ++}; ++ ++struct nfsd4_layout_seg { ++ u64 clientid; ++ u32 layout_type; ++ u32 iomode; ++ u64 offset; ++ u64 length; ++}; ++ ++/* Used by layout_get to encode layout (loc_body var in spec) ++ * Args: ++ * minlength - min number of accessible bytes given by layout ++ * fsid - Major part of struct pnfs_deviceid. File system uses this ++ * to build the deviceid returned in the layout. ++ * fh - fs can modify the file handle for use on data servers ++ * seg - layout info requested and layout info returned ++ * xdr - xdr info ++ * return_on_close - true if layout to be returned on file close ++ */ ++ ++struct nfsd4_pnfs_layoutget_arg { ++ u64 lg_minlength; ++ u64 lg_sbid; ++ const struct knfsd_fh *lg_fh; ++}; ++ ++struct nfsd4_pnfs_layoutget_res { ++ struct nfsd4_layout_seg lg_seg; /* request/resopnse */ ++ u32 lg_return_on_close; ++}; ++ ++struct nfsd4_pnfs_layoutcommit_arg { ++ struct nfsd4_layout_seg lc_seg; /* request */ ++ u32 lc_reclaim; /* request */ ++ u32 lc_newoffset; /* request */ ++ u64 lc_last_wr; /* request */ ++ struct nfstime4 lc_mtime; /* request */ ++ u32 lc_up_len; /* layout length */ ++ void *lc_up_layout; /* decoded by callback */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit_res { ++ u32 lc_size_chg; /* boolean for response */ ++ u64 lc_newsize; /* response */ ++}; ++ ++#define PNFS_LAST_LAYOUT_NO_RECALLS ((void *)-1) /* used with lr_cookie below */ ++ ++struct nfsd4_pnfs_layoutreturn_arg { ++ u32 lr_return_type; /* request */ ++ struct nfsd4_layout_seg lr_seg; /* request */ ++ u32 lr_reclaim; /* request */ ++ u32 lrf_body_len; /* request */ ++ void *lrf_body; /* request */ ++ void *lr_cookie; /* fs private */ ++}; ++ ++/* pNFS Metadata to Data server state communication */ ++struct pnfs_get_state { ++ u32 dsid; /* request */ ++ u64 ino; /* request */ ++ nfs4_stateid stid; /* request;response */ ++ nfs4_clientid clid; /* response */ ++ u32 access; /* response */ ++ u32 stid_gen; /* response */ ++ u32 verifier[2]; /* response */ ++}; ++ ++/* ++ * pNFS export operations vector. ++ * ++ * The filesystem must implement the following methods: ++ * layout_type ++ * get_device_info ++ * layout_get ++ * ++ * All other methods are optional and can be set to NULL if not implemented. ++ */ ++struct pnfs_export_operations { ++ /* Returns the supported pnfs_layouttype4. */ ++ int (*layout_type) (struct super_block *); ++ ++ /* Encode device info onto the xdr stream. */ ++ int (*get_device_info) (struct super_block *, ++ struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++ ++ /* Retrieve all available devices via an iterator. ++ * arg->cookie == 0 indicates the beginning of the list, ++ * otherwise arg->verf is used to verify that the list hasn't changed ++ * while retrieved. ++ * ++ * On output, the filesystem sets the devid based on the current cookie ++ * and sets res->cookie and res->verf corresponding to the next entry. ++ * When the last entry in the list is retrieved, res->eof is set to 1. ++ */ ++ int (*get_device_iter) (struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++ ++ int (*set_device_notify) (struct super_block *, ++ struct pnfs_devnotify_arg *); ++ ++ /* Retrieve and encode a layout for inode onto the xdr stream. ++ * arg->minlength is the minimum number of accessible bytes required ++ * by the client. ++ * The maximum number of bytes to encode the layout is given by ++ * the xdr stream end pointer. ++ * arg->fsid contains the major part of struct pnfs_deviceid. ++ * The file system uses this to build the deviceid returned ++ * in the layout. ++ * res->seg - layout segment requested and layout info returned. ++ * res->fh can be modified the file handle for use on data servers ++ * res->return_on_close - true if layout to be returned on file close ++ * ++ * return one of the following nfs errors: ++ * NFS_OK Success ++ * NFS4ERR_ACCESS Permission error ++ * NFS4ERR_BADIOMODE Server does not support requested iomode ++ * NFS4ERR_BADLAYOUT No layout matching loga_minlength rules ++ * NFS4ERR_INVAL Parameter other than layout is invalid ++ * NFS4ERR_IO I/O error ++ * NFS4ERR_LAYOUTTRYLATER Layout may be retrieved later ++ * NFS4ERR_LAYOUTUNAVAILABLE Layout unavailable for this file ++ * NFS4ERR_LOCKED Lock conflict ++ * NFS4ERR_NOSPC Out-of-space error occured ++ * NFS4ERR_RECALLCONFLICT Layout currently unavialable due to ++ * a conflicting CB_LAYOUTRECALL ++ * NFS4ERR_SERVERFAULT Server went bezerk ++ * NFS4ERR_TOOSMALL loga_maxcount too small to fit layout ++ * NFS4ERR_WRONG_TYPE Wrong file type (not a regular file) ++ */ ++ enum nfsstat4 (*layout_get) (struct inode *, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++ ++ /* Commit changes to layout */ ++ int (*layout_commit) (struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++ ++ /* Returns the layout */ ++ int (*layout_return) (struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++ ++ /* Can layout segments be merged for this layout type? */ ++ int (*can_merge_layouts) (u32 layout_type); ++ ++ /* pNFS Files layout specific operations */ ++ ++ /* Get the write verifier for DS (called on MDS only) */ ++ void (*get_verifier) (struct super_block *, u32 *p); ++ /* Call fs on DS only */ ++ int (*get_state) (struct inode *, struct knfsd_fh *, ++ struct pnfs_get_state *); ++}; ++ ++struct nfsd4_pnfs_cb_layout { ++ u32 cbl_recall_type; /* request */ ++ struct nfsd4_layout_seg cbl_seg; /* request */ ++ u32 cbl_layoutchanged; /* request */ ++ nfs4_stateid cbl_sid; /* request */ ++ struct nfs4_fsid cbl_fsid; ++ void *cbl_cookie; /* fs private */ ++}; ++ ++/* layoutrecall request (from exported filesystem) */ ++struct nfs4_layoutrecall { ++ struct kref clr_ref; ++ struct nfsd4_pnfs_cb_layout cb; /* request */ ++ struct list_head clr_perclnt; /* on cl_layoutrecalls */ ++ struct nfs4_client *clr_client; ++ struct nfs4_file *clr_file; ++ struct timespec clr_time; /* last activity */ ++ struct super_block *clr_sb; /* We might not have a file */ ++ struct nfs4_layoutrecall *parent; /* The initiating recall */ ++ ++ void *clr_args; /* nfsd internal */ ++}; ++ ++struct nfsd4_pnfs_cb_dev_item { ++ u32 cbd_notify_type; /* request */ ++ u32 cbd_layout_type; /* request */ ++ struct nfsd4_pnfs_deviceid cbd_devid; /* request */ ++ u32 cbd_immediate; /* request */ ++}; ++ ++struct nfsd4_pnfs_cb_dev_list { ++ u32 cbd_len; /* request */ ++ struct nfsd4_pnfs_cb_dev_item *cbd_list; /* request */ ++}; ++ ++/* ++ * callbacks provided by the nfsd ++ */ ++struct pnfsd_cb_operations { ++ /* Generic callbacks */ ++ int (*cb_layout_recall) (struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++ int (*cb_device_notify) (struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++ ++ /* pNFS Files layout specific callbacks */ ++ ++ /* Callback from fs on MDS only */ ++ int (*cb_get_state) (struct super_block *, struct pnfs_get_state *); ++ /* Callback from fs on DS only */ ++ int (*cb_change_state) (struct pnfs_get_state *); ++}; ++ ++#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noarch/include/linux/nfsd/syscall.h +--- linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-08-31 20:42:05.594107962 -0400 +@@ -29,6 +29,7 @@ + /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ + #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ + #define NFSCTL_GETFS 8 /* get an fh by path with max FH len */ ++#define NFSCTL_FD2FH 9 /* get a fh from a fd */ + + /* SVC */ + struct nfsctl_svc { +@@ -71,6 +72,11 @@ struct nfsctl_fsparm { + int gd_maxlen; + }; + ++/* FD2FH */ ++struct nfsctl_fd2fh { ++ int fd; ++}; ++ + /* + * This is the argument union. + */ +@@ -82,6 +88,7 @@ struct nfsctl_arg { + struct nfsctl_export u_export; + struct nfsctl_fdparm u_getfd; + struct nfsctl_fsparm u_getfs; ++ struct nfsctl_fd2fh u_fd2fh; + /* + * The following dummy member is needed to preserve binary compatibility + * on platforms where alignof(void*)>alignof(int). It's needed because +@@ -95,6 +102,7 @@ struct nfsctl_arg { + #define ca_export u.u_export + #define ca_getfd u.u_getfd + #define ca_getfs u.u_getfs ++#define ca_fd2fh u.u_fd2fh + }; + + union nfsctl_res { +diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/include/linux/nfs_fs.h +--- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-31 20:41:19.168160480 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-31 20:42:05.584098019 -0400 +@@ -72,13 +72,20 @@ struct nfs_access_entry { + int mask; + }; + ++struct nfs_lock_context { ++ atomic_t count; ++ struct list_head list; ++ struct nfs_open_context *open_context; ++ fl_owner_t lockowner; ++ pid_t pid; ++}; ++ + struct nfs4_state; + struct nfs_open_context { +- atomic_t count; ++ struct nfs_lock_context lock_context; + struct path path; + struct rpc_cred *cred; + struct nfs4_state *state; +- fl_owner_t lockowner; + fmode_t mode; + + unsigned long flags; +@@ -97,6 +104,27 @@ struct nfs_delegation; + + struct posix_acl; + ++struct pnfs_layout_hdr { ++ int refcount; ++ struct list_head layouts; /* other client layouts */ ++ struct list_head segs; /* layout segments list */ ++ int roc_iomode;/* return on close iomode, 0=none */ ++ seqlock_t seqlock; /* Protects the stateid */ ++ nfs4_stateid stateid; ++ unsigned long state; ++#define NFS_INO_RO_LAYOUT_FAILED 0 /* ro layoutget failed stop trying */ ++#define NFS_INO_RW_LAYOUT_FAILED 1 /* rw layoutget failed stop trying */ ++#define NFS_INO_LAYOUTCOMMIT 2 /* LAYOUTCOMMIT needed */ ++ ++ struct rpc_cred *cred; /* layoutcommit credential */ ++ /* DH: These vars keep track of the maximum write range ++ * so the values can be used for layoutcommit. ++ */ ++ loff_t write_begin_pos; ++ loff_t write_end_pos; ++ struct inode *inode; ++}; ++ + /* + * nfs fs inode data in memory + */ +@@ -181,6 +209,13 @@ struct nfs_inode { + struct nfs_delegation *delegation; + fmode_t delegation_state; + struct rw_semaphore rwsem; ++ ++ /* pNFS layout information */ ++#if defined(CONFIG_NFS_V4_1) ++ wait_queue_head_t lo_waitq; ++ struct pnfs_layout_hdr *layout; ++ time_t pnfs_layout_suspend; ++#endif /* CONFIG_NFS_V4_1 */ + #endif /* CONFIG_NFS_V4*/ + #ifdef CONFIG_NFS_FSCACHE + struct fscache_cookie *fscache; +@@ -353,6 +388,8 @@ extern void nfs_setattr_update_inode(str + extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); + extern void put_nfs_open_context(struct nfs_open_context *ctx); + extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode); ++extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx); ++extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); + extern u64 nfs_compat_user_ino64(u64 fileid); + extern void nfs_fattr_init(struct nfs_fattr *fattr); + +@@ -481,8 +518,12 @@ extern void nfs_unblock_sillyrename(stru + extern int nfs_congestion_kb; + extern int nfs_writepage(struct page *page, struct writeback_control *wbc); + extern int nfs_writepages(struct address_space *, struct writeback_control *); +-extern int nfs_flush_incompatible(struct file *file, struct page *page); +-extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); ++struct pnfs_layout_segment; ++extern int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg); ++extern int nfs_updatepage(struct file *, struct page *, ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata); + extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); + + /* +@@ -604,6 +645,8 @@ extern void * nfs_root_data(void); + #define NFSDBG_CLIENT 0x0200 + #define NFSDBG_MOUNT 0x0400 + #define NFSDBG_FSCACHE 0x0800 ++#define NFSDBG_PNFS 0x1000 ++#define NFSDBG_PNFS_LD 0x2000 + #define NFSDBG_ALL 0xFFFF + + #ifdef __KERNEL__ +diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/include/linux/nfs_fs_sb.h +--- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-31 20:41:19.168160480 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-31 20:42:05.586087719 -0400 +@@ -15,6 +15,7 @@ struct nlm_host; + struct nfs4_sequence_args; + struct nfs4_sequence_res; + struct nfs_server; ++struct nfs4_minor_version_ops; + + /* + * The nfs_client identifies our client state to the server. +@@ -70,11 +71,7 @@ struct nfs_client { + */ + char cl_ipaddr[48]; + unsigned char cl_id_uniquifier; +- int (* cl_call_sync)(struct nfs_server *server, +- struct rpc_message *msg, +- struct nfs4_sequence_args *args, +- struct nfs4_sequence_res *res, +- int cache_reply); ++ const struct nfs4_minor_version_ops *cl_mvops; + #endif /* CONFIG_NFS_V4 */ + + #ifdef CONFIG_NFS_V4_1 +@@ -85,6 +82,8 @@ struct nfs_client { + /* The flags used for obtaining the clientid during EXCHANGE_ID */ + u32 cl_exchange_flags; + struct nfs4_session *cl_session; /* sharred session */ ++ struct list_head cl_layouts; ++ struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */ + #endif /* CONFIG_NFS_V4_1 */ + + #ifdef CONFIG_NFS_FSCACHE +@@ -92,6 +91,16 @@ struct nfs_client { + #endif + }; + ++static inline bool ++is_ds_only_client(struct nfs_client *clp) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ return is_ds_only_session(clp->cl_exchange_flags); ++#else ++ return false; ++#endif ++} ++ + /* + * NFS client parameters stored in the superblock. + */ +@@ -136,7 +145,7 @@ struct nfs_server { + #endif + + #ifdef CONFIG_NFS_V4 +- u32 attr_bitmask[2];/* V4 bitmask representing the set ++ u32 attr_bitmask[3];/* V4 bitmask representing the set + of attributes supported on this + filesystem */ + u32 cache_consistency_bitmask[2]; +@@ -148,6 +157,15 @@ struct nfs_server { + that are supported on this + filesystem */ + #endif ++ ++#ifdef CONFIG_NFS_V4_1 ++ u32 pnfs_blksize; /* layout_blksize attr */ ++ struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ ++ void *pnfs_ld_data; /* Per-mount data */ ++ unsigned int ds_rsize; /* Data server read size */ ++ unsigned int ds_wsize; /* Data server write size */ ++#endif /* CONFIG_NFS_V4_1 */ ++ + void (*destroy)(struct nfs_server *); + + atomic_t active; /* Keep trace of any activity to this server */ +diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch/include/linux/nfs_iostat.h +--- linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-08-31 20:42:05.587097913 -0400 +@@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { + NFSIOS_SHORTREAD, + NFSIOS_SHORTWRITE, + NFSIOS_DELAY, ++ NFSIOS_PNFS_READ, ++ NFSIOS_PNFS_WRITE, ++ NFSIOS_PNFS_COMMIT, + __NFSIOS_COUNTSMAX, + }; + +diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/include/linux/nfs_page.h +--- linux-2.6.34.noarch/include/linux/nfs_page.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-08-31 20:42:05.588097898 -0400 +@@ -39,6 +39,7 @@ struct nfs_page { + struct list_head wb_list; /* Defines state of page: */ + struct page *wb_page; /* page to read in/write out */ + struct nfs_open_context *wb_context; /* File state context info */ ++ struct nfs_lock_context *wb_lock_context; /* lock context info */ + atomic_t wb_complete; /* i/os we're waiting for */ + pgoff_t wb_index; /* Offset >> PAGE_CACHE_SHIFT */ + unsigned int wb_offset, /* Offset & ~PAGE_CACHE_MASK */ +@@ -47,6 +48,7 @@ struct nfs_page { + struct kref wb_kref; /* reference count */ + unsigned long wb_flags; + struct nfs_writeverf wb_verf; /* Commit cookie */ ++ struct pnfs_layout_segment *wb_lseg; /* Pnfs layout info */ + }; + + struct nfs_pageio_descriptor { +@@ -60,6 +62,12 @@ struct nfs_pageio_descriptor { + int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int); + int pg_ioflags; + int pg_error; ++ struct pnfs_layout_segment *pg_lseg; ++#ifdef CONFIG_NFS_V4_1 ++ int pg_iswrite; ++ int pg_boundary; ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++#endif /* CONFIG_NFS_V4_1 */ + }; + + #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) +@@ -68,13 +76,15 @@ extern struct nfs_page *nfs_create_reque + struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int count); ++ unsigned int count, ++ struct pnfs_layout_segment *lseg); + extern void nfs_clear_request(struct nfs_page *req); + extern void nfs_release_request(struct nfs_page *req); + + + extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst, +- pgoff_t idx_start, unsigned int npages, int tag); ++ pgoff_t idx_start, unsigned int npages, int tag, ++ int *use_pnfs); + extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, + struct inode *inode, + int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), +diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/include/linux/nfs_xdr.h +--- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-08-31 20:41:19.169171911 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-31 20:42:05.590087729 -0400 +@@ -3,6 +3,8 @@ + + #include + #include ++#include ++#include + + /* + * To change the maximum rsize and wsize supported by the NFS client, adjust +@@ -10,7 +12,7 @@ + * support a megabyte or more. The default is left at 4096 bytes, which is + * reasonable for NFS over UDP. + */ +-#define NFS_MAX_FILE_IO_SIZE (1048576U) ++#define NFS_MAX_FILE_IO_SIZE (4U * 1048576U) + #define NFS_DEF_FILE_IO_SIZE (4096U) + #define NFS_MIN_FILE_IO_SIZE (1024U) + +@@ -113,6 +115,10 @@ struct nfs_fsinfo { + __u32 dtpref; /* pref. readdir transfer size */ + __u64 maxfilesize; + __u32 lease_time; /* in seconds */ ++#if defined(CONFIG_NFS_V4_1) ++ __u32 layouttype; /* supported pnfs layout driver */ ++ __u32 blksize; /* preferred pnfs io block size */ ++#endif + }; + + struct nfs_fsstat { +@@ -185,6 +191,125 @@ struct nfs4_get_lease_time_res { + struct nfs4_sequence_res lr_seq_res; + }; + ++#define PNFS_LAYOUT_MAXSIZE 4096 ++#define NFS4_PNFS_DEVICEID4_SIZE 16 ++ ++struct pnfs_deviceid { ++ char data[NFS4_PNFS_DEVICEID4_SIZE]; ++}; ++ ++struct nfs4_layoutdriver_data { ++ __u32 len; ++ void *buf; ++}; ++ ++struct pnfs_layout_range { ++ u32 iomode; ++ u64 offset; ++ u64 length; ++}; ++ ++struct nfs4_layoutget_args { ++ __u32 type; ++ struct pnfs_layout_range range; ++ __u64 minlength; ++ __u32 maxcount; ++ struct inode *inode; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_layoutget_res { ++ __u32 return_on_close; ++ struct pnfs_layout_range range; ++ __u32 type; ++ nfs4_stateid stateid; ++ struct nfs4_layoutdriver_data layout; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_layoutget { ++ struct nfs4_layoutget_args args; ++ struct nfs4_layoutget_res res; ++ struct pnfs_layout_segment **lsegpp; ++ int status; ++}; ++ ++struct nfs4_layoutcommit_args { ++ nfs4_stateid stateid; ++ __u64 lastbytewritten; ++ __u32 time_modify_changed; ++ struct timespec time_modify; ++ const u32 *bitmask; ++ struct nfs_fh *fh; ++ struct inode *inode; ++ ++ /* Values set by layout driver */ ++ struct pnfs_layout_range range; ++ __u32 layout_type; ++ void *layoutdriver_data; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_layoutcommit_res { ++ __u32 sizechanged; ++ __u64 newsize; ++ struct nfs_fattr *fattr; ++ const struct nfs_server *server; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_layoutcommit_data { ++ struct rpc_task task; ++ struct rpc_cred *cred; ++ struct nfs_fattr fattr; ++ struct nfs4_layoutcommit_args args; ++ struct nfs4_layoutcommit_res res; ++ int status; ++}; ++ ++struct nfs4_layoutreturn_args { ++ __u32 reclaim; ++ __u32 layout_type; ++ __u32 return_type; ++ struct pnfs_layout_range range; ++ struct inode *inode; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_layoutreturn_res { ++ struct nfs4_sequence_res seq_res; ++ u32 lrs_present; ++ nfs4_stateid stateid; ++}; ++ ++struct nfs4_layoutreturn { ++ struct nfs4_layoutreturn_args args; ++ struct nfs4_layoutreturn_res res; ++ struct rpc_cred *cred; ++ int rpc_status; ++}; ++ ++struct nfs4_getdevicelist_args { ++ const struct nfs_fh *fh; ++ u32 layoutclass; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_getdevicelist_res { ++ struct pnfs_devicelist *devlist; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_getdeviceinfo_args { ++ struct pnfs_device *pdev; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_getdeviceinfo_res { ++ struct pnfs_device *pdev; ++ struct nfs4_sequence_res seq_res; ++}; ++ + /* + * Arguments to the open call. + */ +@@ -196,8 +321,10 @@ struct nfs_openargs { + __u64 clientid; + __u64 id; + union { +- struct iattr * attrs; /* UNCHECKED, GUARDED */ +- nfs4_verifier verifier; /* EXCLUSIVE */ ++ struct { ++ struct iattr * attrs; /* UNCHECKED, GUARDED */ ++ nfs4_verifier verifier; /* EXCLUSIVE */ ++ }; + nfs4_stateid delegation; /* CLAIM_DELEGATE_CUR */ + fmode_t delegation_type; /* CLAIM_PREVIOUS */ + } u; +@@ -313,6 +440,10 @@ struct nfs_lockt_res { + struct nfs4_sequence_res seq_res; + }; + ++struct nfs_release_lockowner_args { ++ struct nfs_lowner lock_owner; ++}; ++ + struct nfs4_delegreturnargs { + const struct nfs_fh *fhandle; + const nfs4_stateid *stateid; +@@ -332,6 +463,7 @@ struct nfs4_delegreturnres { + struct nfs_readargs { + struct nfs_fh * fh; + struct nfs_open_context *context; ++ struct nfs_lock_context *lock_context; + __u64 offset; + __u32 count; + unsigned int pgbase; +@@ -352,6 +484,7 @@ struct nfs_readres { + struct nfs_writeargs { + struct nfs_fh * fh; + struct nfs_open_context *context; ++ struct nfs_lock_context *lock_context; + __u64 offset; + __u32 count; + enum nfs3_stable_how stable; +@@ -846,7 +979,7 @@ struct nfs4_server_caps_arg { + }; + + struct nfs4_server_caps_res { +- u32 attr_bitmask[2]; ++ u32 attr_bitmask[3]; + u32 acl_bitmask; + u32 has_links; + u32 has_symlinks; +@@ -961,6 +1094,27 @@ struct nfs_page; + + #define NFS_PAGEVEC_SIZE (8U) + ++#if defined(CONFIG_NFS_V4_1) ++/* pnfsflag values */ ++#define PNFS_NO_RPC 0x0001 /* non rpc result callback switch */ ++ ++/* pnfs-specific data needed for read, write, and commit calls */ ++struct pnfs_call_data { ++ struct pnfs_layout_segment *lseg; ++ const struct rpc_call_ops *call_ops; ++ u32 orig_count; /* for retry via MDS */ ++ int pnfs_error; ++ u8 pnfsflags; ++ u8 how; /* for FLUSH_STABLE */ ++}; ++ ++/* files layout-type specific data for read, write, and commit */ ++struct pnfs_fl_call_data { ++ struct nfs_client *ds_nfs_client; ++ __u64 orig_offset; ++}; ++#endif /* CONFIG_NFS_V4_1 */ ++ + struct nfs_read_data { + int flags; + struct rpc_task task; +@@ -976,10 +1130,16 @@ struct nfs_read_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + + struct nfs_write_data { ++ struct kref refcount; /* For pnfs commit splitting */ ++ struct nfs_write_data *parent; /* For pnfs commit splitting */ + int flags; + struct rpc_task task; + struct inode *inode; +@@ -995,6 +1155,10 @@ struct nfs_write_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + +@@ -1008,6 +1172,7 @@ struct nfs_rpc_ops { + const struct dentry_operations *dentry_ops; + const struct inode_operations *dir_inode_ops; + const struct inode_operations *file_inode_ops; ++ const struct file_operations *file_ops; + + int (*getroot) (struct nfs_server *, struct nfs_fh *, + struct nfs_fsinfo *); +@@ -1072,6 +1237,7 @@ struct nfs_rpc_ops { + extern const struct nfs_rpc_ops nfs_v2_clientops; + extern const struct nfs_rpc_ops nfs_v3_clientops; + extern const struct nfs_rpc_ops nfs_v4_clientops; ++extern const struct nfs_rpc_ops pnfs_v4_clientops; + extern struct rpc_version nfs_version2; + extern struct rpc_version nfs_version3; + extern struct rpc_version nfs_version4; +diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.noarch/include/linux/panfs_shim_api.h +--- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-08-31 20:42:05.598087997 -0400 ++++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-08-31 20:42:05.599087710 -0400 +@@ -0,0 +1,57 @@ ++#ifndef _PANFS_SHIM_API_H ++#define _PANFS_SHIM_API_H ++ ++/* ++ * imported panfs functions ++ */ ++struct panfs_export_operations { ++ int (*convert_rc)(pan_status_t rc); ++ ++ int (*sm_sec_t_get_size_otw)( ++ pan_sm_sec_otw_t *var, ++ pan_size_t *core_sizep, ++ pan_size_t *wire_size, ++ void *buf_end); ++ ++ int (*sm_sec_t_unmarshall)( ++ pan_sm_sec_otw_t *in, ++ pan_sm_sec_t *out, ++ void *buf, ++ pan_size_t size, ++ pan_size_t *otw_consumed, ++ pan_size_t *in_core_consumed); ++ ++ int (*ucreds_get)(void **ucreds_pp); ++ ++ void (*ucreds_put)(void *ucreds); ++ ++ int (*sam_read)( ++ pan_sam_access_flags_t flags, ++ pan_sam_read_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_read_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p); ++ ++ int (*sam_write)( ++ pan_sam_access_flags_t flags, ++ pan_sam_write_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_write_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p); ++}; ++ ++extern int ++panfs_shim_register(struct panfs_export_operations *ops); ++ ++extern int ++panfs_shim_unregister(void); ++ ++#endif /* _PANFS_SHIM_API_H */ +diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h +--- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-31 20:42:05.600025088 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-08-31 20:42:05.600025088 -0400 +@@ -0,0 +1,439 @@ ++/* ++ * pnfs_osd_xdr.h ++ * ++ * pNFS-osd on-the-wire data structures ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#ifndef __PNFS_OSD_XDR_H__ ++#define __PNFS_OSD_XDR_H__ ++ ++#include ++#include ++#include ++#include ++ ++#define PNFS_OSD_OSDNAME_MAXSIZE 256 ++ ++/* ++ * START OF "GENERIC" DECODE ROUTINES. ++ * These may look a little ugly since they are imported from a "generic" ++ * set of XDR encode/decode routines which are intended to be shared by ++ * all of our NFSv4 implementations (OpenBSD, MacOS X...). ++ * ++ * If the pain of reading these is too great, it should be a straightforward ++ * task to translate them into Linux-specific versions which are more ++ * consistent with the style used in NFSv2/v3... ++ */ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (u64)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++ ++/* ++ * draft-ietf-nfsv4-minorversion-22 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* Layout Structure */ ++ ++enum pnfs_osd_raid_algorithm4 { ++ PNFS_OSD_RAID_0 = 1, ++ PNFS_OSD_RAID_4 = 2, ++ PNFS_OSD_RAID_5 = 3, ++ PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ ++}; ++ ++/* struct pnfs_osd_data_map4 { ++ * uint32_t odm_num_comps; ++ * length4 odm_stripe_unit; ++ * uint32_t odm_group_width; ++ * uint32_t odm_group_depth; ++ * uint32_t odm_mirror_cnt; ++ * pnfs_osd_raid_algorithm4 odm_raid_algorithm; ++ * }; ++ */ ++struct pnfs_osd_data_map { ++ u32 odm_num_comps; ++ u64 odm_stripe_unit; ++ u32 odm_group_width; ++ u32 odm_group_depth; ++ u32 odm_mirror_cnt; ++ u32 odm_raid_algorithm; ++}; ++ ++static inline int ++pnfs_osd_data_map_xdr_sz(void) ++{ ++ return 1 + 2 + 1 + 1 + 1 + 1; ++} ++ ++static inline size_t ++pnfs_osd_data_map_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_data_map); ++} ++ ++/* struct pnfs_osd_objid4 { ++ * deviceid4 oid_device_id; ++ * uint64_t oid_partition_id; ++ * uint64_t oid_object_id; ++ * }; ++ */ ++struct pnfs_osd_objid { ++ struct pnfs_deviceid oid_device_id; ++ u64 oid_partition_id; ++ u64 oid_object_id; ++}; ++ ++/* For printout. I use "dev(%llx:%llx)", _DEVID_LO(), _DEVID_HI BE style */ ++#define _DEVID_LO(oid_device_id) \ ++ (unsigned long long)be64_to_cpup((__be64 *)oid_device_id.data) ++ ++#define _DEVID_HI(oid_device_id) \ ++ (unsigned long long)be64_to_cpup(((__be64 *)oid_device_id.data) + 1) ++ ++static inline int ++pnfs_osd_objid_xdr_sz(void) ++{ ++ return (NFS4_PNFS_DEVICEID4_SIZE / 4) + 2 + 2; ++} ++ ++static inline size_t ++pnfs_osd_objid_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_objid); ++} ++ ++enum pnfs_osd_version { ++ PNFS_OSD_MISSING = 0, ++ PNFS_OSD_VERSION_1 = 1, ++ PNFS_OSD_VERSION_2 = 2 ++}; ++ ++struct pnfs_osd_opaque_cred { ++ u32 cred_len; ++ u8 *cred; ++}; ++ ++static inline int ++pnfs_osd_opaque_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ READ32(n); ++ p += XDR_QUADLEN(n); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_opaque_cred_incore_sz(u32 *p) ++{ ++ u32 n; ++ ++ READ32(n); ++ return XDR_QUADLEN(n) * 4; ++} ++ ++enum pnfs_osd_cap_key_sec { ++ PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ PNFS_OSD_CAP_KEY_SEC_SSV = 1, ++}; ++ ++/* struct pnfs_osd_object_cred4 { ++ * pnfs_osd_objid4 oc_object_id; ++ * pnfs_osd_version4 oc_osd_version; ++ * pnfs_osd_cap_key_sec4 oc_cap_key_sec; ++ * opaque oc_capability_key<>; ++ * opaque oc_capability<>; ++ * }; ++ */ ++struct pnfs_osd_object_cred { ++ struct pnfs_osd_objid oc_object_id; ++ u32 oc_osd_version; ++ u32 oc_cap_key_sec; ++ struct pnfs_osd_opaque_cred oc_cap_key; ++ struct pnfs_osd_opaque_cred oc_cap; ++}; ++ ++static inline int ++pnfs_osd_object_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_object_cred_incore_sz(u32 *p) ++{ ++ size_t sz = sizeof(struct pnfs_osd_object_cred); ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ return sz; ++} ++ ++/* struct pnfs_osd_layout4 { ++ * pnfs_osd_data_map4 olo_map; ++ * uint32_t olo_comps_index; ++ * pnfs_osd_object_cred4 olo_components<>; ++ * }; ++ */ ++struct pnfs_osd_layout { ++ struct pnfs_osd_data_map olo_map; ++ u32 olo_comps_index; ++ u32 olo_num_comps; ++ struct pnfs_osd_object_cred *olo_comps; ++}; ++ ++static inline int ++pnfs_osd_layout_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ while ((int)(n--) > 0) ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_layout_incore_sz(u32 *p) ++{ ++ u32 n; ++ size_t sz; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ sz = sizeof(struct pnfs_osd_layout); ++ while ((int)(n--) > 0) { ++ sz += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ return sz; ++} ++ ++/* Device Address */ ++ ++enum pnfs_osd_targetid_type { ++ OBJ_TARGET_ANON = 1, ++ OBJ_TARGET_SCSI_NAME = 2, ++ OBJ_TARGET_SCSI_DEVICE_ID = 3, ++}; ++ ++/* union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) { ++ * case OBJ_TARGET_SCSI_NAME: ++ * string oti_scsi_name<>; ++ * ++ * case OBJ_TARGET_SCSI_DEVICE_ID: ++ * opaque oti_scsi_device_id<>; ++ * ++ * default: ++ * void; ++ * }; ++ * ++ * union pnfs_osd_targetaddr4 switch (bool ota_available) { ++ * case TRUE: ++ * netaddr4 ota_netaddr; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_deviceaddr4 { ++ * pnfs_osd_targetid4 oda_targetid; ++ * pnfs_osd_targetaddr4 oda_targetaddr; ++ * uint64_t oda_lun; ++ * opaque oda_systemid<>; ++ * pnfs_osd_object_cred4 oda_root_obj_cred; ++ * opaque oda_osdname<>; ++ * }; ++ */ ++struct pnfs_osd_targetid { ++ u32 oti_type; ++ struct nfs4_string oti_scsi_device_id; ++}; ++ ++enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 }; ++ ++/* struct netaddr4 { ++ * // see struct rpcb in RFC1833 ++ * string r_netid<>; // network id ++ * string r_addr<>; // universal address ++ * }; ++ */ ++struct pnfs_osd_net_addr { ++ struct nfs4_string r_netid; ++ struct nfs4_string r_addr; ++}; ++ ++struct pnfs_osd_targetaddr { ++ u32 ota_available; ++ struct pnfs_osd_net_addr ota_netaddr; ++}; ++ ++enum { ++ NETWORK_ID_MAX = 16 / 4, ++ UNIVERSAL_ADDRESS_MAX = 64 / 4, ++ PNFS_OSD_TARGETADDR_MAX = 3 + NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX, ++}; ++ ++struct pnfs_osd_deviceaddr { ++ struct pnfs_osd_targetid oda_targetid; ++ struct pnfs_osd_targetaddr oda_targetaddr; ++ u8 oda_lun[8]; ++ struct nfs4_string oda_systemid; ++ struct pnfs_osd_object_cred oda_root_obj_cred; ++ struct nfs4_string oda_osdname; ++}; ++ ++enum { ++ ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4, ++ PNFS_OSD_DEVICEADDR_MAX = ++ PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX + ++ 2 /*oda_lun*/ + ++ 1 + OSD_SYSTEMID_LEN + ++ 1 + ODA_OSDNAME_MAX, ++}; ++ ++/* LAYOUTCOMMIT: layoutupdate */ ++ ++/* union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) { ++ * case TRUE: ++ * int64_t dsu_delta; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_layoutupdate4 { ++ * pnfs_osd_deltaspaceused4 olu_delta_space_used; ++ * bool olu_ioerr_flag; ++ * }; ++ */ ++struct pnfs_osd_layoutupdate { ++ u32 dsu_valid; ++ s64 dsu_delta; ++ u32 olu_ioerr_flag; ++}; ++ ++/* LAYOUTRETURN: I/O Rrror Report */ ++ ++enum pnfs_osd_errno { ++ PNFS_OSD_ERR_EIO = 1, ++ PNFS_OSD_ERR_NOT_FOUND = 2, ++ PNFS_OSD_ERR_NO_SPACE = 3, ++ PNFS_OSD_ERR_BAD_CRED = 4, ++ PNFS_OSD_ERR_NO_ACCESS = 5, ++ PNFS_OSD_ERR_UNREACHABLE = 6, ++ PNFS_OSD_ERR_RESOURCE = 7 ++}; ++ ++/* struct pnfs_osd_ioerr4 { ++ * pnfs_osd_objid4 oer_component; ++ * length4 oer_comp_offset; ++ * length4 oer_comp_length; ++ * bool oer_iswrite; ++ * pnfs_osd_errno4 oer_errno; ++ * }; ++ */ ++struct pnfs_osd_ioerr { ++ struct pnfs_osd_objid oer_component; ++ u64 oer_comp_offset; ++ u64 oer_comp_length; ++ u32 oer_iswrite; ++ u32 oer_errno; ++}; ++ ++static inline unsigned ++pnfs_osd_ioerr_xdr_sz(void) ++{ ++ return pnfs_osd_objid_xdr_sz() + 2 + 2 + 1 + 1; ++} ++ ++/* OSD XDR API */ ++ ++/* Layout helpers */ ++extern struct pnfs_osd_layout *pnfs_osd_xdr_decode_layout( ++ struct pnfs_osd_layout *layout, u32 *p); ++ ++extern int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *layout); ++ ++/* Device Info helpers */ ++ ++/* First pass calculate total size for space needed */ ++extern size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p); ++ ++/* Note: some strings pointed to inside @deviceaddr might point ++ * to space inside @p. @p should stay valid while @deviceaddr ++ * is in use. ++ * It is assumed that @deviceaddr points to bigger memory of size ++ * calculated in first pass by pnfs_osd_xdr_deviceaddr_incore_sz() ++ */ ++extern void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p); ++ ++/* For Servers */ ++extern int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr); ++ ++/* layoutupdate (layout_commit) xdr helpers */ ++extern int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou); ++extern __be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p); ++ ++/* osd_ioerror encoding/decoding (layout_return) */ ++extern int ++pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, struct pnfs_osd_ioerr *ioerr); ++extern __be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p); ++ ++#endif /* __PNFS_OSD_XDR_H__ */ +diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/include/linux/posix_acl.h +--- linux-2.6.34.noarch/include/linux/posix_acl.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-08-31 20:42:05.601087875 -0400 +@@ -8,6 +8,7 @@ + #ifndef __LINUX_POSIX_ACL_H + #define __LINUX_POSIX_ACL_H + ++#include + #include + + #define ACL_UNDEFINED_ID (-1) +diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h +--- linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-08-31 20:42:05.602100892 -0400 +@@ -14,6 +14,8 @@ + /* size of an XDR encoding unit in bytes, i.e. 32bit */ + #define XDR_UNIT (4) + ++#include ++ + /* spec defines authentication flavor as an unsigned 32 bit integer */ + typedef u32 rpc_authflavor_t; + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h +--- linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-31 20:42:05.603108001 -0400 +@@ -3,6 +3,7 @@ + + #ifdef __KERNEL__ + ++#include + #include + + struct rpc_pipe_msg { +@@ -11,6 +12,10 @@ struct rpc_pipe_msg { + size_t len; + size_t copied; + int errno; ++#define PIPEFS_AUTOFREE_RPCMSG 0x01 /* frees rpc_pipe_msg */ ++#define PIPEFS_AUTOFREE_RPCMSG_DATA 0x02 /* frees rpc_pipe_msg->data */ ++#define PIPEFS_AUTOFREE_UPCALL_MSG PIPEFS_AUTOFREE_RPCMSG_DATA ++ u8 flags; + }; + + struct rpc_pipe_ops { +diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h +--- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-31 20:42:05.603108001 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-31 20:42:05.603108001 -0400 +@@ -0,0 +1,111 @@ ++/* ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#ifndef _SIMPLE_RPC_PIPEFS_H_ ++#define _SIMPLE_RPC_PIPEFS_H_ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#define payload_of(headerp) ((void *)(headerp + 1)) ++ ++/* ++ * struct pipefs_hdr -- the generic message format for simple_rpc_pipefs. ++ * Messages may simply be the header itself, although having an optional ++ * data payload follow the header allows much more flexibility. ++ * ++ * Messages are created using pipefs_alloc_init_msg() and ++ * pipefs_alloc_init_msg_padded(), both of which accept a pointer to an ++ * (optional) data payload. ++ * ++ * Given a struct pipefs_hdr *msg that has a struct foo payload, the data ++ * can be accessed using: struct foo *foop = payload_of(msg) ++ */ ++struct pipefs_hdr { ++ u32 msgid; ++ u8 type; ++ u8 flags; ++ u16 totallen; /* length of entire message, including hdr itself */ ++ u32 status; ++}; ++ ++/* ++ * struct pipefs_list -- a type of list used for tracking callers who've made an ++ * upcall and are blocked waiting for a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_assign_upcall_reply(). ++ */ ++struct pipefs_list { ++ struct list_head list; ++ spinlock_t list_lock; ++}; ++ ++ ++/* See net/sunrpc/simple_rpc_pipefs.c for more info on using these functions. */ ++extern struct dentry *pipefs_mkpipe(const char *name, ++ const struct rpc_pipe_ops *ops, ++ int wait_for_open); ++extern void pipefs_closepipe(struct dentry *pipe); ++extern void pipefs_init_list(struct pipefs_list *list); ++extern struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen); ++extern struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, ++ u8 flags, void *data, ++ u16 datalen, u16 padlen); ++extern struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list ++ *uplist, u8 upflags, ++ u32 timeout); ++extern int pipefs_queue_upcall_noreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, u8 upflags); ++extern int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist); ++extern struct pipefs_hdr *pipefs_readmsg(struct file *filp, ++ const char __user *src, size_t len); ++extern ssize_t pipefs_generic_upcall(struct file *filp, ++ struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen); ++extern void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg); ++ ++#endif /* _SIMPLE_RPC_PIPEFS_H_ */ +diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h +--- linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-31 20:42:05.604049784 -0400 +@@ -166,4 +166,41 @@ static inline char *__svc_print_addr(con + + return buf; + } ++ ++/* ++ * Print a network address in a universal format (see rfc1833 and nfsv4.1) ++ */ ++static inline int __svc_print_netaddr(struct sockaddr *addr, ++ struct xdr_netobj *na) ++{ ++ u16 port; ++ ssize_t len; ++ ++ switch (addr->sa_family) { ++ case AF_INET: { ++ struct sockaddr_in *sin = (struct sockaddr_in *)addr; ++ port = ntohs(sin->sin_port); ++ ++ len = snprintf(na->data, na->len, "%pI4.%u.%u", ++ &sin->sin_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ case AF_INET6: { ++ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; ++ port = ntohs(sin6->sin6_port); ++ ++ len = snprintf(na->data, na->len, "%pI6.%u.%u", ++ &sin6->sin6_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ default: ++ snprintf(na->data, na->len, "unknown address type: %d", ++ addr->sa_family); ++ len = -EINVAL; ++ break; ++ } ++ return len; ++} + #endif /* SUNRPC_SVC_XPRT_H */ +diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xdr.h +--- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-31 20:41:19.173118431 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-31 20:42:05.605107904 -0400 +@@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp) + return p + 2; + } + ++static inline __be32 * ++xdr_decode_opaque_fixed(__be32 *p, void *ptr, unsigned int len) ++{ ++ memcpy(ptr, p, len); ++ return p + XDR_QUADLEN(len); ++} ++ + /* + * Adjust kvec to reflect end of xdr'ed data (RPC client XDR) + */ +@@ -197,6 +204,7 @@ struct xdr_stream { + + extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); + extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); ++extern __be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q); + extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, + unsigned int base, unsigned int len); + extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); +diff -up linux-2.6.34.noarch/localversion-pnfs.orig linux-2.6.34.noarch/localversion-pnfs +--- linux-2.6.34.noarch/localversion-pnfs.orig 2010-08-31 20:42:05.605107904 -0400 ++++ linux-2.6.34.noarch/localversion-pnfs 2010-08-31 20:42:05.605107904 -0400 +@@ -0,0 +1 @@ ++-pnfs +diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/sunrpc/Makefile +--- linux-2.6.34.noarch/net/sunrpc/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-08-31 20:42:05.606020148 -0400 +@@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt + svc.o svcsock.o svcauth.o svcauth_unix.o \ + addr.o rpcb_clnt.o timer.o xdr.o \ + sunrpc_syms.o cache.o rpc_pipe.o \ +- svc_xprt.o ++ svc_xprt.o simple_rpc_pipefs.o + sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o + sunrpc-$(CONFIG_PROC_FS) += stats.o + sunrpc-$(CONFIG_SYSCTL) += sysctl.o +diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c +--- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-31 20:42:05.606020148 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-31 20:42:05.607108065 -0400 +@@ -0,0 +1,424 @@ ++/* ++ * net/sunrpc/simple_rpc_pipefs.c ++ * ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++ ++/* ++ * Make an rpc_pipefs pipe named @name at the root of the mounted rpc_pipefs ++ * filesystem. ++ * ++ * If @wait_for_open is non-zero and an upcall is later queued but the userland ++ * end of the pipe has not yet been opened, the upcall will remain queued until ++ * the pipe is opened; otherwise, the upcall queueing will return with -EPIPE. ++ */ ++struct dentry *pipefs_mkpipe(const char *name, const struct rpc_pipe_ops *ops, ++ int wait_for_open) ++{ ++ struct dentry *dir, *pipe; ++ struct vfsmount *mnt; ++ ++ mnt = rpc_get_mount(); ++ if (IS_ERR(mnt)) { ++ pipe = ERR_CAST(mnt); ++ goto out; ++ } ++ dir = mnt->mnt_root; ++ if (!dir) { ++ pipe = ERR_PTR(-ENOENT); ++ goto out; ++ } ++ pipe = rpc_mkpipe(dir, name, NULL, ops, ++ wait_for_open ? RPC_PIPE_WAIT_FOR_OPEN : 0); ++out: ++ return pipe; ++} ++EXPORT_SYMBOL(pipefs_mkpipe); ++ ++/* ++ * Shutdown a pipe made by pipefs_mkpipe(). ++ * XXX: do we need to retain an extra reference on the mount? ++ */ ++void pipefs_closepipe(struct dentry *pipe) ++{ ++ rpc_unlink(pipe); ++ rpc_put_mount(); ++} ++EXPORT_SYMBOL(pipefs_closepipe); ++ ++/* ++ * Initialize a struct pipefs_list -- which are a way to keep track of callers ++ * who're blocked having made an upcall and are awaiting a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_find_upcall_msgid() for how ++ * to use them. ++ */ ++inline void pipefs_init_list(struct pipefs_list *list) ++{ ++ INIT_LIST_HEAD(&list->list); ++ spin_lock_init(&list->list_lock); ++} ++EXPORT_SYMBOL(pipefs_init_list); ++ ++/* ++ * Alloc/init a generic pipefs message header and copy into its message body ++ * an arbitrary data payload. ++ * ++ * struct pipefs_hdr's are meant to serve as generic, general-purpose message ++ * headers for easy rpc_pipefs I/O. When an upcall is made, the ++ * struct pipefs_hdr is assigned to a struct rpc_pipe_msg and delivered ++ * therein. --And yes, the naming can seem a little confusing at first: ++ * ++ * When one thinks of an upcall "message", in simple_rpc_pipefs that's a ++ * struct pipefs_hdr (possibly with an attached message body). A ++ * struct rpc_pipe_msg is actually only the -vehicle- by which the "real" ++ * message is delivered and processed. ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen, u16 padlen) ++{ ++ u16 totallen; ++ struct pipefs_hdr *msg = NULL; ++ ++ totallen = sizeof(*msg) + datalen + padlen; ++ if (totallen > PAGE_SIZE) { ++ msg = ERR_PTR(-E2BIG); ++ goto out; ++ } ++ ++ msg = kzalloc(totallen, GFP_KERNEL); ++ if (!msg) { ++ msg = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ msg->msgid = msgid; ++ msg->type = type; ++ msg->flags = flags; ++ msg->totallen = totallen; ++ memcpy(payload_of(msg), data, datalen); ++out: ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg_padded); ++ ++/* ++ * See the description of pipefs_alloc_init_msg_padded(). ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen) ++{ ++ return pipefs_alloc_init_msg_padded(msgid, type, flags, data, ++ datalen, 0); ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg); ++ ++ ++static void pipefs_init_rpcmsg(struct rpc_pipe_msg *rpcmsg, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ memset(rpcmsg, 0, sizeof(*rpcmsg)); ++ rpcmsg->data = msg; ++ rpcmsg->len = msg->totallen; ++ rpcmsg->flags = upflags; ++} ++ ++static struct rpc_pipe_msg *pipefs_alloc_init_rpcmsg(struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ struct rpc_pipe_msg *rpcmsg; ++ ++ rpcmsg = kmalloc(sizeof(*rpcmsg), GFP_KERNEL); ++ if (!rpcmsg) ++ return ERR_PTR(-ENOMEM); ++ ++ pipefs_init_rpcmsg(rpcmsg, msg, upflags); ++ return rpcmsg; ++} ++ ++ ++/* represents an upcall that'll block and wait for a reply */ ++struct pipefs_upcall { ++ u32 msgid; ++ struct rpc_pipe_msg rpcmsg; ++ struct list_head list; ++ wait_queue_head_t waitq; ++ struct pipefs_hdr *reply; ++}; ++ ++ ++static void pipefs_init_upcall_waitreply(struct pipefs_upcall *upcall, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ upcall->reply = NULL; ++ upcall->msgid = msg->msgid; ++ INIT_LIST_HEAD(&upcall->list); ++ init_waitqueue_head(&upcall->waitq); ++ pipefs_init_rpcmsg(&upcall->rpcmsg, msg, upflags); ++} ++ ++static int __pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_upcall *upcall, ++ struct pipefs_list *uplist, ++ u32 timeout) ++{ ++ int err = 0; ++ DECLARE_WAITQUEUE(wq, current); ++ ++ add_wait_queue(&upcall->waitq, &wq); ++ spin_lock(&uplist->list_lock); ++ list_add(&upcall->list, &uplist->list); ++ spin_unlock(&uplist->list_lock); ++ ++ err = rpc_queue_upcall(pipe->d_inode, &upcall->rpcmsg); ++ if (err < 0) ++ goto out; ++ ++ if (timeout) { ++ /* retval of 0 means timer expired */ ++ err = schedule_timeout_uninterruptible(timeout); ++ if (err == 0 && upcall->reply == NULL) ++ err = -ETIMEDOUT; ++ } else { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ } ++ ++out: ++ spin_lock(&uplist->list_lock); ++ list_del_init(&upcall->list); ++ spin_unlock(&uplist->list_lock); ++ remove_wait_queue(&upcall->waitq, &wq); ++ return err; ++} ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace, place the calling thread ++ * on @uplist, and block the thread to wait for a reply. If @timeout is ++ * nonzero, the thread will be blocked for at most @timeout jiffies. ++ * ++ * (To convert time units into jiffies, consider the functions ++ * msecs_to_jiffies(), usecs_to_jiffies(), timeval_to_jiffies(), and ++ * timespec_to_jiffies().) ++ * ++ * Once a reply is received by your downcall handler, call ++ * pipefs_assign_upcall_reply() with @uplist to find the corresponding upcall, ++ * assign the reply, and wake the waiting thread. ++ * ++ * This function's return value pointer may be an error and should be checked ++ * with IS_ERR() before attempting to access the reply message. ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list *uplist, ++ u8 upflags, u32 timeout) ++{ ++ int err = 0; ++ struct pipefs_upcall upcall; ++ ++ pipefs_init_upcall_waitreply(&upcall, msg, upflags); ++ err = __pipefs_queue_upcall_waitreply(pipe, &upcall, uplist, timeout); ++ if (err < 0) { ++ kfree(upcall.reply); ++ upcall.reply = ERR_PTR(err); ++ } ++ ++ return upcall.reply; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_waitreply); ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace and immediately return (i.e., ++ * no reply is expected). ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++int pipefs_queue_upcall_noreply(struct dentry *pipe, struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ int err = 0; ++ struct rpc_pipe_msg *rpcmsg; ++ ++ upflags |= PIPEFS_AUTOFREE_RPCMSG; ++ rpcmsg = pipefs_alloc_init_rpcmsg(msg, upflags); ++ if (IS_ERR(rpcmsg)) { ++ err = PTR_ERR(rpcmsg); ++ goto out; ++ } ++ err = rpc_queue_upcall(pipe->d_inode, rpcmsg); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_noreply); ++ ++ ++static struct pipefs_upcall *pipefs_find_upcall_msgid(u32 msgid, ++ struct pipefs_list *uplist) ++{ ++ struct pipefs_upcall *upcall; ++ ++ spin_lock(&uplist->list_lock); ++ list_for_each_entry(upcall, &uplist->list, list) ++ if (upcall->msgid == msgid) ++ goto out; ++ upcall = NULL; ++out: ++ spin_unlock(&uplist->list_lock); ++ return upcall; ++} ++ ++/* ++ * In your rpc_pipe_ops->downcall() handler, once you've read in a downcall ++ * message and have determined that it is a reply to a waiting upcall, ++ * you can use this function to find the appropriate upcall, assign the result, ++ * and wake the upcall thread. ++ * ++ * The reply message must have the same msgid as the original upcall message's. ++ * ++ * See also pipefs_queue_upcall_waitreply() and pipefs_readmsg(). ++ */ ++int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist) ++{ ++ int err = 0; ++ struct pipefs_upcall *upcall; ++ ++ upcall = pipefs_find_upcall_msgid(reply->msgid, uplist); ++ if (!upcall) { ++ printk(KERN_ERR "%s: ERROR: have reply but no matching upcall " ++ "for msgid %d\n", __func__, reply->msgid); ++ err = -ENOENT; ++ goto out; ++ } ++ upcall->reply = reply; ++ wake_up(&upcall->waitq); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_assign_upcall_reply); ++ ++/* ++ * Generic method to read-in and return a newly-allocated message which begins ++ * with a struct pipefs_hdr. ++ */ ++struct pipefs_hdr *pipefs_readmsg(struct file *filp, const char __user *src, ++ size_t len) ++{ ++ int err = 0, hdrsize; ++ struct pipefs_hdr *msg = NULL; ++ ++ hdrsize = sizeof(*msg); ++ if (len < hdrsize) { ++ printk(KERN_ERR "%s: ERROR: header is too short (%d vs %d)\n", ++ __func__, (int) len, hdrsize); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ msg = kzalloc(len, GFP_KERNEL); ++ if (!msg) { ++ err = -ENOMEM; ++ goto out; ++ } ++ if (copy_from_user(msg, src, len)) ++ err = -EFAULT; ++out: ++ if (err) { ++ kfree(msg); ++ msg = ERR_PTR(err); ++ } ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_readmsg); ++ ++/* ++ * Generic rpc_pipe_ops->upcall() handler implementation. ++ * ++ * Don't call this directly: to make an upcall, use ++ * pipefs_queue_upcall_waitreply() or pipefs_queue_upcall_noreply(). ++ */ ++ssize_t pipefs_generic_upcall(struct file *filp, struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen) ++{ ++ char *data; ++ ssize_t len, left; ++ ++ data = (char *)rpcmsg->data + rpcmsg->copied; ++ len = rpcmsg->len - rpcmsg->copied; ++ if (len > buflen) ++ len = buflen; ++ ++ left = copy_to_user(dst, data, len); ++ if (left < 0) { ++ rpcmsg->errno = left; ++ return left; ++ } ++ ++ len -= left; ++ rpcmsg->copied += len; ++ rpcmsg->errno = 0; ++ return len; ++} ++EXPORT_SYMBOL(pipefs_generic_upcall); ++ ++/* ++ * Generic rpc_pipe_ops->destroy_msg() handler implementation. ++ * ++ * Items are only freed if @rpcmsg->flags has been set appropriately. ++ * See pipefs_queue_upcall_noreply() and rpc_pipe_fs.h. ++ */ ++void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg) ++{ ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_UPCALL_MSG) ++ kfree(rpcmsg->data); ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_RPCMSG) ++ kfree(rpcmsg); ++} ++EXPORT_SYMBOL(pipefs_generic_destroy_msg); +diff -up linux-2.6.34.noarch/net/sunrpc/xdr.c.orig linux-2.6.34.noarch/net/sunrpc/xdr.c +--- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-08-31 20:41:19.188144022 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-31 20:42:05.607108065 -0400 +@@ -395,24 +395,29 @@ xdr_shrink_pagelen(struct xdr_buf *buf, + { + struct kvec *tail; + size_t copy; +- char *p; + unsigned int pglen = buf->page_len; ++ unsigned int tailbuf_len; + + tail = buf->tail; + BUG_ON (len > pglen); + ++ tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len; ++ + /* Shift the tail first */ +- if (tail->iov_len != 0) { +- p = (char *)tail->iov_base + len; ++ if (tailbuf_len != 0) { ++ unsigned int free_space = tailbuf_len - tail->iov_len; ++ ++ if (len < free_space) ++ free_space = len; ++ tail->iov_len += free_space; ++ ++ copy = len; + if (tail->iov_len > len) { +- copy = tail->iov_len - len; +- memmove(p, tail->iov_base, copy); ++ char *p = (char *)tail->iov_base + len; ++ memmove(p, tail->iov_base, tail->iov_len - len); + } else +- buf->buflen -= len; +- /* Copy from the inlined pages into the tail */ +- copy = len; +- if (copy > tail->iov_len) + copy = tail->iov_len; ++ /* Copy from the inlined pages into the tail */ + _copy_from_pages((char *)tail->iov_base, + buf->pages, buf->page_base + pglen - len, + copy); +@@ -496,6 +501,27 @@ __be32 * xdr_reserve_space(struct xdr_st + EXPORT_SYMBOL_GPL(xdr_reserve_space); + + /** ++ * xdr_rewind_stream - rewind a stream back to some checkpoint ++ * @xdr: pointer to xdr_stream ++ * @q: some checkpoint at historical place of @xdr ++ * ++ * Restors an xdr stream to some historical point. @q must be ++ * a logical xdr point in the past that was sampled by @q = @xdr->p. ++ */ ++__be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q) ++{ ++ size_t nbytes = (xdr->p - q) << 2; ++ ++ BUG_ON(xdr->p < q); ++ BUG_ON(nbytes > xdr->iov->iov_len || nbytes > xdr->buf->len); ++ xdr->p = q; ++ xdr->iov->iov_len -= nbytes; ++ xdr->buf->len -= nbytes; ++ return q; ++} ++EXPORT_SYMBOL_GPL(xdr_rewind_stream); ++ ++/** + * xdr_write_pages - Insert a list of pages into an XDR buffer for sending + * @xdr: pointer to xdr_stream + * @pages: list of pages From c368aef481d0efe35e12a274d914417e1beb972e Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Sat, 4 Sep 2010 09:23:12 -0400 Subject: [PATCH 10/20] Removed localversion-nfs file Signed-off-by: Steve Dickson --- pnfs-all-2.6.35-2010-08-24-f13.patch | 393 +++++++++++++-------------- 1 file changed, 194 insertions(+), 199 deletions(-) diff --git a/pnfs-all-2.6.35-2010-08-24-f13.patch b/pnfs-all-2.6.35-2010-08-24-f13.patch index 17d1c844d..7d82d9fa4 100644 --- a/pnfs-all-2.6.35-2010-08-24-f13.patch +++ b/pnfs-all-2.6.35-2010-08-24-f13.patch @@ -1,6 +1,6 @@ diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arch/um/os-Linux/mem.c ---- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-08-31 20:41:16.924243041 -0400 -+++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-08-31 20:42:05.486160576 -0400 +--- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-09-04 09:20:04.110038647 -0400 ++++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-09-04 09:21:44.875202803 -0400 @@ -13,6 +13,7 @@ #include #include @@ -11,7 +11,7 @@ diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arc #include "os.h" diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd.c --- linux-2.6.34.noarch/block/genhd.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/block/genhd.c 2010-08-31 20:42:05.487160201 -0400 ++++ linux-2.6.34.noarch/block/genhd.c 2010-09-04 09:21:44.875202803 -0400 @@ -1009,6 +1009,7 @@ static void disk_release(struct device * struct class block_class = { .name = "block", @@ -21,8 +21,8 @@ diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd. static char *block_devnode(struct device *dev, mode_t *mode) { diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt ---- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-31 20:42:05.486160576 -0400 -+++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-08-31 20:42:05.486160576 -0400 +--- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-09-04 09:21:44.876222743 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-09-04 09:21:44.876222743 -0400 @@ -0,0 +1,211 @@ +(c) 2007 Network Appliance Inc. + @@ -236,8 +236,8 @@ diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6. + + diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/drivers/md/dm-ioctl.c ---- linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig 2010-08-31 20:41:17.063232968 -0400 -+++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-08-31 20:42:05.488160560 -0400 +--- linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig 2010-09-04 09:20:04.252180557 -0400 ++++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-09-04 09:21:44.877242928 -0400 @@ -657,6 +657,12 @@ static int dev_create(struct dm_ioctl *p return r; } @@ -292,7 +292,7 @@ diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/driv int r; diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drivers/scsi/hosts.c --- linux-2.6.34.noarch/drivers/scsi/hosts.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-08-31 20:42:05.489160594 -0400 ++++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-09-04 09:21:44.879035601 -0400 @@ -49,7 +49,7 @@ static void scsi_host_cls_release(struct put_device(&class_to_shost(dev)->shost_gendev); } @@ -304,7 +304,7 @@ diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drive }; diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/exofs.h --- linux-2.6.34.noarch/fs/exofs/exofs.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-08-31 20:42:05.492243039 -0400 ++++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-09-04 09:21:44.879035601 -0400 @@ -36,13 +36,9 @@ #include #include @@ -360,8 +360,8 @@ diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/ + #endif diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs/export.c ---- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-08-31 20:42:05.493222759 -0400 -+++ linux-2.6.34.noarch/fs/exofs/export.c 2010-08-31 20:42:05.493222759 -0400 +--- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-09-04 09:21:44.880171068 -0400 ++++ linux-2.6.34.noarch/fs/exofs/export.c 2010-09-04 09:21:44.880171068 -0400 @@ -0,0 +1,396 @@ +/* + * export.c - Implementation of the pnfs_export_operations @@ -761,7 +761,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs +} diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/inode.c --- linux-2.6.34.noarch/fs/exofs/inode.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-08-31 20:42:05.494222756 -0400 ++++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-09-04 09:21:44.881160952 -0400 @@ -833,7 +833,7 @@ void exofs_truncate(struct inode *inode) if (unlikely(wait_obj_created(oi))) goto fail; @@ -781,7 +781,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/ * Fill in an inode read from the OSD and set it up for use diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/Kbuild --- linux-2.6.34.noarch/fs/exofs/Kbuild.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-08-31 20:42:05.490222933 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-09-04 09:21:44.882160660 -0400 @@ -13,4 +13,5 @@ # @@ -790,7 +790,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/K obj-$(CONFIG_EXOFS_FS) += exofs.o diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/Kconfig --- linux-2.6.34.noarch/fs/exofs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-08-31 20:42:05.491232880 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-09-04 09:21:44.883039027 -0400 @@ -1,6 +1,7 @@ config EXOFS_FS tristate "exofs: OSD based file system support" @@ -801,7 +801,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/ as its backing storage. diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/super.c --- linux-2.6.34.noarch/fs/exofs/super.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/super.c 2010-08-31 20:42:05.496073173 -0400 ++++ linux-2.6.34.noarch/fs/exofs/super.c 2010-09-04 09:21:44.883039027 -0400 @@ -621,6 +621,7 @@ static int exofs_fill_super(struct super sb->s_fs_info = sbi; sb->s_op = &exofs_sops; @@ -812,7 +812,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/ EXOFS_ERR("ERROR: exofs_iget failed\n"); diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exportfs/expfs.c --- linux-2.6.34.noarch/fs/exportfs/expfs.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-08-31 20:42:05.497212975 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-09-04 09:21:44.884180594 -0400 @@ -16,6 +16,13 @@ #include #include @@ -829,7 +829,7 @@ diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exp diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/exportfs/Makefile --- linux-2.6.34.noarch/fs/exportfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-08-31 20:42:05.496073173 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-09-04 09:21:44.885160697 -0400 @@ -3,4 +3,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs.o @@ -840,8 +840,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/ex +exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o +exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c ---- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-31 20:42:05.497212975 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-31 20:42:05.498113655 -0400 +--- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-09-04 09:21:44.885160697 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-09-04 09:21:44.885160697 -0400 @@ -0,0 +1,158 @@ +/* + * linux/fs/nfsd/nfs4blocklayoutxdr.c @@ -1002,8 +1002,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34. +} +EXPORT_SYMBOL_GPL(blocklayout_encode_layout); diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c ---- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-31 20:42:05.498113655 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-31 20:42:05.498113655 -0400 +--- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-09-04 09:21:44.886051895 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-09-04 09:21:44.886051895 -0400 @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -1224,8 +1224,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.n +} +EXPORT_SYMBOL(filelayout_encode_layout); diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c ---- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-31 20:42:05.499125509 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-31 20:42:05.499125509 -0400 +--- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-09-04 09:21:44.887054758 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-09-04 09:21:44.887054758 -0400 @@ -0,0 +1,289 @@ +/* + * pnfs_osd_xdr_enc.c @@ -1518,7 +1518,7 @@ diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.no +EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gfs2/ops_fstype.c --- linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-08-31 20:42:05.500123860 -0400 ++++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-09-04 09:21:44.888035389 -0400 @@ -19,6 +19,7 @@ #include #include @@ -1539,7 +1539,7 @@ diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gf sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig --- linux-2.6.34.noarch/fs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/Kconfig 2010-08-31 20:42:05.490222933 -0400 ++++ linux-2.6.34.noarch/fs/Kconfig 2010-09-04 09:21:44.889035490 -0400 @@ -224,6 +224,31 @@ config LOCKD_V4 config EXPORTFS tristate @@ -1573,8 +1573,8 @@ diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig tristate select FS_POSIX_ACL diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-31 20:42:05.503222878 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-31 20:42:05.503222878 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-09-04 09:21:44.890035431 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-09-04 09:21:44.890035431 -0400 @@ -0,0 +1,66 @@ +#include +#include @@ -1643,8 +1643,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.or + return; +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-31 20:42:05.504232855 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-31 20:42:05.504232855 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-09-04 09:21:44.891045310 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-09-04 09:21:44.891045310 -0400 @@ -0,0 +1,1160 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.c @@ -2807,8 +2807,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34. +module_init(nfs4blocklayout_init); +module_exit(nfs4blocklayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-31 20:42:05.506119071 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-31 20:42:05.506119071 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-09-04 09:21:44.892025716 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-09-04 09:21:44.892025716 -0400 @@ -0,0 +1,335 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdev.c @@ -3146,8 +3146,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6. + goto out; +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-31 20:42:05.506119071 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-31 20:42:05.506119071 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-09-04 09:21:44.893035500 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-09-04 09:21:44.893035500 -0400 @@ -0,0 +1,120 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdm.c @@ -3270,8 +3270,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.3 + } +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-31 20:42:05.505169618 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-31 20:42:05.505169618 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-09-04 09:21:44.894045279 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-09-04 09:21:44.894045279 -0400 @@ -0,0 +1,302 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h @@ -3576,8 +3576,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34. + +#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-31 20:42:05.507113260 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-08-31 20:42:05.508119925 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-09-04 09:21:44.895035248 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-09-04 09:21:44.895035248 -0400 @@ -0,0 +1,948 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h @@ -4528,8 +4528,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noar + } +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile ---- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-31 20:42:05.502212803 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-08-31 20:42:05.502212803 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-09-04 09:21:44.895035248 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-09-04 09:21:44.896025369 -0400 @@ -0,0 +1,6 @@ +# +# Makefile for the pNFS block layout driver kernel module @@ -4539,7 +4539,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarc + extents.o block-device-discovery-pipe.o diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/callback.h --- linux-2.6.34.noarch/fs/nfs/callback.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-08-31 20:42:05.508119925 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-09-04 09:21:44.896025369 -0400 @@ -111,6 +111,13 @@ extern int nfs41_validate_delegation_sta #define RCA4_TYPE_MASK_RDATA_DLG 0 @@ -4596,7 +4596,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/c extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/nfs/callback_proc.c --- linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-08-31 20:42:05.509093330 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-09-04 09:21:44.897056128 -0400 @@ -8,10 +8,15 @@ #include #include @@ -5079,7 +5079,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/ return status; diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/nfs/callback_xdr.c --- linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-08-31 20:42:05.510143651 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-09-04 09:21:44.898072186 -0400 @@ -22,6 +22,8 @@ #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) @@ -5281,8 +5281,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/n .process_op = (callback_process_op_t)nfs4_callback_sequence, .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/client.c ---- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-08-31 20:41:19.144140225 -0400 -+++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-31 20:42:05.511222861 -0400 +--- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-09-04 09:20:05.988202702 -0400 ++++ linux-2.6.34.noarch/fs/nfs/client.c 2010-09-04 09:21:44.900025165 -0400 @@ -39,6 +39,7 @@ #include #include @@ -5491,8 +5491,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/cli goto error; diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/bl_com.c ---- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-08-31 20:42:05.550110844 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-08-31 20:42:05.550110844 -0400 +--- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-09-04 09:21:44.900025165 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-09-04 09:21:44.901035455 -0400 @@ -0,0 +1,292 @@ +#if defined(CONFIG_SPNFS_BLOCK) + @@ -5787,8 +5787,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/b +} +#endif /* CONFIG_SPNFS_BLOCK */ diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/bl_ops.c ---- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-08-31 20:42:05.551222888 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-08-31 20:42:05.551222888 -0400 +--- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-09-04 09:21:44.902035254 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-09-04 09:21:44.902035254 -0400 @@ -0,0 +1,1672 @@ +/* + * bl_ops.c @@ -7463,8 +7463,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/b + +#endif /* CONFIG_SPNFS_BLOCK */ diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs/delegation.c ---- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-08-31 20:41:19.144140225 -0400 -+++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-31 20:42:05.512106042 -0400 +--- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-09-04 09:20:05.988202702 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-09-04 09:21:44.903025737 -0400 @@ -104,7 +104,8 @@ again: continue; if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) @@ -7541,7 +7541,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs rcu_read_unlock(); diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs/delegation.h --- linux-2.6.34.noarch/fs/nfs/delegation.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-08-31 20:42:05.513114811 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-09-04 09:21:44.904035627 -0400 @@ -34,9 +34,7 @@ enum { int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); @@ -7554,8 +7554,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c ---- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-08-31 20:41:19.196140434 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-31 20:42:05.553222784 -0400 +--- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-09-04 09:20:06.039203080 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-09-04 09:21:44.905045348 -0400 @@ -17,11 +17,19 @@ #include #include @@ -7733,7 +7733,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/e svcauth_unix_purge(); diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/direct.c --- linux-2.6.34.noarch/fs/nfs/direct.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-08-31 20:42:05.514196343 -0400 ++++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-09-04 09:21:44.906025356 -0400 @@ -267,6 +267,38 @@ static const struct rpc_call_ops nfs_rea .rpc_release = nfs_direct_read_release, }; @@ -7979,7 +7979,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/dir user_addr += bytes; diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kconfig --- linux-2.6.34.noarch/fs/nfsd/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-08-31 20:42:05.549222922 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-09-04 09:21:44.907035472 -0400 @@ -79,3 +79,52 @@ config NFSD_V4 available from http://linux-nfs.org/. @@ -8035,7 +8035,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kc + If unsure, say N. diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/Makefile --- linux-2.6.34.noarch/fs/nfsd/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-08-31 20:42:05.549222922 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-09-04 09:21:44.907035472 -0400 @@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ @@ -8045,8 +8045,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/M +nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o +nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-31 20:41:19.197150385 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-31 20:42:05.554114789 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-09-04 09:20:06.040212867 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-09-04 09:21:44.908055511 -0400 @@ -40,7 +40,6 @@ #define NFSPROC4_CB_NULL 0 @@ -8586,8 +8586,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/ +} +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-31 20:42:05.556172071 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-31 20:42:05.556172071 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-09-04 09:21:44.910025108 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-09-04 09:21:44.910025108 -0400 @@ -0,0 +1,1679 @@ +/****************************************************************************** + * @@ -10269,8 +10269,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfs + return status; +} diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-31 20:42:05.557222774 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-31 20:42:05.557222774 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-09-04 09:21:44.911025728 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-09-04 09:21:44.911025728 -0400 @@ -0,0 +1,461 @@ +/****************************************************************************** + * @@ -10734,8 +10734,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/n +}; +EXPORT_SYMBOL(pnfs_dlm_export_ops); diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-31 20:42:05.558141620 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-31 20:42:05.558141620 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-09-04 09:21:44.912035398 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-09-04 09:21:44.912035398 -0400 @@ -0,0 +1,620 @@ +/* +* linux/fs/nfsd/nfs4pnfsds.c @@ -11358,8 +11358,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nf + +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-31 20:41:19.198160463 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-31 20:42:05.559129617 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-09-04 09:20:06.041223204 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-09-04 09:21:44.913035888 -0400 @@ -34,10 +34,14 @@ */ #include @@ -11834,8 +11834,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd static const char *nfsd4_op_name(unsigned opnum) diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-08-31 20:41:19.200150153 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-31 20:42:05.561202607 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-09-04 09:20:06.043212709 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-09-04 09:21:44.916015197 -0400 @@ -42,6 +42,8 @@ #include "xdr4.h" #include "vfs.h" @@ -12351,8 +12351,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs } diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-08-31 20:41:19.202150173 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-31 20:42:05.563232916 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-09-04 09:20:06.045212665 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-09-04 09:21:44.918025318 -0400 @@ -47,9 +47,14 @@ #include #include @@ -12971,8 +12971,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/ [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c ---- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-08-31 20:41:19.203150982 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-31 20:42:05.565212801 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-09-04 09:20:06.047233081 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-09-04 09:21:44.920025397 -0400 @@ -13,10 +13,15 @@ #include #include @@ -13149,8 +13149,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/n remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h ---- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-08-31 20:41:19.204160960 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-31 20:42:05.565212801 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-09-04 09:20:06.047233081 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-09-04 09:21:44.920025397 -0400 @@ -285,11 +285,17 @@ extern time_t nfsd4_grace; #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ NFSD4_SUPPORTED_ATTRS_WORD0 @@ -13172,7 +13172,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfs { diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.c --- linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-08-31 20:42:05.566222921 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-09-04 09:21:44.921045937 -0400 @@ -10,6 +10,7 @@ #include @@ -13210,7 +13210,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nf __u32 tfh[2]; diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.h --- linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-08-31 20:42:05.567233002 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-09-04 09:21:44.922035547 -0400 @@ -14,6 +14,7 @@ enum nfsd_fsid { FSID_UUID8, FSID_UUID16, @@ -13263,8 +13263,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nf + #endif /* _LINUX_NFSD_FH_INT_H */ diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/nfssvc.c ---- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-08-31 20:41:17.274232911 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-08-31 20:42:05.568144414 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-09-04 09:20:04.514160362 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-09-04 09:21:44.923045353 -0400 @@ -115,7 +115,7 @@ struct svc_program nfsd_program = { }; @@ -13275,8 +13275,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/n int nfsd_vers(int vers, enum vers_op change) { diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pnfsd.h ---- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-08-31 20:42:05.569090615 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-08-31 20:42:05.569090615 -0400 +--- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-09-04 09:21:44.923045353 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-09-04 09:21:44.923045353 -0400 @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2005 The Regents of the University of Michigan. @@ -13422,8 +13422,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pn + +#endif /* LINUX_NFSD_PNFSD_H */ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c ---- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-31 20:42:05.569090615 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-31 20:42:05.569090615 -0400 +--- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-09-04 09:21:44.924046083 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-09-04 09:21:44.924046083 -0400 @@ -0,0 +1,225 @@ +/* + * linux/fs/nfsd/pnfs_lexp.c @@ -13651,8 +13651,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nf + inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; +} diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_com.c ---- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-31 20:42:05.570119170 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-08-31 20:42:05.570119170 -0400 +--- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-09-04 09:21:44.925035828 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-09-04 09:21:44.925035828 -0400 @@ -0,0 +1,535 @@ +/* + * fs/nfsd/spnfs_com.c @@ -14190,8 +14190,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfs +} +#endif /* CONFIG_PROC_FS */ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c ---- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-31 20:42:05.571097807 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-08-31 20:42:05.572091128 -0400 +--- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-09-04 09:21:44.926030099 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-09-04 09:21:44.926030099 -0400 @@ -0,0 +1,878 @@ +/* + * fs/nfsd/spnfs_ops.c @@ -15072,8 +15072,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfs + return 0; +} diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h ---- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-08-31 20:41:19.205016844 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-31 20:42:05.572091128 -0400 +--- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-09-04 09:20:06.048233523 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-09-04 09:21:44.927025219 -0400 @@ -242,6 +242,12 @@ struct nfs4_client { u32 cl_cb_seq_nr; struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ @@ -15190,8 +15190,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/st + #endif /* NFSD4_STATE_H */ diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs.c ---- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-08-31 20:41:17.275233561 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-08-31 20:42:05.573121119 -0400 +--- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-09-04 09:20:04.515160297 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-09-04 09:21:44.929025356 -0400 @@ -37,7 +37,12 @@ #ifdef CONFIG_NFSD_V4 #include @@ -15318,8 +15318,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs. out_nfserr: err = nfserrno(host_err); diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h ---- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-08-31 20:41:19.206170424 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-31 20:42:05.575139084 -0400 +--- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-09-04 09:20:06.049232898 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-09-04 09:21:44.930035442 -0400 @@ -37,6 +37,8 @@ #ifndef _LINUX_NFSD_XDR4_H #define _LINUX_NFSD_XDR4_H @@ -15396,8 +15396,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr struct nfs4_replay * replay; }; diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file.c ---- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-31 20:41:19.146161064 -0400 -+++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-31 20:42:05.515139585 -0400 +--- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-09-04 09:20:05.990223533 -0400 ++++ linux-2.6.34.noarch/fs/nfs/file.c 2010-09-04 09:21:44.930035442 -0400 @@ -36,6 +36,7 @@ #include "internal.h" #include "iostat.h" @@ -15515,8 +15515,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file. if (!ret) return VM_FAULT_LOCKED; diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inode.c ---- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-08-31 20:41:19.149170418 -0400 -+++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-31 20:42:05.516222809 -0400 +--- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-09-04 09:20:05.993222927 -0400 ++++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-09-04 09:21:44.932035441 -0400 @@ -48,6 +48,7 @@ #include "internal.h" #include "fscache.h" @@ -15730,8 +15730,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inod nfs_fs_proc_exit(); nfsiod_stop(); diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/internal.h ---- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-08-31 20:41:19.149170418 -0400 -+++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-31 20:42:05.517099944 -0400 +--- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-09-04 09:20:05.993222927 -0400 ++++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-09-04 09:21:44.933035332 -0400 @@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_serv struct nfs_fattr *); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); @@ -15792,7 +15792,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/i struct page *, struct page *); diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kconfig --- linux-2.6.34.noarch/fs/nfs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-08-31 20:42:05.500123860 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-09-04 09:21:44.933035332 -0400 @@ -79,10 +79,48 @@ config NFS_V4_1 depends on NFS_V4 && EXPERIMENTAL help @@ -15845,7 +15845,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kcon depends on NFS_FS=y && IP_PNP diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Makefile --- linux-2.6.34.noarch/fs/nfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-08-31 20:42:05.501268752 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-09-04 09:21:44.934046035 -0400 @@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4x delegation.o idmap.o \ callback.o callback_xdr.o callback_proc.o \ @@ -15860,8 +15860,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Mak +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ +obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs3proc.c ---- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-08-31 20:41:19.152180625 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-31 20:42:05.518232887 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-09-04 09:20:05.996242985 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-09-04 09:21:44.935035426 -0400 @@ -833,6 +833,7 @@ const struct nfs_rpc_ops nfs_v3_clientop .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs3_dir_inode_operations, @@ -15871,8 +15871,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/n .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-31 20:42:05.519163219 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-08-31 20:42:05.520222923 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-09-04 09:21:44.936035595 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-09-04 09:21:44.936035595 -0400 @@ -0,0 +1,768 @@ +/* + * linux/fs/nfs/nfs4filelayout.c @@ -16643,8 +16643,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs +module_init(nfs4filelayout_init); +module_exit(nfs4filelayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-31 20:42:05.521233147 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-31 20:42:05.521233147 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-09-04 09:21:44.937035580 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-09-04 09:21:44.937035580 -0400 @@ -0,0 +1,635 @@ +/* + * linux/fs/nfs/nfs4filelayoutdev.c @@ -17282,8 +17282,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch +} + diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-31 20:42:05.520222923 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-08-31 20:42:05.520222923 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-09-04 09:21:44.938035519 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-09-04 09:21:44.938035519 -0400 @@ -0,0 +1,96 @@ +/* + * pnfs_nfs4filelayout.h @@ -17382,8 +17382,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs + +#endif /* FS_NFS_NFS4FILELAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nfs4_fs.h ---- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-31 20:41:19.154160465 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-31 20:42:05.519163219 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-09-04 09:20:05.998222938 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-09-04 09:21:44.939035693 -0400 @@ -45,8 +45,28 @@ enum nfs4_client_state { NFS4CLNT_RECLAIM_NOGRACE, NFS4CLNT_DELEGRETURN, @@ -17532,8 +17532,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nf /* nfs4xdr.c */ diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs4proc.c ---- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-08-31 20:41:19.157140145 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-31 20:42:05.524099925 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-09-04 09:20:06.001202714 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-09-04 09:21:44.942015100 -0400 @@ -49,12 +49,14 @@ #include #include @@ -19198,7 +19198,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/n .setattr = nfs4_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs/nfs4renewd.c --- linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-08-31 20:42:05.526213255 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-09-04 09:21:44.944045456 -0400 @@ -54,17 +54,17 @@ void nfs4_renew_state(struct work_struct *work) @@ -19221,8 +19221,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs spin_lock(&clp->cl_lock); lease = clp->cl_lease_time; diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/nfs4state.c ---- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-08-31 20:41:19.158078621 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-31 20:42:05.527232994 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-09-04 09:20:06.002213222 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-09-04 09:21:44.945035417 -0400 @@ -48,11 +48,13 @@ #include #include @@ -19545,8 +19545,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/ test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs4xdr.c ---- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-31 20:41:19.160150207 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-31 20:42:05.530092192 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-09-04 09:20:06.004212730 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-09-04 09:21:44.948015074 -0400 @@ -50,8 +50,10 @@ #include #include @@ -21056,8 +21056,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nf }; diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild ---- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-31 20:42:05.532213157 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-08-31 20:42:05.532213157 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-09-04 09:21:44.950025182 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-09-04 09:21:44.950025182 -0400 @@ -0,0 +1,11 @@ +# +# Makefile for the pNFS Objects Layout Driver kernel module @@ -21071,8 +21071,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs +panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o +obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-31 20:42:05.533243491 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-31 20:42:05.534105468 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-09-04 09:21:44.951035482 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-09-04 09:21:44.951035482 -0400 @@ -0,0 +1,1087 @@ +/* + * objio_osd.c @@ -22162,8 +22162,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noar +module_init(objlayout_init); +module_exit(objlayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-31 20:42:05.535059115 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-08-31 20:42:05.535059115 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-09-04 09:21:44.952035857 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-09-04 09:21:44.952035857 -0400 @@ -0,0 +1,790 @@ +/* + * objlayout.c @@ -22956,8 +22956,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noar + .uninitialize_mountpoint = objlayout_uninitialize_mountpoint, +}; diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h ---- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-31 20:42:05.535059115 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-08-31 20:42:05.535059115 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-09-04 09:21:44.953025191 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-09-04 09:21:44.953025191 -0400 @@ -0,0 +1,171 @@ +/* + * objlayout.h @@ -23131,8 +23131,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noar + +#endif /* _OBJLAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-31 20:42:05.536110535 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-31 20:42:05.536110535 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-09-04 09:21:44.954045432 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-09-04 09:21:44.954045432 -0400 @@ -0,0 +1,734 @@ +/* + * panfs_shim.c @@ -23869,8 +23869,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noa +module_init(panlayout_init); +module_exit(panlayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h ---- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-31 20:42:05.537124598 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-31 20:42:05.537124598 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-09-04 09:21:44.955035904 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-09-04 09:21:44.955035904 -0400 @@ -0,0 +1,482 @@ +/* + * panfs_shim.h @@ -24355,8 +24355,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noa + +#endif /* _PANLAYOUT_PANFS_SHIM_H */ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-31 20:42:05.538121971 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-31 20:42:05.538121971 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-09-04 09:21:44.956036011 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-09-04 09:21:44.956036011 -0400 @@ -0,0 +1,435 @@ +/* + * pnfs_osd_xdr.c @@ -24794,8 +24794,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6. + return 0; +} diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/pagelist.c ---- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-08-31 20:41:19.162150222 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-31 20:42:05.539131687 -0400 +--- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-09-04 09:20:06.006202442 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-09-04 09:21:44.957035861 -0400 @@ -20,6 +20,7 @@ #include @@ -24918,8 +24918,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/p if (res == INT_MAX) goto out; diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs.c ---- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-08-31 20:42:05.541150301 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-08-31 20:42:05.541150301 -0400 +--- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-09-04 09:21:44.959025145 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-09-04 09:21:44.959025145 -0400 @@ -0,0 +1,2037 @@ +/* + * linux/fs/nfs/pnfs.c @@ -26959,8 +26959,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs. +} +EXPORT_SYMBOL(nfs4_put_deviceid_cache); diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs.h ---- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-08-31 20:42:05.542222767 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-08-31 20:42:05.542222767 -0400 +--- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-09-04 09:21:44.960025819 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-09-04 09:21:44.960025819 -0400 @@ -0,0 +1,354 @@ +/* + * fs/nfs/pnfs.h @@ -27317,8 +27317,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs. + +#endif /* FS_NFS_PNFS_H */ diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc.c ---- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-08-31 20:41:19.163155499 -0400 -+++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-31 20:42:05.543103394 -0400 +--- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-09-04 09:20:06.007232858 -0400 ++++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-09-04 09:21:44.961035556 -0400 @@ -443,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, stru fattr = nfs_alloc_fattr(); status = -ENOMEM; @@ -27346,8 +27346,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc. .getattr = nfs_proc_getattr, .setattr = nfs_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read.c ---- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-08-31 20:41:19.164160482 -0400 -+++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-31 20:42:05.544233042 -0400 +--- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-09-04 09:20:06.008232903 -0400 ++++ linux-2.6.34.noarch/fs/nfs/read.c 2010-09-04 09:21:44.962035703 -0400 @@ -18,8 +18,12 @@ #include #include @@ -27562,8 +27562,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read. nfs_add_stats(inode, NFSIOS_READPAGES, npages); read_complete: diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/super.c ---- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-31 20:41:19.165170508 -0400 -+++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-31 20:42:05.545114737 -0400 +--- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-09-04 09:20:06.009232934 -0400 ++++ linux-2.6.34.noarch/fs/nfs/super.c 2010-09-04 09:21:44.963035469 -0400 @@ -64,6 +64,7 @@ #include "iostat.h" #include "internal.h" @@ -27611,8 +27611,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/supe #endif diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unlink.c ---- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-08-31 20:41:19.166151095 -0400 -+++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-31 20:42:05.546131839 -0400 +--- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-09-04 09:20:06.010203248 -0400 ++++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-09-04 09:21:44.964036069 -0400 @@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task struct nfs_unlinkdata *data = calldata; struct nfs_server *server = NFS_SERVER(data->dir); @@ -27623,8 +27623,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unl return; rpc_call_start(task); diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/write.c ---- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-08-31 20:41:17.273213379 -0400 -+++ linux-2.6.34.noarch/fs/nfs/write.c 2010-08-31 20:42:05.548212682 -0400 +--- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-09-04 09:20:04.513160311 -0400 ++++ linux-2.6.34.noarch/fs/nfs/write.c 2010-09-04 09:21:44.966025174 -0400 @@ -20,6 +20,7 @@ #include #include @@ -28313,7 +28313,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/writ int nfs_wb_page_cancel(struct inode *inode, struct page *page) diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/include/linux/exportfs.h --- linux-2.6.34.noarch/include/linux/exportfs.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-08-31 20:42:05.577222704 -0400 ++++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-09-04 09:21:44.967035352 -0400 @@ -2,6 +2,7 @@ #define LINUX_EXPORTFS_H 1 @@ -28386,8 +28386,8 @@ diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/i +#endif /* CONFIG_PNFSD */ #endif /* LINUX_EXPORTFS_H */ diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/include/linux/exp_xdr.h ---- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-08-31 20:42:05.576053304 -0400 -+++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-08-31 20:42:05.576053304 -0400 +--- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-09-04 09:21:44.969025737 -0400 ++++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-09-04 09:21:44.969025737 -0400 @@ -0,0 +1,141 @@ +#ifndef _LINUX_EXP_XDR_H +#define _LINUX_EXP_XDR_H @@ -28531,8 +28531,8 @@ diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/in +} +#endif /* _LINUX_EXP_XDR_H */ diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include/linux/fs.h ---- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-08-31 20:41:19.120034834 -0400 -+++ linux-2.6.34.noarch/include/linux/fs.h 2010-08-31 20:42:05.579212604 -0400 +--- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-09-04 09:20:05.965243003 -0400 ++++ linux-2.6.34.noarch/include/linux/fs.h 2010-09-04 09:21:44.971015113 -0400 @@ -387,6 +387,7 @@ struct inodes_stat_t { #include @@ -28551,7 +28551,7 @@ diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include struct dentry *s_root; diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/include/linux/nfs4.h --- linux-2.6.34.noarch/include/linux/nfs4.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-08-31 20:42:05.581035627 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-09-04 09:21:44.973025301 -0400 @@ -17,7 +17,10 @@ #define NFS4_BITMAP_SIZE 2 @@ -28681,8 +28681,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/inclu #endif diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfs4_pnfs.h ---- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-31 20:42:05.583087731 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-08-31 20:42:05.583087731 -0400 +--- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-09-04 09:21:44.974035325 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-09-04 09:21:44.974035325 -0400 @@ -0,0 +1,329 @@ +/* + * include/linux/nfs4_pnfs.h @@ -29014,8 +29014,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/ + +#endif /* LINUX_NFS4_PNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarch/include/linux/nfsd4_block.h ---- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-08-31 20:42:05.596098115 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-08-31 20:42:05.596098115 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-09-04 09:21:44.976025566 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-09-04 09:21:44.976025566 -0400 @@ -0,0 +1,101 @@ +#ifndef NFSD4_BLOCK +#define NFSD4_BLOCK @@ -29119,8 +29119,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarc +#endif /* NFSD4_BLOCK */ + diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h ---- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-31 20:42:05.597097942 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-08-31 20:42:05.597097942 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-09-04 09:21:44.977035317 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-09-04 09:21:44.977035317 -0400 @@ -0,0 +1,345 @@ +/* + * include/linux/nfsd4_spnfs.h @@ -29469,7 +29469,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarc +#endif /* NFS_SPNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch/include/linux/nfsd/const.h --- linux-2.6.34.noarch/include/linux/nfsd/const.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-08-31 20:42:05.591097762 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-09-04 09:21:44.978015841 -0400 @@ -29,6 +29,7 @@ #ifdef __KERNEL__ @@ -29480,7 +29480,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch * Largest number of bytes we need to allocate for an NFS diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch/include/linux/nfsd/debug.h --- linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-08-31 20:42:05.591097762 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-09-04 09:21:44.978015841 -0400 @@ -32,6 +32,8 @@ #define NFSDDBG_REPCACHE 0x0080 #define NFSDDBG_XDR 0x0100 @@ -29492,7 +29492,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarch/include/linux/nfsd/export.h --- linux-2.6.34.noarch/include/linux/nfsd/export.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-08-31 20:42:05.592118086 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-09-04 09:21:44.979055116 -0400 @@ -100,6 +100,7 @@ struct svc_export { uid_t ex_anon_uid; gid_t ex_anon_gid; @@ -29502,8 +29502,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarc struct nfsd4_fs_locations ex_fslocs; int ex_nflavors; diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h ---- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-31 20:42:05.592118086 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-31 20:42:05.592118086 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-09-04 09:21:44.979055116 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-09-04 09:21:44.980035474 -0400 @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -29638,8 +29638,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.3 + +#endif /* NFSD_NFS4LAYOUTXDR_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h ---- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-31 20:42:05.593020723 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-31 20:42:05.593020723 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-09-04 09:21:44.980035474 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-09-04 09:21:44.980035474 -0400 @@ -0,0 +1,54 @@ +/****************************************************************************** + * @@ -29696,8 +29696,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34. + +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h ---- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-31 20:42:05.594107962 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-31 20:42:05.594107962 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-09-04 09:21:44.981055721 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-09-04 09:21:44.981055721 -0400 @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -29972,7 +29972,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.n +#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noarch/include/linux/nfsd/syscall.h --- linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-08-31 20:42:05.594107962 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-09-04 09:21:44.982035422 -0400 @@ -29,6 +29,7 @@ /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ @@ -30010,8 +30010,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noar union nfsctl_res { diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/include/linux/nfs_fs.h ---- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-31 20:41:19.168160480 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-31 20:42:05.584098019 -0400 +--- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-09-04 09:20:06.012232950 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-09-04 09:21:44.983045467 -0400 @@ -72,13 +72,20 @@ struct nfs_access_entry { int mask; }; @@ -30111,8 +30111,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/inc #ifdef __KERNEL__ diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/include/linux/nfs_fs_sb.h ---- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-31 20:41:19.168160480 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-31 20:42:05.586087719 -0400 +--- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-09-04 09:20:06.012232950 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-09-04 09:21:44.985025570 -0400 @@ -15,6 +15,7 @@ struct nlm_host; struct nfs4_sequence_args; struct nfs4_sequence_res; @@ -30187,7 +30187,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/ atomic_t active; /* Keep trace of any activity to this server */ diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch/include/linux/nfs_iostat.h --- linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-08-31 20:42:05.587097913 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-09-04 09:21:44.986035288 -0400 @@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { NFSIOS_SHORTREAD, NFSIOS_SHORTWRITE, @@ -30200,7 +30200,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/include/linux/nfs_page.h --- linux-2.6.34.noarch/include/linux/nfs_page.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-08-31 20:42:05.588097898 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-09-04 09:21:44.987025532 -0400 @@ -39,6 +39,7 @@ struct nfs_page { struct list_head wb_list; /* Defines state of page: */ struct page *wb_page; /* page to read in/write out */ @@ -30249,8 +30249,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/i struct inode *inode, int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/include/linux/nfs_xdr.h ---- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-08-31 20:41:19.169171911 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-31 20:42:05.590087729 -0400 +--- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-09-04 09:20:06.013233555 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-09-04 09:21:44.989035583 -0400 @@ -3,6 +3,8 @@ #include @@ -30528,8 +30528,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/in extern struct rpc_version nfs_version3; extern struct rpc_version nfs_version4; diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.noarch/include/linux/panfs_shim_api.h ---- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-08-31 20:42:05.598087997 -0400 -+++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-08-31 20:42:05.599087710 -0400 +--- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-09-04 09:21:44.990025422 -0400 ++++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-09-04 09:21:44.991025218 -0400 @@ -0,0 +1,57 @@ +#ifndef _PANFS_SHIM_API_H +#define _PANFS_SHIM_API_H @@ -30589,8 +30589,8 @@ diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.no + +#endif /* _PANFS_SHIM_API_H */ diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h ---- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-31 20:42:05.600025088 -0400 -+++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-08-31 20:42:05.600025088 -0400 +--- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-09-04 09:21:44.992035338 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-09-04 09:21:44.992035338 -0400 @@ -0,0 +1,439 @@ +/* + * pnfs_osd_xdr.h @@ -31033,7 +31033,7 @@ diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noar +#endif /* __PNFS_OSD_XDR_H__ */ diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/include/linux/posix_acl.h --- linux-2.6.34.noarch/include/linux/posix_acl.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-08-31 20:42:05.601087875 -0400 ++++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-09-04 09:21:44.993025468 -0400 @@ -8,6 +8,7 @@ #ifndef __LINUX_POSIX_ACL_H #define __LINUX_POSIX_ACL_H @@ -31044,7 +31044,7 @@ diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/ #define ACL_UNDEFINED_ID (-1) diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h --- linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-08-31 20:42:05.602100892 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-09-04 09:21:44.994025129 -0400 @@ -14,6 +14,8 @@ /* size of an XDR encoding unit in bytes, i.e. 32bit */ #define XDR_UNIT (4) @@ -31056,7 +31056,7 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.n diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h --- linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-31 20:42:05.603108001 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-09-04 09:21:44.995045529 -0400 @@ -3,6 +3,7 @@ #ifdef __KERNEL__ @@ -31077,8 +31077,8 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.3 struct rpc_pipe_ops { diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h ---- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-31 20:42:05.603108001 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-31 20:42:05.603108001 -0400 +--- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-09-04 09:21:44.995045529 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-09-04 09:21:44.995045529 -0400 @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2008 The Regents of the University of Michigan. @@ -31193,7 +31193,7 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux +#endif /* _SIMPLE_RPC_PIPEFS_H_ */ diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h --- linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-31 20:42:05.604049784 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-09-04 09:21:44.996061803 -0400 @@ -166,4 +166,41 @@ static inline char *__svc_print_addr(con return buf; @@ -31237,8 +31237,8 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.n +} #endif /* SUNRPC_SVC_XPRT_H */ diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xdr.h ---- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-31 20:41:19.173118431 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-31 20:42:05.605107904 -0400 +--- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-09-04 09:20:06.017243774 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-09-04 09:21:44.997045653 -0400 @@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp) return p + 2; } @@ -31261,14 +31261,9 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, unsigned int len); extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); -diff -up linux-2.6.34.noarch/localversion-pnfs.orig linux-2.6.34.noarch/localversion-pnfs ---- linux-2.6.34.noarch/localversion-pnfs.orig 2010-08-31 20:42:05.605107904 -0400 -+++ linux-2.6.34.noarch/localversion-pnfs 2010-08-31 20:42:05.605107904 -0400 -@@ -0,0 +1 @@ -+-pnfs diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/sunrpc/Makefile --- linux-2.6.34.noarch/net/sunrpc/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-08-31 20:42:05.606020148 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-09-04 09:21:44.998058968 -0400 @@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt svc.o svcsock.o svcauth.o svcauth_unix.o \ addr.o rpcb_clnt.o timer.o xdr.o \ @@ -31279,8 +31274,8 @@ diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/su sunrpc-$(CONFIG_PROC_FS) += stats.o sunrpc-$(CONFIG_SYSCTL) += sysctl.o diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c ---- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-31 20:42:05.606020148 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-31 20:42:05.607108065 -0400 +--- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-09-04 09:21:44.999045582 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-09-04 09:21:44.999045582 -0400 @@ -0,0 +1,424 @@ +/* + * net/sunrpc/simple_rpc_pipefs.c @@ -31707,8 +31702,8 @@ diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.no +} +EXPORT_SYMBOL(pipefs_generic_destroy_msg); diff -up linux-2.6.34.noarch/net/sunrpc/xdr.c.orig linux-2.6.34.noarch/net/sunrpc/xdr.c ---- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-08-31 20:41:19.188144022 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-31 20:42:05.607108065 -0400 +--- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-09-04 09:20:06.031222775 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-09-04 09:21:45.000045387 -0400 @@ -395,24 +395,29 @@ xdr_shrink_pagelen(struct xdr_buf *buf, { struct kvec *tail; From e13651e22a981ffcd6c27e1c6ab9c30704a69bbc Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Mon, 23 Aug 2010 12:20:57 -0400 Subject: [PATCH 11/20] Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 Signed-off-by: Steve Dickson --- config-generic | 12 + kernel.spec | 15 +- linux-2.6-pnfs-compile.patch | 13 + linux-2.6.35-inline.patch | 11 + nfs-35-fc.patch | 7235 ++++++ nfsd-35-fc.patch | 1808 ++ pnfs-all-2.6.35-2010-08-19-f13.patch | 31788 +++++++++++++++++++++++++ 7 files changed, 40880 insertions(+), 2 deletions(-) create mode 100644 linux-2.6-pnfs-compile.patch create mode 100644 linux-2.6.35-inline.patch create mode 100644 nfs-35-fc.patch create mode 100644 nfsd-35-fc.patch create mode 100644 pnfs-all-2.6.35-2010-08-19-f13.patch diff --git a/config-generic b/config-generic index a25e79f7e..898e7a3d7 100644 --- a/config-generic +++ b/config-generic @@ -3322,6 +3322,18 @@ CONFIG_NFSD_V3=y CONFIG_NFSD_V3_ACL=y CONFIG_NFSD_V4=y CONFIG_NFS_FSCACHE=y +# Enable pNFS +CONFIG_PNFS=y +CONFIG_PNFSD=y +CONFIG_PNFSD_LOCAL_EXPORT=y +CONFIG_SPNFS=y +CONFIG_SPNFS_LAYOUTSEGMENTS=y +CONFIG_SPNFS_BLOCK=y +CONFIG_PNFS_OBJLAYOUT=m +CONFIG_PNFS_BLOCK=m +CONFIG_PNFS_PANLAYOUT=m +CONFIG_PNFS_FILE_LAYOUT=m +# CONFIG_LOCKD=m CONFIG_LOCKD_V4=y CONFIG_EXPORTFS=m diff --git a/kernel.spec b/kernel.spec index e2e43696d..6e4442efc 100644 --- a/kernel.spec +++ b/kernel.spec @@ -23,7 +23,7 @@ Summary: The Linux kernel # # (Uncomment the '#' and both spaces below to set the buildid.) # -# % define buildid .local +%define buildid .pnfs_all_2.6.35_2010_08_19 ################################################################### # The buildid can also be specified on the rpmbuild command line @@ -107,7 +107,7 @@ Summary: The Linux kernel # kernel-headers %define with_headers %{?_without_headers: 0} %{?!_without_headers: 1} # kernel-firmware -%define with_firmware %{?_with_firmware: 1} %{?!_with_firmware: 0} +%define with_firmware %{?_with_firmware: 1} %{?!_with_firmware: 1} # tools/perf %define with_perftool %{?_without_perftool: 0} %{?!_without_perftool: 1} # perf noarch subpkg @@ -816,6 +816,12 @@ Patch12570: sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bi Patch12580: xen-handle-events-as-edge-triggered.patch Patch12581: xen-use-percpu-interrupts-for-ipis-and-virqs.patch +Patch30000: nfs-35-fc.patch +Patch30001: nfsd-35-fc.patch +Patch30002: pnfs-all-2.6.35-2010-08-19-f13.patch +Patch30003: linux-2.6-pnfs-compile.patch +Patch30004: linux-2.6.35-inline.patch + %endif BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root @@ -1543,6 +1549,11 @@ ApplyPatch sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bit ApplyPatch xen-handle-events-as-edge-triggered.patch ApplyPatch xen-use-percpu-interrupts-for-ipis-and-virqs.patch +ApplyPatch nfs-35-fc.patch +ApplyPatch nfsd-35-fc.patch +ApplyPatch pnfs-all-2.6.35-2010-08-19-f13.patch +ApplyPatch linux-2.6-pnfs-compile.patch +ApplyPatch linux-2.6.35-inline.patch # END OF PATCH APPLICATIONS %endif diff --git a/linux-2.6-pnfs-compile.patch b/linux-2.6-pnfs-compile.patch new file mode 100644 index 000000000..7c8cc4248 --- /dev/null +++ b/linux-2.6-pnfs-compile.patch @@ -0,0 +1,13 @@ +diff -up linux-2.6.32.x86_64/fs/nfs/objlayout/pnfs_osd_xdr.h.orig linux-2.6.32.x86_64/fs/nfs/objlayout/pnfs_osd_xdr.h +diff -up linux-2.6.32.x86_64/include/net/inet_connection_sock.h.orig linux-2.6.32.x86_64/include/net/inet_connection_sock.h +--- linux-2.6.32.x86_64/include/net/inet_connection_sock.h.orig 2009-12-02 22:51:21.000000000 -0500 ++++ linux-2.6.32.x86_64/include/net/inet_connection_sock.h 2010-04-21 14:26:24.475659551 -0400 +@@ -23,7 +23,7 @@ + #include + #include + +-#define INET_CSK_DEBUG 1 ++//#define INET_CSK_DEBUG 1 + + /* Cancel timers, when they are not required. */ + #undef INET_CSK_CLEAR_TIMERS diff --git a/linux-2.6.35-inline.patch b/linux-2.6.35-inline.patch new file mode 100644 index 000000000..c56d8da5e --- /dev/null +++ b/linux-2.6.35-inline.patch @@ -0,0 +1,11 @@ +diff -up linux-2.6.34.noarch/arch/x86/Makefile.orig linux-2.6.34.noarch/arch/x86/Makefile +--- linux-2.6.34.noarch/arch/x86/Makefile.orig 2010-07-01 13:33:21.859627499 -0400 ++++ linux-2.6.34.noarch/arch/x86/Makefile 2010-07-01 13:36:26.751576450 -0400 +@@ -81,6 +81,7 @@ ifdef CONFIG_CC_STACKPROTECTOR + $(warning stack protector enabled but no compiler support) + endif + endif ++KBUILD_CFLAGS += -fno-inline-functions-called-once + + # Don't unroll struct assignments with kmemcheck enabled + ifeq ($(CONFIG_KMEMCHECK),y) diff --git a/nfs-35-fc.patch b/nfs-35-fc.patch new file mode 100644 index 000000000..c3ad25f65 --- /dev/null +++ b/nfs-35-fc.patch @@ -0,0 +1,7235 @@ +diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/client.c +--- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-23 11:01:00.352376393 -0400 +@@ -934,7 +934,6 @@ static int nfs_probe_fsinfo(struct nfs_s + } + + fsinfo.fattr = fattr; +- nfs_fattr_init(fattr); + error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); + if (error < 0) + goto out_error; +@@ -1047,13 +1046,18 @@ struct nfs_server *nfs_create_server(con + struct nfs_fh *mntfh) + { + struct nfs_server *server; +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr; + int error; + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + ++ error = -ENOMEM; ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto error; ++ + /* Get a client representation */ + error = nfs_init_server(server, data); + if (error < 0) +@@ -1064,7 +1068,7 @@ struct nfs_server *nfs_create_server(con + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + /* Probe the root fh to retrieve its FSID */ +- error = nfs_probe_fsinfo(server, mntfh, &fattr); ++ error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto error; + if (server->nfs_client->rpc_ops->version == 3) { +@@ -1077,14 +1081,14 @@ struct nfs_server *nfs_create_server(con + server->namelen = NFS2_MAXNAMLEN; + } + +- if (!(fattr.valid & NFS_ATTR_FATTR)) { +- error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); ++ if (!(fattr->valid & NFS_ATTR_FATTR)) { ++ error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); + if (error < 0) { + dprintk("nfs_create_server: getattr error = %d\n", -error); + goto error; + } + } +- memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid)); ++ memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); + + dprintk("Server FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, +@@ -1096,9 +1100,11 @@ struct nfs_server *nfs_create_server(con + spin_unlock(&nfs_client_lock); + + server->mount_time = jiffies; ++ nfs_free_fattr(fattr); + return server; + + error: ++ nfs_free_fattr(fattr); + nfs_free_server(server); + return ERR_PTR(error); + } +@@ -1340,7 +1346,7 @@ error: + struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, + struct nfs_fh *mntfh) + { +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr; + struct nfs_server *server; + int error; + +@@ -1350,6 +1356,11 @@ struct nfs_server *nfs4_create_server(co + if (!server) + return ERR_PTR(-ENOMEM); + ++ error = -ENOMEM; ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto error; ++ + /* set up the general RPC client */ + error = nfs4_init_server(server, data); + if (error < 0) +@@ -1364,7 +1375,7 @@ struct nfs_server *nfs4_create_server(co + goto error; + + /* Probe the root fh to retrieve its FSID */ +- error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path); ++ error = nfs4_get_rootfh(server, mntfh); + if (error < 0) + goto error; + +@@ -1375,7 +1386,7 @@ struct nfs_server *nfs4_create_server(co + + nfs4_session_set_rwsize(server); + +- error = nfs_probe_fsinfo(server, mntfh, &fattr); ++ error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto error; + +@@ -1389,9 +1400,11 @@ struct nfs_server *nfs4_create_server(co + + server->mount_time = jiffies; + dprintk("<-- nfs4_create_server() = %p\n", server); ++ nfs_free_fattr(fattr); + return server; + + error: ++ nfs_free_fattr(fattr); + nfs_free_server(server); + dprintk("<-- nfs4_create_server() = error %d\n", error); + return ERR_PTR(error); +@@ -1405,7 +1418,7 @@ struct nfs_server *nfs4_create_referral_ + { + struct nfs_client *parent_client; + struct nfs_server *server, *parent_server; +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr; + int error; + + dprintk("--> nfs4_create_referral_server()\n"); +@@ -1414,6 +1427,11 @@ struct nfs_server *nfs4_create_referral_ + if (!server) + return ERR_PTR(-ENOMEM); + ++ error = -ENOMEM; ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto error; ++ + parent_server = NFS_SB(data->sb); + parent_client = parent_server->nfs_client; + +@@ -1443,12 +1461,12 @@ struct nfs_server *nfs4_create_referral_ + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + /* Probe the root fh to retrieve its FSID and filehandle */ +- error = nfs4_path_walk(server, mntfh, data->mnt_path); ++ error = nfs4_get_rootfh(server, mntfh); + if (error < 0) + goto error; + + /* probe the filesystem info for this server filesystem */ +- error = nfs_probe_fsinfo(server, mntfh, &fattr); ++ error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto error; + +@@ -1466,10 +1484,12 @@ struct nfs_server *nfs4_create_referral_ + + server->mount_time = jiffies; + ++ nfs_free_fattr(fattr); + dprintk("<-- nfs_create_referral_server() = %p\n", server); + return server; + + error: ++ nfs_free_fattr(fattr); + nfs_free_server(server); + dprintk("<-- nfs4_create_referral_server() = error %d\n", error); + return ERR_PTR(error); +@@ -1485,7 +1505,7 @@ struct nfs_server *nfs_clone_server(stru + struct nfs_fattr *fattr) + { + struct nfs_server *server; +- struct nfs_fattr fattr_fsinfo; ++ struct nfs_fattr *fattr_fsinfo; + int error; + + dprintk("--> nfs_clone_server(,%llx:%llx,)\n", +@@ -1496,6 +1516,11 @@ struct nfs_server *nfs_clone_server(stru + if (!server) + return ERR_PTR(-ENOMEM); + ++ error = -ENOMEM; ++ fattr_fsinfo = nfs_alloc_fattr(); ++ if (fattr_fsinfo == NULL) ++ goto out_free_server; ++ + /* Copy data from the source */ + server->nfs_client = source->nfs_client; + atomic_inc(&server->nfs_client->cl_count); +@@ -1512,7 +1537,7 @@ struct nfs_server *nfs_clone_server(stru + nfs_init_server_aclclient(server); + + /* probe the filesystem info for this server filesystem */ +- error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo); ++ error = nfs_probe_fsinfo(server, fh, fattr_fsinfo); + if (error < 0) + goto out_free_server; + +@@ -1534,10 +1559,12 @@ struct nfs_server *nfs_clone_server(stru + + server->mount_time = jiffies; + ++ nfs_free_fattr(fattr_fsinfo); + dprintk("<-- nfs_clone_server() = %p\n", server); + return server; + + out_free_server: ++ nfs_free_fattr(fattr_fsinfo); + nfs_free_server(server); + dprintk("<-- nfs_clone_server() = error %d\n", error); + return ERR_PTR(error); +diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs/delegation.c +--- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-23 11:01:00.352376393 -0400 +@@ -213,7 +213,7 @@ int nfs_inode_set_delegation(struct inod + struct nfs_delegation *freeme = NULL; + int status = 0; + +- delegation = kmalloc(sizeof(*delegation), GFP_KERNEL); ++ delegation = kmalloc(sizeof(*delegation), GFP_NOFS); + if (delegation == NULL) + return -ENOMEM; + memcpy(delegation->stateid.data, res->delegation.data, +diff -up linux-2.6.34.noarch/fs/nfs/dir.c.orig linux-2.6.34.noarch/fs/nfs/dir.c +--- linux-2.6.34.noarch/fs/nfs/dir.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/dir.c 2010-08-23 11:01:00.353376419 -0400 +@@ -530,9 +530,7 @@ static int nfs_readdir(struct file *filp + nfs_readdir_descriptor_t my_desc, + *desc = &my_desc; + struct nfs_entry my_entry; +- struct nfs_fh fh; +- struct nfs_fattr fattr; +- long res; ++ int res = -ENOMEM; + + dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", + dentry->d_parent->d_name.name, dentry->d_name.name, +@@ -554,9 +552,11 @@ static int nfs_readdir(struct file *filp + + my_entry.cookie = my_entry.prev_cookie = 0; + my_entry.eof = 0; +- my_entry.fh = &fh; +- my_entry.fattr = &fattr; +- nfs_fattr_init(&fattr); ++ my_entry.fh = nfs_alloc_fhandle(); ++ my_entry.fattr = nfs_alloc_fattr(); ++ if (my_entry.fh == NULL || my_entry.fattr == NULL) ++ goto out_alloc_failed; ++ + desc->entry = &my_entry; + + nfs_block_sillyrename(dentry); +@@ -598,7 +598,10 @@ out: + nfs_unblock_sillyrename(dentry); + if (res > 0) + res = 0; +- dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n", ++out_alloc_failed: ++ nfs_free_fattr(my_entry.fattr); ++ nfs_free_fhandle(my_entry.fh); ++ dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + res); + return res; +@@ -776,9 +779,9 @@ static int nfs_lookup_revalidate(struct + struct inode *dir; + struct inode *inode; + struct dentry *parent; ++ struct nfs_fh *fhandle = NULL; ++ struct nfs_fattr *fattr = NULL; + int error; +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; + + parent = dget_parent(dentry); + dir = parent->d_inode; +@@ -811,14 +814,22 @@ static int nfs_lookup_revalidate(struct + if (NFS_STALE(inode)) + goto out_bad; + +- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); ++ error = -ENOMEM; ++ fhandle = nfs_alloc_fhandle(); ++ fattr = nfs_alloc_fattr(); ++ if (fhandle == NULL || fattr == NULL) ++ goto out_error; ++ ++ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + if (error) + goto out_bad; +- if (nfs_compare_fh(NFS_FH(inode), &fhandle)) ++ if (nfs_compare_fh(NFS_FH(inode), fhandle)) + goto out_bad; +- if ((error = nfs_refresh_inode(inode, &fattr)) != 0) ++ if ((error = nfs_refresh_inode(inode, fattr)) != 0) + goto out_bad; + ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fhandle); + out_set_verifier: + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + out_valid: +@@ -842,11 +853,21 @@ out_zap_parent: + shrink_dcache_parent(dentry); + } + d_drop(dentry); ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fhandle); + dput(parent); + dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", + __func__, dentry->d_parent->d_name.name, + dentry->d_name.name); + return 0; ++out_error: ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fhandle); ++ dput(parent); ++ dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n", ++ __func__, dentry->d_parent->d_name.name, ++ dentry->d_name.name, error); ++ return error; + } + + /* +@@ -911,9 +932,9 @@ static struct dentry *nfs_lookup(struct + struct dentry *res; + struct dentry *parent; + struct inode *inode = NULL; ++ struct nfs_fh *fhandle = NULL; ++ struct nfs_fattr *fattr = NULL; + int error; +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; + + dfprintk(VFS, "NFS: lookup(%s/%s)\n", + dentry->d_parent->d_name.name, dentry->d_name.name); +@@ -923,7 +944,6 @@ static struct dentry *nfs_lookup(struct + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) + goto out; + +- res = ERR_PTR(-ENOMEM); + dentry->d_op = NFS_PROTO(dir)->dentry_ops; + + /* +@@ -936,17 +956,23 @@ static struct dentry *nfs_lookup(struct + goto out; + } + ++ res = ERR_PTR(-ENOMEM); ++ fhandle = nfs_alloc_fhandle(); ++ fattr = nfs_alloc_fattr(); ++ if (fhandle == NULL || fattr == NULL) ++ goto out; ++ + parent = dentry->d_parent; + /* Protect against concurrent sillydeletes */ + nfs_block_sillyrename(parent); +- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); ++ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + if (error == -ENOENT) + goto no_entry; + if (error < 0) { + res = ERR_PTR(error); + goto out_unblock_sillyrename; + } +- inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr); ++ inode = nfs_fhget(dentry->d_sb, fhandle, fattr); + res = (struct dentry *)inode; + if (IS_ERR(res)) + goto out_unblock_sillyrename; +@@ -962,6 +988,8 @@ no_entry: + out_unblock_sillyrename: + nfs_unblock_sillyrename(parent); + out: ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fhandle); + return res; + } + +@@ -1669,28 +1697,33 @@ static void nfs_access_free_entry(struct + smp_mb__after_atomic_dec(); + } + ++static void nfs_access_free_list(struct list_head *head) ++{ ++ struct nfs_access_entry *cache; ++ ++ while (!list_empty(head)) { ++ cache = list_entry(head->next, struct nfs_access_entry, lru); ++ list_del(&cache->lru); ++ nfs_access_free_entry(cache); ++ } ++} ++ + int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) + { + LIST_HEAD(head); + struct nfs_inode *nfsi; + struct nfs_access_entry *cache; + +-restart: ++ if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) ++ return (nr_to_scan == 0) ? 0 : -1; ++ + spin_lock(&nfs_access_lru_lock); + list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { +- struct rw_semaphore *s_umount; + struct inode *inode; + + if (nr_to_scan-- == 0) + break; +- s_umount = &nfsi->vfs_inode.i_sb->s_umount; +- if (!down_read_trylock(s_umount)) +- continue; +- inode = igrab(&nfsi->vfs_inode); +- if (inode == NULL) { +- up_read(s_umount); +- continue; +- } ++ inode = &nfsi->vfs_inode; + spin_lock(&inode->i_lock); + if (list_empty(&nfsi->access_cache_entry_lru)) + goto remove_lru_entry; +@@ -1704,61 +1737,47 @@ restart: + else { + remove_lru_entry: + list_del_init(&nfsi->access_cache_inode_lru); ++ smp_mb__before_clear_bit(); + clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); ++ smp_mb__after_clear_bit(); + } +- spin_unlock(&inode->i_lock); +- spin_unlock(&nfs_access_lru_lock); +- iput(inode); +- up_read(s_umount); +- goto restart; + } + spin_unlock(&nfs_access_lru_lock); +- while (!list_empty(&head)) { +- cache = list_entry(head.next, struct nfs_access_entry, lru); +- list_del(&cache->lru); +- nfs_access_free_entry(cache); +- } ++ nfs_access_free_list(&head); + return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure; + } + +-static void __nfs_access_zap_cache(struct inode *inode) ++static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) + { +- struct nfs_inode *nfsi = NFS_I(inode); + struct rb_root *root_node = &nfsi->access_cache; +- struct rb_node *n, *dispose = NULL; ++ struct rb_node *n; + struct nfs_access_entry *entry; + + /* Unhook entries from the cache */ + while ((n = rb_first(root_node)) != NULL) { + entry = rb_entry(n, struct nfs_access_entry, rb_node); + rb_erase(n, root_node); +- list_del(&entry->lru); +- n->rb_left = dispose; +- dispose = n; ++ list_move(&entry->lru, head); + } + nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS; +- spin_unlock(&inode->i_lock); +- +- /* Now kill them all! */ +- while (dispose != NULL) { +- n = dispose; +- dispose = n->rb_left; +- nfs_access_free_entry(rb_entry(n, struct nfs_access_entry, rb_node)); +- } + } + + void nfs_access_zap_cache(struct inode *inode) + { ++ LIST_HEAD(head); ++ ++ if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0) ++ return; + /* Remove from global LRU init */ +- if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { +- spin_lock(&nfs_access_lru_lock); ++ spin_lock(&nfs_access_lru_lock); ++ if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) + list_del_init(&NFS_I(inode)->access_cache_inode_lru); +- spin_unlock(&nfs_access_lru_lock); +- } + + spin_lock(&inode->i_lock); +- /* This will release the spinlock */ +- __nfs_access_zap_cache(inode); ++ __nfs_access_zap_cache(NFS_I(inode), &head); ++ spin_unlock(&inode->i_lock); ++ spin_unlock(&nfs_access_lru_lock); ++ nfs_access_free_list(&head); + } + + static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred) +@@ -1809,8 +1828,8 @@ out_stale: + nfs_access_free_entry(cache); + return -ENOENT; + out_zap: +- /* This will release the spinlock */ +- __nfs_access_zap_cache(inode); ++ spin_unlock(&inode->i_lock); ++ nfs_access_zap_cache(inode); + return -ENOENT; + } + +@@ -1865,9 +1884,11 @@ static void nfs_access_add_cache(struct + smp_mb__after_atomic_inc(); + + /* Add inode to global LRU list */ +- if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { ++ if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { + spin_lock(&nfs_access_lru_lock); +- list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list); ++ if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) ++ list_add_tail(&NFS_I(inode)->access_cache_inode_lru, ++ &nfs_access_lru_list); + spin_unlock(&nfs_access_lru_lock); + } + } +diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file.c +--- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-23 11:00:23.790502081 -0400 ++++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-23 11:01:00.354376416 -0400 +@@ -162,14 +162,17 @@ static int nfs_revalidate_file_size(stru + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_inode *nfsi = NFS_I(inode); + +- if (server->flags & NFS_MOUNT_NOAC) +- goto force_reval; ++ if (nfs_have_delegated_attributes(inode)) ++ goto out_noreval; ++ + if (filp->f_flags & O_DIRECT) + goto force_reval; +- if (nfsi->npages != 0) +- return 0; +- if (!(nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode)) +- return 0; ++ if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) ++ goto force_reval; ++ if (nfs_attribute_timeout(inode)) ++ goto force_reval; ++out_noreval: ++ return 0; + force_reval: + return __nfs_revalidate_inode(server, inode); + } +diff -up linux-2.6.34.noarch/fs/nfs/fscache.c.orig linux-2.6.34.noarch/fs/nfs/fscache.c +--- linux-2.6.34.noarch/fs/nfs/fscache.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/fscache.c 2010-08-23 11:01:00.355376416 -0400 +@@ -467,7 +467,8 @@ int __nfs_readpages_from_fscache(struct + struct list_head *pages, + unsigned *nr_pages) + { +- int ret, npages = *nr_pages; ++ unsigned npages = *nr_pages; ++ int ret; + + dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", + NFS_I(inode)->fscache, npages, inode); +diff -up linux-2.6.34.noarch/fs/nfs/getroot.c.orig linux-2.6.34.noarch/fs/nfs/getroot.c +--- linux-2.6.34.noarch/fs/nfs/getroot.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/getroot.c 2010-08-23 11:01:00.356376417 -0400 +@@ -78,159 +78,94 @@ struct dentry *nfs_get_root(struct super + { + struct nfs_server *server = NFS_SB(sb); + struct nfs_fsinfo fsinfo; +- struct nfs_fattr fattr; +- struct dentry *mntroot; ++ struct dentry *ret; + struct inode *inode; + int error; + + /* get the actual root for this mount */ +- fsinfo.fattr = &fattr; ++ fsinfo.fattr = nfs_alloc_fattr(); ++ if (fsinfo.fattr == NULL) ++ return ERR_PTR(-ENOMEM); + + error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); + if (error < 0) { + dprintk("nfs_get_root: getattr error = %d\n", -error); +- return ERR_PTR(error); ++ ret = ERR_PTR(error); ++ goto out; + } + + inode = nfs_fhget(sb, mntfh, fsinfo.fattr); + if (IS_ERR(inode)) { + dprintk("nfs_get_root: get root inode failed\n"); +- return ERR_CAST(inode); ++ ret = ERR_CAST(inode); ++ goto out; + } + + error = nfs_superblock_set_dummy_root(sb, inode); +- if (error != 0) +- return ERR_PTR(error); ++ if (error != 0) { ++ ret = ERR_PTR(error); ++ goto out; ++ } + + /* root dentries normally start off anonymous and get spliced in later + * if the dentry tree reaches them; however if the dentry already + * exists, we'll pick it up at this point and use it as the root + */ +- mntroot = d_obtain_alias(inode); +- if (IS_ERR(mntroot)) { ++ ret = d_obtain_alias(inode); ++ if (IS_ERR(ret)) { + dprintk("nfs_get_root: get root dentry failed\n"); +- return mntroot; ++ goto out; + } + +- security_d_instantiate(mntroot, inode); +- +- if (!mntroot->d_op) +- mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops; ++ security_d_instantiate(ret, inode); + +- return mntroot; ++ if (ret->d_op == NULL) ++ ret->d_op = server->nfs_client->rpc_ops->dentry_ops; ++out: ++ nfs_free_fattr(fsinfo.fattr); ++ return ret; + } + + #ifdef CONFIG_NFS_V4 + +-/* +- * Do a simple pathwalk from the root FH of the server to the nominated target +- * of the mountpoint +- * - give error on symlinks +- * - give error on ".." occurring in the path +- * - follow traversals +- */ +-int nfs4_path_walk(struct nfs_server *server, +- struct nfs_fh *mntfh, +- const char *path) ++int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh) + { + struct nfs_fsinfo fsinfo; +- struct nfs_fattr fattr; +- struct nfs_fh lastfh; +- struct qstr name; +- int ret; +- +- dprintk("--> nfs4_path_walk(,,%s)\n", path); +- +- fsinfo.fattr = &fattr; +- nfs_fattr_init(&fattr); +- +- /* Eat leading slashes */ +- while (*path == '/') +- path++; ++ int ret = -ENOMEM; ++ ++ dprintk("--> nfs4_get_rootfh()\n"); ++ ++ fsinfo.fattr = nfs_alloc_fattr(); ++ if (fsinfo.fattr == NULL) ++ goto out; + + /* Start by getting the root filehandle from the server */ + ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); + if (ret < 0) { +- dprintk("nfs4_get_root: getroot error = %d\n", -ret); +- return ret; ++ dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); ++ goto out; + } + +- if (!S_ISDIR(fattr.mode)) { +- printk(KERN_ERR "nfs4_get_root:" ++ if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_MODE) ++ || !S_ISDIR(fsinfo.fattr->mode)) { ++ printk(KERN_ERR "nfs4_get_rootfh:" + " getroot encountered non-directory\n"); +- return -ENOTDIR; ++ ret = -ENOTDIR; ++ goto out; + } + +- /* FIXME: It is quite valid for the server to return a referral here */ +- if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) { +- printk(KERN_ERR "nfs4_get_root:" ++ if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { ++ printk(KERN_ERR "nfs4_get_rootfh:" + " getroot obtained referral\n"); +- return -EREMOTE; ++ ret = -EREMOTE; ++ goto out; + } + +-next_component: +- dprintk("Next: %s\n", path); +- +- /* extract the next bit of the path */ +- if (!*path) +- goto path_walk_complete; +- +- name.name = path; +- while (*path && *path != '/') +- path++; +- name.len = path - (const char *) name.name; +- +- if (name.len > NFS4_MAXNAMLEN) +- return -ENAMETOOLONG; +- +-eat_dot_dir: +- while (*path == '/') +- path++; +- +- if (path[0] == '.' && (path[1] == '/' || !path[1])) { +- path += 2; +- goto eat_dot_dir; +- } +- +- /* FIXME: Why shouldn't the user be able to use ".." in the path? */ +- if (path[0] == '.' && path[1] == '.' && (path[2] == '/' || !path[2]) +- ) { +- printk(KERN_ERR "nfs4_get_root:" +- " Mount path contains reference to \"..\"\n"); +- return -EINVAL; +- } +- +- /* lookup the next FH in the sequence */ +- memcpy(&lastfh, mntfh, sizeof(lastfh)); +- +- dprintk("LookupFH: %*.*s [%s]\n", name.len, name.len, name.name, path); +- +- ret = server->nfs_client->rpc_ops->lookupfh(server, &lastfh, &name, +- mntfh, &fattr); +- if (ret < 0) { +- dprintk("nfs4_get_root: getroot error = %d\n", -ret); +- return ret; +- } +- +- if (!S_ISDIR(fattr.mode)) { +- printk(KERN_ERR "nfs4_get_root:" +- " lookupfh encountered non-directory\n"); +- return -ENOTDIR; +- } +- +- /* FIXME: Referrals are quite valid here too */ +- if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) { +- printk(KERN_ERR "nfs4_get_root:" +- " lookupfh obtained referral\n"); +- return -EREMOTE; +- } +- +- goto next_component; +- +-path_walk_complete: +- memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid)); +- dprintk("<-- nfs4_path_walk() = 0\n"); +- return 0; ++ memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); ++out: ++ nfs_free_fattr(fsinfo.fattr); ++ dprintk("<-- nfs4_get_rootfh() = %d\n", ret); ++ return ret; + } + + /* +@@ -239,8 +174,8 @@ path_walk_complete: + struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) + { + struct nfs_server *server = NFS_SB(sb); +- struct nfs_fattr fattr; +- struct dentry *mntroot; ++ struct nfs_fattr *fattr = NULL; ++ struct dentry *ret; + struct inode *inode; + int error; + +@@ -254,40 +189,50 @@ struct dentry *nfs4_get_root(struct supe + return ERR_PTR(error); + } + ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ return ERR_PTR(-ENOMEM);; ++ + /* get the actual root for this mount */ +- error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); ++ error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); + if (error < 0) { + dprintk("nfs_get_root: getattr error = %d\n", -error); +- return ERR_PTR(error); ++ ret = ERR_PTR(error); ++ goto out; + } + +- inode = nfs_fhget(sb, mntfh, &fattr); ++ inode = nfs_fhget(sb, mntfh, fattr); + if (IS_ERR(inode)) { + dprintk("nfs_get_root: get root inode failed\n"); +- return ERR_CAST(inode); ++ ret = ERR_CAST(inode); ++ goto out; + } + + error = nfs_superblock_set_dummy_root(sb, inode); +- if (error != 0) +- return ERR_PTR(error); ++ if (error != 0) { ++ ret = ERR_PTR(error); ++ goto out; ++ } + + /* root dentries normally start off anonymous and get spliced in later + * if the dentry tree reaches them; however if the dentry already + * exists, we'll pick it up at this point and use it as the root + */ +- mntroot = d_obtain_alias(inode); +- if (IS_ERR(mntroot)) { ++ ret = d_obtain_alias(inode); ++ if (IS_ERR(ret)) { + dprintk("nfs_get_root: get root dentry failed\n"); +- return mntroot; ++ goto out; + } + +- security_d_instantiate(mntroot, inode); ++ security_d_instantiate(ret, inode); + +- if (!mntroot->d_op) +- mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops; ++ if (ret->d_op == NULL) ++ ret->d_op = server->nfs_client->rpc_ops->dentry_ops; + ++out: ++ nfs_free_fattr(fattr); + dprintk("<-- nfs4_get_root()\n"); +- return mntroot; ++ return ret; + } + + #endif /* CONFIG_NFS_V4 */ +diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inode.c +--- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-23 11:01:00.357376378 -0400 +@@ -393,8 +393,8 @@ int + nfs_setattr(struct dentry *dentry, struct iattr *attr) + { + struct inode *inode = dentry->d_inode; +- struct nfs_fattr fattr; +- int error; ++ struct nfs_fattr *fattr; ++ int error = -ENOMEM; + + nfs_inc_stats(inode, NFSIOS_VFSSETATTR); + +@@ -417,14 +417,20 @@ nfs_setattr(struct dentry *dentry, struc + filemap_write_and_wait(inode->i_mapping); + nfs_wb_all(inode); + } ++ ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto out; + /* + * Return any delegations if we're going to change ACLs + */ + if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) + nfs_inode_return_delegation(inode); +- error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); ++ error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); + if (error == 0) +- nfs_refresh_inode(inode, &fattr); ++ nfs_refresh_inode(inode, fattr); ++ nfs_free_fattr(fattr); ++out: + return error; + } + +@@ -682,7 +688,7 @@ int + __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) + { + int status = -ESTALE; +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr = NULL; + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", +@@ -693,8 +699,13 @@ __nfs_revalidate_inode(struct nfs_server + if (NFS_STALE(inode)) + goto out; + ++ status = -ENOMEM; ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto out; ++ + nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); +- status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); ++ status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr); + if (status != 0) { + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", + inode->i_sb->s_id, +@@ -707,7 +718,7 @@ __nfs_revalidate_inode(struct nfs_server + goto out; + } + +- status = nfs_refresh_inode(inode, &fattr); ++ status = nfs_refresh_inode(inode, fattr); + if (status) { + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", + inode->i_sb->s_id, +@@ -723,6 +734,7 @@ __nfs_revalidate_inode(struct nfs_server + (long long)NFS_FILEID(inode)); + + out: ++ nfs_free_fattr(fattr); + return status; + } + +@@ -730,9 +742,14 @@ int nfs_attribute_timeout(struct inode * + { + struct nfs_inode *nfsi = NFS_I(inode); + ++ return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); ++} ++ ++static int nfs_attribute_cache_expired(struct inode *inode) ++{ + if (nfs_have_delegated_attributes(inode)) + return 0; +- return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); ++ return nfs_attribute_timeout(inode); + } + + /** +@@ -745,7 +762,7 @@ int nfs_attribute_timeout(struct inode * + int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) + { + if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) +- && !nfs_attribute_timeout(inode)) ++ && !nfs_attribute_cache_expired(inode)) + return NFS_STALE(inode) ? -ESTALE : 0; + return __nfs_revalidate_inode(server, inode); + } +@@ -782,7 +799,8 @@ int nfs_revalidate_mapping(struct inode + int ret = 0; + + if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) +- || nfs_attribute_timeout(inode) || NFS_STALE(inode)) { ++ || nfs_attribute_cache_expired(inode) ++ || NFS_STALE(inode)) { + ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (ret < 0) + goto out; +@@ -916,6 +934,26 @@ void nfs_fattr_init(struct nfs_fattr *fa + fattr->gencount = nfs_inc_attr_generation_counter(); + } + ++struct nfs_fattr *nfs_alloc_fattr(void) ++{ ++ struct nfs_fattr *fattr; ++ ++ fattr = kmalloc(sizeof(*fattr), GFP_NOFS); ++ if (fattr != NULL) ++ nfs_fattr_init(fattr); ++ return fattr; ++} ++ ++struct nfs_fh *nfs_alloc_fhandle(void) ++{ ++ struct nfs_fh *fh; ++ ++ fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS); ++ if (fh != NULL) ++ fh->size = 0; ++ return fh; ++} ++ + /** + * nfs_inode_attrs_need_update - check if the inode attributes need updating + * @inode - pointer to inode +diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/internal.h +--- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-23 11:01:00.358564151 -0400 +@@ -244,9 +244,7 @@ extern struct dentry *nfs_get_root(struc + #ifdef CONFIG_NFS_V4 + extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *); + +-extern int nfs4_path_walk(struct nfs_server *server, +- struct nfs_fh *mntfh, +- const char *path); ++extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); + #endif + + /* read.c */ +diff -up linux-2.6.34.noarch/fs/nfs/iostat.h.orig linux-2.6.34.noarch/fs/nfs/iostat.h +--- linux-2.6.34.noarch/fs/nfs/iostat.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/iostat.h 2010-08-23 11:01:00.358564151 -0400 +@@ -36,14 +36,14 @@ static inline void nfs_inc_stats(const s + + static inline void nfs_add_server_stats(const struct nfs_server *server, + enum nfs_stat_bytecounters stat, +- unsigned long addend) ++ long addend) + { + this_cpu_add(server->io_stats->bytes[stat], addend); + } + + static inline void nfs_add_stats(const struct inode *inode, + enum nfs_stat_bytecounters stat, +- unsigned long addend) ++ long addend) + { + nfs_add_server_stats(NFS_SERVER(inode), stat, addend); + } +@@ -51,7 +51,7 @@ static inline void nfs_add_stats(const s + #ifdef CONFIG_NFS_FSCACHE + static inline void nfs_add_fscache_stats(struct inode *inode, + enum nfs_stat_fscachecounters stat, +- unsigned long addend) ++ long addend) + { + this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend); + } +diff -up linux-2.6.34.noarch/fs/nfs/namespace.c.orig linux-2.6.34.noarch/fs/nfs/namespace.c +--- linux-2.6.34.noarch/fs/nfs/namespace.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/namespace.c 2010-08-23 11:01:00.359420147 -0400 +@@ -105,8 +105,8 @@ static void * nfs_follow_mountpoint(stru + struct vfsmount *mnt; + struct nfs_server *server = NFS_SERVER(dentry->d_inode); + struct dentry *parent; +- struct nfs_fh fh; +- struct nfs_fattr fattr; ++ struct nfs_fh *fh = NULL; ++ struct nfs_fattr *fattr = NULL; + int err; + + dprintk("--> nfs_follow_mountpoint()\n"); +@@ -115,6 +115,12 @@ static void * nfs_follow_mountpoint(stru + if (IS_ROOT(dentry)) + goto out_err; + ++ err = -ENOMEM; ++ fh = nfs_alloc_fhandle(); ++ fattr = nfs_alloc_fattr(); ++ if (fh == NULL || fattr == NULL) ++ goto out_err; ++ + dprintk("%s: enter\n", __func__); + dput(nd->path.dentry); + nd->path.dentry = dget(dentry); +@@ -123,16 +129,16 @@ static void * nfs_follow_mountpoint(stru + parent = dget_parent(nd->path.dentry); + err = server->nfs_client->rpc_ops->lookup(parent->d_inode, + &nd->path.dentry->d_name, +- &fh, &fattr); ++ fh, fattr); + dput(parent); + if (err != 0) + goto out_err; + +- if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) ++ if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) + mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry); + else +- mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, &fh, +- &fattr); ++ mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh, ++ fattr); + err = PTR_ERR(mnt); + if (IS_ERR(mnt)) + goto out_err; +@@ -151,6 +157,8 @@ static void * nfs_follow_mountpoint(stru + nd->path.dentry = dget(mnt->mnt_root); + schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); + out: ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fh); + dprintk("%s: done, returned %d\n", __func__, err); + + dprintk("<-- nfs_follow_mountpoint() = %d\n", err); +diff -up linux-2.6.34.noarch/fs/nfs/nfs3acl.c.orig linux-2.6.34.noarch/fs/nfs/nfs3acl.c +--- linux-2.6.34.noarch/fs/nfs/nfs3acl.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3acl.c 2010-08-23 11:01:00.359420147 -0400 +@@ -185,7 +185,6 @@ static void nfs3_cache_acls(struct inode + struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) + { + struct nfs_server *server = NFS_SERVER(inode); +- struct nfs_fattr fattr; + struct page *pages[NFSACL_MAXPAGES] = { }; + struct nfs3_getaclargs args = { + .fh = NFS_FH(inode), +@@ -193,7 +192,7 @@ struct posix_acl *nfs3_proc_getacl(struc + .pages = pages, + }; + struct nfs3_getaclres res = { +- .fattr = &fattr, ++ 0 + }; + struct rpc_message msg = { + .rpc_argp = &args, +@@ -228,7 +227,10 @@ struct posix_acl *nfs3_proc_getacl(struc + + dprintk("NFS call getacl\n"); + msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; +- nfs_fattr_init(&fattr); ++ res.fattr = nfs_alloc_fattr(); ++ if (res.fattr == NULL) ++ return ERR_PTR(-ENOMEM); ++ + status = rpc_call_sync(server->client_acl, &msg, 0); + dprintk("NFS reply getacl: %d\n", status); + +@@ -238,7 +240,7 @@ struct posix_acl *nfs3_proc_getacl(struc + + switch (status) { + case 0: +- status = nfs_refresh_inode(inode, &fattr); ++ status = nfs_refresh_inode(inode, res.fattr); + break; + case -EPFNOSUPPORT: + case -EPROTONOSUPPORT: +@@ -278,6 +280,7 @@ struct posix_acl *nfs3_proc_getacl(struc + getout: + posix_acl_release(res.acl_access); + posix_acl_release(res.acl_default); ++ nfs_free_fattr(res.fattr); + + if (status != 0) { + posix_acl_release(acl); +@@ -290,7 +293,7 @@ static int nfs3_proc_setacls(struct inod + struct posix_acl *dfacl) + { + struct nfs_server *server = NFS_SERVER(inode); +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr; + struct page *pages[NFSACL_MAXPAGES]; + struct nfs3_setaclargs args = { + .inode = inode, +@@ -335,8 +338,13 @@ static int nfs3_proc_setacls(struct inod + } + + dprintk("NFS call setacl\n"); ++ status = -ENOMEM; ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto out_freepages; ++ + msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; +- nfs_fattr_init(&fattr); ++ msg.rpc_resp = fattr; + status = rpc_call_sync(server->client_acl, &msg, 0); + nfs_access_zap_cache(inode); + nfs_zap_acl_cache(inode); +@@ -344,7 +352,7 @@ static int nfs3_proc_setacls(struct inod + + switch (status) { + case 0: +- status = nfs_refresh_inode(inode, &fattr); ++ status = nfs_refresh_inode(inode, fattr); + nfs3_cache_acls(inode, acl, dfacl); + break; + case -EPFNOSUPPORT: +@@ -355,6 +363,7 @@ static int nfs3_proc_setacls(struct inod + case -ENOTSUPP: + status = -EOPNOTSUPP; + } ++ nfs_free_fattr(fattr); + out_freepages: + while (args.npages != 0) { + args.npages--; +diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs3proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-23 11:01:00.360574301 -0400 +@@ -144,14 +144,12 @@ static int + nfs3_proc_lookup(struct inode *dir, struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) + { +- struct nfs_fattr dir_attr; + struct nfs3_diropargs arg = { + .fh = NFS_FH(dir), + .name = name->name, + .len = name->len + }; + struct nfs3_diropres res = { +- .dir_attr = &dir_attr, + .fh = fhandle, + .fattr = fattr + }; +@@ -163,29 +161,30 @@ nfs3_proc_lookup(struct inode *dir, stru + int status; + + dprintk("NFS call lookup %s\n", name->name); +- nfs_fattr_init(&dir_attr); ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.dir_attr == NULL) ++ return -ENOMEM; ++ + nfs_fattr_init(fattr); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); +- nfs_refresh_inode(dir, &dir_attr); ++ nfs_refresh_inode(dir, res.dir_attr); + if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) { + msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; + msg.rpc_argp = fhandle; + msg.rpc_resp = fattr; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + } ++ nfs_free_fattr(res.dir_attr); + dprintk("NFS reply lookup: %d\n", status); + return status; + } + + static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) + { +- struct nfs_fattr fattr; + struct nfs3_accessargs arg = { + .fh = NFS_FH(inode), + }; +- struct nfs3_accessres res = { +- .fattr = &fattr, +- }; ++ struct nfs3_accessres res; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS], + .rpc_argp = &arg, +@@ -193,7 +192,7 @@ static int nfs3_proc_access(struct inode + .rpc_cred = entry->cred, + }; + int mode = entry->mask; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call access\n"); + +@@ -210,9 +209,13 @@ static int nfs3_proc_access(struct inode + if (mode & MAY_EXEC) + arg.access |= NFS3_ACCESS_EXECUTE; + } +- nfs_fattr_init(&fattr); ++ ++ res.fattr = nfs_alloc_fattr(); ++ if (res.fattr == NULL) ++ goto out; ++ + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); +- nfs_refresh_inode(inode, &fattr); ++ nfs_refresh_inode(inode, res.fattr); + if (status == 0) { + entry->mask = 0; + if (res.access & NFS3_ACCESS_READ) +@@ -222,6 +225,8 @@ static int nfs3_proc_access(struct inode + if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE)) + entry->mask |= MAY_EXEC; + } ++ nfs_free_fattr(res.fattr); ++out: + dprintk("NFS reply access: %d\n", status); + return status; + } +@@ -229,7 +234,7 @@ static int nfs3_proc_access(struct inode + static int nfs3_proc_readlink(struct inode *inode, struct page *page, + unsigned int pgbase, unsigned int pglen) + { +- struct nfs_fattr fattr; ++ struct nfs_fattr *fattr; + struct nfs3_readlinkargs args = { + .fh = NFS_FH(inode), + .pgbase = pgbase, +@@ -239,14 +244,19 @@ static int nfs3_proc_readlink(struct ino + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_READLINK], + .rpc_argp = &args, +- .rpc_resp = &fattr, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call readlink\n"); +- nfs_fattr_init(&fattr); ++ fattr = nfs_alloc_fattr(); ++ if (fattr == NULL) ++ goto out; ++ msg.rpc_resp = fattr; ++ + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); +- nfs_refresh_inode(inode, &fattr); ++ nfs_refresh_inode(inode, fattr); ++ nfs_free_fattr(fattr); ++out: + dprintk("NFS reply readlink: %d\n", status); + return status; + } +@@ -396,12 +406,17 @@ nfs3_proc_remove(struct inode *dir, stru + .rpc_argp = &arg, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call remove %s\n", name->name); +- nfs_fattr_init(&res.dir_attr); ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.dir_attr == NULL) ++ goto out; ++ + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); +- nfs_post_op_update_inode(dir, &res.dir_attr); ++ nfs_post_op_update_inode(dir, res.dir_attr); ++ nfs_free_fattr(res.dir_attr); ++out: + dprintk("NFS reply remove: %d\n", status); + return status; + } +@@ -419,7 +434,7 @@ nfs3_proc_unlink_done(struct rpc_task *t + if (nfs3_async_handle_jukebox(task, dir)) + return 0; + res = task->tk_msg.rpc_resp; +- nfs_post_op_update_inode(dir, &res->dir_attr); ++ nfs_post_op_update_inode(dir, res->dir_attr); + return 1; + } + +@@ -427,7 +442,6 @@ static int + nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, + struct inode *new_dir, struct qstr *new_name) + { +- struct nfs_fattr old_dir_attr, new_dir_attr; + struct nfs3_renameargs arg = { + .fromfh = NFS_FH(old_dir), + .fromname = old_name->name, +@@ -436,23 +450,27 @@ nfs3_proc_rename(struct inode *old_dir, + .toname = new_name->name, + .tolen = new_name->len + }; +- struct nfs3_renameres res = { +- .fromattr = &old_dir_attr, +- .toattr = &new_dir_attr +- }; ++ struct nfs3_renameres res; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], + .rpc_argp = &arg, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); +- nfs_fattr_init(&old_dir_attr); +- nfs_fattr_init(&new_dir_attr); ++ ++ res.fromattr = nfs_alloc_fattr(); ++ res.toattr = nfs_alloc_fattr(); ++ if (res.fromattr == NULL || res.toattr == NULL) ++ goto out; ++ + status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); +- nfs_post_op_update_inode(old_dir, &old_dir_attr); +- nfs_post_op_update_inode(new_dir, &new_dir_attr); ++ nfs_post_op_update_inode(old_dir, res.fromattr); ++ nfs_post_op_update_inode(new_dir, res.toattr); ++out: ++ nfs_free_fattr(res.toattr); ++ nfs_free_fattr(res.fromattr); + dprintk("NFS reply rename: %d\n", status); + return status; + } +@@ -460,30 +478,32 @@ nfs3_proc_rename(struct inode *old_dir, + static int + nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) + { +- struct nfs_fattr dir_attr, fattr; + struct nfs3_linkargs arg = { + .fromfh = NFS_FH(inode), + .tofh = NFS_FH(dir), + .toname = name->name, + .tolen = name->len + }; +- struct nfs3_linkres res = { +- .dir_attr = &dir_attr, +- .fattr = &fattr +- }; ++ struct nfs3_linkres res; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_LINK], + .rpc_argp = &arg, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call link %s\n", name->name); +- nfs_fattr_init(&dir_attr); +- nfs_fattr_init(&fattr); ++ res.fattr = nfs_alloc_fattr(); ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.fattr == NULL || res.dir_attr == NULL) ++ goto out; ++ + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); +- nfs_post_op_update_inode(dir, &dir_attr); +- nfs_post_op_update_inode(inode, &fattr); ++ nfs_post_op_update_inode(dir, res.dir_attr); ++ nfs_post_op_update_inode(inode, res.fattr); ++out: ++ nfs_free_fattr(res.dir_attr); ++ nfs_free_fattr(res.fattr); + dprintk("NFS reply link: %d\n", status); + return status; + } +@@ -554,7 +574,7 @@ out: + static int + nfs3_proc_rmdir(struct inode *dir, struct qstr *name) + { +- struct nfs_fattr dir_attr; ++ struct nfs_fattr *dir_attr; + struct nfs3_diropargs arg = { + .fh = NFS_FH(dir), + .name = name->name, +@@ -563,14 +583,19 @@ nfs3_proc_rmdir(struct inode *dir, struc + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR], + .rpc_argp = &arg, +- .rpc_resp = &dir_attr, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call rmdir %s\n", name->name); +- nfs_fattr_init(&dir_attr); ++ dir_attr = nfs_alloc_fattr(); ++ if (dir_attr == NULL) ++ goto out; ++ ++ msg.rpc_resp = dir_attr; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); +- nfs_post_op_update_inode(dir, &dir_attr); ++ nfs_post_op_update_inode(dir, dir_attr); ++ nfs_free_fattr(dir_attr); ++out: + dprintk("NFS reply rmdir: %d\n", status); + return status; + } +@@ -589,7 +614,6 @@ nfs3_proc_readdir(struct dentry *dentry, + u64 cookie, struct page *page, unsigned int count, int plus) + { + struct inode *dir = dentry->d_inode; +- struct nfs_fattr dir_attr; + __be32 *verf = NFS_COOKIEVERF(dir); + struct nfs3_readdirargs arg = { + .fh = NFS_FH(dir), +@@ -600,7 +624,6 @@ nfs3_proc_readdir(struct dentry *dentry, + .pages = &page + }; + struct nfs3_readdirres res = { +- .dir_attr = &dir_attr, + .verf = verf, + .plus = plus + }; +@@ -610,7 +633,7 @@ nfs3_proc_readdir(struct dentry *dentry, + .rpc_resp = &res, + .rpc_cred = cred + }; +- int status; ++ int status = -ENOMEM; + + if (plus) + msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS]; +@@ -618,12 +641,17 @@ nfs3_proc_readdir(struct dentry *dentry, + dprintk("NFS call readdir%s %d\n", + plus? "plus" : "", (unsigned int) cookie); + +- nfs_fattr_init(&dir_attr); ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.dir_attr == NULL) ++ goto out; ++ + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + + nfs_invalidate_atime(dir); ++ nfs_refresh_inode(dir, res.dir_attr); + +- nfs_refresh_inode(dir, &dir_attr); ++ nfs_free_fattr(res.dir_attr); ++out: + dprintk("NFS reply readdir: %d\n", status); + return status; + } +diff -up linux-2.6.34.noarch/fs/nfs/nfs3xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs3xdr.c +--- linux-2.6.34.noarch/fs/nfs/nfs3xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3xdr.c 2010-08-23 11:01:00.361593802 -0400 +@@ -762,7 +762,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, _ + static int + nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res) + { +- return nfs3_xdr_wccstat(req, p, &res->dir_attr); ++ return nfs3_xdr_wccstat(req, p, res->dir_attr); + } + + /* +diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nfs4_fs.h +--- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-23 11:01:00.362574935 -0400 +@@ -206,14 +206,14 @@ extern ssize_t nfs4_listxattr(struct den + + + /* nfs4proc.c */ +-extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *); +-extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *); ++extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); ++extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); + extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); + extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); + extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); + extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); + extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); +-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait); ++extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); + extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); + extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); + extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); +@@ -286,7 +286,7 @@ extern void nfs4_put_lock_state(struct n + extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); + extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); + +-extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter); ++extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); + extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); + extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); + extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); +diff -up linux-2.6.34.noarch/fs/nfs/nfs4namespace.c.orig linux-2.6.34.noarch/fs/nfs/nfs4namespace.c +--- linux-2.6.34.noarch/fs/nfs/nfs4namespace.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4namespace.c 2010-08-23 11:01:00.363574219 -0400 +@@ -115,6 +115,7 @@ static struct vfsmount *try_location(str + char *page, char *page2, + const struct nfs4_fs_location *location) + { ++ const size_t addr_bufsize = sizeof(struct sockaddr_storage); + struct vfsmount *mnt = ERR_PTR(-ENOENT); + char *mnt_path; + unsigned int maxbuflen; +@@ -126,9 +127,12 @@ static struct vfsmount *try_location(str + mountdata->mnt_path = mnt_path; + maxbuflen = mnt_path - 1 - page2; + ++ mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL); ++ if (mountdata->addr == NULL) ++ return ERR_PTR(-ENOMEM); ++ + for (s = 0; s < location->nservers; s++) { + const struct nfs4_string *buf = &location->servers[s]; +- struct sockaddr_storage addr; + + if (buf->len <= 0 || buf->len >= maxbuflen) + continue; +@@ -137,11 +141,10 @@ static struct vfsmount *try_location(str + continue; + + mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, +- (struct sockaddr *)&addr, sizeof(addr)); ++ mountdata->addr, addr_bufsize); + if (mountdata->addrlen == 0) + continue; + +- mountdata->addr = (struct sockaddr *)&addr; + rpc_set_port(mountdata->addr, NFS_PORT); + + memcpy(page2, buf->data, buf->len); +@@ -156,6 +159,7 @@ static struct vfsmount *try_location(str + if (!IS_ERR(mnt)) + break; + } ++ kfree(mountdata->addr); + return mnt; + } + +@@ -221,8 +225,8 @@ out: + + /* + * nfs_do_refmount - handle crossing a referral on server ++ * @mnt_parent - mountpoint of referral + * @dentry - dentry of referral +- * @nd - nameidata info + * + */ + struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) +diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-23 11:01:00.365544029 -0400 +@@ -70,6 +70,9 @@ static int nfs4_do_fsinfo(struct nfs_ser + static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); + static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); ++static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, ++ struct nfs_fattr *fattr, struct iattr *sattr, ++ struct nfs4_state *state); + + /* Prevent leaks of NFSv4 errors into userland */ + static int nfs4_map_errors(int err) +@@ -714,17 +717,18 @@ static void nfs4_init_opendata_res(struc + + static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, + struct nfs4_state_owner *sp, fmode_t fmode, int flags, +- const struct iattr *attrs) ++ const struct iattr *attrs, ++ gfp_t gfp_mask) + { + struct dentry *parent = dget_parent(path->dentry); + struct inode *dir = parent->d_inode; + struct nfs_server *server = NFS_SERVER(dir); + struct nfs4_opendata *p; + +- p = kzalloc(sizeof(*p), GFP_KERNEL); ++ p = kzalloc(sizeof(*p), gfp_mask); + if (p == NULL) + goto err; +- p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); ++ p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask); + if (p->o_arg.seqid == NULL) + goto err_free; + path_get(path); +@@ -1060,7 +1064,7 @@ static struct nfs4_opendata *nfs4_open_r + { + struct nfs4_opendata *opendata; + +- opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL); ++ opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL, GFP_NOFS); + if (opendata == NULL) + return ERR_PTR(-ENOMEM); + opendata->state = state; +@@ -1648,7 +1652,7 @@ static int _nfs4_do_open(struct inode *d + if (path->dentry->d_inode != NULL) + nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode); + status = -ENOMEM; +- opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr); ++ opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr, GFP_KERNEL); + if (opendata == NULL) + goto err_put_state_owner; + +@@ -1659,15 +1663,24 @@ static int _nfs4_do_open(struct inode *d + if (status != 0) + goto err_opendata_put; + +- if (opendata->o_arg.open_flags & O_EXCL) +- nfs4_exclusive_attrset(opendata, sattr); +- + state = nfs4_opendata_to_nfs4_state(opendata); + status = PTR_ERR(state); + if (IS_ERR(state)) + goto err_opendata_put; + if (server->caps & NFS_CAP_POSIX_LOCK) + set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); ++ ++ if (opendata->o_arg.open_flags & O_EXCL) { ++ nfs4_exclusive_attrset(opendata, sattr); ++ ++ nfs_fattr_init(opendata->o_res.f_attr); ++ status = nfs4_do_setattr(state->inode, cred, ++ opendata->o_res.f_attr, sattr, ++ state); ++ if (status == 0) ++ nfs_setattr_update_inode(state->inode, sattr); ++ nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); ++ } + nfs4_opendata_put(opendata); + nfs4_put_state_owner(sp); + *res = state; +@@ -1914,7 +1927,7 @@ static const struct rpc_call_ops nfs4_cl + * + * NOTE: Caller must be holding the sp->so_owner semaphore! + */ +-int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) ++int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait) + { + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_closedata *calldata; +@@ -1933,7 +1946,7 @@ int nfs4_do_close(struct path *path, str + }; + int status = -ENOMEM; + +- calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); ++ calldata = kzalloc(sizeof(*calldata), gfp_mask); + if (calldata == NULL) + goto out; + calldata->inode = state->inode; +@@ -1941,7 +1954,7 @@ int nfs4_do_close(struct path *path, str + calldata->arg.fh = NFS_FH(state->inode); + calldata->arg.stateid = &state->open_stateid; + /* Serialization for the sequence id */ +- calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); ++ calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask); + if (calldata->arg.seqid == NULL) + goto out_free_calldata; + calldata->arg.fmode = 0; +@@ -2404,14 +2417,12 @@ static int nfs4_proc_lookup(struct inode + static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) + { + struct nfs_server *server = NFS_SERVER(inode); +- struct nfs_fattr fattr; + struct nfs4_accessargs args = { + .fh = NFS_FH(inode), + .bitmask = server->attr_bitmask, + }; + struct nfs4_accessres res = { + .server = server, +- .fattr = &fattr, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], +@@ -2438,7 +2449,11 @@ static int _nfs4_proc_access(struct inod + if (mode & MAY_EXEC) + args.access |= NFS4_ACCESS_EXECUTE; + } +- nfs_fattr_init(&fattr); ++ ++ res.fattr = nfs_alloc_fattr(); ++ if (res.fattr == NULL) ++ return -ENOMEM; ++ + status = nfs4_call_sync(server, &msg, &args, &res, 0); + if (!status) { + entry->mask = 0; +@@ -2448,8 +2463,9 @@ static int _nfs4_proc_access(struct inod + entry->mask |= MAY_WRITE; + if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) + entry->mask |= MAY_EXEC; +- nfs_refresh_inode(inode, &fattr); ++ nfs_refresh_inode(inode, res.fattr); + } ++ nfs_free_fattr(res.fattr); + return status; + } + +@@ -2562,13 +2578,6 @@ nfs4_proc_create(struct inode *dir, stru + } + d_add(dentry, igrab(state->inode)); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); +- if (flags & O_EXCL) { +- struct nfs_fattr fattr; +- status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state); +- if (status == 0) +- nfs_setattr_update_inode(state->inode, sattr); +- nfs_post_op_update_inode(state->inode, &fattr); +- } + if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) + status = nfs4_intent_set_file(nd, &path, state, fmode); + else +@@ -2596,14 +2605,19 @@ static int _nfs4_proc_remove(struct inod + .rpc_argp = &args, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; ++ ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.dir_attr == NULL) ++ goto out; + +- nfs_fattr_init(&res.dir_attr); + status = nfs4_call_sync(server, &msg, &args, &res, 1); + if (status == 0) { + update_changeattr(dir, &res.cinfo); +- nfs_post_op_update_inode(dir, &res.dir_attr); ++ nfs_post_op_update_inode(dir, res.dir_attr); + } ++ nfs_free_fattr(res.dir_attr); ++out: + return status; + } + +@@ -2638,7 +2652,7 @@ static int nfs4_proc_unlink_done(struct + if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) + return 0; + update_changeattr(dir, &res->cinfo); +- nfs_post_op_update_inode(dir, &res->dir_attr); ++ nfs_post_op_update_inode(dir, res->dir_attr); + return 1; + } + +@@ -2653,29 +2667,31 @@ static int _nfs4_proc_rename(struct inod + .new_name = new_name, + .bitmask = server->attr_bitmask, + }; +- struct nfs_fattr old_fattr, new_fattr; + struct nfs4_rename_res res = { + .server = server, +- .old_fattr = &old_fattr, +- .new_fattr = &new_fattr, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME], + .rpc_argp = &arg, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + +- nfs_fattr_init(res.old_fattr); +- nfs_fattr_init(res.new_fattr); +- status = nfs4_call_sync(server, &msg, &arg, &res, 1); ++ res.old_fattr = nfs_alloc_fattr(); ++ res.new_fattr = nfs_alloc_fattr(); ++ if (res.old_fattr == NULL || res.new_fattr == NULL) ++ goto out; + ++ status = nfs4_call_sync(server, &msg, &arg, &res, 1); + if (!status) { + update_changeattr(old_dir, &res.old_cinfo); + nfs_post_op_update_inode(old_dir, res.old_fattr); + update_changeattr(new_dir, &res.new_cinfo); + nfs_post_op_update_inode(new_dir, res.new_fattr); + } ++out: ++ nfs_free_fattr(res.new_fattr); ++ nfs_free_fattr(res.old_fattr); + return status; + } + +@@ -2702,28 +2718,30 @@ static int _nfs4_proc_link(struct inode + .name = name, + .bitmask = server->attr_bitmask, + }; +- struct nfs_fattr fattr, dir_attr; + struct nfs4_link_res res = { + .server = server, +- .fattr = &fattr, +- .dir_attr = &dir_attr, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], + .rpc_argp = &arg, + .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; ++ ++ res.fattr = nfs_alloc_fattr(); ++ res.dir_attr = nfs_alloc_fattr(); ++ if (res.fattr == NULL || res.dir_attr == NULL) ++ goto out; + +- nfs_fattr_init(res.fattr); +- nfs_fattr_init(res.dir_attr); + status = nfs4_call_sync(server, &msg, &arg, &res, 1); + if (!status) { + update_changeattr(dir, &res.cinfo); + nfs_post_op_update_inode(dir, res.dir_attr); + nfs_post_op_update_inode(inode, res.fattr); + } +- ++out: ++ nfs_free_fattr(res.dir_attr); ++ nfs_free_fattr(res.fattr); + return status; + } + +@@ -3146,23 +3164,31 @@ static void nfs4_proc_commit_setup(struc + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; + } + ++struct nfs4_renewdata { ++ struct nfs_client *client; ++ unsigned long timestamp; ++}; ++ + /* + * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special + * standalone procedure for queueing an asynchronous RENEW. + */ +-static void nfs4_renew_release(void *data) ++static void nfs4_renew_release(void *calldata) + { +- struct nfs_client *clp = data; ++ struct nfs4_renewdata *data = calldata; ++ struct nfs_client *clp = data->client; + + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); ++ kfree(data); + } + +-static void nfs4_renew_done(struct rpc_task *task, void *data) ++static void nfs4_renew_done(struct rpc_task *task, void *calldata) + { +- struct nfs_client *clp = data; +- unsigned long timestamp = task->tk_start; ++ struct nfs4_renewdata *data = calldata; ++ struct nfs_client *clp = data->client; ++ unsigned long timestamp = data->timestamp; + + if (task->tk_status < 0) { + /* Unless we're shutting down, schedule state recovery! */ +@@ -3188,11 +3214,17 @@ int nfs4_proc_async_renew(struct nfs_cli + .rpc_argp = clp, + .rpc_cred = cred, + }; ++ struct nfs4_renewdata *data; + + if (!atomic_inc_not_zero(&clp->cl_count)) + return -EIO; ++ data = kmalloc(sizeof(*data), GFP_KERNEL); ++ if (data == NULL) ++ return -ENOMEM; ++ data->client = clp; ++ data->timestamp = jiffies; + return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, +- &nfs4_renew_ops, clp); ++ &nfs4_renew_ops, data); + } + + int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) +@@ -3494,7 +3526,9 @@ nfs4_async_handle_error(struct rpc_task + return _nfs4_async_handle_error(task, server, server->nfs_client, state); + } + +-int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred) ++int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, ++ unsigned short port, struct rpc_cred *cred, ++ struct nfs4_setclientid_res *res) + { + nfs4_verifier sc_verifier; + struct nfs4_setclientid setclientid = { +@@ -3504,7 +3538,7 @@ int nfs4_proc_setclientid(struct nfs_cli + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], + .rpc_argp = &setclientid, +- .rpc_resp = clp, ++ .rpc_resp = res, + .rpc_cred = cred, + }; + __be32 *p; +@@ -3547,12 +3581,14 @@ int nfs4_proc_setclientid(struct nfs_cli + return status; + } + +-static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) ++static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, ++ struct nfs4_setclientid_res *arg, ++ struct rpc_cred *cred) + { + struct nfs_fsinfo fsinfo; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM], +- .rpc_argp = clp, ++ .rpc_argp = arg, + .rpc_resp = &fsinfo, + .rpc_cred = cred, + }; +@@ -3570,12 +3606,14 @@ static int _nfs4_proc_setclientid_confir + return status; + } + +-int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) ++int nfs4_proc_setclientid_confirm(struct nfs_client *clp, ++ struct nfs4_setclientid_res *arg, ++ struct rpc_cred *cred) + { + long timeout = 0; + int err; + do { +- err = _nfs4_proc_setclientid_confirm(clp, cred); ++ err = _nfs4_proc_setclientid_confirm(clp, arg, cred); + switch (err) { + case 0: + return err; +@@ -3667,7 +3705,7 @@ static int _nfs4_proc_delegreturn(struct + }; + int status = 0; + +- data = kzalloc(sizeof(*data), GFP_KERNEL); ++ data = kzalloc(sizeof(*data), GFP_NOFS); + if (data == NULL) + return -ENOMEM; + data->args.fhandle = &data->fh; +@@ -3823,7 +3861,7 @@ static struct nfs4_unlockdata *nfs4_allo + struct nfs4_unlockdata *p; + struct inode *inode = lsp->ls_state->inode; + +- p = kzalloc(sizeof(*p), GFP_KERNEL); ++ p = kzalloc(sizeof(*p), GFP_NOFS); + if (p == NULL) + return NULL; + p->arg.fh = NFS_FH(inode); +@@ -3961,7 +3999,7 @@ static int nfs4_proc_unlck(struct nfs4_s + if (test_bit(NFS_DELEGATED_STATE, &state->flags)) + goto out; + lsp = request->fl_u.nfs4_fl.owner; +- seqid = nfs_alloc_seqid(&lsp->ls_seqid); ++ seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL); + status = -ENOMEM; + if (seqid == NULL) + goto out; +@@ -3989,22 +4027,23 @@ struct nfs4_lockdata { + }; + + static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, +- struct nfs_open_context *ctx, struct nfs4_lock_state *lsp) ++ struct nfs_open_context *ctx, struct nfs4_lock_state *lsp, ++ gfp_t gfp_mask) + { + struct nfs4_lockdata *p; + struct inode *inode = lsp->ls_state->inode; + struct nfs_server *server = NFS_SERVER(inode); + +- p = kzalloc(sizeof(*p), GFP_KERNEL); ++ p = kzalloc(sizeof(*p), gfp_mask); + if (p == NULL) + return NULL; + + p->arg.fh = NFS_FH(inode); + p->arg.fl = &p->fl; +- p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid); ++ p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask); + if (p->arg.open_seqid == NULL) + goto out_free; +- p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid); ++ p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask); + if (p->arg.lock_seqid == NULL) + goto out_free_seqid; + p->arg.lock_stateid = &lsp->ls_stateid; +@@ -4158,7 +4197,8 @@ static int _nfs4_do_setlk(struct nfs4_st + + dprintk("%s: begin!\n", __func__); + data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), +- fl->fl_u.nfs4_fl.owner); ++ fl->fl_u.nfs4_fl.owner, ++ recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS); + if (data == NULL) + return -ENOMEM; + if (IS_SETLKW(cmd)) +@@ -4647,7 +4687,7 @@ static int nfs4_reset_slot_table(struct + if (max_reqs != tbl->max_slots) { + ret = -ENOMEM; + new = kmalloc(max_reqs * sizeof(struct nfs4_slot), +- GFP_KERNEL); ++ GFP_NOFS); + if (!new) + goto out; + ret = 0; +@@ -4712,7 +4752,7 @@ static int nfs4_init_slot_table(struct n + + dprintk("--> %s: max_reqs=%u\n", __func__, max_slots); + +- slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL); ++ slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS); + if (!slot) + goto out; + ret = 0; +@@ -4761,7 +4801,7 @@ struct nfs4_session *nfs4_alloc_session( + struct nfs4_session *session; + struct nfs4_slot_table *tbl; + +- session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL); ++ session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); + if (!session) + return NULL; + +@@ -5105,8 +5145,8 @@ static int nfs41_proc_async_sequence(str + + if (!atomic_inc_not_zero(&clp->cl_count)) + return -EIO; +- args = kzalloc(sizeof(*args), GFP_KERNEL); +- res = kzalloc(sizeof(*res), GFP_KERNEL); ++ args = kzalloc(sizeof(*args), GFP_NOFS); ++ res = kzalloc(sizeof(*res), GFP_NOFS); + if (!args || !res) { + kfree(args); + kfree(res); +@@ -5207,7 +5247,7 @@ static int nfs41_proc_reclaim_complete(s + int status = -ENOMEM; + + dprintk("--> %s\n", __func__); +- calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); ++ calldata = kzalloc(sizeof(*calldata), GFP_NOFS); + if (calldata == NULL) + goto out; + calldata->clp = clp; +diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/nfs4state.c +--- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-23 11:01:00.367574218 -0400 +@@ -62,6 +62,7 @@ static LIST_HEAD(nfs4_clientid_list); + + int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) + { ++ struct nfs4_setclientid_res clid; + unsigned short port; + int status; + +@@ -69,11 +70,15 @@ int nfs4_init_clientid(struct nfs_client + if (clp->cl_addr.ss_family == AF_INET6) + port = nfs_callback_tcpport6; + +- status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred); +- if (status == 0) +- status = nfs4_proc_setclientid_confirm(clp, cred); +- if (status == 0) +- nfs4_schedule_state_renewal(clp); ++ status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); ++ if (status != 0) ++ goto out; ++ status = nfs4_proc_setclientid_confirm(clp, &clid, cred); ++ if (status != 0) ++ goto out; ++ clp->cl_clientid = clid.clientid; ++ nfs4_schedule_state_renewal(clp); ++out: + return status; + } + +@@ -361,7 +366,7 @@ nfs4_alloc_state_owner(void) + { + struct nfs4_state_owner *sp; + +- sp = kzalloc(sizeof(*sp),GFP_KERNEL); ++ sp = kzalloc(sizeof(*sp),GFP_NOFS); + if (!sp) + return NULL; + spin_lock_init(&sp->so_lock); +@@ -435,7 +440,7 @@ nfs4_alloc_open_state(void) + { + struct nfs4_state *state; + +- state = kzalloc(sizeof(*state), GFP_KERNEL); ++ state = kzalloc(sizeof(*state), GFP_NOFS); + if (!state) + return NULL; + atomic_set(&state->count, 1); +@@ -537,7 +542,8 @@ void nfs4_put_open_state(struct nfs4_sta + /* + * Close the current file. + */ +-static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait) ++static void __nfs4_close(struct path *path, struct nfs4_state *state, ++ fmode_t fmode, gfp_t gfp_mask, int wait) + { + struct nfs4_state_owner *owner = state->owner; + int call_close = 0; +@@ -578,17 +584,17 @@ static void __nfs4_close(struct path *pa + nfs4_put_open_state(state); + nfs4_put_state_owner(owner); + } else +- nfs4_do_close(path, state, wait); ++ nfs4_do_close(path, state, gfp_mask, wait); + } + + void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) + { +- __nfs4_close(path, state, fmode, 0); ++ __nfs4_close(path, state, fmode, GFP_NOFS, 0); + } + + void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode) + { +- __nfs4_close(path, state, fmode, 1); ++ __nfs4_close(path, state, fmode, GFP_KERNEL, 1); + } + + /* +@@ -618,7 +624,7 @@ static struct nfs4_lock_state *nfs4_allo + struct nfs4_lock_state *lsp; + struct nfs_client *clp = state->owner->so_client; + +- lsp = kzalloc(sizeof(*lsp), GFP_KERNEL); ++ lsp = kzalloc(sizeof(*lsp), GFP_NOFS); + if (lsp == NULL) + return NULL; + rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue"); +@@ -754,11 +760,11 @@ void nfs4_copy_stateid(nfs4_stateid *dst + nfs4_put_lock_state(lsp); + } + +-struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter) ++struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask) + { + struct nfs_seqid *new; + +- new = kmalloc(sizeof(*new), GFP_KERNEL); ++ new = kmalloc(sizeof(*new), gfp_mask); + if (new != NULL) { + new->sequence = counter; + INIT_LIST_HEAD(&new->list); +@@ -1347,7 +1353,7 @@ static int nfs4_recall_slot(struct nfs_c + + nfs4_begin_drain_session(clp); + new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot), +- GFP_KERNEL); ++ GFP_NOFS); + if (!new) + return -ENOMEM; + +diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-23 11:00:23.792491380 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-23 11:01:00.369544055 -0400 +@@ -1504,14 +1504,14 @@ static void encode_setclientid(struct xd + hdr->replen += decode_setclientid_maxsz; + } + +-static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr) ++static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr) + { + __be32 *p; + + p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); +- p = xdr_encode_hyper(p, client_state->cl_clientid); +- xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); ++ p = xdr_encode_hyper(p, arg->clientid); ++ xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE); + hdr->nops++; + hdr->replen += decode_setclientid_confirm_maxsz; + } +@@ -2324,7 +2324,7 @@ static int nfs4_xdr_enc_setclientid(stru + /* + * a SETCLIENTID_CONFIRM request + */ +-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp) ++static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg) + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +@@ -2334,7 +2334,7 @@ static int nfs4_xdr_enc_setclientid_conf + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); +- encode_setclientid_confirm(&xdr, clp, &hdr); ++ encode_setclientid_confirm(&xdr, arg, &hdr); + encode_putrootfh(&xdr, &hdr); + encode_fsinfo(&xdr, lease_bitmap, &hdr); + encode_nops(&hdr); +@@ -4397,7 +4397,7 @@ out_overflow: + return -EIO; + } + +-static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) ++static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res) + { + __be32 *p; + uint32_t opnum; +@@ -4417,8 +4417,8 @@ static int decode_setclientid(struct xdr + p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); + if (unlikely(!p)) + goto out_overflow; +- p = xdr_decode_hyper(p, &clp->cl_clientid); +- memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE); ++ p = xdr_decode_hyper(p, &res->clientid); ++ memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE); + } else if (nfserr == NFSERR_CLID_INUSE) { + uint32_t len; + +@@ -4815,7 +4815,7 @@ static int nfs4_xdr_dec_remove(struct rp + goto out; + if ((status = decode_remove(&xdr, &res->cinfo)) != 0) + goto out; +- decode_getfattr(&xdr, &res->dir_attr, res->server, ++ decode_getfattr(&xdr, res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); + out: + return status; +@@ -5498,7 +5498,7 @@ static int nfs4_xdr_dec_renew(struct rpc + * Decode SETCLIENTID response + */ + static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, +- struct nfs_client *clp) ++ struct nfs4_setclientid_res *res) + { + struct xdr_stream xdr; + struct compound_hdr hdr; +@@ -5507,7 +5507,7 @@ static int nfs4_xdr_dec_setclientid(stru + xdr_init_decode(&xdr, &req->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (!status) +- status = decode_setclientid(&xdr, clp); ++ status = decode_setclientid(&xdr, res); + return status; + } + +diff -up linux-2.6.34.noarch/fs/nfs/nfsroot.c.orig linux-2.6.34.noarch/fs/nfs/nfsroot.c +--- linux-2.6.34.noarch/fs/nfs/nfsroot.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfsroot.c 2010-08-23 11:01:00.371574358 -0400 +@@ -488,7 +488,6 @@ static int __init root_nfs_ports(void) + */ + static int __init root_nfs_get_handle(void) + { +- struct nfs_fh fh; + struct sockaddr_in sin; + unsigned int auth_flav_len = 0; + struct nfs_mount_request request = { +@@ -499,21 +498,24 @@ static int __init root_nfs_get_handle(vo + NFS_MNT3_VERSION : NFS_MNT_VERSION, + .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ? + XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP, +- .fh = &fh, + .auth_flav_len = &auth_flav_len, + }; +- int status; ++ int status = -ENOMEM; + ++ request.fh = nfs_alloc_fhandle(); ++ if (!request.fh) ++ goto out; + set_sockaddr(&sin, servaddr, htons(mount_port)); + status = nfs_mount(&request); + if (status < 0) + printk(KERN_ERR "Root-NFS: Server returned error %d " + "while mounting %s\n", status, nfs_export_path); + else { +- nfs_data.root.size = fh.size; +- memcpy(nfs_data.root.data, fh.data, fh.size); ++ nfs_data.root.size = request.fh->size; ++ memcpy(&nfs_data.root.data, request.fh->data, request.fh->size); + } +- ++ nfs_free_fhandle(request.fh); ++out: + return status; + } + +diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/pagelist.c +--- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-23 11:01:00.371574358 -0400 +@@ -60,16 +60,10 @@ nfs_create_request(struct nfs_open_conte + { + struct nfs_page *req; + +- for (;;) { +- /* try to allocate the request struct */ +- req = nfs_page_alloc(); +- if (req != NULL) +- break; +- +- if (fatal_signal_pending(current)) +- return ERR_PTR(-ERESTARTSYS); +- yield(); +- } ++ /* try to allocate the request struct */ ++ req = nfs_page_alloc(); ++ if (req == NULL) ++ return ERR_PTR(-ENOMEM); + + /* Initialize the request struct. Initially, we assume a + * long write-back delay. This will be adjusted in +diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc.c +--- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-23 11:01:00.372574292 -0400 +@@ -224,35 +224,60 @@ static int nfs_proc_readlink(struct inod + return status; + } + ++struct nfs_createdata { ++ struct nfs_createargs arg; ++ struct nfs_diropok res; ++ struct nfs_fh fhandle; ++ struct nfs_fattr fattr; ++}; ++ ++static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir, ++ struct dentry *dentry, struct iattr *sattr) ++{ ++ struct nfs_createdata *data; ++ ++ data = kmalloc(sizeof(*data), GFP_KERNEL); ++ ++ if (data != NULL) { ++ data->arg.fh = NFS_FH(dir); ++ data->arg.name = dentry->d_name.name; ++ data->arg.len = dentry->d_name.len; ++ data->arg.sattr = sattr; ++ nfs_fattr_init(&data->fattr); ++ data->fhandle.size = 0; ++ data->res.fh = &data->fhandle; ++ data->res.fattr = &data->fattr; ++ } ++ return data; ++}; ++ ++static void nfs_free_createdata(const struct nfs_createdata *data) ++{ ++ kfree(data); ++} ++ + static int + nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + int flags, struct nameidata *nd) + { +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; +- struct nfs_createargs arg = { +- .fh = NFS_FH(dir), +- .name = dentry->d_name.name, +- .len = dentry->d_name.len, +- .sattr = sattr +- }; +- struct nfs_diropok res = { +- .fh = &fhandle, +- .fattr = &fattr +- }; ++ struct nfs_createdata *data; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_CREATE], +- .rpc_argp = &arg, +- .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + +- nfs_fattr_init(&fattr); + dprintk("NFS call create %s\n", dentry->d_name.name); ++ data = nfs_alloc_createdata(dir, dentry, sattr); ++ if (data == NULL) ++ goto out; ++ msg.rpc_argp = &data->arg; ++ msg.rpc_resp = &data->res; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + if (status == 0) +- status = nfs_instantiate(dentry, &fhandle, &fattr); ++ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); ++ nfs_free_createdata(data); ++out: + dprintk("NFS reply create: %d\n", status); + return status; + } +@@ -264,24 +289,12 @@ static int + nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + dev_t rdev) + { +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; +- struct nfs_createargs arg = { +- .fh = NFS_FH(dir), +- .name = dentry->d_name.name, +- .len = dentry->d_name.len, +- .sattr = sattr +- }; +- struct nfs_diropok res = { +- .fh = &fhandle, +- .fattr = &fattr +- }; ++ struct nfs_createdata *data; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_CREATE], +- .rpc_argp = &arg, +- .rpc_resp = &res, + }; +- int status, mode; ++ umode_t mode; ++ int status = -ENOMEM; + + dprintk("NFS call mknod %s\n", dentry->d_name.name); + +@@ -294,17 +307,24 @@ nfs_proc_mknod(struct inode *dir, struct + sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */ + } + +- nfs_fattr_init(&fattr); ++ data = nfs_alloc_createdata(dir, dentry, sattr); ++ if (data == NULL) ++ goto out; ++ msg.rpc_argp = &data->arg; ++ msg.rpc_resp = &data->res; ++ + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + + if (status == -EINVAL && S_ISFIFO(mode)) { + sattr->ia_mode = mode; +- nfs_fattr_init(&fattr); ++ nfs_fattr_init(data->res.fattr); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + } + if (status == 0) +- status = nfs_instantiate(dentry, &fhandle, &fattr); ++ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); ++ nfs_free_createdata(data); ++out: + dprintk("NFS reply mknod: %d\n", status); + return status; + } +@@ -398,8 +418,8 @@ static int + nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, + unsigned int len, struct iattr *sattr) + { +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; ++ struct nfs_fh *fh; ++ struct nfs_fattr *fattr; + struct nfs_symlinkargs arg = { + .fromfh = NFS_FH(dir), + .fromname = dentry->d_name.name, +@@ -412,12 +432,18 @@ nfs_proc_symlink(struct inode *dir, stru + .rpc_proc = &nfs_procedures[NFSPROC_SYMLINK], + .rpc_argp = &arg, + }; +- int status; ++ int status = -ENAMETOOLONG; ++ ++ dprintk("NFS call symlink %s\n", dentry->d_name.name); + + if (len > NFS2_MAXPATHLEN) +- return -ENAMETOOLONG; ++ goto out; + +- dprintk("NFS call symlink %s\n", dentry->d_name.name); ++ fh = nfs_alloc_fhandle(); ++ fattr = nfs_alloc_fattr(); ++ status = -ENOMEM; ++ if (fh == NULL || fattr == NULL) ++ goto out; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); +@@ -427,12 +453,12 @@ nfs_proc_symlink(struct inode *dir, stru + * filehandle size to zero indicates to nfs_instantiate that it + * should fill in the data with a LOOKUP call on the wire. + */ +- if (status == 0) { +- nfs_fattr_init(&fattr); +- fhandle.size = 0; +- status = nfs_instantiate(dentry, &fhandle, &fattr); +- } ++ if (status == 0) ++ status = nfs_instantiate(dentry, fh, fattr); + ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fh); ++out: + dprintk("NFS reply symlink: %d\n", status); + return status; + } +@@ -440,31 +466,25 @@ nfs_proc_symlink(struct inode *dir, stru + static int + nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) + { +- struct nfs_fh fhandle; +- struct nfs_fattr fattr; +- struct nfs_createargs arg = { +- .fh = NFS_FH(dir), +- .name = dentry->d_name.name, +- .len = dentry->d_name.len, +- .sattr = sattr +- }; +- struct nfs_diropok res = { +- .fh = &fhandle, +- .fattr = &fattr +- }; ++ struct nfs_createdata *data; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_MKDIR], +- .rpc_argp = &arg, +- .rpc_resp = &res, + }; +- int status; ++ int status = -ENOMEM; + + dprintk("NFS call mkdir %s\n", dentry->d_name.name); +- nfs_fattr_init(&fattr); ++ data = nfs_alloc_createdata(dir, dentry, sattr); ++ if (data == NULL) ++ goto out; ++ msg.rpc_argp = &data->arg; ++ msg.rpc_resp = &data->res; ++ + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + if (status == 0) +- status = nfs_instantiate(dentry, &fhandle, &fattr); ++ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); ++ nfs_free_createdata(data); ++out: + dprintk("NFS reply mkdir: %d\n", status); + return status; + } +diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read.c +--- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-23 11:01:00.373574317 -0400 +@@ -40,7 +40,7 @@ static mempool_t *nfs_rdata_mempool; + + struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) + { +- struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS); ++ struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL); + + if (p) { + memset(p, 0, sizeof(*p)); +@@ -50,7 +50,7 @@ struct nfs_read_data *nfs_readdata_alloc + if (pagecount <= ARRAY_SIZE(p->page_array)) + p->pagevec = p->page_array; + else { +- p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); ++ p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); + if (!p->pagevec) { + mempool_free(p, nfs_rdata_mempool); + p = NULL; +diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/super.c +--- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-23 11:00:23.794511661 -0400 ++++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-23 11:01:00.374564179 -0400 +@@ -141,7 +141,6 @@ static const match_table_t nfs_mount_opt + { Opt_resvport, "resvport" }, + { Opt_noresvport, "noresvport" }, + { Opt_fscache, "fsc" }, +- { Opt_fscache_uniq, "fsc=%s" }, + { Opt_nofscache, "nofsc" }, + + { Opt_port, "port=%s" }, +@@ -171,6 +170,7 @@ static const match_table_t nfs_mount_opt + { Opt_mountaddr, "mountaddr=%s" }, + + { Opt_lookupcache, "lookupcache=%s" }, ++ { Opt_fscache_uniq, "fsc=%s" }, + + { Opt_err, NULL } + }; +@@ -423,15 +423,19 @@ static int nfs_statfs(struct dentry *den + unsigned char blockbits; + unsigned long blockres; + struct nfs_fh *fh = NFS_FH(dentry->d_inode); +- struct nfs_fattr fattr; +- struct nfs_fsstat res = { +- .fattr = &fattr, +- }; +- int error; ++ struct nfs_fsstat res; ++ int error = -ENOMEM; ++ ++ res.fattr = nfs_alloc_fattr(); ++ if (res.fattr == NULL) ++ goto out_err; + + error = server->nfs_client->rpc_ops->statfs(server, fh, &res); ++ ++ nfs_free_fattr(res.fattr); + if (error < 0) + goto out_err; ++ + buf->f_type = NFS_SUPER_MAGIC; + + /* +@@ -1060,14 +1064,6 @@ static int nfs_parse_mount_options(char + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = NULL; + break; +- case Opt_fscache_uniq: +- string = match_strdup(args); +- if (!string) +- goto out_nomem; +- kfree(mnt->fscache_uniq); +- mnt->fscache_uniq = string; +- mnt->options |= NFS_OPTION_FSCACHE; +- break; + + /* + * options that take numeric values +@@ -1398,6 +1394,14 @@ static int nfs_parse_mount_options(char + return 0; + }; + break; ++ case Opt_fscache_uniq: ++ string = match_strdup(args); ++ if (string == NULL) ++ goto out_nomem; ++ kfree(mnt->fscache_uniq); ++ mnt->fscache_uniq = string; ++ mnt->options |= NFS_OPTION_FSCACHE; ++ break; + + /* + * Special options +@@ -2186,7 +2190,7 @@ static int nfs_get_sb(struct file_system + int error = -ENOMEM; + + data = nfs_alloc_parsed_mount_data(3); +- mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); ++ mntfh = nfs_alloc_fhandle(); + if (data == NULL || mntfh == NULL) + goto out_free_fh; + +@@ -2261,7 +2265,7 @@ out: + kfree(data->fscache_uniq); + security_free_mnt_opts(&data->lsm_opts); + out_free_fh: +- kfree(mntfh); ++ nfs_free_fhandle(mntfh); + kfree(data); + return error; + +@@ -2570,7 +2574,7 @@ static int nfs4_remote_get_sb(struct fil + }; + int error = -ENOMEM; + +- mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); ++ mntfh = nfs_alloc_fhandle(); + if (data == NULL || mntfh == NULL) + goto out_free_fh; + +@@ -2628,7 +2632,7 @@ static int nfs4_remote_get_sb(struct fil + out: + security_free_mnt_opts(&data->lsm_opts); + out_free_fh: +- kfree(mntfh); ++ nfs_free_fhandle(mntfh); + return error; + + out_free: +@@ -2683,41 +2687,120 @@ out_freepage: + free_page((unsigned long)page); + } + ++struct nfs_referral_count { ++ struct list_head list; ++ const struct task_struct *task; ++ unsigned int referral_count; ++}; ++ ++static LIST_HEAD(nfs_referral_count_list); ++static DEFINE_SPINLOCK(nfs_referral_count_list_lock); ++ ++static struct nfs_referral_count *nfs_find_referral_count(void) ++{ ++ struct nfs_referral_count *p; ++ ++ list_for_each_entry(p, &nfs_referral_count_list, list) { ++ if (p->task == current) ++ return p; ++ } ++ return NULL; ++} ++ ++#define NFS_MAX_NESTED_REFERRALS 2 ++ ++static int nfs_referral_loop_protect(void) ++{ ++ struct nfs_referral_count *p, *new; ++ int ret = -ENOMEM; ++ ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ goto out; ++ new->task = current; ++ new->referral_count = 1; ++ ++ ret = 0; ++ spin_lock(&nfs_referral_count_list_lock); ++ p = nfs_find_referral_count(); ++ if (p != NULL) { ++ if (p->referral_count >= NFS_MAX_NESTED_REFERRALS) ++ ret = -ELOOP; ++ else ++ p->referral_count++; ++ } else { ++ list_add(&new->list, &nfs_referral_count_list); ++ new = NULL; ++ } ++ spin_unlock(&nfs_referral_count_list_lock); ++ kfree(new); ++out: ++ return ret; ++} ++ ++static void nfs_referral_loop_unprotect(void) ++{ ++ struct nfs_referral_count *p; ++ ++ spin_lock(&nfs_referral_count_list_lock); ++ p = nfs_find_referral_count(); ++ p->referral_count--; ++ if (p->referral_count == 0) ++ list_del(&p->list); ++ else ++ p = NULL; ++ spin_unlock(&nfs_referral_count_list_lock); ++ kfree(p); ++} ++ + static int nfs_follow_remote_path(struct vfsmount *root_mnt, + const char *export_path, struct vfsmount *mnt_target) + { ++ struct nameidata *nd = NULL; + struct mnt_namespace *ns_private; +- struct nameidata nd; + struct super_block *s; + int ret; + ++ nd = kmalloc(sizeof(*nd), GFP_KERNEL); ++ if (nd == NULL) ++ return -ENOMEM; ++ + ns_private = create_mnt_ns(root_mnt); + ret = PTR_ERR(ns_private); + if (IS_ERR(ns_private)) + goto out_mntput; + ++ ret = nfs_referral_loop_protect(); ++ if (ret != 0) ++ goto out_put_mnt_ns; ++ + ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, +- export_path, LOOKUP_FOLLOW, &nd); ++ export_path, LOOKUP_FOLLOW, nd); + ++ nfs_referral_loop_unprotect(); + put_mnt_ns(ns_private); + + if (ret != 0) + goto out_err; + +- s = nd.path.mnt->mnt_sb; ++ s = nd->path.mnt->mnt_sb; + atomic_inc(&s->s_active); + mnt_target->mnt_sb = s; +- mnt_target->mnt_root = dget(nd.path.dentry); ++ mnt_target->mnt_root = dget(nd->path.dentry); + + /* Correct the device pathname */ +- nfs_fix_devname(&nd.path, mnt_target); ++ nfs_fix_devname(&nd->path, mnt_target); + +- path_put(&nd.path); ++ path_put(&nd->path); ++ kfree(nd); + down_write(&s->s_umount); + return 0; ++out_put_mnt_ns: ++ put_mnt_ns(ns_private); + out_mntput: + mntput(root_mnt); + out_err: ++ kfree(nd); + return ret; + } + +@@ -2888,17 +2971,21 @@ static int nfs4_remote_referral_get_sb(s + struct super_block *s; + struct nfs_server *server; + struct dentry *mntroot; +- struct nfs_fh mntfh; ++ struct nfs_fh *mntfh; + int (*compare_super)(struct super_block *, void *) = nfs_compare_super; + struct nfs_sb_mountdata sb_mntdata = { + .mntflags = flags, + }; +- int error; ++ int error = -ENOMEM; + + dprintk("--> nfs4_referral_get_sb()\n"); + ++ mntfh = nfs_alloc_fhandle(); ++ if (mntfh == NULL) ++ goto out_err_nofh; ++ + /* create a new volume representation */ +- server = nfs4_create_referral_server(data, &mntfh); ++ server = nfs4_create_referral_server(data, mntfh); + if (IS_ERR(server)) { + error = PTR_ERR(server); + goto out_err_noserver; +@@ -2930,7 +3017,7 @@ static int nfs4_remote_referral_get_sb(s + nfs_fscache_get_super_cookie(s, NULL, data); + } + +- mntroot = nfs4_get_root(s, &mntfh); ++ mntroot = nfs4_get_root(s, mntfh); + if (IS_ERR(mntroot)) { + error = PTR_ERR(mntroot); + goto error_splat_super; +@@ -2947,12 +3034,15 @@ static int nfs4_remote_referral_get_sb(s + + security_sb_clone_mnt_opts(data->sb, s); + ++ nfs_free_fhandle(mntfh); + dprintk("<-- nfs4_referral_get_sb() = 0\n"); + return 0; + + out_err_nosb: + nfs_free_server(server); + out_err_noserver: ++ nfs_free_fhandle(mntfh); ++out_err_nofh: + dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error); + return error; + +@@ -2961,6 +3051,7 @@ error_splat_super: + bdi_unregister(&server->backing_dev_info); + error_splat_bdi: + deactivate_locked_super(s); ++ nfs_free_fhandle(mntfh); + dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); + return error; + } +diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unlink.c +--- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-23 11:01:00.375554592 -0400 +@@ -23,6 +23,7 @@ struct nfs_unlinkdata { + struct nfs_removeres res; + struct inode *dir; + struct rpc_cred *cred; ++ struct nfs_fattr dir_attr; + }; + + /** +@@ -169,7 +170,7 @@ static int nfs_do_call_unlink(struct den + } + nfs_sb_active(dir->i_sb); + data->args.fh = NFS_FH(dir); +- nfs_fattr_init(&data->res.dir_attr); ++ nfs_fattr_init(data->res.dir_attr); + + NFS_PROTO(dir)->unlink_setup(&msg, dir); + +@@ -259,6 +260,7 @@ nfs_async_unlink(struct inode *dir, stru + goto out_free; + } + data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ data->res.dir_attr = &data->dir_attr; + + status = -EBUSY; + spin_lock(&dentry->d_lock); +diff -up linux-2.6.34.noarch/include/linux/ktime.h.orig linux-2.6.34.noarch/include/linux/ktime.h +--- linux-2.6.34.noarch/include/linux/ktime.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/ktime.h 2010-08-23 11:01:00.377554285 -0400 +@@ -130,7 +130,7 @@ static inline ktime_t timeval_to_ktime(s + /* Convert ktime_t to nanoseconds - NOP in the scalar storage format: */ + #define ktime_to_ns(kt) ((kt).tv64) + +-#else ++#else /* !((BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)) */ + + /* + * Helper macros/inlines to get the ktime_t math right in the timespec +@@ -275,7 +275,7 @@ static inline s64 ktime_to_ns(const ktim + return (s64) kt.tv.sec * NSEC_PER_SEC + kt.tv.nsec; + } + +-#endif ++#endif /* !((BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)) */ + + /** + * ktime_equal - Compares two ktime_t variables to see if they are equal +@@ -295,6 +295,12 @@ static inline s64 ktime_to_us(const ktim + return (s64) tv.tv_sec * USEC_PER_SEC + tv.tv_usec; + } + ++static inline s64 ktime_to_ms(const ktime_t kt) ++{ ++ struct timeval tv = ktime_to_timeval(kt); ++ return (s64) tv.tv_sec * MSEC_PER_SEC + tv.tv_usec / USEC_PER_MSEC; ++} ++ + static inline s64 ktime_us_delta(const ktime_t later, const ktime_t earlier) + { + return ktime_to_us(ktime_sub(later, earlier)); +diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/include/linux/nfs_fs.h +--- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-23 11:00:23.822502111 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-23 11:01:00.378563926 -0400 +@@ -356,6 +356,20 @@ extern struct nfs_open_context *nfs_find + extern u64 nfs_compat_user_ino64(u64 fileid); + extern void nfs_fattr_init(struct nfs_fattr *fattr); + ++extern struct nfs_fattr *nfs_alloc_fattr(void); ++ ++static inline void nfs_free_fattr(const struct nfs_fattr *fattr) ++{ ++ kfree(fattr); ++} ++ ++extern struct nfs_fh *nfs_alloc_fhandle(void); ++ ++static inline void nfs_free_fhandle(const struct nfs_fh *fh) ++{ ++ kfree(fh); ++} ++ + /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */ + extern __be32 root_nfs_parse_addr(char *name); /*__init*/ + extern unsigned long nfs_inc_attr_generation_counter(void); +diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/include/linux/nfs_fs_sb.h +--- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-23 11:01:00.380553887 -0400 +@@ -44,7 +44,6 @@ struct nfs_client { + + #ifdef CONFIG_NFS_V4 + u64 cl_clientid; /* constant */ +- nfs4_verifier cl_confirm; + unsigned long cl_state; + + struct rb_root cl_openowner_id; +diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/include/linux/nfs_xdr.h +--- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-23 11:01:00.381564072 -0400 +@@ -386,8 +386,8 @@ struct nfs_removeargs { + + struct nfs_removeres { + const struct nfs_server *server; ++ struct nfs_fattr *dir_attr; + struct nfs4_change_info cinfo; +- struct nfs_fattr dir_attr; + struct nfs4_sequence_res seq_res; + }; + +@@ -824,6 +824,11 @@ struct nfs4_setclientid { + u32 sc_cb_ident; + }; + ++struct nfs4_setclientid_res { ++ u64 clientid; ++ nfs4_verifier confirm; ++}; ++ + struct nfs4_statfs_arg { + const struct nfs_fh * fh; + const u32 * bitmask; +diff -up linux-2.6.34.noarch/include/linux/sunrpc/auth_gss.h.orig linux-2.6.34.noarch/include/linux/sunrpc/auth_gss.h +--- linux-2.6.34.noarch/include/linux/sunrpc/auth_gss.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/auth_gss.h 2010-08-23 11:01:00.382564026 -0400 +@@ -82,6 +82,7 @@ struct gss_cred { + enum rpc_gss_svc gc_service; + struct gss_cl_ctx *gc_ctx; + struct gss_upcall_msg *gc_upcall; ++ unsigned long gc_upcall_timestamp; + unsigned char gc_machine_cred : 1; + }; + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/auth.h.orig linux-2.6.34.noarch/include/linux/sunrpc/auth.h +--- linux-2.6.34.noarch/include/linux/sunrpc/auth.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/auth.h 2010-08-23 11:01:00.382564026 -0400 +@@ -54,6 +54,7 @@ struct rpc_cred { + #define RPCAUTH_CRED_NEW 0 + #define RPCAUTH_CRED_UPTODATE 1 + #define RPCAUTH_CRED_HASHED 2 ++#define RPCAUTH_CRED_NEGATIVE 3 + + #define RPCAUTH_CRED_MAGIC 0x0f4aa4f0 + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/gss_api.h.orig linux-2.6.34.noarch/include/linux/sunrpc/gss_api.h +--- linux-2.6.34.noarch/include/linux/sunrpc/gss_api.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/gss_api.h 2010-08-23 11:01:00.383574314 -0400 +@@ -35,7 +35,8 @@ int gss_import_sec_context( + const void* input_token, + size_t bufsize, + struct gss_api_mech *mech, +- struct gss_ctx **ctx_id); ++ struct gss_ctx **ctx_id, ++ gfp_t gfp_mask); + u32 gss_get_mic( + struct gss_ctx *ctx_id, + struct xdr_buf *message, +@@ -80,6 +81,8 @@ struct gss_api_mech { + /* pseudoflavors supported by this mechanism: */ + int gm_pf_num; + struct pf_desc * gm_pfs; ++ /* Should the following be a callback operation instead? */ ++ const char *gm_upcall_enctypes; + }; + + /* and must provide the following operations: */ +@@ -87,7 +90,8 @@ struct gss_api_ops { + int (*gss_import_sec_context)( + const void *input_token, + size_t bufsize, +- struct gss_ctx *ctx_id); ++ struct gss_ctx *ctx_id, ++ gfp_t gfp_mask); + u32 (*gss_get_mic)( + struct gss_ctx *ctx_id, + struct xdr_buf *message, +diff -up linux-2.6.34.noarch/include/linux/sunrpc/gss_krb5.h.orig linux-2.6.34.noarch/include/linux/sunrpc/gss_krb5.h +--- linux-2.6.34.noarch/include/linux/sunrpc/gss_krb5.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/gss_krb5.h 2010-08-23 11:01:00.383574314 -0400 +@@ -4,7 +4,7 @@ + * Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h, + * lib/gssapi/krb5/gssapiP_krb5.h, and others + * +- * Copyright (c) 2000 The Regents of the University of Michigan. ++ * Copyright (c) 2000-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson +@@ -36,17 +36,86 @@ + * + */ + ++#include + #include + #include + #include + ++/* Length of constant used in key derivation */ ++#define GSS_KRB5_K5CLENGTH (5) ++ ++/* Maximum key length (in bytes) for the supported crypto algorithms*/ ++#define GSS_KRB5_MAX_KEYLEN (32) ++ ++/* Maximum checksum function output for the supported crypto algorithms */ ++#define GSS_KRB5_MAX_CKSUM_LEN (20) ++ ++/* Maximum blocksize for the supported crypto algorithms */ ++#define GSS_KRB5_MAX_BLOCKSIZE (16) ++ ++struct krb5_ctx; ++ ++struct gss_krb5_enctype { ++ const u32 etype; /* encryption (key) type */ ++ const u32 ctype; /* checksum type */ ++ const char *name; /* "friendly" name */ ++ const char *encrypt_name; /* crypto encrypt name */ ++ const char *cksum_name; /* crypto checksum name */ ++ const u16 signalg; /* signing algorithm */ ++ const u16 sealalg; /* sealing algorithm */ ++ const u32 blocksize; /* encryption blocksize */ ++ const u32 conflen; /* confounder length ++ (normally the same as ++ the blocksize) */ ++ const u32 cksumlength; /* checksum length */ ++ const u32 keyed_cksum; /* is it a keyed cksum? */ ++ const u32 keybytes; /* raw key len, in bytes */ ++ const u32 keylength; /* final key len, in bytes */ ++ u32 (*encrypt) (struct crypto_blkcipher *tfm, ++ void *iv, void *in, void *out, ++ int length); /* encryption function */ ++ u32 (*decrypt) (struct crypto_blkcipher *tfm, ++ void *iv, void *in, void *out, ++ int length); /* decryption function */ ++ u32 (*mk_key) (const struct gss_krb5_enctype *gk5e, ++ struct xdr_netobj *in, ++ struct xdr_netobj *out); /* complete key generation */ ++ u32 (*encrypt_v2) (struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, int ec, ++ struct page **pages); /* v2 encryption function */ ++ u32 (*decrypt_v2) (struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, u32 *headskip, ++ u32 *tailskip); /* v2 decryption function */ ++}; ++ ++/* krb5_ctx flags definitions */ ++#define KRB5_CTX_FLAG_INITIATOR 0x00000001 ++#define KRB5_CTX_FLAG_CFX 0x00000002 ++#define KRB5_CTX_FLAG_ACCEPTOR_SUBKEY 0x00000004 ++ + struct krb5_ctx { + int initiate; /* 1 = initiating, 0 = accepting */ ++ u32 enctype; ++ u32 flags; ++ const struct gss_krb5_enctype *gk5e; /* enctype-specific info */ + struct crypto_blkcipher *enc; + struct crypto_blkcipher *seq; ++ struct crypto_blkcipher *acceptor_enc; ++ struct crypto_blkcipher *initiator_enc; ++ struct crypto_blkcipher *acceptor_enc_aux; ++ struct crypto_blkcipher *initiator_enc_aux; ++ u8 Ksess[GSS_KRB5_MAX_KEYLEN]; /* session key */ ++ u8 cksum[GSS_KRB5_MAX_KEYLEN]; + s32 endtime; + u32 seq_send; ++ u64 seq_send64; + struct xdr_netobj mech_used; ++ u8 initiator_sign[GSS_KRB5_MAX_KEYLEN]; ++ u8 acceptor_sign[GSS_KRB5_MAX_KEYLEN]; ++ u8 initiator_seal[GSS_KRB5_MAX_KEYLEN]; ++ u8 acceptor_seal[GSS_KRB5_MAX_KEYLEN]; ++ u8 initiator_integ[GSS_KRB5_MAX_KEYLEN]; ++ u8 acceptor_integ[GSS_KRB5_MAX_KEYLEN]; + }; + + extern spinlock_t krb5_seq_lock; +@@ -57,6 +126,18 @@ extern spinlock_t krb5_seq_lock; + #define KG_TOK_MIC_MSG 0x0101 + #define KG_TOK_WRAP_MSG 0x0201 + ++#define KG2_TOK_INITIAL 0x0101 ++#define KG2_TOK_RESPONSE 0x0202 ++#define KG2_TOK_MIC 0x0404 ++#define KG2_TOK_WRAP 0x0504 ++ ++#define KG2_TOKEN_FLAG_SENTBYACCEPTOR 0x01 ++#define KG2_TOKEN_FLAG_SEALED 0x02 ++#define KG2_TOKEN_FLAG_ACCEPTORSUBKEY 0x04 ++ ++#define KG2_RESP_FLAG_ERROR 0x0001 ++#define KG2_RESP_FLAG_DELEG_OK 0x0002 ++ + enum sgn_alg { + SGN_ALG_DES_MAC_MD5 = 0x0000, + SGN_ALG_MD2_5 = 0x0001, +@@ -81,6 +162,9 @@ enum seal_alg { + #define CKSUMTYPE_RSA_MD5_DES 0x0008 + #define CKSUMTYPE_NIST_SHA 0x0009 + #define CKSUMTYPE_HMAC_SHA1_DES3 0x000c ++#define CKSUMTYPE_HMAC_SHA1_96_AES128 0x000f ++#define CKSUMTYPE_HMAC_SHA1_96_AES256 0x0010 ++#define CKSUMTYPE_HMAC_MD5_ARCFOUR -138 /* Microsoft md5 hmac cksumtype */ + + /* from gssapi_err_krb5.h */ + #define KG_CCACHE_NOMATCH (39756032L) +@@ -111,11 +195,56 @@ enum seal_alg { + #define ENCTYPE_DES3_CBC_RAW 0x0006 /* DES-3 cbc mode raw */ + #define ENCTYPE_DES_HMAC_SHA1 0x0008 + #define ENCTYPE_DES3_CBC_SHA1 0x0010 ++#define ENCTYPE_AES128_CTS_HMAC_SHA1_96 0x0011 ++#define ENCTYPE_AES256_CTS_HMAC_SHA1_96 0x0012 ++#define ENCTYPE_ARCFOUR_HMAC 0x0017 ++#define ENCTYPE_ARCFOUR_HMAC_EXP 0x0018 + #define ENCTYPE_UNKNOWN 0x01ff + +-s32 +-make_checksum(char *, char *header, int hdrlen, struct xdr_buf *body, +- int body_offset, struct xdr_netobj *cksum); ++/* ++ * Constants used for key derivation ++ */ ++/* for 3DES */ ++#define KG_USAGE_SEAL (22) ++#define KG_USAGE_SIGN (23) ++#define KG_USAGE_SEQ (24) ++ ++/* from rfc3961 */ ++#define KEY_USAGE_SEED_CHECKSUM (0x99) ++#define KEY_USAGE_SEED_ENCRYPTION (0xAA) ++#define KEY_USAGE_SEED_INTEGRITY (0x55) ++ ++/* from rfc4121 */ ++#define KG_USAGE_ACCEPTOR_SEAL (22) ++#define KG_USAGE_ACCEPTOR_SIGN (23) ++#define KG_USAGE_INITIATOR_SEAL (24) ++#define KG_USAGE_INITIATOR_SIGN (25) ++ ++/* ++ * This compile-time check verifies that we will not exceed the ++ * slack space allotted by the client and server auth_gss code ++ * before they call gss_wrap(). ++ */ ++#define GSS_KRB5_MAX_SLACK_NEEDED \ ++ (GSS_KRB5_TOK_HDR_LEN /* gss token header */ \ ++ + GSS_KRB5_MAX_CKSUM_LEN /* gss token checksum */ \ ++ + GSS_KRB5_MAX_BLOCKSIZE /* confounder */ \ ++ + GSS_KRB5_MAX_BLOCKSIZE /* possible padding */ \ ++ + GSS_KRB5_TOK_HDR_LEN /* encrypted hdr in v2 token */\ ++ + GSS_KRB5_MAX_CKSUM_LEN /* encryption hmac */ \ ++ + 4 + 4 /* RPC verifier */ \ ++ + GSS_KRB5_TOK_HDR_LEN \ ++ + GSS_KRB5_MAX_CKSUM_LEN) ++ ++u32 ++make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, ++ struct xdr_buf *body, int body_offset, u8 *cksumkey, ++ unsigned int usage, struct xdr_netobj *cksumout); ++ ++u32 ++make_checksum_v2(struct krb5_ctx *, char *header, int hdrlen, ++ struct xdr_buf *body, int body_offset, u8 *key, ++ unsigned int usage, struct xdr_netobj *cksum); + + u32 gss_get_mic_kerberos(struct gss_ctx *, struct xdr_buf *, + struct xdr_netobj *); +@@ -149,11 +278,54 @@ gss_decrypt_xdr_buf(struct crypto_blkcip + int offset); + + s32 +-krb5_make_seq_num(struct crypto_blkcipher *key, ++krb5_make_seq_num(struct krb5_ctx *kctx, ++ struct crypto_blkcipher *key, + int direction, + u32 seqnum, unsigned char *cksum, unsigned char *buf); + + s32 +-krb5_get_seq_num(struct crypto_blkcipher *key, ++krb5_get_seq_num(struct krb5_ctx *kctx, + unsigned char *cksum, + unsigned char *buf, int *direction, u32 *seqnum); ++ ++int ++xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen); ++ ++u32 ++krb5_derive_key(const struct gss_krb5_enctype *gk5e, ++ const struct xdr_netobj *inkey, ++ struct xdr_netobj *outkey, ++ const struct xdr_netobj *in_constant, ++ gfp_t gfp_mask); ++ ++u32 ++gss_krb5_des3_make_key(const struct gss_krb5_enctype *gk5e, ++ struct xdr_netobj *randombits, ++ struct xdr_netobj *key); ++ ++u32 ++gss_krb5_aes_make_key(const struct gss_krb5_enctype *gk5e, ++ struct xdr_netobj *randombits, ++ struct xdr_netobj *key); ++ ++u32 ++gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, int ec, ++ struct page **pages); ++ ++u32 ++gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, u32 *plainoffset, ++ u32 *plainlen); ++ ++int ++krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, ++ struct crypto_blkcipher *cipher, ++ unsigned char *cksum); ++ ++int ++krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, ++ struct crypto_blkcipher *cipher, ++ s32 seqnum); ++void ++gss_krb5_make_confounder(char *p, u32 conflen); +diff -up linux-2.6.34.noarch/include/linux/sunrpc/metrics.h.orig linux-2.6.34.noarch/include/linux/sunrpc/metrics.h +--- linux-2.6.34.noarch/include/linux/sunrpc/metrics.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/metrics.h 2010-08-23 11:01:00.384611889 -0400 +@@ -26,6 +26,7 @@ + #define _LINUX_SUNRPC_METRICS_H + + #include ++#include + + #define RPC_IOSTATS_VERS "1.0" + +@@ -58,9 +59,9 @@ struct rpc_iostats { + * and the total time the request spent from init to release + * are measured. + */ +- unsigned long long om_queue, /* jiffies queued for xmit */ +- om_rtt, /* jiffies for RPC RTT */ +- om_execute; /* jiffies for RPC execution */ ++ ktime_t om_queue, /* queued for xmit */ ++ om_rtt, /* RPC RTT */ ++ om_execute; /* RPC execution */ + } ____cacheline_aligned; + + struct rpc_task; +diff -up linux-2.6.34.noarch/include/linux/sunrpc/sched.h.orig linux-2.6.34.noarch/include/linux/sunrpc/sched.h +--- linux-2.6.34.noarch/include/linux/sunrpc/sched.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/sched.h 2010-08-23 11:01:00.385361873 -0400 +@@ -10,6 +10,7 @@ + #define _LINUX_SUNRPC_SCHED_H_ + + #include ++#include + #include + #include + #include +@@ -40,21 +41,15 @@ struct rpc_wait { + * This is the RPC task struct + */ + struct rpc_task { +-#ifdef RPC_DEBUG +- unsigned long tk_magic; /* 0xf00baa */ +-#endif + atomic_t tk_count; /* Reference count */ + struct list_head tk_task; /* global list of tasks */ + struct rpc_clnt * tk_client; /* RPC client */ + struct rpc_rqst * tk_rqstp; /* RPC request */ +- int tk_status; /* result of last operation */ + + /* + * RPC call state + */ + struct rpc_message tk_msg; /* RPC call info */ +- __u8 tk_garb_retry; +- __u8 tk_cred_retry; + + /* + * callback to be executed after waking up +@@ -67,7 +62,6 @@ struct rpc_task { + void * tk_calldata; + + unsigned long tk_timeout; /* timeout for rpc_sleep() */ +- unsigned short tk_flags; /* misc flags */ + unsigned long tk_runstate; /* Task run status */ + struct workqueue_struct *tk_workqueue; /* Normally rpciod, but could + * be any workqueue +@@ -78,17 +72,19 @@ struct rpc_task { + struct rpc_wait tk_wait; /* RPC wait */ + } u; + +- unsigned short tk_timeouts; /* maj timeouts */ +- size_t tk_bytes_sent; /* total bytes sent */ +- unsigned long tk_start; /* RPC task init timestamp */ +- long tk_rtt; /* round-trip time (jiffies) */ ++ ktime_t tk_start; /* RPC task init timestamp */ + + pid_t tk_owner; /* Process id for batching tasks */ +- unsigned char tk_priority : 2;/* Task priority */ ++ int tk_status; /* result of last operation */ ++ unsigned short tk_flags; /* misc flags */ ++ unsigned short tk_timeouts; /* maj timeouts */ + + #ifdef RPC_DEBUG + unsigned short tk_pid; /* debugging aid */ + #endif ++ unsigned char tk_priority : 2,/* Task priority */ ++ tk_garb_retry : 2, ++ tk_cred_retry : 2; + }; + #define tk_xprt tk_client->cl_xprt + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xdr.h +--- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-23 11:01:00.385361873 -0400 +@@ -1,7 +1,10 @@ + /* +- * include/linux/sunrpc/xdr.h ++ * XDR standard data types and function declarations + * + * Copyright (C) 1995-1997 Olaf Kirch ++ * ++ * Based on: ++ * RFC 4506 "XDR: External Data Representation Standard", May 2006 + */ + + #ifndef _SUNRPC_XDR_H_ +@@ -62,7 +65,6 @@ struct xdr_buf { + + unsigned int buflen, /* Total length of storage buffer */ + len; /* Length of XDR encoded message */ +- + }; + + /* +@@ -178,7 +180,7 @@ struct xdr_array2_desc { + }; + + extern int xdr_decode_array2(struct xdr_buf *buf, unsigned int base, +- struct xdr_array2_desc *desc); ++ struct xdr_array2_desc *desc); + extern int xdr_encode_array2(struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc); + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/xprt.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xprt.h +--- linux-2.6.34.noarch/include/linux/sunrpc/xprt.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xprt.h 2010-08-23 11:01:00.386574704 -0400 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -65,8 +66,6 @@ struct rpc_rqst { + struct rpc_task * rq_task; /* RPC task data */ + __be32 rq_xid; /* request XID */ + int rq_cong; /* has incremented xprt->cong */ +- int rq_reply_bytes_recvd; /* number of reply */ +- /* bytes received */ + u32 rq_seqno; /* gss seq no. used on req. */ + int rq_enc_pages_num; + struct page **rq_enc_pages; /* scratch pages for use by +@@ -77,12 +76,16 @@ struct rpc_rqst { + __u32 * rq_buffer; /* XDR encode buffer */ + size_t rq_callsize, + rq_rcvsize; ++ size_t rq_xmit_bytes_sent; /* total bytes sent */ ++ size_t rq_reply_bytes_recvd; /* total reply bytes */ ++ /* received */ + + struct xdr_buf rq_private_buf; /* The receive buffer + * used in the softirq. + */ + unsigned long rq_majortimeo; /* major timeout alarm */ + unsigned long rq_timeout; /* Current timeout value */ ++ ktime_t rq_rtt; /* round-trip time */ + unsigned int rq_retries; /* # of retries */ + unsigned int rq_connect_cookie; + /* A cookie used to track the +@@ -94,7 +97,7 @@ struct rpc_rqst { + */ + u32 rq_bytes_sent; /* Bytes we have sent */ + +- unsigned long rq_xtime; /* when transmitted */ ++ ktime_t rq_xtime; /* transmit time stamp */ + int rq_ntrans; + + #if defined(CONFIG_NFS_V4_1) +@@ -174,8 +177,7 @@ struct rpc_xprt { + /* + * Connection of transports + */ +- unsigned long connect_timeout, +- bind_timeout, ++ unsigned long bind_timeout, + reestablish_timeout; + unsigned int connect_cookie; /* A cookie that gets bumped + every time the transport +@@ -294,7 +296,6 @@ void xprt_set_retrans_timeout_rtt(stru + void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); + void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action); + void xprt_write_space(struct rpc_xprt *xprt); +-void xprt_update_rtt(struct rpc_task *task); + void xprt_adjust_cwnd(struct rpc_task *task, int result); + struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid); + void xprt_complete_rqst(struct rpc_task *task, int copied); +diff -up linux-2.6.34.noarch/net/sunrpc/auth.c.orig linux-2.6.34.noarch/net/sunrpc/auth.c +--- linux-2.6.34.noarch/net/sunrpc/auth.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth.c 2010-08-23 11:01:00.387574079 -0400 +@@ -236,10 +236,15 @@ rpcauth_prune_expired(struct list_head * + + list_for_each_entry_safe(cred, next, &cred_unused, cr_lru) { + +- /* Enforce a 60 second garbage collection moratorium */ ++ if (nr_to_scan-- == 0) ++ break; ++ /* ++ * Enforce a 60 second garbage collection moratorium ++ * Note that the cred_unused list must be time-ordered. ++ */ + if (time_in_range(cred->cr_expire, expired, jiffies) && + test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) +- continue; ++ return 0; + + list_del_init(&cred->cr_lru); + number_cred_unused--; +@@ -252,13 +257,10 @@ rpcauth_prune_expired(struct list_head * + get_rpccred(cred); + list_add_tail(&cred->cr_lru, free); + rpcauth_unhash_cred_locked(cred); +- nr_to_scan--; + } + spin_unlock(cache_lock); +- if (nr_to_scan == 0) +- break; + } +- return nr_to_scan; ++ return (number_cred_unused / 100) * sysctl_vfs_cache_pressure; + } + + /* +@@ -270,11 +272,12 @@ rpcauth_cache_shrinker(int nr_to_scan, g + LIST_HEAD(free); + int res; + ++ if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) ++ return (nr_to_scan == 0) ? 0 : -1; + if (list_empty(&cred_unused)) + return 0; + spin_lock(&rpc_credcache_lock); +- nr_to_scan = rpcauth_prune_expired(&free, nr_to_scan); +- res = (number_cred_unused / 100) * sysctl_vfs_cache_pressure; ++ res = rpcauth_prune_expired(&free, nr_to_scan); + spin_unlock(&rpc_credcache_lock); + rpcauth_destroy_credlist(&free); + return res; +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/auth_gss.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/auth_gss.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/auth_gss.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/auth_gss.c 2010-08-23 11:01:00.388574680 -0400 +@@ -57,11 +57,14 @@ static const struct rpc_authops authgss_ + static const struct rpc_credops gss_credops; + static const struct rpc_credops gss_nullops; + ++#define GSS_RETRY_EXPIRED 5 ++static unsigned int gss_expired_cred_retry_delay = GSS_RETRY_EXPIRED; ++ + #ifdef RPC_DEBUG + # define RPCDBG_FACILITY RPCDBG_AUTH + #endif + +-#define GSS_CRED_SLACK 1024 ++#define GSS_CRED_SLACK (RPC_MAX_AUTH_SIZE * 2) + /* length of a krb5 verifier (48), plus data added before arguments when + * using integrity (two 4-byte integers): */ + #define GSS_VERF_SLACK 100 +@@ -229,7 +232,7 @@ gss_fill_context(const void *p, const vo + p = ERR_PTR(-EFAULT); + goto err; + } +- ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx); ++ ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, GFP_NOFS); + if (ret < 0) { + p = ERR_PTR(ret); + goto err; +@@ -350,6 +353,24 @@ gss_unhash_msg(struct gss_upcall_msg *gs + } + + static void ++gss_handle_downcall_result(struct gss_cred *gss_cred, struct gss_upcall_msg *gss_msg) ++{ ++ switch (gss_msg->msg.errno) { ++ case 0: ++ if (gss_msg->ctx == NULL) ++ break; ++ clear_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags); ++ gss_cred_set_ctx(&gss_cred->gc_base, gss_msg->ctx); ++ break; ++ case -EKEYEXPIRED: ++ set_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags); ++ } ++ gss_cred->gc_upcall_timestamp = jiffies; ++ gss_cred->gc_upcall = NULL; ++ rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); ++} ++ ++static void + gss_upcall_callback(struct rpc_task *task) + { + struct gss_cred *gss_cred = container_of(task->tk_msg.rpc_cred, +@@ -358,13 +379,9 @@ gss_upcall_callback(struct rpc_task *tas + struct inode *inode = &gss_msg->inode->vfs_inode; + + spin_lock(&inode->i_lock); +- if (gss_msg->ctx) +- gss_cred_set_ctx(task->tk_msg.rpc_cred, gss_msg->ctx); +- else +- task->tk_status = gss_msg->msg.errno; +- gss_cred->gc_upcall = NULL; +- rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); ++ gss_handle_downcall_result(gss_cred, gss_msg); + spin_unlock(&inode->i_lock); ++ task->tk_status = gss_msg->msg.errno; + gss_release_msg(gss_msg); + } + +@@ -377,11 +394,12 @@ static void gss_encode_v0_msg(struct gss + static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, + struct rpc_clnt *clnt, int machine_cred) + { ++ struct gss_api_mech *mech = gss_msg->auth->mech; + char *p = gss_msg->databuf; + int len = 0; + + gss_msg->msg.len = sprintf(gss_msg->databuf, "mech=%s uid=%d ", +- gss_msg->auth->mech->gm_name, ++ mech->gm_name, + gss_msg->uid); + p += gss_msg->msg.len; + if (clnt->cl_principal) { +@@ -398,6 +416,11 @@ static void gss_encode_v1_msg(struct gss + p += len; + gss_msg->msg.len += len; + } ++ if (mech->gm_upcall_enctypes) { ++ len = sprintf(p, mech->gm_upcall_enctypes); ++ p += len; ++ gss_msg->msg.len += len; ++ } + len = sprintf(p, "\n"); + gss_msg->msg.len += len; + +@@ -507,18 +530,16 @@ gss_refresh_upcall(struct rpc_task *task + spin_lock(&inode->i_lock); + if (gss_cred->gc_upcall != NULL) + rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL); +- else if (gss_msg->ctx != NULL) { +- gss_cred_set_ctx(task->tk_msg.rpc_cred, gss_msg->ctx); +- gss_cred->gc_upcall = NULL; +- rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); +- } else if (gss_msg->msg.errno >= 0) { ++ else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) { + task->tk_timeout = 0; + gss_cred->gc_upcall = gss_msg; + /* gss_upcall_callback will release the reference to gss_upcall_msg */ + atomic_inc(&gss_msg->count); + rpc_sleep_on(&gss_msg->rpc_waitqueue, task, gss_upcall_callback); +- } else ++ } else { ++ gss_handle_downcall_result(gss_cred, gss_msg); + err = gss_msg->msg.errno; ++ } + spin_unlock(&inode->i_lock); + gss_release_msg(gss_msg); + out: +@@ -1117,6 +1138,23 @@ static int gss_renew_cred(struct rpc_tas + return 0; + } + ++static int gss_cred_is_negative_entry(struct rpc_cred *cred) ++{ ++ if (test_bit(RPCAUTH_CRED_NEGATIVE, &cred->cr_flags)) { ++ unsigned long now = jiffies; ++ unsigned long begin, expire; ++ struct gss_cred *gss_cred; ++ ++ gss_cred = container_of(cred, struct gss_cred, gc_base); ++ begin = gss_cred->gc_upcall_timestamp; ++ expire = begin + gss_expired_cred_retry_delay * HZ; ++ ++ if (time_in_range_open(now, begin, expire)) ++ return 1; ++ } ++ return 0; ++} ++ + /* + * Refresh credentials. XXX - finish + */ +@@ -1126,6 +1164,9 @@ gss_refresh(struct rpc_task *task) + struct rpc_cred *cred = task->tk_msg.rpc_cred; + int ret = 0; + ++ if (gss_cred_is_negative_entry(cred)) ++ return -EKEYEXPIRED; ++ + if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && + !test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags)) { + ret = gss_renew_cred(task); +@@ -1316,15 +1357,21 @@ gss_wrap_req_priv(struct rpc_cred *cred, + inpages = snd_buf->pages + first; + snd_buf->pages = rqstp->rq_enc_pages; + snd_buf->page_base -= first << PAGE_CACHE_SHIFT; +- /* Give the tail its own page, in case we need extra space in the +- * head when wrapping: */ ++ /* ++ * Give the tail its own page, in case we need extra space in the ++ * head when wrapping: ++ * ++ * call_allocate() allocates twice the slack space required ++ * by the authentication flavor to rq_callsize. ++ * For GSS, slack is GSS_CRED_SLACK. ++ */ + if (snd_buf->page_len || snd_buf->tail[0].iov_len) { + tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]); + memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len); + snd_buf->tail[0].iov_base = tmp; + } + maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages); +- /* RPC_SLACK_SPACE should prevent this ever happening: */ ++ /* slack space should prevent this ever happening: */ + BUG_ON(snd_buf->len > snd_buf->buflen); + status = -EIO; + /* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was +@@ -1573,5 +1620,11 @@ static void __exit exit_rpcsec_gss(void) + } + + MODULE_LICENSE("GPL"); ++module_param_named(expired_cred_retry_delay, ++ gss_expired_cred_retry_delay, ++ uint, 0644); ++MODULE_PARM_DESC(expired_cred_retry_delay, "Timeout (in seconds) until " ++ "the RPC engine retries an expired credential"); ++ + module_init(init_rpcsec_gss) + module_exit(exit_rpcsec_gss) +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_crypto.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_crypto.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_crypto.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_crypto.c 2010-08-23 11:01:00.390553891 -0400 +@@ -1,7 +1,7 @@ + /* + * linux/net/sunrpc/gss_krb5_crypto.c + * +- * Copyright (c) 2000 The Regents of the University of Michigan. ++ * Copyright (c) 2000-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson +@@ -41,6 +41,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -58,13 +59,13 @@ krb5_encrypt( + { + u32 ret = -EINVAL; + struct scatterlist sg[1]; +- u8 local_iv[16] = {0}; ++ u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; + struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv }; + + if (length % crypto_blkcipher_blocksize(tfm) != 0) + goto out; + +- if (crypto_blkcipher_ivsize(tfm) > 16) { ++ if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { + dprintk("RPC: gss_k5encrypt: tfm iv size too large %d\n", + crypto_blkcipher_ivsize(tfm)); + goto out; +@@ -92,13 +93,13 @@ krb5_decrypt( + { + u32 ret = -EINVAL; + struct scatterlist sg[1]; +- u8 local_iv[16] = {0}; ++ u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; + struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv }; + + if (length % crypto_blkcipher_blocksize(tfm) != 0) + goto out; + +- if (crypto_blkcipher_ivsize(tfm) > 16) { ++ if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { + dprintk("RPC: gss_k5decrypt: tfm iv size too large %d\n", + crypto_blkcipher_ivsize(tfm)); + goto out; +@@ -123,21 +124,155 @@ checksummer(struct scatterlist *sg, void + return crypto_hash_update(desc, sg, sg->length); + } + +-/* checksum the plaintext data and hdrlen bytes of the token header */ +-s32 +-make_checksum(char *cksumname, char *header, int hdrlen, struct xdr_buf *body, +- int body_offset, struct xdr_netobj *cksum) ++static int ++arcfour_hmac_md5_usage_to_salt(unsigned int usage, u8 salt[4]) ++{ ++ unsigned int ms_usage; ++ ++ switch (usage) { ++ case KG_USAGE_SIGN: ++ ms_usage = 15; ++ break; ++ case KG_USAGE_SEAL: ++ ms_usage = 13; ++ break; ++ default: ++ return EINVAL;; ++ } ++ salt[0] = (ms_usage >> 0) & 0xff; ++ salt[1] = (ms_usage >> 8) & 0xff; ++ salt[2] = (ms_usage >> 16) & 0xff; ++ salt[3] = (ms_usage >> 24) & 0xff; ++ ++ return 0; ++} ++ ++static u32 ++make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, ++ struct xdr_buf *body, int body_offset, u8 *cksumkey, ++ unsigned int usage, struct xdr_netobj *cksumout) + { +- struct hash_desc desc; /* XXX add to ctx? */ ++ struct hash_desc desc; + struct scatterlist sg[1]; + int err; ++ u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ u8 rc4salt[4]; ++ struct crypto_hash *md5; ++ struct crypto_hash *hmac_md5; ++ ++ if (cksumkey == NULL) ++ return GSS_S_FAILURE; ++ ++ if (cksumout->len < kctx->gk5e->cksumlength) { ++ dprintk("%s: checksum buffer length, %u, too small for %s\n", ++ __func__, cksumout->len, kctx->gk5e->name); ++ return GSS_S_FAILURE; ++ } ++ ++ if (arcfour_hmac_md5_usage_to_salt(usage, rc4salt)) { ++ dprintk("%s: invalid usage value %u\n", __func__, usage); ++ return GSS_S_FAILURE; ++ } ++ ++ md5 = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); ++ if (IS_ERR(md5)) ++ return GSS_S_FAILURE; ++ ++ hmac_md5 = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(hmac_md5)) { ++ crypto_free_hash(md5); ++ return GSS_S_FAILURE; ++ } ++ ++ desc.tfm = md5; ++ desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out; ++ sg_init_one(sg, rc4salt, 4); ++ err = crypto_hash_update(&desc, sg, 4); ++ if (err) ++ goto out; ++ ++ sg_init_one(sg, header, hdrlen); ++ err = crypto_hash_update(&desc, sg, hdrlen); ++ if (err) ++ goto out; ++ err = xdr_process_buf(body, body_offset, body->len - body_offset, ++ checksummer, &desc); ++ if (err) ++ goto out; ++ err = crypto_hash_final(&desc, checksumdata); ++ if (err) ++ goto out; ++ ++ desc.tfm = hmac_md5; ++ desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out; ++ err = crypto_hash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength); ++ if (err) ++ goto out; ++ ++ sg_init_one(sg, checksumdata, crypto_hash_digestsize(md5)); ++ err = crypto_hash_digest(&desc, sg, crypto_hash_digestsize(md5), ++ checksumdata); ++ if (err) ++ goto out; ++ ++ memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); ++ cksumout->len = kctx->gk5e->cksumlength; ++out: ++ crypto_free_hash(md5); ++ crypto_free_hash(hmac_md5); ++ return err ? GSS_S_FAILURE : 0; ++} ++ ++/* ++ * checksum the plaintext data and hdrlen bytes of the token header ++ * The checksum is performed over the first 8 bytes of the ++ * gss token header and then over the data body ++ */ ++u32 ++make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, ++ struct xdr_buf *body, int body_offset, u8 *cksumkey, ++ unsigned int usage, struct xdr_netobj *cksumout) ++{ ++ struct hash_desc desc; ++ struct scatterlist sg[1]; ++ int err; ++ u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ unsigned int checksumlen; ++ ++ if (kctx->gk5e->ctype == CKSUMTYPE_HMAC_MD5_ARCFOUR) ++ return make_checksum_hmac_md5(kctx, header, hdrlen, ++ body, body_offset, ++ cksumkey, usage, cksumout); ++ ++ if (cksumout->len < kctx->gk5e->cksumlength) { ++ dprintk("%s: checksum buffer length, %u, too small for %s\n", ++ __func__, cksumout->len, kctx->gk5e->name); ++ return GSS_S_FAILURE; ++ } + +- desc.tfm = crypto_alloc_hash(cksumname, 0, CRYPTO_ALG_ASYNC); ++ desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(desc.tfm)) + return GSS_S_FAILURE; +- cksum->len = crypto_hash_digestsize(desc.tfm); + desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + ++ checksumlen = crypto_hash_digestsize(desc.tfm); ++ ++ if (cksumkey != NULL) { ++ err = crypto_hash_setkey(desc.tfm, cksumkey, ++ kctx->gk5e->keylength); ++ if (err) ++ goto out; ++ } ++ + err = crypto_hash_init(&desc); + if (err) + goto out; +@@ -149,15 +284,109 @@ make_checksum(char *cksumname, char *hea + checksummer, &desc); + if (err) + goto out; +- err = crypto_hash_final(&desc, cksum->data); ++ err = crypto_hash_final(&desc, checksumdata); ++ if (err) ++ goto out; + ++ switch (kctx->gk5e->ctype) { ++ case CKSUMTYPE_RSA_MD5: ++ err = kctx->gk5e->encrypt(kctx->seq, NULL, checksumdata, ++ checksumdata, checksumlen); ++ if (err) ++ goto out; ++ memcpy(cksumout->data, ++ checksumdata + checksumlen - kctx->gk5e->cksumlength, ++ kctx->gk5e->cksumlength); ++ break; ++ case CKSUMTYPE_HMAC_SHA1_DES3: ++ memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); ++ break; ++ default: ++ BUG(); ++ break; ++ } ++ cksumout->len = kctx->gk5e->cksumlength; ++out: ++ crypto_free_hash(desc.tfm); ++ return err ? GSS_S_FAILURE : 0; ++} ++ ++/* ++ * checksum the plaintext data and hdrlen bytes of the token header ++ * Per rfc4121, sec. 4.2.4, the checksum is performed over the data ++ * body then over the first 16 octets of the MIC token ++ * Inclusion of the header data in the calculation of the ++ * checksum is optional. ++ */ ++u32 ++make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, ++ struct xdr_buf *body, int body_offset, u8 *cksumkey, ++ unsigned int usage, struct xdr_netobj *cksumout) ++{ ++ struct hash_desc desc; ++ struct scatterlist sg[1]; ++ int err; ++ u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ unsigned int checksumlen; ++ ++ if (kctx->gk5e->keyed_cksum == 0) { ++ dprintk("%s: expected keyed hash for %s\n", ++ __func__, kctx->gk5e->name); ++ return GSS_S_FAILURE; ++ } ++ if (cksumkey == NULL) { ++ dprintk("%s: no key supplied for %s\n", ++ __func__, kctx->gk5e->name); ++ return GSS_S_FAILURE; ++ } ++ ++ desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(desc.tfm)) ++ return GSS_S_FAILURE; ++ checksumlen = crypto_hash_digestsize(desc.tfm); ++ desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; ++ ++ err = crypto_hash_setkey(desc.tfm, cksumkey, kctx->gk5e->keylength); ++ if (err) ++ goto out; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out; ++ err = xdr_process_buf(body, body_offset, body->len - body_offset, ++ checksummer, &desc); ++ if (err) ++ goto out; ++ if (header != NULL) { ++ sg_init_one(sg, header, hdrlen); ++ err = crypto_hash_update(&desc, sg, hdrlen); ++ if (err) ++ goto out; ++ } ++ err = crypto_hash_final(&desc, checksumdata); ++ if (err) ++ goto out; ++ ++ cksumout->len = kctx->gk5e->cksumlength; ++ ++ switch (kctx->gk5e->ctype) { ++ case CKSUMTYPE_HMAC_SHA1_96_AES128: ++ case CKSUMTYPE_HMAC_SHA1_96_AES256: ++ /* note that this truncates the hash */ ++ memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); ++ break; ++ default: ++ BUG(); ++ break; ++ } + out: + crypto_free_hash(desc.tfm); + return err ? GSS_S_FAILURE : 0; + } + + struct encryptor_desc { +- u8 iv[8]; /* XXX hard-coded blocksize */ ++ u8 iv[GSS_KRB5_MAX_BLOCKSIZE]; + struct blkcipher_desc desc; + int pos; + struct xdr_buf *outbuf; +@@ -198,7 +427,7 @@ encryptor(struct scatterlist *sg, void * + desc->fraglen += sg->length; + desc->pos += sg->length; + +- fraglen = thislen & 7; /* XXX hardcoded blocksize */ ++ fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1); + thislen -= fraglen; + + if (thislen == 0) +@@ -256,7 +485,7 @@ gss_encrypt_xdr_buf(struct crypto_blkcip + } + + struct decryptor_desc { +- u8 iv[8]; /* XXX hard-coded blocksize */ ++ u8 iv[GSS_KRB5_MAX_BLOCKSIZE]; + struct blkcipher_desc desc; + struct scatterlist frags[4]; + int fragno; +@@ -278,7 +507,7 @@ decryptor(struct scatterlist *sg, void * + desc->fragno++; + desc->fraglen += sg->length; + +- fraglen = thislen & 7; /* XXX hardcoded blocksize */ ++ fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1); + thislen -= fraglen; + + if (thislen == 0) +@@ -325,3 +554,437 @@ gss_decrypt_xdr_buf(struct crypto_blkcip + + return xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc); + } ++ ++/* ++ * This function makes the assumption that it was ultimately called ++ * from gss_wrap(). ++ * ++ * The client auth_gss code moves any existing tail data into a ++ * separate page before calling gss_wrap. ++ * The server svcauth_gss code ensures that both the head and the ++ * tail have slack space of RPC_MAX_AUTH_SIZE before calling gss_wrap. ++ * ++ * Even with that guarantee, this function may be called more than ++ * once in the processing of gss_wrap(). The best we can do is ++ * verify at compile-time (see GSS_KRB5_SLACK_CHECK) that the ++ * largest expected shift will fit within RPC_MAX_AUTH_SIZE. ++ * At run-time we can verify that a single invocation of this ++ * function doesn't attempt to use more the RPC_MAX_AUTH_SIZE. ++ */ ++ ++int ++xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen) ++{ ++ u8 *p; ++ ++ if (shiftlen == 0) ++ return 0; ++ ++ BUILD_BUG_ON(GSS_KRB5_MAX_SLACK_NEEDED > RPC_MAX_AUTH_SIZE); ++ BUG_ON(shiftlen > RPC_MAX_AUTH_SIZE); ++ ++ p = buf->head[0].iov_base + base; ++ ++ memmove(p + shiftlen, p, buf->head[0].iov_len - base); ++ ++ buf->head[0].iov_len += shiftlen; ++ buf->len += shiftlen; ++ ++ return 0; ++} ++ ++static u32 ++gss_krb5_cts_crypt(struct crypto_blkcipher *cipher, struct xdr_buf *buf, ++ u32 offset, u8 *iv, struct page **pages, int encrypt) ++{ ++ u32 ret; ++ struct scatterlist sg[1]; ++ struct blkcipher_desc desc = { .tfm = cipher, .info = iv }; ++ u8 data[crypto_blkcipher_blocksize(cipher) * 2]; ++ struct page **save_pages; ++ u32 len = buf->len - offset; ++ ++ BUG_ON(len > crypto_blkcipher_blocksize(cipher) * 2); ++ ++ /* ++ * For encryption, we want to read from the cleartext ++ * page cache pages, and write the encrypted data to ++ * the supplied xdr_buf pages. ++ */ ++ save_pages = buf->pages; ++ if (encrypt) ++ buf->pages = pages; ++ ++ ret = read_bytes_from_xdr_buf(buf, offset, data, len); ++ buf->pages = save_pages; ++ if (ret) ++ goto out; ++ ++ sg_init_one(sg, data, len); ++ ++ if (encrypt) ++ ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, len); ++ else ++ ret = crypto_blkcipher_decrypt_iv(&desc, sg, sg, len); ++ ++ if (ret) ++ goto out; ++ ++ ret = write_bytes_to_xdr_buf(buf, offset, data, len); ++ ++out: ++ return ret; ++} ++ ++u32 ++gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, int ec, struct page **pages) ++{ ++ u32 err; ++ struct xdr_netobj hmac; ++ u8 *cksumkey; ++ u8 *ecptr; ++ struct crypto_blkcipher *cipher, *aux_cipher; ++ int blocksize; ++ struct page **save_pages; ++ int nblocks, nbytes; ++ struct encryptor_desc desc; ++ u32 cbcbytes; ++ unsigned int usage; ++ ++ if (kctx->initiate) { ++ cipher = kctx->initiator_enc; ++ aux_cipher = kctx->initiator_enc_aux; ++ cksumkey = kctx->initiator_integ; ++ usage = KG_USAGE_INITIATOR_SEAL; ++ } else { ++ cipher = kctx->acceptor_enc; ++ aux_cipher = kctx->acceptor_enc_aux; ++ cksumkey = kctx->acceptor_integ; ++ usage = KG_USAGE_ACCEPTOR_SEAL; ++ } ++ blocksize = crypto_blkcipher_blocksize(cipher); ++ ++ /* hide the gss token header and insert the confounder */ ++ offset += GSS_KRB5_TOK_HDR_LEN; ++ if (xdr_extend_head(buf, offset, kctx->gk5e->conflen)) ++ return GSS_S_FAILURE; ++ gss_krb5_make_confounder(buf->head[0].iov_base + offset, kctx->gk5e->conflen); ++ offset -= GSS_KRB5_TOK_HDR_LEN; ++ ++ if (buf->tail[0].iov_base != NULL) { ++ ecptr = buf->tail[0].iov_base + buf->tail[0].iov_len; ++ } else { ++ buf->tail[0].iov_base = buf->head[0].iov_base ++ + buf->head[0].iov_len; ++ buf->tail[0].iov_len = 0; ++ ecptr = buf->tail[0].iov_base; ++ } ++ ++ memset(ecptr, 'X', ec); ++ buf->tail[0].iov_len += ec; ++ buf->len += ec; ++ ++ /* copy plaintext gss token header after filler (if any) */ ++ memcpy(ecptr + ec, buf->head[0].iov_base + offset, ++ GSS_KRB5_TOK_HDR_LEN); ++ buf->tail[0].iov_len += GSS_KRB5_TOK_HDR_LEN; ++ buf->len += GSS_KRB5_TOK_HDR_LEN; ++ ++ /* Do the HMAC */ ++ hmac.len = GSS_KRB5_MAX_CKSUM_LEN; ++ hmac.data = buf->tail[0].iov_base + buf->tail[0].iov_len; ++ ++ /* ++ * When we are called, pages points to the real page cache ++ * data -- which we can't go and encrypt! buf->pages points ++ * to scratch pages which we are going to send off to the ++ * client/server. Swap in the plaintext pages to calculate ++ * the hmac. ++ */ ++ save_pages = buf->pages; ++ buf->pages = pages; ++ ++ err = make_checksum_v2(kctx, NULL, 0, buf, ++ offset + GSS_KRB5_TOK_HDR_LEN, ++ cksumkey, usage, &hmac); ++ buf->pages = save_pages; ++ if (err) ++ return GSS_S_FAILURE; ++ ++ nbytes = buf->len - offset - GSS_KRB5_TOK_HDR_LEN; ++ nblocks = (nbytes + blocksize - 1) / blocksize; ++ cbcbytes = 0; ++ if (nblocks > 2) ++ cbcbytes = (nblocks - 2) * blocksize; ++ ++ memset(desc.iv, 0, sizeof(desc.iv)); ++ ++ if (cbcbytes) { ++ desc.pos = offset + GSS_KRB5_TOK_HDR_LEN; ++ desc.fragno = 0; ++ desc.fraglen = 0; ++ desc.pages = pages; ++ desc.outbuf = buf; ++ desc.desc.info = desc.iv; ++ desc.desc.flags = 0; ++ desc.desc.tfm = aux_cipher; ++ ++ sg_init_table(desc.infrags, 4); ++ sg_init_table(desc.outfrags, 4); ++ ++ err = xdr_process_buf(buf, offset + GSS_KRB5_TOK_HDR_LEN, ++ cbcbytes, encryptor, &desc); ++ if (err) ++ goto out_err; ++ } ++ ++ /* Make sure IV carries forward from any CBC results. */ ++ err = gss_krb5_cts_crypt(cipher, buf, ++ offset + GSS_KRB5_TOK_HDR_LEN + cbcbytes, ++ desc.iv, pages, 1); ++ if (err) { ++ err = GSS_S_FAILURE; ++ goto out_err; ++ } ++ ++ /* Now update buf to account for HMAC */ ++ buf->tail[0].iov_len += kctx->gk5e->cksumlength; ++ buf->len += kctx->gk5e->cksumlength; ++ ++out_err: ++ if (err) ++ err = GSS_S_FAILURE; ++ return err; ++} ++ ++u32 ++gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, ++ u32 *headskip, u32 *tailskip) ++{ ++ struct xdr_buf subbuf; ++ u32 ret = 0; ++ u8 *cksum_key; ++ struct crypto_blkcipher *cipher, *aux_cipher; ++ struct xdr_netobj our_hmac_obj; ++ u8 our_hmac[GSS_KRB5_MAX_CKSUM_LEN]; ++ u8 pkt_hmac[GSS_KRB5_MAX_CKSUM_LEN]; ++ int nblocks, blocksize, cbcbytes; ++ struct decryptor_desc desc; ++ unsigned int usage; ++ ++ if (kctx->initiate) { ++ cipher = kctx->acceptor_enc; ++ aux_cipher = kctx->acceptor_enc_aux; ++ cksum_key = kctx->acceptor_integ; ++ usage = KG_USAGE_ACCEPTOR_SEAL; ++ } else { ++ cipher = kctx->initiator_enc; ++ aux_cipher = kctx->initiator_enc_aux; ++ cksum_key = kctx->initiator_integ; ++ usage = KG_USAGE_INITIATOR_SEAL; ++ } ++ blocksize = crypto_blkcipher_blocksize(cipher); ++ ++ ++ /* create a segment skipping the header and leaving out the checksum */ ++ xdr_buf_subsegment(buf, &subbuf, offset + GSS_KRB5_TOK_HDR_LEN, ++ (buf->len - offset - GSS_KRB5_TOK_HDR_LEN - ++ kctx->gk5e->cksumlength)); ++ ++ nblocks = (subbuf.len + blocksize - 1) / blocksize; ++ ++ cbcbytes = 0; ++ if (nblocks > 2) ++ cbcbytes = (nblocks - 2) * blocksize; ++ ++ memset(desc.iv, 0, sizeof(desc.iv)); ++ ++ if (cbcbytes) { ++ desc.fragno = 0; ++ desc.fraglen = 0; ++ desc.desc.info = desc.iv; ++ desc.desc.flags = 0; ++ desc.desc.tfm = aux_cipher; ++ ++ sg_init_table(desc.frags, 4); ++ ++ ret = xdr_process_buf(&subbuf, 0, cbcbytes, decryptor, &desc); ++ if (ret) ++ goto out_err; ++ } ++ ++ /* Make sure IV carries forward from any CBC results. */ ++ ret = gss_krb5_cts_crypt(cipher, &subbuf, cbcbytes, desc.iv, NULL, 0); ++ if (ret) ++ goto out_err; ++ ++ ++ /* Calculate our hmac over the plaintext data */ ++ our_hmac_obj.len = sizeof(our_hmac); ++ our_hmac_obj.data = our_hmac; ++ ++ ret = make_checksum_v2(kctx, NULL, 0, &subbuf, 0, ++ cksum_key, usage, &our_hmac_obj); ++ if (ret) ++ goto out_err; ++ ++ /* Get the packet's hmac value */ ++ ret = read_bytes_from_xdr_buf(buf, buf->len - kctx->gk5e->cksumlength, ++ pkt_hmac, kctx->gk5e->cksumlength); ++ if (ret) ++ goto out_err; ++ ++ if (memcmp(pkt_hmac, our_hmac, kctx->gk5e->cksumlength) != 0) { ++ ret = GSS_S_BAD_SIG; ++ goto out_err; ++ } ++ *headskip = kctx->gk5e->conflen; ++ *tailskip = kctx->gk5e->cksumlength; ++out_err: ++ if (ret && ret != GSS_S_BAD_SIG) ++ ret = GSS_S_FAILURE; ++ return ret; ++} ++ ++/* ++ * Compute Kseq given the initial session key and the checksum. ++ * Set the key of the given cipher. ++ */ ++int ++krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher, ++ unsigned char *cksum) ++{ ++ struct crypto_hash *hmac; ++ struct hash_desc desc; ++ struct scatterlist sg[1]; ++ u8 Kseq[GSS_KRB5_MAX_KEYLEN]; ++ u32 zeroconstant = 0; ++ int err; ++ ++ dprintk("%s: entered\n", __func__); ++ ++ hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); ++ if (IS_ERR(hmac)) { ++ dprintk("%s: error %ld, allocating hash '%s'\n", ++ __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name); ++ return PTR_ERR(hmac); ++ } ++ ++ desc.tfm = hmac; ++ desc.flags = 0; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out_err; ++ ++ /* Compute intermediate Kseq from session key */ ++ err = crypto_hash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ sg_init_table(sg, 1); ++ sg_set_buf(sg, &zeroconstant, 4); ++ ++ err = crypto_hash_digest(&desc, sg, 4, Kseq); ++ if (err) ++ goto out_err; ++ ++ /* Compute final Kseq from the checksum and intermediate Kseq */ ++ err = crypto_hash_setkey(hmac, Kseq, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ sg_set_buf(sg, cksum, 8); ++ ++ err = crypto_hash_digest(&desc, sg, 8, Kseq); ++ if (err) ++ goto out_err; ++ ++ err = crypto_blkcipher_setkey(cipher, Kseq, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ err = 0; ++ ++out_err: ++ crypto_free_hash(hmac); ++ dprintk("%s: returning %d\n", __func__, err); ++ return err; ++} ++ ++/* ++ * Compute Kcrypt given the initial session key and the plaintext seqnum. ++ * Set the key of cipher kctx->enc. ++ */ ++int ++krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher, ++ s32 seqnum) ++{ ++ struct crypto_hash *hmac; ++ struct hash_desc desc; ++ struct scatterlist sg[1]; ++ u8 Kcrypt[GSS_KRB5_MAX_KEYLEN]; ++ u8 zeroconstant[4] = {0}; ++ u8 seqnumarray[4]; ++ int err, i; ++ ++ dprintk("%s: entered, seqnum %u\n", __func__, seqnum); ++ ++ hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); ++ if (IS_ERR(hmac)) { ++ dprintk("%s: error %ld, allocating hash '%s'\n", ++ __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name); ++ return PTR_ERR(hmac); ++ } ++ ++ desc.tfm = hmac; ++ desc.flags = 0; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out_err; ++ ++ /* Compute intermediate Kcrypt from session key */ ++ for (i = 0; i < kctx->gk5e->keylength; i++) ++ Kcrypt[i] = kctx->Ksess[i] ^ 0xf0; ++ ++ err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ sg_init_table(sg, 1); ++ sg_set_buf(sg, zeroconstant, 4); ++ ++ err = crypto_hash_digest(&desc, sg, 4, Kcrypt); ++ if (err) ++ goto out_err; ++ ++ /* Compute final Kcrypt from the seqnum and intermediate Kcrypt */ ++ err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ seqnumarray[0] = (unsigned char) ((seqnum >> 24) & 0xff); ++ seqnumarray[1] = (unsigned char) ((seqnum >> 16) & 0xff); ++ seqnumarray[2] = (unsigned char) ((seqnum >> 8) & 0xff); ++ seqnumarray[3] = (unsigned char) ((seqnum >> 0) & 0xff); ++ ++ sg_set_buf(sg, seqnumarray, 4); ++ ++ err = crypto_hash_digest(&desc, sg, 4, Kcrypt); ++ if (err) ++ goto out_err; ++ ++ err = crypto_blkcipher_setkey(cipher, Kcrypt, kctx->gk5e->keylength); ++ if (err) ++ goto out_err; ++ ++ err = 0; ++ ++out_err: ++ crypto_free_hash(hmac); ++ dprintk("%s: returning %d\n", __func__, err); ++ return err; ++} ++ +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_keys.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_keys.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_keys.c.orig 2010-08-23 11:01:00.390553891 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_keys.c 2010-08-23 11:01:00.391564137 -0400 +@@ -0,0 +1,336 @@ ++/* ++ * COPYRIGHT (c) 2008 ++ * The Regents of the University of Michigan ++ * ALL RIGHTS RESERVED ++ * ++ * Permission is granted to use, copy, create derivative works ++ * and redistribute this software and such derivative works ++ * for any purpose, so long as the name of The University of ++ * Michigan is not used in any advertising or publicity ++ * pertaining to the use of distribution of this software ++ * without specific, written prior authorization. If the ++ * above copyright notice or any other identification of the ++ * University of Michigan is included in any copy of any ++ * portion of this software, then the disclaimer below must ++ * also be included. ++ * ++ * THIS SOFTWARE IS PROVIDED AS IS, WITHOUT REPRESENTATION ++ * FROM THE UNIVERSITY OF MICHIGAN AS TO ITS FITNESS FOR ANY ++ * PURPOSE, AND WITHOUT WARRANTY BY THE UNIVERSITY OF ++ * MICHIGAN OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ++ * WITHOUT LIMITATION THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ++ * REGENTS OF THE UNIVERSITY OF MICHIGAN SHALL NOT BE LIABLE ++ * FOR ANY DAMAGES, INCLUDING SPECIAL, INDIRECT, INCIDENTAL, OR ++ * CONSEQUENTIAL DAMAGES, WITH RESPECT TO ANY CLAIM ARISING ++ * OUT OF OR IN CONNECTION WITH THE USE OF THE SOFTWARE, EVEN ++ * IF IT HAS BEEN OR IS HEREAFTER ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGES. ++ */ ++ ++/* ++ * Copyright (C) 1998 by the FundsXpress, INC. ++ * ++ * All rights reserved. ++ * ++ * Export of this software from the United States of America may require ++ * a specific license from the United States Government. It is the ++ * responsibility of any person or organization contemplating export to ++ * obtain such a license before exporting. ++ * ++ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and ++ * distribute this software and its documentation for any purpose and ++ * without fee is hereby granted, provided that the above copyright ++ * notice appear in all copies and that both that copyright notice and ++ * this permission notice appear in supporting documentation, and that ++ * the name of FundsXpress. not be used in advertising or publicity pertaining ++ * to distribution of the software without specific, written prior ++ * permission. FundsXpress makes no representations about the suitability of ++ * this software for any purpose. It is provided "as is" without express ++ * or implied warranty. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR ++ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED ++ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef RPC_DEBUG ++# define RPCDBG_FACILITY RPCDBG_AUTH ++#endif ++ ++/* ++ * This is the n-fold function as described in rfc3961, sec 5.1 ++ * Taken from MIT Kerberos and modified. ++ */ ++ ++static void krb5_nfold(u32 inbits, const u8 *in, ++ u32 outbits, u8 *out) ++{ ++ int a, b, c, lcm; ++ int byte, i, msbit; ++ ++ /* the code below is more readable if I make these bytes ++ instead of bits */ ++ ++ inbits >>= 3; ++ outbits >>= 3; ++ ++ /* first compute lcm(n,k) */ ++ ++ a = outbits; ++ b = inbits; ++ ++ while (b != 0) { ++ c = b; ++ b = a%b; ++ a = c; ++ } ++ ++ lcm = outbits*inbits/a; ++ ++ /* now do the real work */ ++ ++ memset(out, 0, outbits); ++ byte = 0; ++ ++ /* this will end up cycling through k lcm(k,n)/k times, which ++ is correct */ ++ for (i = lcm-1; i >= 0; i--) { ++ /* compute the msbit in k which gets added into this byte */ ++ msbit = ( ++ /* first, start with the msbit in the first, ++ * unrotated byte */ ++ ((inbits << 3) - 1) ++ /* then, for each byte, shift to the right ++ * for each repetition */ ++ + (((inbits << 3) + 13) * (i/inbits)) ++ /* last, pick out the correct byte within ++ * that shifted repetition */ ++ + ((inbits - (i % inbits)) << 3) ++ ) % (inbits << 3); ++ ++ /* pull out the byte value itself */ ++ byte += (((in[((inbits - 1) - (msbit >> 3)) % inbits] << 8)| ++ (in[((inbits) - (msbit >> 3)) % inbits])) ++ >> ((msbit & 7) + 1)) & 0xff; ++ ++ /* do the addition */ ++ byte += out[i % outbits]; ++ out[i % outbits] = byte & 0xff; ++ ++ /* keep around the carry bit, if any */ ++ byte >>= 8; ++ ++ } ++ ++ /* if there's a carry bit left over, add it back in */ ++ if (byte) { ++ for (i = outbits - 1; i >= 0; i--) { ++ /* do the addition */ ++ byte += out[i]; ++ out[i] = byte & 0xff; ++ ++ /* keep around the carry bit, if any */ ++ byte >>= 8; ++ } ++ } ++} ++ ++/* ++ * This is the DK (derive_key) function as described in rfc3961, sec 5.1 ++ * Taken from MIT Kerberos and modified. ++ */ ++ ++u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e, ++ const struct xdr_netobj *inkey, ++ struct xdr_netobj *outkey, ++ const struct xdr_netobj *in_constant, ++ gfp_t gfp_mask) ++{ ++ size_t blocksize, keybytes, keylength, n; ++ unsigned char *inblockdata, *outblockdata, *rawkey; ++ struct xdr_netobj inblock, outblock; ++ struct crypto_blkcipher *cipher; ++ u32 ret = EINVAL; ++ ++ blocksize = gk5e->blocksize; ++ keybytes = gk5e->keybytes; ++ keylength = gk5e->keylength; ++ ++ if ((inkey->len != keylength) || (outkey->len != keylength)) ++ goto err_return; ++ ++ cipher = crypto_alloc_blkcipher(gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cipher)) ++ goto err_return; ++ if (crypto_blkcipher_setkey(cipher, inkey->data, inkey->len)) ++ goto err_return; ++ ++ /* allocate and set up buffers */ ++ ++ ret = ENOMEM; ++ inblockdata = kmalloc(blocksize, gfp_mask); ++ if (inblockdata == NULL) ++ goto err_free_cipher; ++ ++ outblockdata = kmalloc(blocksize, gfp_mask); ++ if (outblockdata == NULL) ++ goto err_free_in; ++ ++ rawkey = kmalloc(keybytes, gfp_mask); ++ if (rawkey == NULL) ++ goto err_free_out; ++ ++ inblock.data = (char *) inblockdata; ++ inblock.len = blocksize; ++ ++ outblock.data = (char *) outblockdata; ++ outblock.len = blocksize; ++ ++ /* initialize the input block */ ++ ++ if (in_constant->len == inblock.len) { ++ memcpy(inblock.data, in_constant->data, inblock.len); ++ } else { ++ krb5_nfold(in_constant->len * 8, in_constant->data, ++ inblock.len * 8, inblock.data); ++ } ++ ++ /* loop encrypting the blocks until enough key bytes are generated */ ++ ++ n = 0; ++ while (n < keybytes) { ++ (*(gk5e->encrypt))(cipher, NULL, inblock.data, ++ outblock.data, inblock.len); ++ ++ if ((keybytes - n) <= outblock.len) { ++ memcpy(rawkey + n, outblock.data, (keybytes - n)); ++ break; ++ } ++ ++ memcpy(rawkey + n, outblock.data, outblock.len); ++ memcpy(inblock.data, outblock.data, outblock.len); ++ n += outblock.len; ++ } ++ ++ /* postprocess the key */ ++ ++ inblock.data = (char *) rawkey; ++ inblock.len = keybytes; ++ ++ BUG_ON(gk5e->mk_key == NULL); ++ ret = (*(gk5e->mk_key))(gk5e, &inblock, outkey); ++ if (ret) { ++ dprintk("%s: got %d from mk_key function for '%s'\n", ++ __func__, ret, gk5e->encrypt_name); ++ goto err_free_raw; ++ } ++ ++ /* clean memory, free resources and exit */ ++ ++ ret = 0; ++ ++err_free_raw: ++ memset(rawkey, 0, keybytes); ++ kfree(rawkey); ++err_free_out: ++ memset(outblockdata, 0, blocksize); ++ kfree(outblockdata); ++err_free_in: ++ memset(inblockdata, 0, blocksize); ++ kfree(inblockdata); ++err_free_cipher: ++ crypto_free_blkcipher(cipher); ++err_return: ++ return ret; ++} ++ ++#define smask(step) ((1<>step)&smask(step))) ++#define parity_char(x) pstep(pstep(pstep((x), 4), 2), 1) ++ ++static void mit_des_fixup_key_parity(u8 key[8]) ++{ ++ int i; ++ for (i = 0; i < 8; i++) { ++ key[i] &= 0xfe; ++ key[i] |= 1^parity_char(key[i]); ++ } ++} ++ ++/* ++ * This is the des3 key derivation postprocess function ++ */ ++u32 gss_krb5_des3_make_key(const struct gss_krb5_enctype *gk5e, ++ struct xdr_netobj *randombits, ++ struct xdr_netobj *key) ++{ ++ int i; ++ u32 ret = EINVAL; ++ ++ if (key->len != 24) { ++ dprintk("%s: key->len is %d\n", __func__, key->len); ++ goto err_out; ++ } ++ if (randombits->len != 21) { ++ dprintk("%s: randombits->len is %d\n", ++ __func__, randombits->len); ++ goto err_out; ++ } ++ ++ /* take the seven bytes, move them around into the top 7 bits of the ++ 8 key bytes, then compute the parity bits. Do this three times. */ ++ ++ for (i = 0; i < 3; i++) { ++ memcpy(key->data + i*8, randombits->data + i*7, 7); ++ key->data[i*8+7] = (((key->data[i*8]&1)<<1) | ++ ((key->data[i*8+1]&1)<<2) | ++ ((key->data[i*8+2]&1)<<3) | ++ ((key->data[i*8+3]&1)<<4) | ++ ((key->data[i*8+4]&1)<<5) | ++ ((key->data[i*8+5]&1)<<6) | ++ ((key->data[i*8+6]&1)<<7)); ++ ++ mit_des_fixup_key_parity(key->data + i*8); ++ } ++ ret = 0; ++err_out: ++ return ret; ++} ++ ++/* ++ * This is the aes key derivation postprocess function ++ */ ++u32 gss_krb5_aes_make_key(const struct gss_krb5_enctype *gk5e, ++ struct xdr_netobj *randombits, ++ struct xdr_netobj *key) ++{ ++ u32 ret = EINVAL; ++ ++ if (key->len != 16 && key->len != 32) { ++ dprintk("%s: key->len is %d\n", __func__, key->len); ++ goto err_out; ++ } ++ if (randombits->len != 16 && randombits->len != 32) { ++ dprintk("%s: randombits->len is %d\n", ++ __func__, randombits->len); ++ goto err_out; ++ } ++ if (randombits->len != key->len) { ++ dprintk("%s: randombits->len is %d, key->len is %d\n", ++ __func__, randombits->len, key->len); ++ goto err_out; ++ } ++ memcpy(key->data, randombits->data, key->len); ++ ret = 0; ++err_out: ++ return ret; ++} ++ +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_mech.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_mech.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_mech.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_mech.c 2010-08-23 11:01:00.392564136 -0400 +@@ -1,7 +1,7 @@ + /* + * linux/net/sunrpc/gss_krb5_mech.c + * +- * Copyright (c) 2001 The Regents of the University of Michigan. ++ * Copyright (c) 2001-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson +@@ -48,6 +48,143 @@ + # define RPCDBG_FACILITY RPCDBG_AUTH + #endif + ++static struct gss_api_mech gss_kerberos_mech; /* forward declaration */ ++ ++static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { ++ /* ++ * DES (All DES enctypes are mapped to the same gss functionality) ++ */ ++ { ++ .etype = ENCTYPE_DES_CBC_RAW, ++ .ctype = CKSUMTYPE_RSA_MD5, ++ .name = "des-cbc-crc", ++ .encrypt_name = "cbc(des)", ++ .cksum_name = "md5", ++ .encrypt = krb5_encrypt, ++ .decrypt = krb5_decrypt, ++ .mk_key = NULL, ++ .signalg = SGN_ALG_DES_MAC_MD5, ++ .sealalg = SEAL_ALG_DES, ++ .keybytes = 7, ++ .keylength = 8, ++ .blocksize = 8, ++ .conflen = 8, ++ .cksumlength = 8, ++ .keyed_cksum = 0, ++ }, ++ /* ++ * RC4-HMAC ++ */ ++ { ++ .etype = ENCTYPE_ARCFOUR_HMAC, ++ .ctype = CKSUMTYPE_HMAC_MD5_ARCFOUR, ++ .name = "rc4-hmac", ++ .encrypt_name = "ecb(arc4)", ++ .cksum_name = "hmac(md5)", ++ .encrypt = krb5_encrypt, ++ .decrypt = krb5_decrypt, ++ .mk_key = NULL, ++ .signalg = SGN_ALG_HMAC_MD5, ++ .sealalg = SEAL_ALG_MICROSOFT_RC4, ++ .keybytes = 16, ++ .keylength = 16, ++ .blocksize = 1, ++ .conflen = 8, ++ .cksumlength = 8, ++ .keyed_cksum = 1, ++ }, ++ /* ++ * 3DES ++ */ ++ { ++ .etype = ENCTYPE_DES3_CBC_RAW, ++ .ctype = CKSUMTYPE_HMAC_SHA1_DES3, ++ .name = "des3-hmac-sha1", ++ .encrypt_name = "cbc(des3_ede)", ++ .cksum_name = "hmac(sha1)", ++ .encrypt = krb5_encrypt, ++ .decrypt = krb5_decrypt, ++ .mk_key = gss_krb5_des3_make_key, ++ .signalg = SGN_ALG_HMAC_SHA1_DES3_KD, ++ .sealalg = SEAL_ALG_DES3KD, ++ .keybytes = 21, ++ .keylength = 24, ++ .blocksize = 8, ++ .conflen = 8, ++ .cksumlength = 20, ++ .keyed_cksum = 1, ++ }, ++ /* ++ * AES128 ++ */ ++ { ++ .etype = ENCTYPE_AES128_CTS_HMAC_SHA1_96, ++ .ctype = CKSUMTYPE_HMAC_SHA1_96_AES128, ++ .name = "aes128-cts", ++ .encrypt_name = "cts(cbc(aes))", ++ .cksum_name = "hmac(sha1)", ++ .encrypt = krb5_encrypt, ++ .decrypt = krb5_decrypt, ++ .mk_key = gss_krb5_aes_make_key, ++ .encrypt_v2 = gss_krb5_aes_encrypt, ++ .decrypt_v2 = gss_krb5_aes_decrypt, ++ .signalg = -1, ++ .sealalg = -1, ++ .keybytes = 16, ++ .keylength = 16, ++ .blocksize = 16, ++ .conflen = 16, ++ .cksumlength = 12, ++ .keyed_cksum = 1, ++ }, ++ /* ++ * AES256 ++ */ ++ { ++ .etype = ENCTYPE_AES256_CTS_HMAC_SHA1_96, ++ .ctype = CKSUMTYPE_HMAC_SHA1_96_AES256, ++ .name = "aes256-cts", ++ .encrypt_name = "cts(cbc(aes))", ++ .cksum_name = "hmac(sha1)", ++ .encrypt = krb5_encrypt, ++ .decrypt = krb5_decrypt, ++ .mk_key = gss_krb5_aes_make_key, ++ .encrypt_v2 = gss_krb5_aes_encrypt, ++ .decrypt_v2 = gss_krb5_aes_decrypt, ++ .signalg = -1, ++ .sealalg = -1, ++ .keybytes = 32, ++ .keylength = 32, ++ .blocksize = 16, ++ .conflen = 16, ++ .cksumlength = 12, ++ .keyed_cksum = 1, ++ }, ++}; ++ ++static const int num_supported_enctypes = ++ ARRAY_SIZE(supported_gss_krb5_enctypes); ++ ++static int ++supported_gss_krb5_enctype(int etype) ++{ ++ int i; ++ for (i = 0; i < num_supported_enctypes; i++) ++ if (supported_gss_krb5_enctypes[i].etype == etype) ++ return 1; ++ return 0; ++} ++ ++static const struct gss_krb5_enctype * ++get_gss_krb5_enctype(int etype) ++{ ++ int i; ++ for (i = 0; i < num_supported_enctypes; i++) ++ if (supported_gss_krb5_enctypes[i].etype == etype) ++ return &supported_gss_krb5_enctypes[i]; ++ return NULL; ++} ++ + static const void * + simple_get_bytes(const void *p, const void *end, void *res, int len) + { +@@ -78,35 +215,45 @@ simple_get_netobj(const void *p, const v + } + + static inline const void * +-get_key(const void *p, const void *end, struct crypto_blkcipher **res) ++get_key(const void *p, const void *end, ++ struct krb5_ctx *ctx, struct crypto_blkcipher **res) + { + struct xdr_netobj key; + int alg; +- char *alg_name; + + p = simple_get_bytes(p, end, &alg, sizeof(alg)); + if (IS_ERR(p)) + goto out_err; ++ ++ switch (alg) { ++ case ENCTYPE_DES_CBC_CRC: ++ case ENCTYPE_DES_CBC_MD4: ++ case ENCTYPE_DES_CBC_MD5: ++ /* Map all these key types to ENCTYPE_DES_CBC_RAW */ ++ alg = ENCTYPE_DES_CBC_RAW; ++ break; ++ } ++ ++ if (!supported_gss_krb5_enctype(alg)) { ++ printk(KERN_WARNING "gss_kerberos_mech: unsupported " ++ "encryption key algorithm %d\n", alg); ++ goto out_err; ++ } + p = simple_get_netobj(p, end, &key); + if (IS_ERR(p)) + goto out_err; + +- switch (alg) { +- case ENCTYPE_DES_CBC_RAW: +- alg_name = "cbc(des)"; +- break; +- default: +- printk("gss_kerberos_mech: unsupported algorithm %d\n", alg); +- goto out_err_free_key; +- } +- *res = crypto_alloc_blkcipher(alg_name, 0, CRYPTO_ALG_ASYNC); ++ *res = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); + if (IS_ERR(*res)) { +- printk("gss_kerberos_mech: unable to initialize crypto algorithm %s\n", alg_name); ++ printk(KERN_WARNING "gss_kerberos_mech: unable to initialize " ++ "crypto algorithm %s\n", ctx->gk5e->encrypt_name); + *res = NULL; + goto out_err_free_key; + } + if (crypto_blkcipher_setkey(*res, key.data, key.len)) { +- printk("gss_kerberos_mech: error setting key for crypto algorithm %s\n", alg_name); ++ printk(KERN_WARNING "gss_kerberos_mech: error setting key for " ++ "crypto algorithm %s\n", ctx->gk5e->encrypt_name); + goto out_err_free_tfm; + } + +@@ -123,56 +270,55 @@ out_err: + } + + static int +-gss_import_sec_context_kerberos(const void *p, +- size_t len, +- struct gss_ctx *ctx_id) ++gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) + { +- const void *end = (const void *)((const char *)p + len); +- struct krb5_ctx *ctx; + int tmp; + +- if (!(ctx = kzalloc(sizeof(*ctx), GFP_NOFS))) { +- p = ERR_PTR(-ENOMEM); +- goto out_err; +- } +- + p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate)); + if (IS_ERR(p)) +- goto out_err_free_ctx; ++ goto out_err; ++ ++ /* Old format supports only DES! Any other enctype uses new format */ ++ ctx->enctype = ENCTYPE_DES_CBC_RAW; ++ ++ ctx->gk5e = get_gss_krb5_enctype(ctx->enctype); ++ if (ctx->gk5e == NULL) ++ goto out_err; ++ + /* The downcall format was designed before we completely understood + * the uses of the context fields; so it includes some stuff we + * just give some minimal sanity-checking, and some we ignore + * completely (like the next twenty bytes): */ + if (unlikely(p + 20 > end || p + 20 < p)) +- goto out_err_free_ctx; ++ goto out_err; + p += 20; + p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); + if (IS_ERR(p)) +- goto out_err_free_ctx; ++ goto out_err; + if (tmp != SGN_ALG_DES_MAC_MD5) { + p = ERR_PTR(-ENOSYS); +- goto out_err_free_ctx; ++ goto out_err; + } + p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); + if (IS_ERR(p)) +- goto out_err_free_ctx; ++ goto out_err; + if (tmp != SEAL_ALG_DES) { + p = ERR_PTR(-ENOSYS); +- goto out_err_free_ctx; ++ goto out_err; + } + p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); + if (IS_ERR(p)) +- goto out_err_free_ctx; ++ goto out_err; + p = simple_get_bytes(p, end, &ctx->seq_send, sizeof(ctx->seq_send)); + if (IS_ERR(p)) +- goto out_err_free_ctx; ++ goto out_err; + p = simple_get_netobj(p, end, &ctx->mech_used); + if (IS_ERR(p)) +- goto out_err_free_ctx; +- p = get_key(p, end, &ctx->enc); ++ goto out_err; ++ p = get_key(p, end, ctx, &ctx->enc); + if (IS_ERR(p)) + goto out_err_free_mech; +- p = get_key(p, end, &ctx->seq); ++ p = get_key(p, end, ctx, &ctx->seq); + if (IS_ERR(p)) + goto out_err_free_key1; + if (p != end) { +@@ -180,9 +326,6 @@ gss_import_sec_context_kerberos(const vo + goto out_err_free_key2; + } + +- ctx_id->internal_ctx_id = ctx; +- +- dprintk("RPC: Successfully imported new context.\n"); + return 0; + + out_err_free_key2: +@@ -191,18 +334,378 @@ out_err_free_key1: + crypto_free_blkcipher(ctx->enc); + out_err_free_mech: + kfree(ctx->mech_used.data); +-out_err_free_ctx: +- kfree(ctx); + out_err: + return PTR_ERR(p); + } + ++struct crypto_blkcipher * ++context_v2_alloc_cipher(struct krb5_ctx *ctx, const char *cname, u8 *key) ++{ ++ struct crypto_blkcipher *cp; ++ ++ cp = crypto_alloc_blkcipher(cname, 0, CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cp)) { ++ dprintk("gss_kerberos_mech: unable to initialize " ++ "crypto algorithm %s\n", cname); ++ return NULL; ++ } ++ if (crypto_blkcipher_setkey(cp, key, ctx->gk5e->keylength)) { ++ dprintk("gss_kerberos_mech: error setting key for " ++ "crypto algorithm %s\n", cname); ++ crypto_free_blkcipher(cp); ++ return NULL; ++ } ++ return cp; ++} ++ ++static inline void ++set_cdata(u8 cdata[GSS_KRB5_K5CLENGTH], u32 usage, u8 seed) ++{ ++ cdata[0] = (usage>>24)&0xff; ++ cdata[1] = (usage>>16)&0xff; ++ cdata[2] = (usage>>8)&0xff; ++ cdata[3] = usage&0xff; ++ cdata[4] = seed; ++} ++ ++static int ++context_derive_keys_des3(struct krb5_ctx *ctx, gfp_t gfp_mask) ++{ ++ struct xdr_netobj c, keyin, keyout; ++ u8 cdata[GSS_KRB5_K5CLENGTH]; ++ u32 err; ++ ++ c.len = GSS_KRB5_K5CLENGTH; ++ c.data = cdata; ++ ++ keyin.data = ctx->Ksess; ++ keyin.len = ctx->gk5e->keylength; ++ keyout.len = ctx->gk5e->keylength; ++ ++ /* seq uses the raw key */ ++ ctx->seq = context_v2_alloc_cipher(ctx, ctx->gk5e->encrypt_name, ++ ctx->Ksess); ++ if (ctx->seq == NULL) ++ goto out_err; ++ ++ ctx->enc = context_v2_alloc_cipher(ctx, ctx->gk5e->encrypt_name, ++ ctx->Ksess); ++ if (ctx->enc == NULL) ++ goto out_free_seq; ++ ++ /* derive cksum */ ++ set_cdata(cdata, KG_USAGE_SIGN, KEY_USAGE_SEED_CHECKSUM); ++ keyout.data = ctx->cksum; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving cksum key\n", ++ __func__, err); ++ goto out_free_enc; ++ } ++ ++ return 0; ++ ++out_free_enc: ++ crypto_free_blkcipher(ctx->enc); ++out_free_seq: ++ crypto_free_blkcipher(ctx->seq); ++out_err: ++ return -EINVAL; ++} ++ ++/* ++ * Note that RC4 depends on deriving keys using the sequence ++ * number or the checksum of a token. Therefore, the final keys ++ * cannot be calculated until the token is being constructed! ++ */ ++static int ++context_derive_keys_rc4(struct krb5_ctx *ctx) ++{ ++ struct crypto_hash *hmac; ++ char sigkeyconstant[] = "signaturekey"; ++ int slen = strlen(sigkeyconstant) + 1; /* include null terminator */ ++ struct hash_desc desc; ++ struct scatterlist sg[1]; ++ int err; ++ ++ dprintk("RPC: %s: entered\n", __func__); ++ /* ++ * derive cksum (aka Ksign) key ++ */ ++ hmac = crypto_alloc_hash(ctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); ++ if (IS_ERR(hmac)) { ++ dprintk("%s: error %ld allocating hash '%s'\n", ++ __func__, PTR_ERR(hmac), ctx->gk5e->cksum_name); ++ err = PTR_ERR(hmac); ++ goto out_err; ++ } ++ ++ err = crypto_hash_setkey(hmac, ctx->Ksess, ctx->gk5e->keylength); ++ if (err) ++ goto out_err_free_hmac; ++ ++ sg_init_table(sg, 1); ++ sg_set_buf(sg, sigkeyconstant, slen); ++ ++ desc.tfm = hmac; ++ desc.flags = 0; ++ ++ err = crypto_hash_init(&desc); ++ if (err) ++ goto out_err_free_hmac; ++ ++ err = crypto_hash_digest(&desc, sg, slen, ctx->cksum); ++ if (err) ++ goto out_err_free_hmac; ++ /* ++ * allocate hash, and blkciphers for data and seqnum encryption ++ */ ++ ctx->enc = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(ctx->enc)) { ++ err = PTR_ERR(ctx->enc); ++ goto out_err_free_hmac; ++ } ++ ++ ctx->seq = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(ctx->seq)) { ++ crypto_free_blkcipher(ctx->enc); ++ err = PTR_ERR(ctx->seq); ++ goto out_err_free_hmac; ++ } ++ ++ dprintk("RPC: %s: returning success\n", __func__); ++ ++ err = 0; ++ ++out_err_free_hmac: ++ crypto_free_hash(hmac); ++out_err: ++ dprintk("RPC: %s: returning %d\n", __func__, err); ++ return err; ++} ++ ++static int ++context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask) ++{ ++ struct xdr_netobj c, keyin, keyout; ++ u8 cdata[GSS_KRB5_K5CLENGTH]; ++ u32 err; ++ ++ c.len = GSS_KRB5_K5CLENGTH; ++ c.data = cdata; ++ ++ keyin.data = ctx->Ksess; ++ keyin.len = ctx->gk5e->keylength; ++ keyout.len = ctx->gk5e->keylength; ++ ++ /* initiator seal encryption */ ++ set_cdata(cdata, KG_USAGE_INITIATOR_SEAL, KEY_USAGE_SEED_ENCRYPTION); ++ keyout.data = ctx->initiator_seal; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving initiator_seal key\n", ++ __func__, err); ++ goto out_err; ++ } ++ ctx->initiator_enc = context_v2_alloc_cipher(ctx, ++ ctx->gk5e->encrypt_name, ++ ctx->initiator_seal); ++ if (ctx->initiator_enc == NULL) ++ goto out_err; ++ ++ /* acceptor seal encryption */ ++ set_cdata(cdata, KG_USAGE_ACCEPTOR_SEAL, KEY_USAGE_SEED_ENCRYPTION); ++ keyout.data = ctx->acceptor_seal; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving acceptor_seal key\n", ++ __func__, err); ++ goto out_free_initiator_enc; ++ } ++ ctx->acceptor_enc = context_v2_alloc_cipher(ctx, ++ ctx->gk5e->encrypt_name, ++ ctx->acceptor_seal); ++ if (ctx->acceptor_enc == NULL) ++ goto out_free_initiator_enc; ++ ++ /* initiator sign checksum */ ++ set_cdata(cdata, KG_USAGE_INITIATOR_SIGN, KEY_USAGE_SEED_CHECKSUM); ++ keyout.data = ctx->initiator_sign; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving initiator_sign key\n", ++ __func__, err); ++ goto out_free_acceptor_enc; ++ } ++ ++ /* acceptor sign checksum */ ++ set_cdata(cdata, KG_USAGE_ACCEPTOR_SIGN, KEY_USAGE_SEED_CHECKSUM); ++ keyout.data = ctx->acceptor_sign; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving acceptor_sign key\n", ++ __func__, err); ++ goto out_free_acceptor_enc; ++ } ++ ++ /* initiator seal integrity */ ++ set_cdata(cdata, KG_USAGE_INITIATOR_SEAL, KEY_USAGE_SEED_INTEGRITY); ++ keyout.data = ctx->initiator_integ; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving initiator_integ key\n", ++ __func__, err); ++ goto out_free_acceptor_enc; ++ } ++ ++ /* acceptor seal integrity */ ++ set_cdata(cdata, KG_USAGE_ACCEPTOR_SEAL, KEY_USAGE_SEED_INTEGRITY); ++ keyout.data = ctx->acceptor_integ; ++ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); ++ if (err) { ++ dprintk("%s: Error %d deriving acceptor_integ key\n", ++ __func__, err); ++ goto out_free_acceptor_enc; ++ } ++ ++ switch (ctx->enctype) { ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ ctx->initiator_enc_aux = ++ context_v2_alloc_cipher(ctx, "cbc(aes)", ++ ctx->initiator_seal); ++ if (ctx->initiator_enc_aux == NULL) ++ goto out_free_acceptor_enc; ++ ctx->acceptor_enc_aux = ++ context_v2_alloc_cipher(ctx, "cbc(aes)", ++ ctx->acceptor_seal); ++ if (ctx->acceptor_enc_aux == NULL) { ++ crypto_free_blkcipher(ctx->initiator_enc_aux); ++ goto out_free_acceptor_enc; ++ } ++ } ++ ++ return 0; ++ ++out_free_acceptor_enc: ++ crypto_free_blkcipher(ctx->acceptor_enc); ++out_free_initiator_enc: ++ crypto_free_blkcipher(ctx->initiator_enc); ++out_err: ++ return -EINVAL; ++} ++ ++static int ++gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, ++ gfp_t gfp_mask) ++{ ++ int keylen; ++ ++ p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags)); ++ if (IS_ERR(p)) ++ goto out_err; ++ ctx->initiate = ctx->flags & KRB5_CTX_FLAG_INITIATOR; ++ ++ p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); ++ if (IS_ERR(p)) ++ goto out_err; ++ p = simple_get_bytes(p, end, &ctx->seq_send64, sizeof(ctx->seq_send64)); ++ if (IS_ERR(p)) ++ goto out_err; ++ /* set seq_send for use by "older" enctypes */ ++ ctx->seq_send = ctx->seq_send64; ++ if (ctx->seq_send64 != ctx->seq_send) { ++ dprintk("%s: seq_send64 %lx, seq_send %x overflow?\n", __func__, ++ (long unsigned)ctx->seq_send64, ctx->seq_send); ++ goto out_err; ++ } ++ p = simple_get_bytes(p, end, &ctx->enctype, sizeof(ctx->enctype)); ++ if (IS_ERR(p)) ++ goto out_err; ++ /* Map ENCTYPE_DES3_CBC_SHA1 to ENCTYPE_DES3_CBC_RAW */ ++ if (ctx->enctype == ENCTYPE_DES3_CBC_SHA1) ++ ctx->enctype = ENCTYPE_DES3_CBC_RAW; ++ ctx->gk5e = get_gss_krb5_enctype(ctx->enctype); ++ if (ctx->gk5e == NULL) { ++ dprintk("gss_kerberos_mech: unsupported krb5 enctype %u\n", ++ ctx->enctype); ++ p = ERR_PTR(-EINVAL); ++ goto out_err; ++ } ++ keylen = ctx->gk5e->keylength; ++ ++ p = simple_get_bytes(p, end, ctx->Ksess, keylen); ++ if (IS_ERR(p)) ++ goto out_err; ++ ++ if (p != end) { ++ p = ERR_PTR(-EINVAL); ++ goto out_err; ++ } ++ ++ ctx->mech_used.data = kmemdup(gss_kerberos_mech.gm_oid.data, ++ gss_kerberos_mech.gm_oid.len, gfp_mask); ++ if (unlikely(ctx->mech_used.data == NULL)) { ++ p = ERR_PTR(-ENOMEM); ++ goto out_err; ++ } ++ ctx->mech_used.len = gss_kerberos_mech.gm_oid.len; ++ ++ switch (ctx->enctype) { ++ case ENCTYPE_DES3_CBC_RAW: ++ return context_derive_keys_des3(ctx, gfp_mask); ++ case ENCTYPE_ARCFOUR_HMAC: ++ return context_derive_keys_rc4(ctx); ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ return context_derive_keys_new(ctx, gfp_mask); ++ default: ++ return -EINVAL; ++ } ++ ++out_err: ++ return PTR_ERR(p); ++} ++ ++static int ++gss_import_sec_context_kerberos(const void *p, size_t len, ++ struct gss_ctx *ctx_id, ++ gfp_t gfp_mask) ++{ ++ const void *end = (const void *)((const char *)p + len); ++ struct krb5_ctx *ctx; ++ int ret; ++ ++ ctx = kzalloc(sizeof(*ctx), gfp_mask); ++ if (ctx == NULL) ++ return -ENOMEM; ++ ++ if (len == 85) ++ ret = gss_import_v1_context(p, end, ctx); ++ else ++ ret = gss_import_v2_context(p, end, ctx, gfp_mask); ++ ++ if (ret == 0) ++ ctx_id->internal_ctx_id = ctx; ++ else ++ kfree(ctx); ++ ++ dprintk("RPC: %s: returning %d\n", __func__, ret); ++ return ret; ++} ++ + static void + gss_delete_sec_context_kerberos(void *internal_ctx) { + struct krb5_ctx *kctx = internal_ctx; + + crypto_free_blkcipher(kctx->seq); + crypto_free_blkcipher(kctx->enc); ++ crypto_free_blkcipher(kctx->acceptor_enc); ++ crypto_free_blkcipher(kctx->initiator_enc); ++ crypto_free_blkcipher(kctx->acceptor_enc_aux); ++ crypto_free_blkcipher(kctx->initiator_enc_aux); + kfree(kctx->mech_used.data); + kfree(kctx); + } +@@ -241,6 +744,7 @@ static struct gss_api_mech gss_kerberos_ + .gm_ops = &gss_kerberos_ops, + .gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs), + .gm_pfs = gss_kerberos_pfs, ++ .gm_upcall_enctypes = "enctypes=18,17,16,23,3,1,2 ", + }; + + static int __init init_kerberos_module(void) +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seal.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seal.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seal.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seal.c 2010-08-23 11:01:00.392564136 -0400 +@@ -3,7 +3,7 @@ + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5seal.c + * +- * Copyright (c) 2000 The Regents of the University of Michigan. ++ * Copyright (c) 2000-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson +@@ -70,53 +70,154 @@ + + DEFINE_SPINLOCK(krb5_seq_lock); + +-u32 +-gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, ++static char * ++setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token) ++{ ++ __be16 *ptr, *krb5_hdr; ++ int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength; ++ ++ token->len = g_token_size(&ctx->mech_used, body_size); ++ ++ ptr = (__be16 *)token->data; ++ g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr); ++ ++ /* ptr now at start of header described in rfc 1964, section 1.2.1: */ ++ krb5_hdr = ptr; ++ *ptr++ = KG_TOK_MIC_MSG; ++ *ptr++ = cpu_to_le16(ctx->gk5e->signalg); ++ *ptr++ = SEAL_ALG_NONE; ++ *ptr++ = 0xffff; ++ ++ return (char *)krb5_hdr; ++} ++ ++static void * ++setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) ++{ ++ __be16 *ptr, *krb5_hdr; ++ u8 *p, flags = 0x00; ++ ++ if ((ctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0) ++ flags |= 0x01; ++ if (ctx->flags & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY) ++ flags |= 0x04; ++ ++ /* Per rfc 4121, sec 4.2.6.1, there is no header, ++ * just start the token */ ++ krb5_hdr = ptr = (__be16 *)token->data; ++ ++ *ptr++ = KG2_TOK_MIC; ++ p = (u8 *)ptr; ++ *p++ = flags; ++ *p++ = 0xff; ++ ptr = (__be16 *)p; ++ *ptr++ = 0xffff; ++ *ptr++ = 0xffff; ++ ++ token->len = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength; ++ return krb5_hdr; ++} ++ ++static u32 ++gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, + struct xdr_netobj *token) + { +- struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; +- char cksumdata[16]; +- struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; +- unsigned char *ptr, *msg_start; ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), ++ .data = cksumdata}; ++ void *ptr; + s32 now; + u32 seq_send; ++ u8 *cksumkey; + +- dprintk("RPC: gss_krb5_seal\n"); ++ dprintk("RPC: %s\n", __func__); + BUG_ON(ctx == NULL); + + now = get_seconds(); + +- token->len = g_token_size(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8); ++ ptr = setup_token(ctx, token); + +- ptr = token->data; +- g_make_token_header(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8, &ptr); ++ if (ctx->gk5e->keyed_cksum) ++ cksumkey = ctx->cksum; ++ else ++ cksumkey = NULL; + +- /* ptr now at header described in rfc 1964, section 1.2.1: */ +- ptr[0] = (unsigned char) ((KG_TOK_MIC_MSG >> 8) & 0xff); +- ptr[1] = (unsigned char) (KG_TOK_MIC_MSG & 0xff); ++ if (make_checksum(ctx, ptr, 8, text, 0, cksumkey, ++ KG_USAGE_SIGN, &md5cksum)) ++ return GSS_S_FAILURE; + +- msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8; ++ memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); + +- *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5); +- memset(ptr + 4, 0xff, 4); ++ spin_lock(&krb5_seq_lock); ++ seq_send = ctx->seq_send++; ++ spin_unlock(&krb5_seq_lock); + +- if (make_checksum("md5", ptr, 8, text, 0, &md5cksum)) ++ if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff, ++ seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)) + return GSS_S_FAILURE; + +- if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, +- md5cksum.data, md5cksum.len)) +- return GSS_S_FAILURE; ++ return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; ++} ++ ++u32 ++gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, ++ struct xdr_netobj *token) ++{ ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj cksumobj = { .len = sizeof(cksumdata), ++ .data = cksumdata}; ++ void *krb5_hdr; ++ s32 now; ++ u64 seq_send; ++ u8 *cksumkey; ++ unsigned int cksum_usage; ++ ++ dprintk("RPC: %s\n", __func__); + +- memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8); ++ krb5_hdr = setup_token_v2(ctx, token); + ++ /* Set up the sequence number. Now 64-bits in clear ++ * text and w/o direction indicator */ + spin_lock(&krb5_seq_lock); +- seq_send = ctx->seq_send++; ++ seq_send = ctx->seq_send64++; + spin_unlock(&krb5_seq_lock); ++ *((u64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send); + +- if (krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff, +- seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, +- ptr + 8)) ++ if (ctx->initiate) { ++ cksumkey = ctx->initiator_sign; ++ cksum_usage = KG_USAGE_INITIATOR_SIGN; ++ } else { ++ cksumkey = ctx->acceptor_sign; ++ cksum_usage = KG_USAGE_ACCEPTOR_SIGN; ++ } ++ ++ if (make_checksum_v2(ctx, krb5_hdr, GSS_KRB5_TOK_HDR_LEN, ++ text, 0, cksumkey, cksum_usage, &cksumobj)) + return GSS_S_FAILURE; + ++ memcpy(krb5_hdr + GSS_KRB5_TOK_HDR_LEN, cksumobj.data, cksumobj.len); ++ ++ now = get_seconds(); ++ + return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; + } ++ ++u32 ++gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, ++ struct xdr_netobj *token) ++{ ++ struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; ++ ++ switch (ctx->enctype) { ++ default: ++ BUG(); ++ case ENCTYPE_DES_CBC_RAW: ++ case ENCTYPE_DES3_CBC_RAW: ++ case ENCTYPE_ARCFOUR_HMAC: ++ return gss_get_mic_v1(ctx, text, token); ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ return gss_get_mic_v2(ctx, text, token); ++ } ++} ++ +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seqnum.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seqnum.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seqnum.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_seqnum.c 2010-08-23 11:01:00.393496180 -0400 +@@ -39,14 +39,51 @@ + # define RPCDBG_FACILITY RPCDBG_AUTH + #endif + ++static s32 ++krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum, ++ unsigned char *cksum, unsigned char *buf) ++{ ++ struct crypto_blkcipher *cipher; ++ unsigned char plain[8]; ++ s32 code; ++ ++ dprintk("RPC: %s:\n", __func__); ++ cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cipher)) ++ return PTR_ERR(cipher); ++ ++ plain[0] = (unsigned char) ((seqnum >> 24) & 0xff); ++ plain[1] = (unsigned char) ((seqnum >> 16) & 0xff); ++ plain[2] = (unsigned char) ((seqnum >> 8) & 0xff); ++ plain[3] = (unsigned char) ((seqnum >> 0) & 0xff); ++ plain[4] = direction; ++ plain[5] = direction; ++ plain[6] = direction; ++ plain[7] = direction; ++ ++ code = krb5_rc4_setup_seq_key(kctx, cipher, cksum); ++ if (code) ++ goto out; ++ ++ code = krb5_encrypt(cipher, cksum, plain, buf, 8); ++out: ++ crypto_free_blkcipher(cipher); ++ return code; ++} + s32 +-krb5_make_seq_num(struct crypto_blkcipher *key, ++krb5_make_seq_num(struct krb5_ctx *kctx, ++ struct crypto_blkcipher *key, + int direction, + u32 seqnum, + unsigned char *cksum, unsigned char *buf) + { + unsigned char plain[8]; + ++ if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) ++ return krb5_make_rc4_seq_num(kctx, direction, seqnum, ++ cksum, buf); ++ + plain[0] = (unsigned char) (seqnum & 0xff); + plain[1] = (unsigned char) ((seqnum >> 8) & 0xff); + plain[2] = (unsigned char) ((seqnum >> 16) & 0xff); +@@ -60,17 +97,59 @@ krb5_make_seq_num(struct crypto_blkciphe + return krb5_encrypt(key, cksum, plain, buf, 8); + } + ++static s32 ++krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum, ++ unsigned char *buf, int *direction, s32 *seqnum) ++{ ++ struct crypto_blkcipher *cipher; ++ unsigned char plain[8]; ++ s32 code; ++ ++ dprintk("RPC: %s:\n", __func__); ++ cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cipher)) ++ return PTR_ERR(cipher); ++ ++ code = krb5_rc4_setup_seq_key(kctx, cipher, cksum); ++ if (code) ++ goto out; ++ ++ code = krb5_decrypt(cipher, cksum, buf, plain, 8); ++ if (code) ++ goto out; ++ ++ if ((plain[4] != plain[5]) || (plain[4] != plain[6]) ++ || (plain[4] != plain[7])) { ++ code = (s32)KG_BAD_SEQ; ++ goto out; ++ } ++ ++ *direction = plain[4]; ++ ++ *seqnum = ((plain[0] << 24) | (plain[1] << 16) | ++ (plain[2] << 8) | (plain[3])); ++out: ++ crypto_free_blkcipher(cipher); ++ return code; ++} ++ + s32 +-krb5_get_seq_num(struct crypto_blkcipher *key, ++krb5_get_seq_num(struct krb5_ctx *kctx, + unsigned char *cksum, + unsigned char *buf, + int *direction, u32 *seqnum) + { + s32 code; + unsigned char plain[8]; ++ struct crypto_blkcipher *key = kctx->seq; + + dprintk("RPC: krb5_get_seq_num:\n"); + ++ if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) ++ return krb5_get_rc4_seq_num(kctx, cksum, buf, ++ direction, seqnum); ++ + if ((code = krb5_decrypt(key, cksum, buf, plain, 8))) + return code; + +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_unseal.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_unseal.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_unseal.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_unseal.c 2010-08-23 11:01:00.393496180 -0400 +@@ -3,7 +3,7 @@ + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5unseal.c + * +- * Copyright (c) 2000 The Regents of the University of Michigan. ++ * Copyright (c) 2000-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson +@@ -70,20 +70,21 @@ + /* read_token is a mic token, and message_buffer is the data that the mic was + * supposedly taken over. */ + +-u32 +-gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, ++static u32 ++gss_verify_mic_v1(struct krb5_ctx *ctx, + struct xdr_buf *message_buffer, struct xdr_netobj *read_token) + { +- struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; + int signalg; + int sealalg; +- char cksumdata[16]; +- struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), ++ .data = cksumdata}; + s32 now; + int direction; + u32 seqnum; + unsigned char *ptr = (unsigned char *)read_token->data; + int bodysize; ++ u8 *cksumkey; + + dprintk("RPC: krb5_read_token\n"); + +@@ -98,7 +99,7 @@ gss_verify_mic_kerberos(struct gss_ctx * + /* XXX sanity-check bodysize?? */ + + signalg = ptr[2] + (ptr[3] << 8); +- if (signalg != SGN_ALG_DES_MAC_MD5) ++ if (signalg != ctx->gk5e->signalg) + return GSS_S_DEFECTIVE_TOKEN; + + sealalg = ptr[4] + (ptr[5] << 8); +@@ -108,13 +109,17 @@ gss_verify_mic_kerberos(struct gss_ctx * + if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) + return GSS_S_DEFECTIVE_TOKEN; + +- if (make_checksum("md5", ptr, 8, message_buffer, 0, &md5cksum)) +- return GSS_S_FAILURE; ++ if (ctx->gk5e->keyed_cksum) ++ cksumkey = ctx->cksum; ++ else ++ cksumkey = NULL; + +- if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, md5cksum.data, 16)) ++ if (make_checksum(ctx, ptr, 8, message_buffer, 0, ++ cksumkey, KG_USAGE_SIGN, &md5cksum)) + return GSS_S_FAILURE; + +- if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8)) ++ if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN, ++ ctx->gk5e->cksumlength)) + return GSS_S_BAD_SIG; + + /* it got through unscathed. Make sure the context is unexpired */ +@@ -126,7 +131,8 @@ gss_verify_mic_kerberos(struct gss_ctx * + + /* do sequencing checks */ + +- if (krb5_get_seq_num(ctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, &direction, &seqnum)) ++ if (krb5_get_seq_num(ctx, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, ++ &direction, &seqnum)) + return GSS_S_FAILURE; + + if ((ctx->initiate && direction != 0xff) || +@@ -135,3 +141,86 @@ gss_verify_mic_kerberos(struct gss_ctx * + + return GSS_S_COMPLETE; + } ++ ++static u32 ++gss_verify_mic_v2(struct krb5_ctx *ctx, ++ struct xdr_buf *message_buffer, struct xdr_netobj *read_token) ++{ ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj cksumobj = {.len = sizeof(cksumdata), ++ .data = cksumdata}; ++ s32 now; ++ u64 seqnum; ++ u8 *ptr = read_token->data; ++ u8 *cksumkey; ++ u8 flags; ++ int i; ++ unsigned int cksum_usage; ++ ++ dprintk("RPC: %s\n", __func__); ++ ++ if (be16_to_cpu(*((__be16 *)ptr)) != KG2_TOK_MIC) ++ return GSS_S_DEFECTIVE_TOKEN; ++ ++ flags = ptr[2]; ++ if ((!ctx->initiate && (flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR)) || ++ (ctx->initiate && !(flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR))) ++ return GSS_S_BAD_SIG; ++ ++ if (flags & KG2_TOKEN_FLAG_SEALED) { ++ dprintk("%s: token has unexpected sealed flag\n", __func__); ++ return GSS_S_FAILURE; ++ } ++ ++ for (i = 3; i < 8; i++) ++ if (ptr[i] != 0xff) ++ return GSS_S_DEFECTIVE_TOKEN; ++ ++ if (ctx->initiate) { ++ cksumkey = ctx->acceptor_sign; ++ cksum_usage = KG_USAGE_ACCEPTOR_SIGN; ++ } else { ++ cksumkey = ctx->initiator_sign; ++ cksum_usage = KG_USAGE_INITIATOR_SIGN; ++ } ++ ++ if (make_checksum_v2(ctx, ptr, GSS_KRB5_TOK_HDR_LEN, message_buffer, 0, ++ cksumkey, cksum_usage, &cksumobj)) ++ return GSS_S_FAILURE; ++ ++ if (memcmp(cksumobj.data, ptr + GSS_KRB5_TOK_HDR_LEN, ++ ctx->gk5e->cksumlength)) ++ return GSS_S_BAD_SIG; ++ ++ /* it got through unscathed. Make sure the context is unexpired */ ++ now = get_seconds(); ++ if (now > ctx->endtime) ++ return GSS_S_CONTEXT_EXPIRED; ++ ++ /* do sequencing checks */ ++ ++ seqnum = be64_to_cpup((__be64 *)ptr + 8); ++ ++ return GSS_S_COMPLETE; ++} ++ ++u32 ++gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, ++ struct xdr_buf *message_buffer, ++ struct xdr_netobj *read_token) ++{ ++ struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; ++ ++ switch (ctx->enctype) { ++ default: ++ BUG(); ++ case ENCTYPE_DES_CBC_RAW: ++ case ENCTYPE_DES3_CBC_RAW: ++ case ENCTYPE_ARCFOUR_HMAC: ++ return gss_verify_mic_v1(ctx, message_buffer, read_token); ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ return gss_verify_mic_v2(ctx, message_buffer, read_token); ++ } ++} ++ +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_wrap.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_wrap.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_wrap.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_krb5_wrap.c 2010-08-23 11:01:00.394576083 -0400 +@@ -1,3 +1,33 @@ ++/* ++ * COPYRIGHT (c) 2008 ++ * The Regents of the University of Michigan ++ * ALL RIGHTS RESERVED ++ * ++ * Permission is granted to use, copy, create derivative works ++ * and redistribute this software and such derivative works ++ * for any purpose, so long as the name of The University of ++ * Michigan is not used in any advertising or publicity ++ * pertaining to the use of distribution of this software ++ * without specific, written prior authorization. If the ++ * above copyright notice or any other identification of the ++ * University of Michigan is included in any copy of any ++ * portion of this software, then the disclaimer below must ++ * also be included. ++ * ++ * THIS SOFTWARE IS PROVIDED AS IS, WITHOUT REPRESENTATION ++ * FROM THE UNIVERSITY OF MICHIGAN AS TO ITS FITNESS FOR ANY ++ * PURPOSE, AND WITHOUT WARRANTY BY THE UNIVERSITY OF ++ * MICHIGAN OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ++ * WITHOUT LIMITATION THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ++ * REGENTS OF THE UNIVERSITY OF MICHIGAN SHALL NOT BE LIABLE ++ * FOR ANY DAMAGES, INCLUDING SPECIAL, INDIRECT, INCIDENTAL, OR ++ * CONSEQUENTIAL DAMAGES, WITH RESPECT TO ANY CLAIM ARISING ++ * OUT OF OR IN CONNECTION WITH THE USE OF THE SOFTWARE, EVEN ++ * IF IT HAS BEEN OR IS HEREAFTER ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGES. ++ */ ++ + #include + #include + #include +@@ -12,10 +42,7 @@ + static inline int + gss_krb5_padding(int blocksize, int length) + { +- /* Most of the code is block-size independent but currently we +- * use only 8: */ +- BUG_ON(blocksize != 8); +- return 8 - (length & 7); ++ return blocksize - (length % blocksize); + } + + static inline void +@@ -86,8 +113,8 @@ out: + return 0; + } + +-static void +-make_confounder(char *p, u32 conflen) ++void ++gss_krb5_make_confounder(char *p, u32 conflen) + { + static u64 i = 0; + u64 *q = (u64 *)p; +@@ -127,69 +154,73 @@ make_confounder(char *p, u32 conflen) + + /* XXX factor out common code with seal/unseal. */ + +-u32 +-gss_wrap_kerberos(struct gss_ctx *ctx, int offset, ++static u32 ++gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, + struct xdr_buf *buf, struct page **pages) + { +- struct krb5_ctx *kctx = ctx->internal_ctx_id; +- char cksumdata[16]; +- struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), ++ .data = cksumdata}; + int blocksize = 0, plainlen; + unsigned char *ptr, *msg_start; + s32 now; + int headlen; + struct page **tmp_pages; + u32 seq_send; ++ u8 *cksumkey; ++ u32 conflen = kctx->gk5e->conflen; + +- dprintk("RPC: gss_wrap_kerberos\n"); ++ dprintk("RPC: %s\n", __func__); + + now = get_seconds(); + + blocksize = crypto_blkcipher_blocksize(kctx->enc); + gss_krb5_add_padding(buf, offset, blocksize); + BUG_ON((buf->len - offset) % blocksize); +- plainlen = blocksize + buf->len - offset; ++ plainlen = conflen + buf->len - offset; + +- headlen = g_token_size(&kctx->mech_used, 24 + plainlen) - +- (buf->len - offset); ++ headlen = g_token_size(&kctx->mech_used, ++ GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength + plainlen) - ++ (buf->len - offset); + + ptr = buf->head[0].iov_base + offset; + /* shift data to make room for header. */ ++ xdr_extend_head(buf, offset, headlen); ++ + /* XXX Would be cleverer to encrypt while copying. */ +- /* XXX bounds checking, slack, etc. */ +- memmove(ptr + headlen, ptr, buf->head[0].iov_len - offset); +- buf->head[0].iov_len += headlen; +- buf->len += headlen; + BUG_ON((buf->len - offset - headlen) % blocksize); + + g_make_token_header(&kctx->mech_used, +- GSS_KRB5_TOK_HDR_LEN + 8 + plainlen, &ptr); ++ GSS_KRB5_TOK_HDR_LEN + ++ kctx->gk5e->cksumlength + plainlen, &ptr); + + + /* ptr now at header described in rfc 1964, section 1.2.1: */ + ptr[0] = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff); + ptr[1] = (unsigned char) (KG_TOK_WRAP_MSG & 0xff); + +- msg_start = ptr + 24; ++ msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength; + +- *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5); ++ *(__be16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg); + memset(ptr + 4, 0xff, 4); +- *(__be16 *)(ptr + 4) = htons(SEAL_ALG_DES); ++ *(__be16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg); + +- make_confounder(msg_start, blocksize); ++ gss_krb5_make_confounder(msg_start, conflen); ++ ++ if (kctx->gk5e->keyed_cksum) ++ cksumkey = kctx->cksum; ++ else ++ cksumkey = NULL; + + /* XXXJBF: UGH!: */ + tmp_pages = buf->pages; + buf->pages = pages; +- if (make_checksum("md5", ptr, 8, buf, +- offset + headlen - blocksize, &md5cksum)) ++ if (make_checksum(kctx, ptr, 8, buf, offset + headlen - conflen, ++ cksumkey, KG_USAGE_SEAL, &md5cksum)) + return GSS_S_FAILURE; + buf->pages = tmp_pages; + +- if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, +- md5cksum.data, md5cksum.len)) +- return GSS_S_FAILURE; +- memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8); ++ memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); + + spin_lock(&krb5_seq_lock); + seq_send = kctx->seq_send++; +@@ -197,25 +228,42 @@ gss_wrap_kerberos(struct gss_ctx *ctx, i + + /* XXX would probably be more efficient to compute checksum + * and encrypt at the same time: */ +- if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff, ++ if ((krb5_make_seq_num(kctx, kctx->seq, kctx->initiate ? 0 : 0xff, + seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8))) + return GSS_S_FAILURE; + +- if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize, +- pages)) +- return GSS_S_FAILURE; ++ if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { ++ struct crypto_blkcipher *cipher; ++ int err; ++ cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cipher)) ++ return GSS_S_FAILURE; ++ ++ krb5_rc4_setup_enc_key(kctx, cipher, seq_send); ++ ++ err = gss_encrypt_xdr_buf(cipher, buf, ++ offset + headlen - conflen, pages); ++ crypto_free_blkcipher(cipher); ++ if (err) ++ return GSS_S_FAILURE; ++ } else { ++ if (gss_encrypt_xdr_buf(kctx->enc, buf, ++ offset + headlen - conflen, pages)) ++ return GSS_S_FAILURE; ++ } + + return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; + } + +-u32 +-gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf) ++static u32 ++gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) + { +- struct krb5_ctx *kctx = ctx->internal_ctx_id; + int signalg; + int sealalg; +- char cksumdata[16]; +- struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; ++ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; ++ struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), ++ .data = cksumdata}; + s32 now; + int direction; + s32 seqnum; +@@ -224,6 +272,9 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, + void *data_start, *orig_start; + int data_len; + int blocksize; ++ u32 conflen = kctx->gk5e->conflen; ++ int crypt_offset; ++ u8 *cksumkey; + + dprintk("RPC: gss_unwrap_kerberos\n"); + +@@ -241,29 +292,65 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, + /* get the sign and seal algorithms */ + + signalg = ptr[2] + (ptr[3] << 8); +- if (signalg != SGN_ALG_DES_MAC_MD5) ++ if (signalg != kctx->gk5e->signalg) + return GSS_S_DEFECTIVE_TOKEN; + + sealalg = ptr[4] + (ptr[5] << 8); +- if (sealalg != SEAL_ALG_DES) ++ if (sealalg != kctx->gk5e->sealalg) + return GSS_S_DEFECTIVE_TOKEN; + + if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) + return GSS_S_DEFECTIVE_TOKEN; + +- if (gss_decrypt_xdr_buf(kctx->enc, buf, +- ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base)) +- return GSS_S_DEFECTIVE_TOKEN; ++ /* ++ * Data starts after token header and checksum. ptr points ++ * to the beginning of the token header ++ */ ++ crypt_offset = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) - ++ (unsigned char *)buf->head[0].iov_base; ++ ++ /* ++ * Need plaintext seqnum to derive encryption key for arcfour-hmac ++ */ ++ if (krb5_get_seq_num(kctx, ptr + GSS_KRB5_TOK_HDR_LEN, ++ ptr + 8, &direction, &seqnum)) ++ return GSS_S_BAD_SIG; + +- if (make_checksum("md5", ptr, 8, buf, +- ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base, &md5cksum)) +- return GSS_S_FAILURE; ++ if ((kctx->initiate && direction != 0xff) || ++ (!kctx->initiate && direction != 0)) ++ return GSS_S_BAD_SIG; ++ ++ if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { ++ struct crypto_blkcipher *cipher; ++ int err; ++ ++ cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, ++ CRYPTO_ALG_ASYNC); ++ if (IS_ERR(cipher)) ++ return GSS_S_FAILURE; ++ ++ krb5_rc4_setup_enc_key(kctx, cipher, seqnum); ++ ++ err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset); ++ crypto_free_blkcipher(cipher); ++ if (err) ++ return GSS_S_DEFECTIVE_TOKEN; ++ } else { ++ if (gss_decrypt_xdr_buf(kctx->enc, buf, crypt_offset)) ++ return GSS_S_DEFECTIVE_TOKEN; ++ } + +- if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, +- md5cksum.data, md5cksum.len)) ++ if (kctx->gk5e->keyed_cksum) ++ cksumkey = kctx->cksum; ++ else ++ cksumkey = NULL; ++ ++ if (make_checksum(kctx, ptr, 8, buf, crypt_offset, ++ cksumkey, KG_USAGE_SEAL, &md5cksum)) + return GSS_S_FAILURE; + +- if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8)) ++ if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN, ++ kctx->gk5e->cksumlength)) + return GSS_S_BAD_SIG; + + /* it got through unscathed. Make sure the context is unexpired */ +@@ -275,19 +362,12 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, + + /* do sequencing checks */ + +- if (krb5_get_seq_num(kctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, +- &direction, &seqnum)) +- return GSS_S_BAD_SIG; +- +- if ((kctx->initiate && direction != 0xff) || +- (!kctx->initiate && direction != 0)) +- return GSS_S_BAD_SIG; +- + /* Copy the data back to the right position. XXX: Would probably be + * better to copy and encrypt at the same time. */ + + blocksize = crypto_blkcipher_blocksize(kctx->enc); +- data_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8 + blocksize; ++ data_start = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) + ++ conflen; + orig_start = buf->head[0].iov_base + offset; + data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start; + memmove(orig_start, data_start, data_len); +@@ -299,3 +379,209 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, + + return GSS_S_COMPLETE; + } ++ ++/* ++ * We cannot currently handle tokens with rotated data. We need a ++ * generalized routine to rotate the data in place. It is anticipated ++ * that we won't encounter rotated data in the general case. ++ */ ++static u32 ++rotate_left(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, u16 rrc) ++{ ++ unsigned int realrrc = rrc % (buf->len - offset - GSS_KRB5_TOK_HDR_LEN); ++ ++ if (realrrc == 0) ++ return 0; ++ ++ dprintk("%s: cannot process token with rotated data: " ++ "rrc %u, realrrc %u\n", __func__, rrc, realrrc); ++ return 1; ++} ++ ++static u32 ++gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, ++ struct xdr_buf *buf, struct page **pages) ++{ ++ int blocksize; ++ u8 *ptr, *plainhdr; ++ s32 now; ++ u8 flags = 0x00; ++ __be16 *be16ptr, ec = 0; ++ __be64 *be64ptr; ++ u32 err; ++ ++ dprintk("RPC: %s\n", __func__); ++ ++ if (kctx->gk5e->encrypt_v2 == NULL) ++ return GSS_S_FAILURE; ++ ++ /* make room for gss token header */ ++ if (xdr_extend_head(buf, offset, GSS_KRB5_TOK_HDR_LEN)) ++ return GSS_S_FAILURE; ++ ++ /* construct gss token header */ ++ ptr = plainhdr = buf->head[0].iov_base + offset; ++ *ptr++ = (unsigned char) ((KG2_TOK_WRAP>>8) & 0xff); ++ *ptr++ = (unsigned char) (KG2_TOK_WRAP & 0xff); ++ ++ if ((kctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0) ++ flags |= KG2_TOKEN_FLAG_SENTBYACCEPTOR; ++ if ((kctx->flags & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY) != 0) ++ flags |= KG2_TOKEN_FLAG_ACCEPTORSUBKEY; ++ /* We always do confidentiality in wrap tokens */ ++ flags |= KG2_TOKEN_FLAG_SEALED; ++ ++ *ptr++ = flags; ++ *ptr++ = 0xff; ++ be16ptr = (__be16 *)ptr; ++ ++ blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc); ++ *be16ptr++ = cpu_to_be16(ec); ++ /* "inner" token header always uses 0 for RRC */ ++ *be16ptr++ = cpu_to_be16(0); ++ ++ be64ptr = (__be64 *)be16ptr; ++ spin_lock(&krb5_seq_lock); ++ *be64ptr = cpu_to_be64(kctx->seq_send64++); ++ spin_unlock(&krb5_seq_lock); ++ ++ err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, ec, pages); ++ if (err) ++ return err; ++ ++ now = get_seconds(); ++ return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; ++} ++ ++static u32 ++gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) ++{ ++ s32 now; ++ u64 seqnum; ++ u8 *ptr; ++ u8 flags = 0x00; ++ u16 ec, rrc; ++ int err; ++ u32 headskip, tailskip; ++ u8 decrypted_hdr[GSS_KRB5_TOK_HDR_LEN]; ++ unsigned int movelen; ++ ++ ++ dprintk("RPC: %s\n", __func__); ++ ++ if (kctx->gk5e->decrypt_v2 == NULL) ++ return GSS_S_FAILURE; ++ ++ ptr = buf->head[0].iov_base + offset; ++ ++ if (be16_to_cpu(*((__be16 *)ptr)) != KG2_TOK_WRAP) ++ return GSS_S_DEFECTIVE_TOKEN; ++ ++ flags = ptr[2]; ++ if ((!kctx->initiate && (flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR)) || ++ (kctx->initiate && !(flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR))) ++ return GSS_S_BAD_SIG; ++ ++ if ((flags & KG2_TOKEN_FLAG_SEALED) == 0) { ++ dprintk("%s: token missing expected sealed flag\n", __func__); ++ return GSS_S_DEFECTIVE_TOKEN; ++ } ++ ++ if (ptr[3] != 0xff) ++ return GSS_S_DEFECTIVE_TOKEN; ++ ++ ec = be16_to_cpup((__be16 *)(ptr + 4)); ++ rrc = be16_to_cpup((__be16 *)(ptr + 6)); ++ ++ seqnum = be64_to_cpup((__be64 *)(ptr + 8)); ++ ++ if (rrc != 0) { ++ err = rotate_left(kctx, offset, buf, rrc); ++ if (err) ++ return GSS_S_FAILURE; ++ } ++ ++ err = (*kctx->gk5e->decrypt_v2)(kctx, offset, buf, ++ &headskip, &tailskip); ++ if (err) ++ return GSS_S_FAILURE; ++ ++ /* ++ * Retrieve the decrypted gss token header and verify ++ * it against the original ++ */ ++ err = read_bytes_from_xdr_buf(buf, ++ buf->len - GSS_KRB5_TOK_HDR_LEN - tailskip, ++ decrypted_hdr, GSS_KRB5_TOK_HDR_LEN); ++ if (err) { ++ dprintk("%s: error %u getting decrypted_hdr\n", __func__, err); ++ return GSS_S_FAILURE; ++ } ++ if (memcmp(ptr, decrypted_hdr, 6) ++ || memcmp(ptr + 8, decrypted_hdr + 8, 8)) { ++ dprintk("%s: token hdr, plaintext hdr mismatch!\n", __func__); ++ return GSS_S_FAILURE; ++ } ++ ++ /* do sequencing checks */ ++ ++ /* it got through unscathed. Make sure the context is unexpired */ ++ now = get_seconds(); ++ if (now > kctx->endtime) ++ return GSS_S_CONTEXT_EXPIRED; ++ ++ /* ++ * Move the head data back to the right position in xdr_buf. ++ * We ignore any "ec" data since it might be in the head or ++ * the tail, and we really don't need to deal with it. ++ * Note that buf->head[0].iov_len may indicate the available ++ * head buffer space rather than that actually occupied. ++ */ ++ movelen = min_t(unsigned int, buf->head[0].iov_len, buf->len); ++ movelen -= offset + GSS_KRB5_TOK_HDR_LEN + headskip; ++ BUG_ON(offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen > ++ buf->head[0].iov_len); ++ memmove(ptr, ptr + GSS_KRB5_TOK_HDR_LEN + headskip, movelen); ++ buf->head[0].iov_len -= GSS_KRB5_TOK_HDR_LEN + headskip; ++ buf->len -= GSS_KRB5_TOK_HDR_LEN + headskip; ++ ++ return GSS_S_COMPLETE; ++} ++ ++u32 ++gss_wrap_kerberos(struct gss_ctx *gctx, int offset, ++ struct xdr_buf *buf, struct page **pages) ++{ ++ struct krb5_ctx *kctx = gctx->internal_ctx_id; ++ ++ switch (kctx->enctype) { ++ default: ++ BUG(); ++ case ENCTYPE_DES_CBC_RAW: ++ case ENCTYPE_DES3_CBC_RAW: ++ case ENCTYPE_ARCFOUR_HMAC: ++ return gss_wrap_kerberos_v1(kctx, offset, buf, pages); ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ return gss_wrap_kerberos_v2(kctx, offset, buf, pages); ++ } ++} ++ ++u32 ++gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, struct xdr_buf *buf) ++{ ++ struct krb5_ctx *kctx = gctx->internal_ctx_id; ++ ++ switch (kctx->enctype) { ++ default: ++ BUG(); ++ case ENCTYPE_DES_CBC_RAW: ++ case ENCTYPE_DES3_CBC_RAW: ++ case ENCTYPE_ARCFOUR_HMAC: ++ return gss_unwrap_kerberos_v1(kctx, offset, buf); ++ case ENCTYPE_AES128_CTS_HMAC_SHA1_96: ++ case ENCTYPE_AES256_CTS_HMAC_SHA1_96: ++ return gss_unwrap_kerberos_v2(kctx, offset, buf); ++ } ++} ++ +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_mech_switch.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_mech_switch.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_mech_switch.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_mech_switch.c 2010-08-23 11:01:00.395574706 -0400 +@@ -249,14 +249,15 @@ EXPORT_SYMBOL_GPL(gss_mech_put); + int + gss_import_sec_context(const void *input_token, size_t bufsize, + struct gss_api_mech *mech, +- struct gss_ctx **ctx_id) ++ struct gss_ctx **ctx_id, ++ gfp_t gfp_mask) + { +- if (!(*ctx_id = kzalloc(sizeof(**ctx_id), GFP_KERNEL))) ++ if (!(*ctx_id = kzalloc(sizeof(**ctx_id), gfp_mask))) + return -ENOMEM; + (*ctx_id)->mech_type = gss_mech_get(mech); + + return mech->gm_ops +- ->gss_import_sec_context(input_token, bufsize, *ctx_id); ++ ->gss_import_sec_context(input_token, bufsize, *ctx_id, gfp_mask); + } + + /* gss_get_mic: compute a mic over message and return mic_token. */ +@@ -285,6 +286,20 @@ gss_verify_mic(struct gss_ctx *context_ + mic_token); + } + ++/* ++ * This function is called from both the client and server code. ++ * Each makes guarantees about how much "slack" space is available ++ * for the underlying function in "buf"'s head and tail while ++ * performing the wrap. ++ * ++ * The client and server code allocate RPC_MAX_AUTH_SIZE extra ++ * space in both the head and tail which is available for use by ++ * the wrap function. ++ * ++ * Underlying functions should verify they do not use more than ++ * RPC_MAX_AUTH_SIZE of extra space in either the head or tail ++ * when performing the wrap. ++ */ + u32 + gss_wrap(struct gss_ctx *ctx_id, + int offset, +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_spkm3_mech.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_spkm3_mech.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_spkm3_mech.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/gss_spkm3_mech.c 2010-08-23 11:01:00.396574085 -0400 +@@ -84,13 +84,14 @@ simple_get_netobj(const void *p, const v + + static int + gss_import_sec_context_spkm3(const void *p, size_t len, +- struct gss_ctx *ctx_id) ++ struct gss_ctx *ctx_id, ++ gfp_t gfp_mask) + { + const void *end = (const void *)((const char *)p + len); + struct spkm3_ctx *ctx; + int version; + +- if (!(ctx = kzalloc(sizeof(*ctx), GFP_NOFS))) ++ if (!(ctx = kzalloc(sizeof(*ctx), gfp_mask))) + goto out_err; + + p = simple_get_bytes(p, end, &version, sizeof(version)); +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/Makefile.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/Makefile +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/Makefile 2010-08-23 11:01:00.387574079 -0400 +@@ -10,7 +10,7 @@ auth_rpcgss-objs := auth_gss.o gss_gener + obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o + + rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ +- gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o ++ gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o gss_krb5_keys.o + + obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o + +diff -up linux-2.6.34.noarch/net/sunrpc/auth_gss/svcauth_gss.c.orig linux-2.6.34.noarch/net/sunrpc/auth_gss/svcauth_gss.c +--- linux-2.6.34.noarch/net/sunrpc/auth_gss/svcauth_gss.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/auth_gss/svcauth_gss.c 2010-08-23 11:01:00.396574085 -0400 +@@ -494,7 +494,7 @@ static int rsc_parse(struct cache_detail + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; +- status = gss_import_sec_context(buf, len, gm, &rsci.mechctx); ++ status = gss_import_sec_context(buf, len, gm, &rsci.mechctx, GFP_KERNEL); + if (status) + goto out; + +@@ -1315,6 +1315,14 @@ svcauth_gss_wrap_resp_priv(struct svc_rq + inpages = resbuf->pages; + /* XXX: Would be better to write some xdr helper functions for + * nfs{2,3,4}xdr.c that place the data right, instead of copying: */ ++ ++ /* ++ * If there is currently tail data, make sure there is ++ * room for the head, tail, and 2 * RPC_MAX_AUTH_SIZE in ++ * the page, and move the current tail data such that ++ * there is RPC_MAX_AUTH_SIZE slack space available in ++ * both the head and tail. ++ */ + if (resbuf->tail[0].iov_base) { + BUG_ON(resbuf->tail[0].iov_base >= resbuf->head[0].iov_base + + PAGE_SIZE); +@@ -1327,6 +1335,13 @@ svcauth_gss_wrap_resp_priv(struct svc_rq + resbuf->tail[0].iov_len); + resbuf->tail[0].iov_base += RPC_MAX_AUTH_SIZE; + } ++ /* ++ * If there is no current tail data, make sure there is ++ * room for the head data, and 2 * RPC_MAX_AUTH_SIZE in the ++ * allotted page, and set up tail information such that there ++ * is RPC_MAX_AUTH_SIZE slack space available in both the ++ * head and tail. ++ */ + if (resbuf->tail[0].iov_base == NULL) { + if (resbuf->head[0].iov_len + 2*RPC_MAX_AUTH_SIZE > PAGE_SIZE) + return -ENOMEM; +diff -up linux-2.6.34.noarch/net/sunrpc/clnt.c.orig linux-2.6.34.noarch/net/sunrpc/clnt.c +--- linux-2.6.34.noarch/net/sunrpc/clnt.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/clnt.c 2010-08-23 11:01:00.397622347 -0400 +@@ -556,26 +556,16 @@ static const struct rpc_call_ops rpc_def + */ + struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) + { +- struct rpc_task *task, *ret; ++ struct rpc_task *task; + + task = rpc_new_task(task_setup_data); +- if (task == NULL) { +- rpc_release_calldata(task_setup_data->callback_ops, +- task_setup_data->callback_data); +- ret = ERR_PTR(-ENOMEM); ++ if (IS_ERR(task)) + goto out; +- } + +- if (task->tk_status != 0) { +- ret = ERR_PTR(task->tk_status); +- rpc_put_task(task); +- goto out; +- } + atomic_inc(&task->tk_count); + rpc_execute(task); +- ret = task; + out: +- return ret; ++ return task; + } + EXPORT_SYMBOL_GPL(rpc_run_task); + +@@ -657,9 +647,8 @@ struct rpc_task *rpc_run_bc_task(struct + * Create an rpc_task to send the data + */ + task = rpc_new_task(&task_setup_data); +- if (!task) { ++ if (IS_ERR(task)) { + xprt_free_bc_request(req); +- task = ERR_PTR(-ENOMEM); + goto out; + } + task->tk_rqstp = req; +diff -up linux-2.6.34.noarch/net/sunrpc/sched.c.orig linux-2.6.34.noarch/net/sunrpc/sched.c +--- linux-2.6.34.noarch/net/sunrpc/sched.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/sched.c 2010-08-23 11:01:00.398564598 -0400 +@@ -25,7 +25,6 @@ + + #ifdef RPC_DEBUG + #define RPCDBG_FACILITY RPCDBG_SCHED +-#define RPC_TASK_MAGIC_ID 0xf00baa + #endif + + /* +@@ -237,7 +236,6 @@ static void rpc_task_set_debuginfo(struc + { + static atomic_t rpc_pid; + +- task->tk_magic = RPC_TASK_MAGIC_ID; + task->tk_pid = atomic_inc_return(&rpc_pid); + } + #else +@@ -360,9 +358,6 @@ static void __rpc_do_wake_up_task(struct + dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n", + task->tk_pid, jiffies); + +-#ifdef RPC_DEBUG +- BUG_ON(task->tk_magic != RPC_TASK_MAGIC_ID); +-#endif + /* Has the task been executed yet? If not, we cannot wake it up! */ + if (!RPC_IS_ACTIVATED(task)) { + printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task); +@@ -834,7 +829,7 @@ static void rpc_init_task(struct rpc_tas + } + + /* starting timestamp */ +- task->tk_start = jiffies; ++ task->tk_start = ktime_get(); + + dprintk("RPC: new task initialized, procpid %u\n", + task_pid_nr(current)); +@@ -856,16 +851,23 @@ struct rpc_task *rpc_new_task(const stru + + if (task == NULL) { + task = rpc_alloc_task(); +- if (task == NULL) +- goto out; ++ if (task == NULL) { ++ rpc_release_calldata(setup_data->callback_ops, ++ setup_data->callback_data); ++ return ERR_PTR(-ENOMEM); ++ } + flags = RPC_TASK_DYNAMIC; + } + + rpc_init_task(task, setup_data); ++ if (task->tk_status < 0) { ++ int err = task->tk_status; ++ rpc_put_task(task); ++ return ERR_PTR(err); ++ } + + task->tk_flags |= flags; + dprintk("RPC: allocated task %p\n", task); +-out: + return task; + } + +@@ -909,9 +911,6 @@ EXPORT_SYMBOL_GPL(rpc_put_task); + + static void rpc_release_task(struct rpc_task *task) + { +-#ifdef RPC_DEBUG +- BUG_ON(task->tk_magic != RPC_TASK_MAGIC_ID); +-#endif + dprintk("RPC: %5u release task\n", task->tk_pid); + + if (!list_empty(&task->tk_task)) { +@@ -923,9 +922,6 @@ static void rpc_release_task(struct rpc_ + } + BUG_ON (RPC_IS_QUEUED(task)); + +-#ifdef RPC_DEBUG +- task->tk_magic = 0; +-#endif + /* Wake up anyone who is waiting for task completion */ + rpc_mark_complete_task(task); + +diff -up linux-2.6.34.noarch/net/sunrpc/stats.c.orig linux-2.6.34.noarch/net/sunrpc/stats.c +--- linux-2.6.34.noarch/net/sunrpc/stats.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/stats.c 2010-08-23 11:01:00.399574225 -0400 +@@ -144,7 +144,7 @@ void rpc_count_iostats(struct rpc_task * + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_iostats *stats; + struct rpc_iostats *op_metrics; +- long rtt, execute, queue; ++ ktime_t delta; + + if (!task->tk_client || !task->tk_client->cl_metrics || !req) + return; +@@ -156,23 +156,16 @@ void rpc_count_iostats(struct rpc_task * + op_metrics->om_ntrans += req->rq_ntrans; + op_metrics->om_timeouts += task->tk_timeouts; + +- op_metrics->om_bytes_sent += task->tk_bytes_sent; ++ op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent; + op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd; + +- queue = (long)req->rq_xtime - task->tk_start; +- if (queue < 0) +- queue = -queue; +- op_metrics->om_queue += queue; +- +- rtt = task->tk_rtt; +- if (rtt < 0) +- rtt = -rtt; +- op_metrics->om_rtt += rtt; +- +- execute = (long)jiffies - task->tk_start; +- if (execute < 0) +- execute = -execute; +- op_metrics->om_execute += execute; ++ delta = ktime_sub(req->rq_xtime, task->tk_start); ++ op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta); ++ ++ op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt); ++ ++ delta = ktime_sub(ktime_get(), task->tk_start); ++ op_metrics->om_execute = ktime_add(op_metrics->om_execute, delta); + } + + static void _print_name(struct seq_file *seq, unsigned int op, +@@ -186,8 +179,6 @@ static void _print_name(struct seq_file + seq_printf(seq, "\t%12u: ", op); + } + +-#define MILLISECS_PER_JIFFY (1000 / HZ) +- + void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt) + { + struct rpc_iostats *stats = clnt->cl_metrics; +@@ -214,9 +205,9 @@ void rpc_print_iostats(struct seq_file * + metrics->om_timeouts, + metrics->om_bytes_sent, + metrics->om_bytes_recv, +- metrics->om_queue * MILLISECS_PER_JIFFY, +- metrics->om_rtt * MILLISECS_PER_JIFFY, +- metrics->om_execute * MILLISECS_PER_JIFFY); ++ ktime_to_ms(metrics->om_queue), ++ ktime_to_ms(metrics->om_rtt), ++ ktime_to_ms(metrics->om_execute)); + } + } + EXPORT_SYMBOL_GPL(rpc_print_iostats); +diff -up linux-2.6.34.noarch/net/sunrpc/xdr.c.orig linux-2.6.34.noarch/net/sunrpc/xdr.c +--- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-23 11:01:00.400574086 -0400 +@@ -762,6 +762,7 @@ int write_bytes_to_xdr_buf(struct xdr_bu + __write_bytes_to_xdr_buf(&subbuf, obj, len); + return 0; + } ++EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf); + + int + xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj) +diff -up linux-2.6.34.noarch/net/sunrpc/xprt.c.orig linux-2.6.34.noarch/net/sunrpc/xprt.c +--- linux-2.6.34.noarch/net/sunrpc/xprt.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xprt.c 2010-08-23 11:01:00.401372963 -0400 +@@ -43,6 +43,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -62,7 +63,6 @@ + * Local functions + */ + static void xprt_request_init(struct rpc_task *, struct rpc_xprt *); +-static inline void do_xprt_reserve(struct rpc_task *); + static void xprt_connect_status(struct rpc_task *task); + static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); + +@@ -711,12 +711,16 @@ void xprt_connect(struct rpc_task *task) + if (task->tk_rqstp) + task->tk_rqstp->rq_bytes_sent = 0; + +- task->tk_timeout = xprt->connect_timeout; ++ task->tk_timeout = task->tk_rqstp->rq_timeout; + rpc_sleep_on(&xprt->pending, task, xprt_connect_status); ++ ++ if (test_bit(XPRT_CLOSING, &xprt->state)) ++ return; ++ if (xprt_test_and_set_connecting(xprt)) ++ return; + xprt->stat.connect_start = jiffies; + xprt->ops->connect(task); + } +- return; + } + + static void xprt_connect_status(struct rpc_task *task) +@@ -771,25 +775,19 @@ struct rpc_rqst *xprt_lookup_rqst(struct + } + EXPORT_SYMBOL_GPL(xprt_lookup_rqst); + +-/** +- * xprt_update_rtt - update an RPC client's RTT state after receiving a reply +- * @task: RPC request that recently completed +- * +- */ +-void xprt_update_rtt(struct rpc_task *task) ++static void xprt_update_rtt(struct rpc_task *task) + { + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_rtt *rtt = task->tk_client->cl_rtt; + unsigned timer = task->tk_msg.rpc_proc->p_timer; ++ long m = usecs_to_jiffies(ktime_to_us(req->rq_rtt)); + + if (timer) { + if (req->rq_ntrans == 1) +- rpc_update_rtt(rtt, timer, +- (long)jiffies - req->rq_xtime); ++ rpc_update_rtt(rtt, timer, m); + rpc_set_timeo(rtt, timer, req->rq_ntrans - 1); + } + } +-EXPORT_SYMBOL_GPL(xprt_update_rtt); + + /** + * xprt_complete_rqst - called when reply processing is complete +@@ -807,7 +805,9 @@ void xprt_complete_rqst(struct rpc_task + task->tk_pid, ntohl(req->rq_xid), copied); + + xprt->stat.recvs++; +- task->tk_rtt = (long)jiffies - req->rq_xtime; ++ req->rq_rtt = ktime_sub(ktime_get(), req->rq_xtime); ++ if (xprt->ops->timer != NULL) ++ xprt_update_rtt(task); + + list_del_init(&req->rq_list); + req->rq_private_buf.len = copied; +@@ -906,7 +906,7 @@ void xprt_transmit(struct rpc_task *task + return; + + req->rq_connect_cookie = xprt->connect_cookie; +- req->rq_xtime = jiffies; ++ req->rq_xtime = ktime_get(); + status = xprt->ops->send_request(task); + if (status != 0) { + task->tk_status = status; +@@ -935,7 +935,7 @@ void xprt_transmit(struct rpc_task *task + spin_unlock_bh(&xprt->transport_lock); + } + +-static inline void do_xprt_reserve(struct rpc_task *task) ++static void xprt_alloc_slot(struct rpc_task *task) + { + struct rpc_xprt *xprt = task->tk_xprt; + +@@ -955,6 +955,16 @@ static inline void do_xprt_reserve(struc + rpc_sleep_on(&xprt->backlog, task, NULL); + } + ++static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) ++{ ++ memset(req, 0, sizeof(*req)); /* mark unused */ ++ ++ spin_lock(&xprt->reserve_lock); ++ list_add(&req->rq_list, &xprt->free); ++ rpc_wake_up_next(&xprt->backlog); ++ spin_unlock(&xprt->reserve_lock); ++} ++ + /** + * xprt_reserve - allocate an RPC request slot + * @task: RPC task requesting a slot allocation +@@ -968,7 +978,7 @@ void xprt_reserve(struct rpc_task *task) + + task->tk_status = -EIO; + spin_lock(&xprt->reserve_lock); +- do_xprt_reserve(task); ++ xprt_alloc_slot(task); + spin_unlock(&xprt->reserve_lock); + } + +@@ -1006,14 +1016,10 @@ void xprt_release(struct rpc_task *task) + { + struct rpc_xprt *xprt; + struct rpc_rqst *req; +- int is_bc_request; + + if (!(req = task->tk_rqstp)) + return; + +- /* Preallocated backchannel request? */ +- is_bc_request = bc_prealloc(req); +- + xprt = req->rq_xprt; + rpc_count_iostats(task); + spin_lock_bh(&xprt->transport_lock); +@@ -1027,21 +1033,16 @@ void xprt_release(struct rpc_task *task) + mod_timer(&xprt->timer, + xprt->last_used + xprt->idle_timeout); + spin_unlock_bh(&xprt->transport_lock); +- if (!bc_prealloc(req)) ++ if (req->rq_buffer) + xprt->ops->buf_free(req->rq_buffer); + task->tk_rqstp = NULL; + if (req->rq_release_snd_buf) + req->rq_release_snd_buf(req); + + dprintk("RPC: %5u release request %p\n", task->tk_pid, req); +- if (likely(!is_bc_request)) { +- memset(req, 0, sizeof(*req)); /* mark unused */ +- +- spin_lock(&xprt->reserve_lock); +- list_add(&req->rq_list, &xprt->free); +- rpc_wake_up_next(&xprt->backlog); +- spin_unlock(&xprt->reserve_lock); +- } else ++ if (likely(!bc_prealloc(req))) ++ xprt_free_slot(xprt, req); ++ else + xprt_free_bc_request(req); + } + +diff -up linux-2.6.34.noarch/net/sunrpc/xprtrdma/transport.c.orig linux-2.6.34.noarch/net/sunrpc/xprtrdma/transport.c +--- linux-2.6.34.noarch/net/sunrpc/xprtrdma/transport.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xprtrdma/transport.c 2010-08-23 11:01:00.402563985 -0400 +@@ -305,7 +305,6 @@ xprt_setup_rdma(struct xprt_create *args + /* 60 second timeout, no retries */ + xprt->timeout = &xprt_rdma_default_timeout; + xprt->bind_timeout = (60U * HZ); +- xprt->connect_timeout = (60U * HZ); + xprt->reestablish_timeout = (5U * HZ); + xprt->idle_timeout = (5U * 60 * HZ); + +@@ -449,21 +448,19 @@ xprt_rdma_connect(struct rpc_task *task) + struct rpc_xprt *xprt = (struct rpc_xprt *)task->tk_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + +- if (!xprt_test_and_set_connecting(xprt)) { +- if (r_xprt->rx_ep.rep_connected != 0) { +- /* Reconnect */ +- schedule_delayed_work(&r_xprt->rdma_connect, +- xprt->reestablish_timeout); +- xprt->reestablish_timeout <<= 1; +- if (xprt->reestablish_timeout > (30 * HZ)) +- xprt->reestablish_timeout = (30 * HZ); +- else if (xprt->reestablish_timeout < (5 * HZ)) +- xprt->reestablish_timeout = (5 * HZ); +- } else { +- schedule_delayed_work(&r_xprt->rdma_connect, 0); +- if (!RPC_IS_ASYNC(task)) +- flush_scheduled_work(); +- } ++ if (r_xprt->rx_ep.rep_connected != 0) { ++ /* Reconnect */ ++ schedule_delayed_work(&r_xprt->rdma_connect, ++ xprt->reestablish_timeout); ++ xprt->reestablish_timeout <<= 1; ++ if (xprt->reestablish_timeout > (30 * HZ)) ++ xprt->reestablish_timeout = (30 * HZ); ++ else if (xprt->reestablish_timeout < (5 * HZ)) ++ xprt->reestablish_timeout = (5 * HZ); ++ } else { ++ schedule_delayed_work(&r_xprt->rdma_connect, 0); ++ if (!RPC_IS_ASYNC(task)) ++ flush_scheduled_work(); + } + } + +@@ -677,7 +674,7 @@ xprt_rdma_send_request(struct rpc_task * + if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + goto drop_connection; + +- task->tk_bytes_sent += rqst->rq_snd_buf.len; ++ rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len; + rqst->rq_bytes_sent = 0; + return 0; + +diff -up linux-2.6.34.noarch/net/sunrpc/xprtsock.c.orig linux-2.6.34.noarch/net/sunrpc/xprtsock.c +--- linux-2.6.34.noarch/net/sunrpc/xprtsock.c.orig 2010-08-23 11:00:23.890501549 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xprtsock.c 2010-08-23 11:01:00.403564023 -0400 +@@ -138,20 +138,6 @@ static ctl_table sunrpc_table[] = { + #endif + + /* +- * Time out for an RPC UDP socket connect. UDP socket connects are +- * synchronous, but we set a timeout anyway in case of resource +- * exhaustion on the local host. +- */ +-#define XS_UDP_CONN_TO (5U * HZ) +- +-/* +- * Wait duration for an RPC TCP connection to be established. Solaris +- * NFS over TCP uses 60 seconds, for example, which is in line with how +- * long a server takes to reboot. +- */ +-#define XS_TCP_CONN_TO (60U * HZ) +- +-/* + * Wait duration for a reply from the RPC portmapper. + */ + #define XS_BIND_TO (60U * HZ) +@@ -543,7 +529,7 @@ static int xs_udp_send_request(struct rp + xdr->len - req->rq_bytes_sent, status); + + if (status >= 0) { +- task->tk_bytes_sent += status; ++ req->rq_xmit_bytes_sent += status; + if (status >= req->rq_slen) + return 0; + /* Still some bytes left; set up for a retry later. */ +@@ -639,7 +625,7 @@ static int xs_tcp_send_request(struct rp + /* If we've sent the entire packet, immediately + * reset the count of bytes sent. */ + req->rq_bytes_sent += status; +- task->tk_bytes_sent += status; ++ req->rq_xmit_bytes_sent += status; + if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_bytes_sent = 0; + return 0; +@@ -859,7 +845,6 @@ static void xs_udp_data_ready(struct soc + dst_confirm(skb_dst(skb)); + + xprt_adjust_cwnd(task, copied); +- xprt_update_rtt(task); + xprt_complete_rqst(task, copied); + + out_unlock: +@@ -2022,9 +2007,6 @@ static void xs_connect(struct rpc_task * + struct rpc_xprt *xprt = task->tk_xprt; + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + +- if (xprt_test_and_set_connecting(xprt)) +- return; +- + if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) { + dprintk("RPC: xs_connect delayed xprt %p for %lu " + "seconds\n", +@@ -2044,16 +2026,6 @@ static void xs_connect(struct rpc_task * + } + } + +-static void xs_tcp_connect(struct rpc_task *task) +-{ +- struct rpc_xprt *xprt = task->tk_xprt; +- +- /* Exit if we need to wait for socket shutdown to complete */ +- if (test_bit(XPRT_CLOSING, &xprt->state)) +- return; +- xs_connect(task); +-} +- + /** + * xs_udp_print_stats - display UDP socket-specifc stats + * @xprt: rpc_xprt struct containing statistics +@@ -2252,7 +2224,7 @@ static struct rpc_xprt_ops xs_tcp_ops = + .release_xprt = xs_tcp_release_xprt, + .rpcbind = rpcb_getport_async, + .set_port = xs_set_port, +- .connect = xs_tcp_connect, ++ .connect = xs_connect, + .buf_alloc = rpc_malloc, + .buf_free = rpc_free, + .send_request = xs_tcp_send_request, +@@ -2343,7 +2315,6 @@ static struct rpc_xprt *xs_setup_udp(str + xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); + + xprt->bind_timeout = XS_BIND_TO; +- xprt->connect_timeout = XS_UDP_CONN_TO; + xprt->reestablish_timeout = XS_UDP_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; + +@@ -2418,7 +2389,6 @@ static struct rpc_xprt *xs_setup_tcp(str + xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; + + xprt->bind_timeout = XS_BIND_TO; +- xprt->connect_timeout = XS_TCP_CONN_TO; + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; + +@@ -2478,9 +2448,6 @@ static struct rpc_xprt *xs_setup_bc_tcp( + struct sock_xprt *transport; + struct svc_sock *bc_sock; + +- if (!args->bc_xprt) +- ERR_PTR(-EINVAL); +- + xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries); + if (IS_ERR(xprt)) + return xprt; +@@ -2494,7 +2461,6 @@ static struct rpc_xprt *xs_setup_bc_tcp( + /* backchannel */ + xprt_set_bound(xprt); + xprt->bind_timeout = 0; +- xprt->connect_timeout = 0; + xprt->reestablish_timeout = 0; + xprt->idle_timeout = 0; + diff --git a/nfsd-35-fc.patch b/nfsd-35-fc.patch new file mode 100644 index 000000000..ef99b4995 --- /dev/null +++ b/nfsd-35-fc.patch @@ -0,0 +1,1808 @@ +diff -up linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt +--- linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt 2010-08-23 09:57:18.233564439 -0400 +@@ -137,7 +137,7 @@ NS*| OPENATTR | OPT | + | READ | REQ | | Section 18.22 | + | READDIR | REQ | | Section 18.23 | + | READLINK | OPT | | Section 18.24 | +-NS | RECLAIM_COMPLETE | REQ | | Section 18.51 | ++ | RECLAIM_COMPLETE | REQ | | Section 18.51 | + | RELEASE_LOCKOWNER | MNI | | N/A | + | REMOVE | REQ | | Section 18.25 | + | RENAME | REQ | | Section 18.26 | +diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c +--- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 09:57:18.234564075 -0400 +@@ -259,10 +259,9 @@ static struct cache_detail svc_expkey_ca + .alloc = expkey_alloc, + }; + +-static struct svc_expkey * +-svc_expkey_lookup(struct svc_expkey *item) ++static int ++svc_expkey_hash(struct svc_expkey *item) + { +- struct cache_head *ch; + int hash = item->ek_fsidtype; + char * cp = (char*)item->ek_fsid; + int len = key_len(item->ek_fsidtype); +@@ -270,6 +269,14 @@ svc_expkey_lookup(struct svc_expkey *ite + hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); + hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS); + hash &= EXPKEY_HASHMASK; ++ return hash; ++} ++ ++static struct svc_expkey * ++svc_expkey_lookup(struct svc_expkey *item) ++{ ++ struct cache_head *ch; ++ int hash = svc_expkey_hash(item); + + ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h, + hash); +@@ -283,13 +290,7 @@ static struct svc_expkey * + svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old) + { + struct cache_head *ch; +- int hash = new->ek_fsidtype; +- char * cp = (char*)new->ek_fsid; +- int len = key_len(new->ek_fsidtype); +- +- hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); +- hash ^= hash_ptr(new->ek_client, EXPKEY_HASHBITS); +- hash &= EXPKEY_HASHMASK; ++ int hash = svc_expkey_hash(new); + + ch = sunrpc_cache_update(&svc_expkey_cache, &new->h, + &old->h, hash); +@@ -738,14 +739,22 @@ struct cache_detail svc_export_cache = { + .alloc = svc_export_alloc, + }; + +-static struct svc_export * +-svc_export_lookup(struct svc_export *exp) ++static int ++svc_export_hash(struct svc_export *exp) + { +- struct cache_head *ch; + int hash; ++ + hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS); + hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS); + hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS); ++ return hash; ++} ++ ++static struct svc_export * ++svc_export_lookup(struct svc_export *exp) ++{ ++ struct cache_head *ch; ++ int hash = svc_export_hash(exp); + + ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h, + hash); +@@ -759,10 +768,7 @@ static struct svc_export * + svc_export_update(struct svc_export *new, struct svc_export *old) + { + struct cache_head *ch; +- int hash; +- hash = hash_ptr(old->ex_client, EXPORT_HASHBITS); +- hash ^= hash_ptr(old->ex_path.dentry, EXPORT_HASHBITS); +- hash ^= hash_ptr(old->ex_path.mnt, EXPORT_HASHBITS); ++ int hash = svc_export_hash(old); + + ch = sunrpc_cache_update(&svc_export_cache, &new->h, + &old->h, +@@ -1071,9 +1077,9 @@ exp_export(struct nfsctl_export *nxp) + err = 0; + finish: + kfree(new.ex_pathname); +- if (exp) ++ if (!IS_ERR_OR_NULL(exp)) + exp_put(exp); +- if (fsid_key && !IS_ERR(fsid_key)) ++ if (!IS_ERR_OR_NULL(fsid_key)) + cache_put(&fsid_key->h, &svc_expkey_cache); + path_put(&path); + out_put_clp: +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 10:00:37.257414684 -0400 +@@ -79,11 +79,6 @@ enum nfs_cb_opnum4 { + cb_sequence_dec_sz + \ + op_dec_sz) + +-struct nfs4_rpc_args { +- void *args_op; +- struct nfsd4_cb_sequence args_seq; +-}; +- + /* + * Generic encode routines from fs/nfs/nfs4xdr.c + */ +@@ -428,13 +423,19 @@ static struct rpc_procinfo nfs4_cb_p + }; + + static struct rpc_version nfs_cb_version4 = { ++/* ++ * Note on the callback rpc program version number: despite language in rfc ++ * 5661 section 18.36.3 requiring servers to use 4 in this field, the ++ * official xdr descriptions for both 4.0 and 4.1 specify version 1, and ++ * in practice that appears to be what implementations use. The section ++ * 18.36.3 language is expected to be fixed in an erratum. ++ */ + .number = 1, + .nrprocs = ARRAY_SIZE(nfs4_cb_procedures), + .procs = nfs4_cb_procedures + }; + + static struct rpc_version * nfs_cb_version[] = { +- NULL, + &nfs_cb_version4, + }; + +@@ -456,15 +457,14 @@ static struct rpc_program cb_program = { + + static int max_cb_time(void) + { +- return max(NFSD_LEASE_TIME/10, (time_t)1) * HZ; ++ return max(nfsd4_lease/10, (time_t)1) * HZ; + } + + /* Reference counting, callback cleanup, etc., all look racy as heck. +- * And why is cb_set an atomic? */ ++ * And why is cl_cb_set an atomic? */ + +-int setup_callback_client(struct nfs4_client *clp) ++int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb) + { +- struct nfs4_cb_conn *cb = &clp->cl_cb_conn; + struct rpc_timeout timeparms = { + .to_initval = max_cb_time(), + .to_retries = 0, +@@ -476,7 +476,7 @@ int setup_callback_client(struct nfs4_cl + .timeout = &timeparms, + .program = &cb_program, + .prognumber = cb->cb_prog, +- .version = nfs_cb_version[1]->number, ++ .version = 0, + .authflavor = clp->cl_flavor, + .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), + .client_name = clp->cl_principal, +@@ -486,7 +486,7 @@ int setup_callback_client(struct nfs4_cl + if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) + return -EINVAL; + if (cb->cb_minorversion) { +- args.bc_xprt = clp->cl_cb_xprt; ++ args.bc_xprt = cb->cb_xprt; + args.protocol = XPRT_TRANSPORT_BC_TCP; + } + /* Create RPC client */ +@@ -496,7 +496,7 @@ int setup_callback_client(struct nfs4_cl + PTR_ERR(client)); + return PTR_ERR(client); + } +- cb->cb_client = client; ++ nfsd4_set_callback_client(clp, client); + return 0; + + } +@@ -514,8 +514,7 @@ static void nfsd4_cb_probe_done(struct r + if (task->tk_status) + warn_no_callback_path(clp, task->tk_status); + else +- atomic_set(&clp->cl_cb_conn.cb_set, 1); +- put_nfs4_client(clp); ++ atomic_set(&clp->cl_cb_set, 1); + } + + static const struct rpc_call_ops nfsd4_cb_probe_ops = { +@@ -537,7 +536,6 @@ int set_callback_cred(void) + + void do_probe_callback(struct nfs4_client *clp) + { +- struct nfs4_cb_conn *cb = &clp->cl_cb_conn; + struct rpc_message msg = { + .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], + .rpc_argp = clp, +@@ -545,34 +543,28 @@ void do_probe_callback(struct nfs4_clien + }; + int status; + +- status = rpc_call_async(cb->cb_client, &msg, ++ status = rpc_call_async(cb->cl_cb_client, &msg, + RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + &nfsd4_cb_probe_ops, (void *)clp); +- if (status) { ++ if (status) + warn_no_callback_path(clp, status); +- put_nfs4_client(clp); +- } + } + + /* + * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... + */ +-void +-nfsd4_probe_callback(struct nfs4_client *clp) ++void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb) + { + int status; + +- BUG_ON(atomic_read(&clp->cl_cb_conn.cb_set)); ++ BUG_ON(atomic_read(&clp->cl_cb_set)); + +- status = setup_callback_client(clp); ++ status = setup_callback_client(clp, cb); + if (status) { + warn_no_callback_path(clp, status); + return; + } + +- /* the task holds a reference to the nfs4_client struct */ +- atomic_inc(&clp->cl_count); +- + do_probe_callback(clp); + } + +@@ -658,18 +650,32 @@ static void nfsd4_cb_done(struct rpc_tas + } + } + ++ + static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) + { + struct nfs4_delegation *dp = calldata; + struct nfs4_client *clp = dp->dl_client; ++ struct rpc_clnt *current_rpc_client = clp->cl_cb_client; + + nfsd4_cb_done(task, calldata); + ++ if (current_rpc_client == NULL) { ++ /* We're shutting down; give up. */ ++ /* XXX: err, or is it ok just to fall through ++ * and rpc_restart_call? */ ++ return; ++ } ++ + switch (task->tk_status) { + case -EIO: + /* Network partition? */ +- atomic_set(&clp->cl_cb_conn.cb_set, 0); ++ atomic_set(&clp->cl_cb_set, 0); + warn_no_callback_path(clp, task->tk_status); ++ if (current_rpc_client != task->tk_client) { ++ /* queue a callback on the new connection: */ ++ nfsd4_cb_recall(dp); ++ return; ++ } + case -EBADHANDLE: + case -NFS4ERR_BAD_STATEID: + /* Race: client probably got cb_recall +@@ -677,7 +683,7 @@ static void nfsd4_cb_recall_done(struct + break; + default: + /* success, or error we can't handle */ +- goto done; ++ return; + } + if (dp->dl_retries--) { + rpc_delay(task, 2*HZ); +@@ -685,20 +691,16 @@ static void nfsd4_cb_recall_done(struct + rpc_restart_call(task); + return; + } else { +- atomic_set(&clp->cl_cb_conn.cb_set, 0); ++ atomic_set(&clp->cl_cb_set, 0); + warn_no_callback_path(clp, task->tk_status); + } +-done: +- kfree(task->tk_msg.rpc_argp); + } + + static void nfsd4_cb_recall_release(void *calldata) + { + struct nfs4_delegation *dp = calldata; +- struct nfs4_client *clp = dp->dl_client; + + nfs4_put_delegation(dp); +- put_nfs4_client(clp); + } + + static const struct rpc_call_ops nfsd4_cb_recall_ops = { +@@ -707,33 +709,75 @@ static const struct rpc_call_ops nfsd4_c + .rpc_release = nfsd4_cb_recall_release, + }; + ++static struct workqueue_struct *callback_wq; ++ ++int nfsd4_create_callback_queue(void) ++{ ++ callback_wq = create_singlethread_workqueue("nfsd4_callbacks"); ++ if (!callback_wq) ++ return -ENOMEM; ++ return 0; ++} ++ ++void nfsd4_destroy_callback_queue(void) ++{ ++ destroy_workqueue(callback_wq); ++} ++ ++/* must be called under the state lock */ ++void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new) ++{ ++ struct rpc_clnt *old = clp->cl_cb_client; ++ ++ clp->cl_cb_client = new; ++ /* ++ * After this, any work that saw the old value of cl_cb_client will ++ * be gone: ++ */ ++ flush_workqueue(callback_wq); ++ /* So we can safely shut it down: */ ++ if (old) ++ rpc_shutdown_client(old); ++} ++ + /* + * called with dp->dl_count inc'ed. + */ +-void +-nfsd4_cb_recall(struct nfs4_delegation *dp) ++static void _nfsd4_cb_recall(struct nfs4_delegation *dp) + { + struct nfs4_client *clp = dp->dl_client; +- struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client; +- struct nfs4_rpc_args *args; ++ struct rpc_clnt *clnt = clp->cl_cb_client; ++ struct nfs4_rpc_args *args = &dp->dl_recall.cb_args; + struct rpc_message msg = { + .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], + .rpc_cred = callback_cred + }; +- int status = -ENOMEM; ++ int status; ++ ++ if (clnt == NULL) ++ return; /* Client is shutting down; give up. */ + +- args = kzalloc(sizeof(*args), GFP_KERNEL); +- if (!args) +- goto out; + args->args_op = dp; + msg.rpc_argp = args; + dp->dl_retries = 1; + status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, + &nfsd4_cb_recall_ops, dp); +-out: +- if (status) { +- kfree(args); +- put_nfs4_client(clp); ++ if (status) + nfs4_put_delegation(dp); +- } ++} ++ ++void nfsd4_do_callback_rpc(struct work_struct *w) ++{ ++ /* XXX: for now, just send off delegation recall. */ ++ /* In future, generalize to handle any sort of callback. */ ++ struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work); ++ struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall); ++ ++ _nfsd4_cb_recall(dp); ++} ++ ++ ++void nfsd4_cb_recall(struct nfs4_delegation *dp) ++{ ++ queue_work(callback_wq, &dp->dl_recall.cb_work); + } +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 09:57:18.237376763 -0400 +@@ -969,20 +969,36 @@ static struct nfsd4_operation nfsd4_ops[ + static const char *nfsd4_op_name(unsigned opnum); + + /* +- * Enforce NFSv4.1 COMPOUND ordering rules. ++ * Enforce NFSv4.1 COMPOUND ordering rules: + * +- * TODO: +- * - enforce NFS4ERR_NOT_ONLY_OP, +- * - DESTROY_SESSION MUST be the final operation in the COMPOUND request. ++ * Also note, enforced elsewhere: ++ * - SEQUENCE other than as first op results in ++ * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().) ++ * - BIND_CONN_TO_SESSION must be the only op in its compound ++ * (Will be enforced in nfsd4_bind_conn_to_session().) ++ * - DESTROY_SESSION must be the final operation in a compound, if ++ * sessionid's in SEQUENCE and DESTROY_SESSION are the same. ++ * (Enforced in nfsd4_destroy_session().) + */ +-static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args) ++static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args) + { +- if (args->minorversion && args->opcnt > 0) { +- struct nfsd4_op *op = &args->ops[0]; +- return (op->status == nfserr_op_illegal) || +- (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP); +- } +- return true; ++ struct nfsd4_op *op = &args->ops[0]; ++ ++ /* These ordering requirements don't apply to NFSv4.0: */ ++ if (args->minorversion == 0) ++ return nfs_ok; ++ /* This is weird, but OK, not our problem: */ ++ if (args->opcnt == 0) ++ return nfs_ok; ++ if (op->status == nfserr_op_illegal) ++ return nfs_ok; ++ if (!(nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP)) ++ return nfserr_op_not_in_session; ++ if (op->opnum == OP_SEQUENCE) ++ return nfs_ok; ++ if (args->opcnt != 1) ++ return nfserr_not_only_op; ++ return nfs_ok; + } + + /* +@@ -1012,6 +1028,7 @@ nfsd4_proc_compound(struct svc_rqst *rqs + resp->rqstp = rqstp; + resp->cstate.minorversion = args->minorversion; + resp->cstate.replay_owner = NULL; ++ resp->cstate.session = NULL; + fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); + fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); + /* Use the deferral mechanism only for NFSv4.0 compounds */ +@@ -1024,13 +1041,13 @@ nfsd4_proc_compound(struct svc_rqst *rqs + if (args->minorversion > nfsd_supported_minorversion) + goto out; + +- if (!nfs41_op_ordering_ok(args)) { ++ status = nfs41_check_op_ordering(args); ++ if (status) { + op = &args->ops[0]; +- op->status = nfserr_sequence_pos; ++ op->status = status; + goto encode_op; + } + +- status = nfs_ok; + while (!status && resp->opcnt < args->opcnt) { + op = &args->ops[resp->opcnt++]; + +@@ -1295,6 +1312,11 @@ static struct nfsd4_operation nfsd4_ops[ + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_SEQUENCE", + }, ++ [OP_RECLAIM_COMPLETE] = { ++ .op_func = (nfsd4op_func)nfsd4_reclaim_complete, ++ .op_flags = ALLOWED_WITHOUT_FH, ++ .op_name = "OP_RECLAIM_COMPLETE", ++ }, + }; + + static const char *nfsd4_op_name(unsigned opnum) +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 09:57:18.240356512 -0400 +@@ -45,8 +45,8 @@ + #define NFSDDBG_FACILITY NFSDDBG_PROC + + /* Globals */ +-static time_t lease_time = 90; /* default lease time */ +-static time_t user_lease_time = 90; ++time_t nfsd4_lease = 90; /* default lease time */ ++time_t nfsd4_grace = 90; + static time_t boot_time; + static u32 current_ownerid = 1; + static u32 current_fileid = 1; +@@ -190,7 +190,7 @@ alloc_init_deleg(struct nfs4_client *clp + dp->dl_vfs_file = stp->st_vfs_file; + dp->dl_type = type; + dp->dl_ident = cb->cb_ident; +- dp->dl_stateid.si_boot = get_seconds(); ++ dp->dl_stateid.si_boot = boot_time; + dp->dl_stateid.si_stateownerid = current_delegid++; + dp->dl_stateid.si_fileid = 0; + dp->dl_stateid.si_generation = 0; +@@ -199,6 +199,7 @@ alloc_init_deleg(struct nfs4_client *clp + atomic_set(&dp->dl_count, 1); + list_add(&dp->dl_perfile, &fp->fi_delegations); + list_add(&dp->dl_perclnt, &clp->cl_delegations); ++ INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc); + return dp; + } + +@@ -249,6 +250,9 @@ unhash_delegation(struct nfs4_delegation + * SETCLIENTID state + */ + ++/* client_lock protects the client lru list and session hash table */ ++static DEFINE_SPINLOCK(client_lock); ++ + /* Hash tables for nfs4_clientid state */ + #define CLIENT_HASH_BITS 4 + #define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) +@@ -367,7 +371,6 @@ static void release_openowner(struct nfs + nfs4_put_stateowner(sop); + } + +-static DEFINE_SPINLOCK(sessionid_lock); + #define SESSION_HASH_SIZE 512 + static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE]; + +@@ -565,10 +568,10 @@ alloc_init_session(struct svc_rqst *rqst + + new->se_flags = cses->flags; + kref_init(&new->se_ref); +- spin_lock(&sessionid_lock); ++ spin_lock(&client_lock); + list_add(&new->se_hash, &sessionid_hashtbl[idx]); + list_add(&new->se_perclnt, &clp->cl_sessions); +- spin_unlock(&sessionid_lock); ++ spin_unlock(&client_lock); + + status = nfs_ok; + out: +@@ -579,7 +582,7 @@ out_free: + goto out; + } + +-/* caller must hold sessionid_lock */ ++/* caller must hold client_lock */ + static struct nfsd4_session * + find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid) + { +@@ -602,7 +605,7 @@ find_in_sessionid_hashtbl(struct nfs4_se + return NULL; + } + +-/* caller must hold sessionid_lock */ ++/* caller must hold client_lock */ + static void + unhash_session(struct nfsd4_session *ses) + { +@@ -610,15 +613,6 @@ unhash_session(struct nfsd4_session *ses + list_del(&ses->se_perclnt); + } + +-static void +-release_session(struct nfsd4_session *ses) +-{ +- spin_lock(&sessionid_lock); +- unhash_session(ses); +- spin_unlock(&sessionid_lock); +- nfsd4_put_session(ses); +-} +- + void + free_session(struct kref *kref) + { +@@ -634,9 +628,18 @@ free_session(struct kref *kref) + kfree(ses); + } + ++/* must be called under the client_lock */ + static inline void +-renew_client(struct nfs4_client *clp) ++renew_client_locked(struct nfs4_client *clp) + { ++ if (is_client_expired(clp)) { ++ dprintk("%s: client (clientid %08x/%08x) already expired\n", ++ __func__, ++ clp->cl_clientid.cl_boot, ++ clp->cl_clientid.cl_id); ++ return; ++ } ++ + /* + * Move client to the end to the LRU list. + */ +@@ -647,6 +650,14 @@ renew_client(struct nfs4_client *clp) + clp->cl_time = get_seconds(); + } + ++static inline void ++renew_client(struct nfs4_client *clp) ++{ ++ spin_lock(&client_lock); ++ renew_client_locked(clp); ++ spin_unlock(&client_lock); ++} ++ + /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ + static int + STALE_CLIENTID(clientid_t *clid) +@@ -680,27 +691,9 @@ static struct nfs4_client *alloc_client( + return clp; + } + +-static void +-shutdown_callback_client(struct nfs4_client *clp) +-{ +- struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client; +- +- if (clnt) { +- /* +- * Callback threads take a reference on the client, so there +- * should be no outstanding callbacks at this point. +- */ +- clp->cl_cb_conn.cb_client = NULL; +- rpc_shutdown_client(clnt); +- } +-} +- + static inline void + free_client(struct nfs4_client *clp) + { +- shutdown_callback_client(clp); +- if (clp->cl_cb_xprt) +- svc_xprt_put(clp->cl_cb_xprt); + if (clp->cl_cred.cr_group_info) + put_group_info(clp->cl_cred.cr_group_info); + kfree(clp->cl_principal); +@@ -709,10 +702,34 @@ free_client(struct nfs4_client *clp) + } + + void +-put_nfs4_client(struct nfs4_client *clp) ++release_session_client(struct nfsd4_session *session) + { +- if (atomic_dec_and_test(&clp->cl_count)) ++ struct nfs4_client *clp = session->se_client; ++ ++ if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock)) ++ return; ++ if (is_client_expired(clp)) { + free_client(clp); ++ session->se_client = NULL; ++ } else ++ renew_client_locked(clp); ++ spin_unlock(&client_lock); ++ nfsd4_put_session(session); ++} ++ ++/* must be called under the client_lock */ ++static inline void ++unhash_client_locked(struct nfs4_client *clp) ++{ ++ mark_client_expired(clp); ++ list_del(&clp->cl_lru); ++ while (!list_empty(&clp->cl_sessions)) { ++ struct nfsd4_session *ses; ++ ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, ++ se_perclnt); ++ unhash_session(ses); ++ nfsd4_put_session(ses); ++ } + } + + static void +@@ -722,9 +739,6 @@ expire_client(struct nfs4_client *clp) + struct nfs4_delegation *dp; + struct list_head reaplist; + +- dprintk("NFSD: expire_client cl_count %d\n", +- atomic_read(&clp->cl_count)); +- + INIT_LIST_HEAD(&reaplist); + spin_lock(&recall_lock); + while (!list_empty(&clp->cl_delegations)) { +@@ -740,20 +754,20 @@ expire_client(struct nfs4_client *clp) + list_del_init(&dp->dl_recall_lru); + unhash_delegation(dp); + } +- list_del(&clp->cl_idhash); +- list_del(&clp->cl_strhash); +- list_del(&clp->cl_lru); + while (!list_empty(&clp->cl_openowners)) { + sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); + release_openowner(sop); + } +- while (!list_empty(&clp->cl_sessions)) { +- struct nfsd4_session *ses; +- ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, +- se_perclnt); +- release_session(ses); +- } +- put_nfs4_client(clp); ++ nfsd4_set_callback_client(clp, NULL); ++ if (clp->cl_cb_conn.cb_xprt) ++ svc_xprt_put(clp->cl_cb_conn.cb_xprt); ++ list_del(&clp->cl_idhash); ++ list_del(&clp->cl_strhash); ++ spin_lock(&client_lock); ++ unhash_client_locked(clp); ++ if (atomic_read(&clp->cl_refcount) == 0) ++ free_client(clp); ++ spin_unlock(&client_lock); + } + + static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) +@@ -839,14 +853,15 @@ static struct nfs4_client *create_client + } + + memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); +- atomic_set(&clp->cl_count, 1); +- atomic_set(&clp->cl_cb_conn.cb_set, 0); ++ atomic_set(&clp->cl_refcount, 0); ++ atomic_set(&clp->cl_cb_set, 0); + INIT_LIST_HEAD(&clp->cl_idhash); + INIT_LIST_HEAD(&clp->cl_strhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); + INIT_LIST_HEAD(&clp->cl_sessions); + INIT_LIST_HEAD(&clp->cl_lru); ++ clp->cl_time = get_seconds(); + clear_bit(0, &clp->cl_cb_slot_busy); + rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); + copy_verf(clp, verf); +@@ -877,8 +892,7 @@ add_to_unconfirmed(struct nfs4_client *c + list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]); + idhashval = clientid_hashval(clp->cl_clientid.cl_id); + list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]); +- list_add_tail(&clp->cl_lru, &client_lru); +- clp->cl_time = get_seconds(); ++ renew_client(clp); + } + + static void +@@ -888,10 +902,9 @@ move_to_confirmed(struct nfs4_client *cl + unsigned int strhashval; + + dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); +- list_del_init(&clp->cl_strhash); + list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); + strhashval = clientstr_hashval(clp->cl_recdir); +- list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); ++ list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); + renew_client(clp); + } + +@@ -1327,15 +1340,9 @@ nfsd4_create_session(struct svc_rqst *rq + cs_slot->sl_seqid++; /* from 0 to 1 */ + move_to_confirmed(unconf); + +- /* +- * We do not support RDMA or persistent sessions +- */ +- cr_ses->flags &= ~SESSION4_PERSIST; +- cr_ses->flags &= ~SESSION4_RDMA; +- + if (cr_ses->flags & SESSION4_BACK_CHAN) { +- unconf->cl_cb_xprt = rqstp->rq_xprt; +- svc_xprt_get(unconf->cl_cb_xprt); ++ unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; ++ svc_xprt_get(rqstp->rq_xprt); + rpc_copy_addr( + (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, + sa); +@@ -1344,7 +1351,7 @@ nfsd4_create_session(struct svc_rqst *rq + cstate->minorversion; + unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; + unconf->cl_cb_seq_nr = 1; +- nfsd4_probe_callback(unconf); ++ nfsd4_probe_callback(unconf, &unconf->cl_cb_conn); + } + conf = unconf; + } else { +@@ -1352,6 +1359,12 @@ nfsd4_create_session(struct svc_rqst *rq + goto out; + } + ++ /* ++ * We do not support RDMA or persistent sessions ++ */ ++ cr_ses->flags &= ~SESSION4_PERSIST; ++ cr_ses->flags &= ~SESSION4_RDMA; ++ + status = alloc_init_session(rqstp, conf, cr_ses); + if (status) + goto out; +@@ -1369,6 +1382,21 @@ out: + return status; + } + ++static bool nfsd4_last_compound_op(struct svc_rqst *rqstp) ++{ ++ struct nfsd4_compoundres *resp = rqstp->rq_resp; ++ struct nfsd4_compoundargs *argp = rqstp->rq_argp; ++ ++ return argp->opcnt == resp->opcnt; ++} ++ ++static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) ++{ ++ if (!session) ++ return 0; ++ return !memcmp(sid, &session->se_sessionid, sizeof(*sid)); ++} ++ + __be32 + nfsd4_destroy_session(struct svc_rqst *r, + struct nfsd4_compound_state *cstate, +@@ -1384,19 +1412,25 @@ nfsd4_destroy_session(struct svc_rqst *r + * - Do we need to clear any callback info from previous session? + */ + ++ if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) { ++ if (!nfsd4_last_compound_op(r)) ++ return nfserr_not_only_op; ++ } + dump_sessionid(__func__, &sessionid->sessionid); +- spin_lock(&sessionid_lock); ++ spin_lock(&client_lock); + ses = find_in_sessionid_hashtbl(&sessionid->sessionid); + if (!ses) { +- spin_unlock(&sessionid_lock); ++ spin_unlock(&client_lock); + goto out; + } + + unhash_session(ses); +- spin_unlock(&sessionid_lock); ++ spin_unlock(&client_lock); + ++ nfs4_lock_state(); + /* wait for callbacks */ +- shutdown_callback_client(ses->se_client); ++ nfsd4_set_callback_client(ses->se_client, NULL); ++ nfs4_unlock_state(); + nfsd4_put_session(ses); + status = nfs_ok; + out: +@@ -1417,7 +1451,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, + if (resp->opcnt != 1) + return nfserr_sequence_pos; + +- spin_lock(&sessionid_lock); ++ spin_lock(&client_lock); + status = nfserr_badsession; + session = find_in_sessionid_hashtbl(&seq->sessionid); + if (!session) +@@ -1456,23 +1490,47 @@ nfsd4_sequence(struct svc_rqst *rqstp, + cstate->slot = slot; + cstate->session = session; + +- /* Hold a session reference until done processing the compound: +- * nfsd4_put_session called only if the cstate slot is set. +- */ +- nfsd4_get_session(session); + out: +- spin_unlock(&sessionid_lock); +- /* Renew the clientid on success and on replay */ ++ /* Hold a session reference until done processing the compound. */ + if (cstate->session) { +- nfs4_lock_state(); +- renew_client(session->se_client); +- nfs4_unlock_state(); ++ nfsd4_get_session(cstate->session); ++ atomic_inc(&session->se_client->cl_refcount); + } ++ spin_unlock(&client_lock); + dprintk("%s: return %d\n", __func__, ntohl(status)); + return status; + } + + __be32 ++nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) ++{ ++ if (rc->rca_one_fs) { ++ if (!cstate->current_fh.fh_dentry) ++ return nfserr_nofilehandle; ++ /* ++ * We don't take advantage of the rca_one_fs case. ++ * That's OK, it's optional, we can safely ignore it. ++ */ ++ return nfs_ok; ++ } ++ nfs4_lock_state(); ++ if (is_client_expired(cstate->session->se_client)) { ++ nfs4_unlock_state(); ++ /* ++ * The following error isn't really legal. ++ * But we only get here if the client just explicitly ++ * destroyed the client. Surely it no longer cares what ++ * error it gets back on an operation for the dead ++ * client. ++ */ ++ return nfserr_stale_clientid; ++ } ++ nfsd4_create_clid_dir(cstate->session->se_client); ++ nfs4_unlock_state(); ++ return nfs_ok; ++} ++ ++__be32 + nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_setclientid *setclid) + { +@@ -1631,9 +1689,8 @@ nfsd4_setclientid_confirm(struct svc_rqs + if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) + status = nfserr_clid_inuse; + else { +- /* XXX: We just turn off callbacks until we can handle +- * change request correctly. */ +- atomic_set(&conf->cl_cb_conn.cb_set, 0); ++ atomic_set(&conf->cl_cb_set, 0); ++ nfsd4_probe_callback(conf, &unconf->cl_cb_conn); + expire_client(unconf); + status = nfs_ok; + +@@ -1667,7 +1724,7 @@ nfsd4_setclientid_confirm(struct svc_rqs + } + move_to_confirmed(unconf); + conf = unconf; +- nfsd4_probe_callback(conf); ++ nfsd4_probe_callback(conf, &conf->cl_cb_conn); + status = nfs_ok; + } + } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) +@@ -1700,12 +1757,12 @@ alloc_init_file(struct inode *ino) + INIT_LIST_HEAD(&fp->fi_hash); + INIT_LIST_HEAD(&fp->fi_stateids); + INIT_LIST_HEAD(&fp->fi_delegations); +- spin_lock(&recall_lock); +- list_add(&fp->fi_hash, &file_hashtbl[hashval]); +- spin_unlock(&recall_lock); + fp->fi_inode = igrab(ino); + fp->fi_id = current_fileid++; + fp->fi_had_conflict = false; ++ spin_lock(&recall_lock); ++ list_add(&fp->fi_hash, &file_hashtbl[hashval]); ++ spin_unlock(&recall_lock); + return fp; + } + return NULL; +@@ -1827,7 +1884,7 @@ init_stateid(struct nfs4_stateid *stp, s + stp->st_stateowner = sop; + get_nfs4_file(fp); + stp->st_file = fp; +- stp->st_stateid.si_boot = get_seconds(); ++ stp->st_stateid.si_boot = boot_time; + stp->st_stateid.si_stateownerid = sop->so_id; + stp->st_stateid.si_fileid = fp->fi_id; + stp->st_stateid.si_generation = 0; +@@ -2028,7 +2085,6 @@ void nfsd_break_deleg_cb(struct file_loc + * lock) we know the server hasn't removed the lease yet, we know + * it's safe to take a reference: */ + atomic_inc(&dp->dl_count); +- atomic_inc(&dp->dl_client->cl_count); + + spin_lock(&recall_lock); + list_add_tail(&dp->dl_recall_lru, &del_recall_lru); +@@ -2347,7 +2403,7 @@ nfs4_open_delegation(struct svc_fh *fh, + { + struct nfs4_delegation *dp; + struct nfs4_stateowner *sop = stp->st_stateowner; +- struct nfs4_cb_conn *cb = &sop->so_client->cl_cb_conn; ++ int cb_up = atomic_read(&sop->so_client->cl_cb_set); + struct file_lock fl, *flp = &fl; + int status, flag = 0; + +@@ -2355,7 +2411,7 @@ nfs4_open_delegation(struct svc_fh *fh, + open->op_recall = 0; + switch (open->op_claim_type) { + case NFS4_OPEN_CLAIM_PREVIOUS: +- if (!atomic_read(&cb->cb_set)) ++ if (!cb_up) + open->op_recall = 1; + flag = open->op_delegate_type; + if (flag == NFS4_OPEN_DELEGATE_NONE) +@@ -2366,7 +2422,7 @@ nfs4_open_delegation(struct svc_fh *fh, + * had the chance to reclaim theirs.... */ + if (locks_in_grace()) + goto out; +- if (!atomic_read(&cb->cb_set) || !sop->so_confirmed) ++ if (!cb_up || !sop->so_confirmed) + goto out; + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) + flag = NFS4_OPEN_DELEGATE_WRITE; +@@ -2483,10 +2539,8 @@ nfsd4_process_open2(struct svc_rqst *rqs + } + memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); + +- if (nfsd4_has_session(&resp->cstate)) { ++ if (nfsd4_has_session(&resp->cstate)) + open->op_stateowner->so_confirmed = 1; +- nfsd4_create_clid_dir(open->op_stateowner->so_client); +- } + + /* + * Attempt to hand out a delegation. No error return, because the +@@ -2537,7 +2591,7 @@ nfsd4_renew(struct svc_rqst *rqstp, stru + renew_client(clp); + status = nfserr_cb_path_down; + if (!list_empty(&clp->cl_delegations) +- && !atomic_read(&clp->cl_cb_conn.cb_set)) ++ && !atomic_read(&clp->cl_cb_set)) + goto out; + status = nfs_ok; + out: +@@ -2554,6 +2608,12 @@ nfsd4_end_grace(void) + dprintk("NFSD: end of grace period\n"); + nfsd4_recdir_purge_old(); + locks_end_grace(&nfsd4_manager); ++ /* ++ * Now that every NFSv4 client has had the chance to recover and ++ * to see the (possibly new, possibly shorter) lease time, we ++ * can safely set the next grace time to the current lease time: ++ */ ++ nfsd4_grace = nfsd4_lease; + } + + static time_t +@@ -2563,15 +2623,17 @@ nfs4_laundromat(void) + struct nfs4_stateowner *sop; + struct nfs4_delegation *dp; + struct list_head *pos, *next, reaplist; +- time_t cutoff = get_seconds() - NFSD_LEASE_TIME; +- time_t t, clientid_val = NFSD_LEASE_TIME; +- time_t u, test_val = NFSD_LEASE_TIME; ++ time_t cutoff = get_seconds() - nfsd4_lease; ++ time_t t, clientid_val = nfsd4_lease; ++ time_t u, test_val = nfsd4_lease; + + nfs4_lock_state(); + + dprintk("NFSD: laundromat service - starting\n"); + if (locks_in_grace()) + nfsd4_end_grace(); ++ INIT_LIST_HEAD(&reaplist); ++ spin_lock(&client_lock); + list_for_each_safe(pos, next, &client_lru) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { +@@ -2580,12 +2642,22 @@ nfs4_laundromat(void) + clientid_val = t; + break; + } ++ if (atomic_read(&clp->cl_refcount)) { ++ dprintk("NFSD: client in use (clientid %08x)\n", ++ clp->cl_clientid.cl_id); ++ continue; ++ } ++ unhash_client_locked(clp); ++ list_add(&clp->cl_lru, &reaplist); ++ } ++ spin_unlock(&client_lock); ++ list_for_each_safe(pos, next, &reaplist) { ++ clp = list_entry(pos, struct nfs4_client, cl_lru); + dprintk("NFSD: purging unused client (clientid %08x)\n", + clp->cl_clientid.cl_id); + nfsd4_remove_clid_dir(clp); + expire_client(clp); + } +- INIT_LIST_HEAD(&reaplist); + spin_lock(&recall_lock); + list_for_each_safe(pos, next, &del_recall_lru) { + dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); +@@ -2605,7 +2677,7 @@ nfs4_laundromat(void) + list_del_init(&dp->dl_recall_lru); + unhash_delegation(dp); + } +- test_val = NFSD_LEASE_TIME; ++ test_val = nfsd4_lease; + list_for_each_safe(pos, next, &close_lru) { + sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); + if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) { +@@ -2661,39 +2733,11 @@ nfs4_check_fh(struct svc_fh *fhp, struct + static int + STALE_STATEID(stateid_t *stateid) + { +- if (time_after((unsigned long)boot_time, +- (unsigned long)stateid->si_boot)) { +- dprintk("NFSD: stale stateid " STATEID_FMT "!\n", +- STATEID_VAL(stateid)); +- return 1; +- } +- return 0; +-} +- +-static int +-EXPIRED_STATEID(stateid_t *stateid) +-{ +- if (time_before((unsigned long)boot_time, +- ((unsigned long)stateid->si_boot)) && +- time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) { +- dprintk("NFSD: expired stateid " STATEID_FMT "!\n", +- STATEID_VAL(stateid)); +- return 1; +- } +- return 0; +-} +- +-static __be32 +-stateid_error_map(stateid_t *stateid) +-{ +- if (STALE_STATEID(stateid)) +- return nfserr_stale_stateid; +- if (EXPIRED_STATEID(stateid)) +- return nfserr_expired; +- +- dprintk("NFSD: bad stateid " STATEID_FMT "!\n", ++ if (stateid->si_boot == boot_time) ++ return 0; ++ dprintk("NFSD: stale stateid " STATEID_FMT "!\n", + STATEID_VAL(stateid)); +- return nfserr_bad_stateid; ++ return 1; + } + + static inline int +@@ -2817,10 +2861,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_ + status = nfserr_bad_stateid; + if (is_delegation_stateid(stateid)) { + dp = find_delegation_stateid(ino, stateid); +- if (!dp) { +- status = stateid_error_map(stateid); ++ if (!dp) + goto out; +- } + status = check_stateid_generation(stateid, &dp->dl_stateid, + flags); + if (status) +@@ -2833,10 +2875,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_ + *filpp = dp->dl_vfs_file; + } else { /* open or lock stateid */ + stp = find_stateid(stateid, flags); +- if (!stp) { +- status = stateid_error_map(stateid); ++ if (!stp) + goto out; +- } + if (nfs4_check_fh(current_fh, stp)) + goto out; + if (!stp->st_stateowner->so_confirmed) +@@ -2908,7 +2948,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_co + */ + sop = search_close_lru(stateid->si_stateownerid, flags); + if (sop == NULL) +- return stateid_error_map(stateid); ++ return nfserr_bad_stateid; + *sopp = sop; + goto check_replay; + } +@@ -3175,10 +3215,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp + if (!is_delegation_stateid(stateid)) + goto out; + dp = find_delegation_stateid(inode, stateid); +- if (!dp) { +- status = stateid_error_map(stateid); ++ if (!dp) + goto out; +- } + status = check_stateid_generation(stateid, &dp->dl_stateid, flags); + if (status) + goto out; +@@ -3404,7 +3442,7 @@ alloc_init_lock_stateid(struct nfs4_stat + stp->st_stateowner = sop; + get_nfs4_file(fp); + stp->st_file = fp; +- stp->st_stateid.si_boot = get_seconds(); ++ stp->st_stateid.si_boot = boot_time; + stp->st_stateid.si_stateownerid = sop->so_id; + stp->st_stateid.si_fileid = fp->fi_id; + stp->st_stateid.si_generation = 0; +@@ -3976,12 +4014,6 @@ nfsd4_load_reboot_recovery_data(void) + printk("NFSD: Failure reading reboot recovery data\n"); + } + +-unsigned long +-get_nfs4_grace_period(void) +-{ +- return max(user_lease_time, lease_time) * HZ; +-} +- + /* + * Since the lifetime of a delegation isn't limited to that of an open, a + * client may quite reasonably hang on to a delegation as long as it has +@@ -4008,20 +4040,27 @@ set_max_delegations(void) + static int + __nfs4_state_start(void) + { +- unsigned long grace_time; ++ int ret; + + boot_time = get_seconds(); +- grace_time = get_nfs4_grace_period(); +- lease_time = user_lease_time; + locks_start_grace(&nfsd4_manager); + printk(KERN_INFO "NFSD: starting %ld-second grace period\n", +- grace_time/HZ); ++ nfsd4_grace); ++ ret = set_callback_cred(); ++ if (ret) ++ return -ENOMEM; + laundry_wq = create_singlethread_workqueue("nfsd4"); + if (laundry_wq == NULL) + return -ENOMEM; +- queue_delayed_work(laundry_wq, &laundromat_work, grace_time); ++ ret = nfsd4_create_callback_queue(); ++ if (ret) ++ goto out_free_laundry; ++ queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ); + set_max_delegations(); +- return set_callback_cred(); ++ return 0; ++out_free_laundry: ++ destroy_workqueue(laundry_wq); ++ return ret; + } + + int +@@ -4039,12 +4078,6 @@ nfs4_state_start(void) + return 0; + } + +-time_t +-nfs4_lease_time(void) +-{ +- return lease_time; +-} +- + static void + __nfs4_state_shutdown(void) + { +@@ -4089,6 +4122,7 @@ nfs4_state_shutdown(void) + nfs4_lock_state(); + nfs4_release_reclaim(); + __nfs4_state_shutdown(); ++ nfsd4_destroy_callback_queue(); + nfs4_unlock_state(); + } + +@@ -4128,21 +4162,3 @@ nfs4_recoverydir(void) + { + return user_recovery_dirname; + } +- +-/* +- * Called when leasetime is changed. +- * +- * The only way the protocol gives us to handle on-the-fly lease changes is to +- * simulate a reboot. Instead of doing that, we just wait till the next time +- * we start to register any changes in lease time. If the administrator +- * really wants to change the lease time *now*, they can go ahead and bring +- * nfsd down and then back up again after changing the lease time. +- * +- * user_lease_time is protected by nfsd_mutex since it's only really accessed +- * when nfsd is starting +- */ +-void +-nfs4_reset_lease(time_t leasetime) +-{ +- user_lease_time = leasetime; +-} +diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c +--- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 09:57:20.629370282 -0400 +@@ -46,6 +46,7 @@ enum { + */ + #ifdef CONFIG_NFSD_V4 + NFSD_Leasetime, ++ NFSD_Gracetime, + NFSD_RecoveryDir, + #endif + }; +@@ -70,6 +71,7 @@ static ssize_t write_ports(struct file * + static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); + #ifdef CONFIG_NFSD_V4 + static ssize_t write_leasetime(struct file *file, char *buf, size_t size); ++static ssize_t write_gracetime(struct file *file, char *buf, size_t size); + static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); + #endif + +@@ -91,6 +93,7 @@ static ssize_t (*write_op[])(struct file + [NFSD_MaxBlkSize] = write_maxblksize, + #ifdef CONFIG_NFSD_V4 + [NFSD_Leasetime] = write_leasetime, ++ [NFSD_Gracetime] = write_gracetime, + [NFSD_RecoveryDir] = write_recoverydir, + #endif + }; +@@ -1204,29 +1207,45 @@ static ssize_t write_maxblksize(struct f + } + + #ifdef CONFIG_NFSD_V4 +-extern time_t nfs4_leasetime(void); +- +-static ssize_t __write_leasetime(struct file *file, char *buf, size_t size) ++static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) + { +- /* if size > 10 seconds, call +- * nfs4_reset_lease() then write out the new lease (seconds) as reply +- */ + char *mesg = buf; +- int rv, lease; ++ int rv, i; + + if (size > 0) { + if (nfsd_serv) + return -EBUSY; +- rv = get_int(&mesg, &lease); ++ rv = get_int(&mesg, &i); + if (rv) + return rv; +- if (lease < 10 || lease > 3600) ++ /* ++ * Some sanity checking. We don't have a reason for ++ * these particular numbers, but problems with the ++ * extremes are: ++ * - Too short: the briefest network outage may ++ * cause clients to lose all their locks. Also, ++ * the frequent polling may be wasteful. ++ * - Too long: do you really want reboot recovery ++ * to take more than an hour? Or to make other ++ * clients wait an hour before being able to ++ * revoke a dead client's locks? ++ */ ++ if (i < 10 || i > 3600) + return -EINVAL; +- nfs4_reset_lease(lease); ++ *time = i; + } + +- return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", +- nfs4_lease_time()); ++ return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time); ++} ++ ++static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) ++{ ++ ssize_t rv; ++ ++ mutex_lock(&nfsd_mutex); ++ rv = __nfsd4_write_time(file, buf, size, time); ++ mutex_unlock(&nfsd_mutex); ++ return rv; + } + + /** +@@ -1252,12 +1271,22 @@ static ssize_t __write_leasetime(struct + */ + static ssize_t write_leasetime(struct file *file, char *buf, size_t size) + { +- ssize_t rv; ++ return nfsd4_write_time(file, buf, size, &nfsd4_lease); ++} + +- mutex_lock(&nfsd_mutex); +- rv = __write_leasetime(file, buf, size); +- mutex_unlock(&nfsd_mutex); +- return rv; ++/** ++ * write_gracetime - Set or report current NFSv4 grace period time ++ * ++ * As above, but sets the time of the NFSv4 grace period. ++ * ++ * Note this should never be set to less than the *previous* ++ * lease-period time, but we don't try to enforce this. (In the common ++ * case (a new boot), we don't know what the previous lease time was ++ * anyway.) ++ */ ++static ssize_t write_gracetime(struct file *file, char *buf, size_t size) ++{ ++ return nfsd4_write_time(file, buf, size, &nfsd4_grace); + } + + extern char *nfs4_recoverydir(void); +@@ -1351,6 +1380,7 @@ static int nfsd_fill_super(struct super_ + [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, + #ifdef CONFIG_NFSD_V4 + [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, ++ [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, + #endif + /* last one */ {""} +diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h +--- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 09:57:20.629370282 -0400 +@@ -82,7 +82,6 @@ int nfs4_state_init(void); + void nfsd4_free_slabs(void); + int nfs4_state_start(void); + void nfs4_state_shutdown(void); +-time_t nfs4_lease_time(void); + void nfs4_reset_lease(time_t leasetime); + int nfs4_reset_recoverydir(char *recdir); + #else +@@ -90,7 +89,6 @@ static inline int nfs4_state_init(void) + static inline void nfsd4_free_slabs(void) { } + static inline int nfs4_state_start(void) { return 0; } + static inline void nfs4_state_shutdown(void) { } +-static inline time_t nfs4_lease_time(void) { return 0; } + static inline void nfs4_reset_lease(time_t leasetime) { } + static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } + #endif +@@ -229,6 +227,9 @@ extern struct timeval nfssvc_boot; + + #ifdef CONFIG_NFSD_V4 + ++extern time_t nfsd4_lease; ++extern time_t nfsd4_grace; ++ + /* before processing a COMPOUND operation, we have to check that there + * is enough space in the buffer for XDR encode to succeed. otherwise, + * we might process an operation with side effects, and be unable to +@@ -247,7 +248,6 @@ extern struct timeval nfssvc_boot; + #define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ + #define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ + +-#define NFSD_LEASE_TIME (nfs4_lease_time()) + #define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */ + + /* +diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h +--- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 09:57:21.807501619 -0400 +@@ -70,6 +70,16 @@ struct nfsd4_cb_sequence { + struct nfs4_client *cbs_clp; + }; + ++struct nfs4_rpc_args { ++ void *args_op; ++ struct nfsd4_cb_sequence args_seq; ++}; ++ ++struct nfsd4_callback { ++ struct nfs4_rpc_args cb_args; ++ struct work_struct cb_work; ++}; ++ + struct nfs4_delegation { + struct list_head dl_perfile; + struct list_head dl_perclnt; +@@ -86,6 +96,7 @@ struct nfs4_delegation { + stateid_t dl_stateid; + struct knfsd_fh dl_fh; + int dl_retries; ++ struct nfsd4_callback dl_recall; + }; + + /* client delegation callback info */ +@@ -96,9 +107,7 @@ struct nfs4_cb_conn { + u32 cb_prog; + u32 cb_minorversion; + u32 cb_ident; /* minorversion 0 only */ +- /* RPC client info */ +- atomic_t cb_set; /* successful CB_NULL call */ +- struct rpc_clnt * cb_client; ++ struct svc_xprt *cb_xprt; /* minorversion 1 only */ + }; + + /* Maximum number of slots per session. 160 is useful for long haul TCP */ +@@ -157,7 +166,7 @@ struct nfsd4_session { + struct list_head se_hash; /* hash by sessionid */ + struct list_head se_perclnt; + u32 se_flags; +- struct nfs4_client *se_client; /* for expire_client */ ++ struct nfs4_client *se_client; + struct nfs4_sessionid se_sessionid; + struct nfsd4_channel_attrs se_fchannel; + struct nfsd4_channel_attrs se_bchannel; +@@ -212,25 +221,41 @@ struct nfs4_client { + struct svc_cred cl_cred; /* setclientid principal */ + clientid_t cl_clientid; /* generated by server */ + nfs4_verifier cl_confirm; /* generated by server */ +- struct nfs4_cb_conn cl_cb_conn; /* callback info */ +- atomic_t cl_count; /* ref count */ + u32 cl_firststate; /* recovery dir creation */ + ++ /* for v4.0 and v4.1 callbacks: */ ++ struct nfs4_cb_conn cl_cb_conn; ++ struct rpc_clnt *cl_cb_client; ++ atomic_t cl_cb_set; ++ + /* for nfs41 */ + struct list_head cl_sessions; + struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ + u32 cl_exchange_flags; + struct nfs4_sessionid cl_sessionid; ++ /* number of rpc's in progress over an associated session: */ ++ atomic_t cl_refcount; + + /* for nfs41 callbacks */ + /* We currently support a single back channel with a single slot */ + unsigned long cl_cb_slot_busy; + u32 cl_cb_seq_nr; +- struct svc_xprt *cl_cb_xprt; /* 4.1 callback transport */ + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ + }; + ++static inline void ++mark_client_expired(struct nfs4_client *clp) ++{ ++ clp->cl_time = 0; ++} ++ ++static inline bool ++is_client_expired(struct nfs4_client *clp) ++{ ++ return clp->cl_time == 0; ++} ++ + /* struct nfs4_client_reset + * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl + * upon lease reset, or from upcall to state_daemon (to read in state +@@ -377,11 +402,14 @@ extern void nfs4_lock_state(void); + extern void nfs4_unlock_state(void); + extern int nfs4_in_grace(void); + extern __be32 nfs4_check_open_reclaim(clientid_t *clid); +-extern void put_nfs4_client(struct nfs4_client *clp); + extern void nfs4_free_stateowner(struct kref *kref); + extern int set_callback_cred(void); +-extern void nfsd4_probe_callback(struct nfs4_client *clp); ++extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); ++extern void nfsd4_do_callback_rpc(struct work_struct *); + extern void nfsd4_cb_recall(struct nfs4_delegation *dp); ++extern int nfsd4_create_callback_queue(void); ++extern void nfsd4_destroy_callback_queue(void); ++extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *); + extern void nfs4_put_delegation(struct nfs4_delegation *dp); + extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); + extern void nfsd4_init_recdir(char *recdir_name); +@@ -392,6 +420,7 @@ extern int nfs4_has_reclaimed_state(cons + extern void nfsd4_recdir_purge_old(void); + extern int nfsd4_create_clid_dir(struct nfs4_client *clp); + extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); ++extern void release_session_client(struct nfsd4_session *); + + static inline void + nfs4_put_stateowner(struct nfs4_stateowner *so) +diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h +--- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 09:57:23.994379831 -0400 +@@ -381,6 +381,10 @@ struct nfsd4_destroy_session { + struct nfs4_sessionid sessionid; + }; + ++struct nfsd4_reclaim_complete { ++ u32 rca_one_fs; ++}; ++ + struct nfsd4_op { + int opnum; + __be32 status; +@@ -421,6 +425,7 @@ struct nfsd4_op { + struct nfsd4_create_session create_session; + struct nfsd4_destroy_session destroy_session; + struct nfsd4_sequence sequence; ++ struct nfsd4_reclaim_complete reclaim_complete; + } u; + struct nfs4_replay * replay; + }; +@@ -513,9 +518,8 @@ extern void nfsd4_store_cache_entry(stru + extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, + struct nfsd4_sequence *seq); + extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, +- struct nfsd4_compound_state *, +-struct nfsd4_exchange_id *); +- extern __be32 nfsd4_create_session(struct svc_rqst *, ++ struct nfsd4_compound_state *, struct nfsd4_exchange_id *); ++extern __be32 nfsd4_create_session(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_create_session *); + extern __be32 nfsd4_sequence(struct svc_rqst *, +@@ -524,6 +528,7 @@ extern __be32 nfsd4_sequence(struct svc_ + extern __be32 nfsd4_destroy_session(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_destroy_session *); ++__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *); + extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, + struct nfsd4_open *open); + extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h 2010-08-23 09:57:23.994379831 -0400 +@@ -40,12 +40,12 @@ struct nfs_fhbase_old { + * This is the new flexible, extensible style NFSv2/v3 file handle. + * by Neil Brown - March 2000 + * +- * The file handle is seens as a list of 4byte words. +- * The first word contains a version number (1) and four descriptor bytes ++ * The file handle starts with a sequence of four-byte words. ++ * The first word contains a version number (1) and three descriptor bytes + * that tell how the remaining 3 variable length fields should be handled. + * These three bytes are auth_type, fsid_type and fileid_type. + * +- * All 4byte values are in host-byte-order. ++ * All four-byte values are in host-byte-order. + * + * The auth_type field specifies how the filehandle can be authenticated + * This might allow a file to be confirmed to be in a writable part of a +diff -up linux-2.6.34.noarch/net/sunrpc/cache.c.orig linux-2.6.34.noarch/net/sunrpc/cache.c +--- linux-2.6.34.noarch/net/sunrpc/cache.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/cache.c 2010-08-23 09:57:23.995376793 -0400 +@@ -49,11 +49,17 @@ static void cache_init(struct cache_head + h->last_refresh = now; + } + ++static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h) ++{ ++ return (h->expiry_time < get_seconds()) || ++ (detail->flush_time > h->last_refresh); ++} ++ + struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, + struct cache_head *key, int hash) + { + struct cache_head **head, **hp; +- struct cache_head *new = NULL; ++ struct cache_head *new = NULL, *freeme = NULL; + + head = &detail->hash_table[hash]; + +@@ -62,6 +68,9 @@ struct cache_head *sunrpc_cache_lookup(s + for (hp=head; *hp != NULL ; hp = &(*hp)->next) { + struct cache_head *tmp = *hp; + if (detail->match(tmp, key)) { ++ if (cache_is_expired(detail, tmp)) ++ /* This entry is expired, we will discard it. */ ++ break; + cache_get(tmp); + read_unlock(&detail->hash_lock); + return tmp; +@@ -86,6 +95,13 @@ struct cache_head *sunrpc_cache_lookup(s + for (hp=head; *hp != NULL ; hp = &(*hp)->next) { + struct cache_head *tmp = *hp; + if (detail->match(tmp, key)) { ++ if (cache_is_expired(detail, tmp)) { ++ *hp = tmp->next; ++ tmp->next = NULL; ++ detail->entries --; ++ freeme = tmp; ++ break; ++ } + cache_get(tmp); + write_unlock(&detail->hash_lock); + cache_put(new, detail); +@@ -98,6 +114,8 @@ struct cache_head *sunrpc_cache_lookup(s + cache_get(new); + write_unlock(&detail->hash_lock); + ++ if (freeme) ++ cache_put(freeme, detail); + return new; + } + EXPORT_SYMBOL_GPL(sunrpc_cache_lookup); +@@ -183,10 +201,7 @@ static int cache_make_upcall(struct cach + + static inline int cache_is_valid(struct cache_detail *detail, struct cache_head *h) + { +- if (!test_bit(CACHE_VALID, &h->flags) || +- h->expiry_time < get_seconds()) +- return -EAGAIN; +- else if (detail->flush_time > h->last_refresh) ++ if (!test_bit(CACHE_VALID, &h->flags)) + return -EAGAIN; + else { + /* entry is valid */ +diff -up linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig linux-2.6.34.noarch/net/sunrpc/svcsock.c +--- linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/svcsock.c 2010-08-23 09:57:23.997368707 -0400 +@@ -547,7 +547,6 @@ static int svc_udp_recvfrom(struct svc_r + dprintk("svc: recvfrom returned error %d\n", -err); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + } +- svc_xprt_received(&svsk->sk_xprt); + return -EAGAIN; + } + len = svc_addr_len(svc_addr(rqstp)); +@@ -562,11 +561,6 @@ static int svc_udp_recvfrom(struct svc_r + svsk->sk_sk->sk_stamp = skb->tstamp; + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ + +- /* +- * Maybe more packets - kick another thread ASAP. +- */ +- svc_xprt_received(&svsk->sk_xprt); +- + len = skb->len - sizeof(struct udphdr); + rqstp->rq_arg.len = len; + +@@ -917,7 +911,6 @@ static int svc_tcp_recv_record(struct sv + if (len < want) { + dprintk("svc: short recvfrom while reading record " + "length (%d of %d)\n", len, want); +- svc_xprt_received(&svsk->sk_xprt); + goto err_again; /* record header not complete */ + } + +@@ -953,7 +946,6 @@ static int svc_tcp_recv_record(struct sv + if (len < svsk->sk_reclen) { + dprintk("svc: incomplete TCP record (%d of %d)\n", + len, svsk->sk_reclen); +- svc_xprt_received(&svsk->sk_xprt); + goto err_again; /* record not complete */ + } + len = svsk->sk_reclen; +@@ -961,10 +953,8 @@ static int svc_tcp_recv_record(struct sv + + return len; + error: +- if (len == -EAGAIN) { ++ if (len == -EAGAIN) + dprintk("RPC: TCP recv_record got EAGAIN\n"); +- svc_xprt_received(&svsk->sk_xprt); +- } + return len; + err_delete: + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); +@@ -1110,7 +1100,6 @@ out: + svsk->sk_tcplen = 0; + + svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt); +- svc_xprt_received(&svsk->sk_xprt); + if (serv->sv_stats) + serv->sv_stats->nettcpcnt++; + +@@ -1119,7 +1108,6 @@ out: + err_again: + if (len == -EAGAIN) { + dprintk("RPC: TCP recvfrom got EAGAIN\n"); +- svc_xprt_received(&svsk->sk_xprt); + return len; + } + error: +diff -up linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig linux-2.6.34.noarch/net/sunrpc/svc_xprt.c +--- linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/svc_xprt.c 2010-08-23 09:57:23.996377209 -0400 +@@ -744,8 +744,10 @@ int svc_recv(struct svc_rqst *rqstp, lon + if (rqstp->rq_deferred) { + svc_xprt_received(xprt); + len = svc_deferred_recv(rqstp); +- } else ++ } else { + len = xprt->xpt_ops->xpo_recvfrom(rqstp); ++ svc_xprt_received(xprt); ++ } + dprintk("svc: got len=%d\n", len); + } + +@@ -893,12 +895,12 @@ void svc_delete_xprt(struct svc_xprt *xp + */ + if (test_bit(XPT_TEMP, &xprt->xpt_flags)) + serv->sv_tmpcnt--; ++ spin_unlock_bh(&serv->sv_lock); + + while ((dr = svc_deferred_dequeue(xprt)) != NULL) + kfree(dr); + + svc_xprt_put(xprt); +- spin_unlock_bh(&serv->sv_lock); + } + + void svc_close_xprt(struct svc_xprt *xprt) +diff -up linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c.orig linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +--- linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 2010-08-23 09:57:23.998377481 -0400 +@@ -566,7 +566,6 @@ static int rdma_read_complete(struct svc + ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, + rqstp->rq_arg.head[0].iov_len); + +- svc_xprt_received(rqstp->rq_xprt); + return ret; + } + +@@ -665,7 +664,6 @@ int svc_rdma_recvfrom(struct svc_rqst *r + rqstp->rq_arg.head[0].iov_len); + rqstp->rq_prot = IPPROTO_MAX; + svc_xprt_copy_addrs(rqstp, xprt); +- svc_xprt_received(xprt); + return ret; + + close_out: +@@ -678,6 +676,5 @@ int svc_rdma_recvfrom(struct svc_rqst *r + */ + set_bit(XPT_CLOSE, &xprt->xpt_flags); + defer: +- svc_xprt_received(xprt); + return 0; + } diff --git a/pnfs-all-2.6.35-2010-08-19-f13.patch b/pnfs-all-2.6.35-2010-08-19-f13.patch new file mode 100644 index 000000000..a9d78ba0e --- /dev/null +++ b/pnfs-all-2.6.35-2010-08-19-f13.patch @@ -0,0 +1,31788 @@ +diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arch/um/os-Linux/mem.c +--- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-08-23 12:08:27.310584826 -0400 ++++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-08-23 12:09:03.273553977 -0400 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include "init.h" + #include "kern_constants.h" + #include "os.h" +diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd.c +--- linux-2.6.34.noarch/block/genhd.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/block/genhd.c 2010-08-23 12:09:03.273553977 -0400 +@@ -1009,6 +1009,7 @@ static void disk_release(struct device * + struct class block_class = { + .name = "block", + }; ++EXPORT_SYMBOL(block_class); + + static char *block_devnode(struct device *dev, mode_t *mode) + { +diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt +--- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-23 12:09:03.274563927 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-08-23 12:09:03.274563927 -0400 +@@ -0,0 +1,211 @@ ++(c) 2007 Network Appliance Inc. ++ ++spNFS ++----- ++ ++An spNFS system consists of a Meta Data Server (MDS), a number of Client machines (C) and a number of Data Servers (DS). ++ ++A file system is mounted by the clients from the MDS, and all file data ++is striped across the DSs. ++ ++Identify the machines that will be filling each of these roles. ++ ++The spnfs kernel will be installed on all machines: clients, the MDS and DSs. ++ ++ ++Building and installing the spNFS kernel ++---------------------------------------- ++ ++Get the spNFS kernel from: ++ ++ git://linux-nfs.org/~bhalevy/linux-pnfs.git ++ ++Use the pnfs-all-latest branch and add these options to your .config file ++ ++ CONFIG_NETWORK_FILESYSTEMS=y ++ CONFIG_NFS_FS=m ++ CONFIG_NFS_V4=y ++ CONFIG_NFS_V4_1=y ++ CONFIG_PNFS=y ++ CONFIG_NFSD=m ++ CONFIG_PNFSD=y ++ # CONFIG_PNFSD_LOCAL_EXPORT is not set ++ CONFIG_SPNFS=y ++ ++By default, spNFS uses whole-file layouts. Layout segments can be enabled ++by adding: ++ ++ CONFIG_SPNFS_LAYOUTSEGMENTS=y ++ ++to your .config file. ++ ++Building and installation of kernel+modules is as usual. ++This kernel should be installed and booted on the client, MDS and DSs. ++ ++Note that CONFIG_PNFSD_LOCAL_EXPORT must be disabled for spnfs as it ++takes over the pnfs export interface. ++ ++Building nfs-utils ++------------------ ++ ++Get the nfs-utils package containing spnfsd from: ++ ++ git://linux-nfs.org/~bhalevy/pnfs-nfs-utils.git ++ ++Follow the standard instructions for building nfs-utils. ++ ++After building, the spnfsd daemon will be located in utils/spnfsd. The spnfsd ++daemon will only be needed on the MDS. ++ ++ ++Installation ++------------ ++ ++The nfs-utils package contains a default spnfsd.conf file in ++utils/spnfsd/spnfsd.conf. Copy this file to /etc/spnfsd.conf. ++ ++By default, the DS-Mount-Directory is set to /spnfs (see spnfsd.conf). Under ++this directory, mount points must be created for each DS to ++be used for pNFS data stripes. These mount points are named by the ip address ++of the corresponding DS. In the sample spnfsd.conf, there are two ++DSs defined (172.16.28.134 and 172.16.28.141). ++ ++Following the sample spnfsd.conf, ++ ++ mkdir /spnfs ++ ++on the MDS (corresponding to DS-Mount-Directory). Then ++ ++ mkdir /spnfs/172.16.28.134 ++ mkdir /spnfs/172.16.28.141 ++ ++to create the mount points for the DSs. ++ ++On the DSs, chose a directory where data stripes will be created by the MDS. ++For the sample file, this directory is /pnfs, so on each DS execute: ++ ++ mkdir /pnfs ++ ++This directory is specified in the spnfsd.conf file by the DS*_ROOT option ++(where * is replaced by the DS number). DS_ROOT is specified relative to ++the directory being exported by the DSs. In our example, our DSs are exporting ++the root directory (/) and therefore our DS_ROOT is /pnfs. On the DSs, we have ++the following entry in /etc/exports: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check) ++ ++N.B. If we had created a /exports directory and a /pnfs directory under ++/exports, and if we were exporting /exports, then DS_ROOT would still be /pnfs ++(not /exports/pnfs). ++ ++It may be useful to add entries to /etc/fstab on the MDS to automatically ++mount the DS_ROOT file systems. For this example, our MDS fstab would ++contain: ++ ++ 172.17.84.128:/pnfs /spnfs/172.17.84.128 nfs defaults 1 2 ++ 172.17.84.122:/pnfs /spnfs/172.17.84.122 nfs defaults 1 2 ++ ++The DS mounts must be performed manually or via fstab at this time (automatic ++mounting, directory creation, etc. are on the todo list). To perform I/O ++through the MDS, the DS mounts MUST use NFSv3 at this time (this restriction ++will eventually be removed). ++ ++ ++On the MDS, choose a file system to use with spNFS and export it, e.g.: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check,pnfs) ++ ++Make sure nfsd and all supporting processes are running on the MDS and DSs. ++ ++ ++Running ++------- ++ ++If rpc_pipefs is not already mounted (if you're running idmapd it probably is), ++you may want to add the following line to /etc/fstab: ++ ++ rpc_pipefs /var/lib/nfs/rpc_pipefs rpc_pipefs defaults 0 0 ++ ++to automatically mount rpc_pipefs. ++ ++With spnfsd.conf configured for your environment and the mounts mounted as ++described above, spnfsd can now be started. ++ ++On the MDS, execute spnfsd: ++ ++ spnfsd ++ ++The executable is located in the directory where it was built, and ++may also have been installed elsewhere depending on how you built nfs-utils. ++It will run in the foreground by default, and in fact will do so despite ++any options suggesting the contrary (it's still a debugging build). ++ ++On the client, make sure the nfslayoutdriver module is loaded: ++ ++ modprobe nfslayoutdriver ++ ++Then mount the file system from the MDS: ++ ++ mount -t nfs4 -o minorversion=1 mds:/ /mnt ++ ++I/O through the MDS is now supported. To use it, do not load the ++nfslayoutdriver on the client, and mount the MDS using NFSv4 or 4.1 ++(NFSv2 and v3 are not yet supported). ++ ++You may now use spNFS by performing file system activities in /mnt. ++If you create files in /mnt, you should see stripe files corresponding to ++new files being created on the DSs. The current implementation names the ++stripe files based on the inode number of the file on the MDS. For example, ++if you create a file foo in /mnt and do an 'ls -li /mnt/foo': ++ ++ # ls -li foo ++ 1233 -rw-r--r-- 1 root root 0 Nov 29 15:54 foo ++ ++You should see stripe files on each under /pnfs (per the sample) named ++1233. The file /pnfs/1233 on DS1 will contain the first bytes ++of data written to foo, DS2 will contain the next bytes, etc. ++Removing /mnt/foo will remove the corresponding stripe files on the DSs. ++Other file system operations should behave (mostly :-) as expected. ++ ++ ++Layout Segments ++--------------- ++ ++If the kernel is compiled to support layout segments, there will ++be two files created under /proc/fs/spnfs for controlling layout ++segment functionality. ++ ++To enable layout segments, write a '1' to /proc/fs/spnfs/layoutseg, e.g.: ++ ++ echo 1 > /proc/fs/spnfs/layoutseg ++ ++Layout segments can be disabled (returning to whole-file layouts) by ++writing a '0' to /proc/fs/spnfs/layoutseg: ++ ++ echo 0 > /proc/fs/spnfs/layoutseg ++ ++When layout segments are enabled, the size of the layouts returned can ++be specified by writing a decimal number (ascii representation) to ++/proc/fs/spnfs/layoutsegsize: ++ ++ echo 1024 > /proc/fs/spnfs/layoutsegsize ++ ++The value'0' has a special meaning--it causes the server to return a ++layout that is exactly the size requested by the client: ++ ++ echo 0 > /proc/fs/spnfs/layoutsegsize ++ ++ ++Troubleshooting ++--------------- ++ ++If you see data being written to the files on the MDS rather than ++the stripe files, make sure the nfslayoutdriver is loaded on the client ++(see above). ++ ++If you get a "permission denied" error, make sure mountd is running on the mds ++(it occasionally fails to start). ++ ++Bugs, enhancements, compliments, complaints to: dmuntz@netapp.com ++ ++ +diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/drivers/md/dm-ioctl.c +--- linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-08-23 12:09:03.275584050 -0400 +@@ -651,6 +651,12 @@ static int dev_create(struct dm_ioctl *p + return r; + } + ++int dm_dev_create(struct dm_ioctl *param) ++{ ++ return dev_create(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_create); ++ + /* + * Always use UUID for lookups if it's present, otherwise use name or dev. + */ +@@ -745,6 +751,12 @@ static int dev_remove(struct dm_ioctl *p + return 0; + } + ++int dm_dev_remove(struct dm_ioctl *param) ++{ ++ return dev_remove(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_remove); ++ + /* + * Check a string doesn't overrun the chunk of + * memory we copied from userland. +@@ -917,6 +929,12 @@ static int do_resume(struct dm_ioctl *pa + return r; + } + ++int dm_do_resume(struct dm_ioctl *param) ++{ ++ return do_resume(param); ++} ++EXPORT_SYMBOL(dm_do_resume); ++ + /* + * Set or unset the suspension state of a device. + * If the device already is in the requested state we just return its status. +@@ -1194,6 +1212,12 @@ out: + return r; + } + ++int dm_table_load(struct dm_ioctl *param, size_t param_size) ++{ ++ return table_load(param, param_size); ++} ++EXPORT_SYMBOL(dm_table_load); ++ + static int table_clear(struct dm_ioctl *param, size_t param_size) + { + int r; +diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drivers/scsi/hosts.c +--- linux-2.6.34.noarch/drivers/scsi/hosts.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-08-23 12:09:03.276563906 -0400 +@@ -49,7 +49,7 @@ static void scsi_host_cls_release(struct + put_device(&class_to_shost(dev)->shost_gendev); + } + +-static struct class shost_class = { ++struct class shost_class = { + .name = "scsi_host", + .dev_release = scsi_host_cls_release, + }; +diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/exofs.h +--- linux-2.6.34.noarch/fs/exofs/exofs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-08-23 12:09:03.277563890 -0400 +@@ -36,13 +36,9 @@ + #include + #include + #include ++#include + #include "common.h" + +-/* FIXME: Remove once pnfs hits mainline +- * #include +- */ +-#include "pnfs.h" +- + #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) + + #ifdef CONFIG_EXOFS_DEBUG +@@ -103,6 +99,7 @@ struct exofs_sb_info { + struct exofs_i_info { + struct inode vfs_inode; /* normal in-memory inode */ + wait_queue_head_t i_wq; /* wait queue for inode */ ++ spinlock_t i_layout_lock; /* lock for layout/return/recall */ + unsigned long i_flags; /* various atomic flags */ + uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ + uint32_t i_dir_start_lookup; /* which page to start lookup */ +@@ -166,6 +163,9 @@ static inline unsigned exofs_io_state_si + */ + #define OBJ_2BCREATED 0 /* object will be created soon*/ + #define OBJ_CREATED 1 /* object has been created on the osd*/ ++/* Below are not used atomic but reuse the same i_flags */ ++#define OBJ_LAYOUT_IS_GIVEN 2 /* inode has given layouts to clients*/ ++#define OBJ_IN_LAYOUT_RECALL 3 /* inode is in the middle of a layout recall*/ + + static inline int obj_2bcreated(struct exofs_i_info *oi) + { +@@ -304,4 +304,20 @@ extern const struct inode_operations exo + extern const struct inode_operations exofs_symlink_inode_operations; + extern const struct inode_operations exofs_fast_symlink_inode_operations; + ++/* export.c */ ++typedef int (exofs_recall_fn)(struct inode *inode); ++#ifdef CONFIG_PNFSD ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo); ++void exofs_init_export(struct super_block *sb); ++#else ++static inline int exofs_inode_recall_layout(struct inode *inode, ++ enum pnfs_iomode iomode, exofs_recall_fn todo) ++{ ++ return todo(inode); ++} ++ ++static inline void exofs_init_export(struct super_block *sb) {} ++#endif ++ + #endif +diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs/export.c +--- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-08-23 12:09:03.278386746 -0400 ++++ linux-2.6.34.noarch/fs/exofs/export.c 2010-08-23 12:09:03.278386746 -0400 +@@ -0,0 +1,396 @@ ++/* ++ * export.c - Implementation of the pnfs_export_operations ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Boaz Harrosh ++ * ++ * This file is part of exofs. ++ * ++ * exofs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation. Since it is based on ext2, and the only ++ * valid version of GPL for the Linux kernel is version 2, the only valid ++ * version of GPL for exofs is version 2. ++ * ++ * exofs is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with exofs; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include ++#include "exofs.h" ++ ++static int exofs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_OSD2_OBJECTS; ++} ++ ++static void set_dev_id(struct pnfs_deviceid *pnfs_devid, u64 sbid, u64 devid) ++{ ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)pnfs_devid; ++ ++ dev_id->sbid = sbid; ++ dev_id->devid = devid; ++} ++ ++static int cb_layout_recall(struct inode *inode, enum pnfs_iomode iomode, ++ u64 offset, u64 length, void *cookie) ++{ ++ struct nfsd4_pnfs_cb_layout cbl; ++ struct pnfsd_cb_ctl cb_ctl; ++ int status; ++ ++ memset(&cb_ctl, 0, sizeof(cb_ctl)); ++ status = pnfsd_get_cb_op(&cb_ctl); ++ if (unlikely(status)) { ++ EXOFS_ERR("%s: nfsd unloaded!! inode (0x%lx) status=%d\n", ++ __func__, inode->i_ino, status); ++ goto err; ++ } ++ ++ memset(&cbl, 0, sizeof(cbl)); ++ cbl.cbl_recall_type = RETURN_FILE; ++ cbl.cbl_seg.layout_type = LAYOUT_OSD2_OBJECTS; ++ cbl.cbl_seg.iomode = iomode; ++ cbl.cbl_seg.offset = offset; ++ cbl.cbl_seg.length = length; ++ cbl.cbl_cookie = cookie; ++ ++ status = cb_ctl.cb_op->cb_layout_recall(inode->i_sb, inode, &cbl); ++ pnfsd_put_cb_op(&cb_ctl); ++ ++err: ++ return status; ++} ++ ++static enum nfsstat4 exofs_layout_get( ++ struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; ++ struct exofs_layout *el = &sbi->layout; ++ struct pnfs_osd_object_cred *creds = NULL; ++ struct pnfs_osd_layout layout; ++ __be32 *start; ++ bool in_recall; ++ int i, err; ++ enum nfsstat4 nfserr; ++ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ res->lg_seg.iomode = IOMODE_RW; ++ res->lg_return_on_close = true; /* TODO: unused but will be soon */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ creds = kcalloc(el->s_numdevs, sizeof(*creds), GFP_KERNEL); ++ if (!creds) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto out; ++ } ++ ++ /* Fill in a pnfs_osd_layout struct */ ++ layout.olo_map = sbi->data_map; ++ ++ for (i = 0; i < el->s_numdevs; i++) { ++ struct pnfs_osd_object_cred *cred = &creds[i]; ++ osd_id id = exofs_oi_objno(oi); ++ unsigned dev = exofs_layout_od_id(el, id, i); ++ ++ set_dev_id(&cred->oc_object_id.oid_device_id, args->lg_sbid, ++ dev); ++ cred->oc_object_id.oid_partition_id = el->s_pid; ++ cred->oc_object_id.oid_object_id = id; ++ cred->oc_osd_version = osd_dev_is_ver1(el->s_ods[dev]) ? ++ PNFS_OSD_VERSION_1 : ++ PNFS_OSD_VERSION_2; ++ cred->oc_cap_key_sec = PNFS_OSD_CAP_KEY_SEC_NONE; ++ ++ cred->oc_cap_key.cred_len = 0; ++ cred->oc_cap_key.cred = NULL; ++ ++ cred->oc_cap.cred_len = OSD_CAP_LEN; ++ cred->oc_cap.cred = oi->i_cred; ++ } ++ ++ layout.olo_comps_index = 0; ++ layout.olo_num_comps = el->s_numdevs; ++ layout.olo_comps = creds; ++ ++ err = pnfs_osd_xdr_encode_layout(xdr, &layout); ++ if (err) { ++ nfserr = NFS4ERR_TOOSMALL; /* FIXME: Change osd_xdr error codes */ ++ goto out; ++ } ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ if (!in_recall) { ++ __set_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ nfserr = NFS4_OK; ++ } else { ++ nfserr = NFS4ERR_RECALLCONFLICT; ++ } ++ spin_unlock(&oi->i_layout_lock); ++ ++out: ++ kfree(creds); ++ EXOFS_DBGMSG("(0x%lx) nfserr=%u xdr_bytes=%zu\n", ++ inode->i_ino, nfserr, exp_xdr_qbytes(xdr->p - start)); ++ return nfserr; ++} ++ ++/* NOTE: inode mutex must NOT be held */ ++static int exofs_layout_commit( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct timespec mtime; ++ loff_t i_size; ++ int in_recall; ++ ++ /* In case of a recall we ignore the new size and mtime since they ++ * are going to be changed again by truncate, and since we cannot take ++ * the inode lock in that case. ++ */ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ if (in_recall) { ++ EXOFS_DBGMSG("(0x%lx) commit was called during recall\n", ++ inode->i_ino); ++ return 0; ++ } ++ ++ /* NOTE: I would love to call inode_setattr here ++ * but i cannot since this will cause an eventual vmtruncate, ++ * which will cause a layout_recall. So open code the i_size ++ * and mtime/atime changes under i_mutex. ++ */ ++ mutex_lock_nested(&inode->i_mutex, I_MUTEX_NORMAL); ++ ++ if (args->lc_mtime.seconds) { ++ mtime.tv_sec = args->lc_mtime.seconds; ++ mtime.tv_nsec = args->lc_mtime.nseconds; ++ ++ /* layout commit may only make time bigger, since there might ++ * be reordering of the notifications and it might arrive after ++ * A local change. ++ * TODO: if mtime > ctime then we know set_attr did an mtime ++ * in the future. and we can let this update through ++ */ ++ if (0 <= timespec_compare(&mtime, &inode->i_mtime)) ++ mtime = inode->i_mtime; ++ } else { ++ mtime = current_fs_time(inode->i_sb); ++ } ++ ++ /* TODO: Will below work? since mark_inode_dirty has it's own ++ * Time handling ++ */ ++ inode->i_atime = inode->i_mtime = mtime; ++ ++ i_size = i_size_read(inode); ++ if (args->lc_newoffset) { ++ loff_t new_size = args->lc_last_wr + 1; ++ ++ if (i_size < new_size) { ++ i_size_write(inode, i_size = new_size); ++ res->lc_size_chg = 1; ++ res->lc_newsize = new_size; ++ } ++ } ++ /* TODO: else { i_size = osd_get_object_length() } */ ++ ++/* TODO: exofs does not currently use the osd_xdr part of the layout_commit */ ++ ++ mark_inode_dirty_sync(inode); ++ ++ mutex_unlock(&inode->i_mutex); ++ EXOFS_DBGMSG("(0x%lx) i_size=0x%llx lcp->off=0x%llx\n", ++ inode->i_ino, i_size, args->lc_last_wr); ++ return 0; ++} ++ ++static void exofs_handle_error(struct pnfs_osd_ioerr *ioerr) ++{ ++ EXOFS_ERR("exofs_handle_error: errno=%d is_write=%d obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ ioerr->oer_errno, ioerr->oer_iswrite, ++ _LLU(ioerr->oer_component.oid_object_id), ++ _LLU(ioerr->oer_comp_offset), ++ _LLU(ioerr->oer_comp_length)); ++} ++ ++static int exofs_layout_return( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ __be32 *p = args->lrf_body; ++ unsigned len = exp_xdr_qwords(args->lrf_body_len); ++ ++ EXOFS_DBGMSG("(0x%lx) cookie %p xdr_len %d\n", ++ inode->i_ino, args->lr_cookie, len); ++ ++ while (len >= pnfs_osd_ioerr_xdr_sz()) { ++ struct pnfs_osd_ioerr ioerr; ++ ++ p = pnfs_osd_xdr_decode_ioerr(&ioerr, p); ++ len -= pnfs_osd_ioerr_xdr_sz(); ++ exofs_handle_error(&ioerr); ++ } ++ ++ if (args->lr_cookie) { ++ struct exofs_i_info *oi = exofs_i(inode); ++ bool in_recall; ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ __clear_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ /* TODO: how to communicate cookie with the waiter */ ++ if (in_recall) ++ wake_up(&oi->i_wq); /* wakeup any recalls */ ++ } ++ ++ return 0; ++} ++ ++int exofs_get_device_info(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct exofs_sb_info *sbi = sb->s_fs_info; ++ struct pnfs_osd_deviceaddr devaddr; ++ const struct osd_dev_info *odi; ++ u64 devno = devid->devid; ++ __be32 *start; ++ int err; ++ ++ memset(&devaddr, 0, sizeof(devaddr)); ++ ++ if (unlikely(devno >= sbi->layout.s_numdevs)) ++ return -ENODEV; ++ ++ odi = osduld_device_info(sbi->layout.s_ods[devno]); ++ ++ devaddr.oda_systemid.len = odi->systemid_len; ++ devaddr.oda_systemid.data = (void *)odi->systemid; /* !const cast */ ++ ++ devaddr.oda_osdname.len = odi->osdname_len ; ++ devaddr.oda_osdname.data = (void *)odi->osdname;/* !const cast */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ err = -E2BIG; ++ goto err; ++ } ++ ++ err = pnfs_osd_xdr_encode_deviceaddr(xdr, &devaddr); ++ if (err) ++ goto err; ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ EXOFS_DBGMSG("xdr_bytes=%Zu devno=%lld osdname-%s\n", ++ exp_xdr_qbytes(xdr->p - start), devno, odi->osdname); ++ return 0; ++ ++err: ++ EXOFS_DBGMSG("Error: err=%d at_byte=%zu\n", ++ err, exp_xdr_qbytes(xdr->p - start)); ++ return err; ++} ++ ++struct pnfs_export_operations exofs_pnfs_ops = { ++ .layout_type = exofs_layout_type, ++ .layout_get = exofs_layout_get, ++ .layout_commit = exofs_layout_commit, ++ .layout_return = exofs_layout_return, ++ .get_device_info = exofs_get_device_info, ++}; ++ ++static bool is_layout_returned(struct exofs_i_info *oi) ++{ ++ bool layout_given; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ return !layout_given; ++} ++ ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ int layout_given; ++ int error = 0; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ __set_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ if (!layout_given) ++ goto exec; ++ ++ for (;;) { ++ EXOFS_DBGMSG("(0x%lx) has_layout issue a recall\n", ++ inode->i_ino); ++ error = cb_layout_recall(inode, iomode, 0, NFS4_MAX_UINT64, ++ &oi->i_wq); ++ switch (error) { ++ case 0: ++ case -EAGAIN: ++ break; ++ case -ENOENT: ++ goto exec; ++ default: ++ goto err; ++ } ++ ++ error = wait_event_interruptible(oi->i_wq, ++ is_layout_returned(oi)); ++ if (error) ++ goto err; ++ } ++ ++exec: ++ error = todo(inode); ++ ++err: ++ spin_lock(&oi->i_layout_lock); ++ __clear_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ EXOFS_DBGMSG("(0x%lx) return=>%d\n", inode->i_ino, error); ++ return error; ++} ++ ++void exofs_init_export(struct super_block *sb) ++{ ++ sb->s_pnfs_op = &exofs_pnfs_ops; ++} +diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/inode.c +--- linux-2.6.34.noarch/fs/exofs/inode.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-08-23 12:09:03.279502002 -0400 +@@ -833,7 +833,7 @@ void exofs_truncate(struct inode *inode) + if (unlikely(wait_obj_created(oi))) + goto fail; + +- ret = _do_truncate(inode); ++ ret = exofs_inode_recall_layout(inode, IOMODE_ANY, _do_truncate); + if (ret) + goto fail; + +@@ -964,6 +964,7 @@ static void __oi_init(struct exofs_i_inf + { + init_waitqueue_head(&oi->i_wq); + oi->i_flags = 0; ++ spin_lock_init(&oi->i_layout_lock); + } + /* + * Fill in an inode read from the OSD and set it up for use +diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/Kbuild +--- linux-2.6.34.noarch/fs/exofs/Kbuild.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-08-23 12:09:03.279502002 -0400 +@@ -13,4 +13,5 @@ + # + + exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o ++exofs-$(CONFIG_PNFSD) += export.o + obj-$(CONFIG_EXOFS_FS) += exofs.o +diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/Kconfig +--- linux-2.6.34.noarch/fs/exofs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-08-23 12:09:03.280553663 -0400 +@@ -1,6 +1,7 @@ + config EXOFS_FS + tristate "exofs: OSD based file system support" + depends on SCSI_OSD_ULD ++ select EXPORTFS_OSD_LAYOUT if PNFSD + help + EXOFS is a file system that uses an OSD storage device, + as its backing storage. +diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/super.c +--- linux-2.6.34.noarch/fs/exofs/super.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/super.c 2010-08-23 12:09:03.281511951 -0400 +@@ -621,6 +621,7 @@ static int exofs_fill_super(struct super + sb->s_fs_info = sbi; + sb->s_op = &exofs_sops; + sb->s_export_op = &exofs_export_ops; ++ exofs_init_export(sb); + root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); + if (IS_ERR(root)) { + EXOFS_ERR("ERROR: exofs_iget failed\n"); +diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exportfs/expfs.c +--- linux-2.6.34.noarch/fs/exportfs/expfs.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-08-23 12:09:03.282511528 -0400 +@@ -16,6 +16,13 @@ + #include + #include + ++#if defined(CONFIG_PNFSD) ++struct pnfsd_cb_ctl pnfsd_cb_ctl = { ++ .lock = __SPIN_LOCK_UNLOCKED(pnfsd_cb_ctl.lock) ++}; ++EXPORT_SYMBOL(pnfsd_cb_ctl); ++#endif /* CONFIG_PNFSD */ ++ + #define dprintk(fmt, args...) do{}while(0) + + +diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/exportfs/Makefile +--- linux-2.6.34.noarch/fs/exportfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-08-23 12:09:03.282511528 -0400 +@@ -3,4 +3,7 @@ + + obj-$(CONFIG_EXPORTFS) += exportfs.o + +-exportfs-objs := expfs.o ++exportfs-y := expfs.o ++exportfs-$(CONFIG_EXPORTFS_FILE_LAYOUT) += nfs4filelayoutxdr.o ++exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o ++exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o +diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c +--- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-23 12:09:03.283511561 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-23 12:09:03.283511561 -0400 +@@ -0,0 +1,158 @@ ++/* ++ * linux/fs/nfsd/nfs4blocklayoutxdr.c ++ * ++ * ++ * Created by Rick McNeal on 3/31/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++#include ++#include ++#include ++#include ++ ++static int ++bl_encode_simple(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 12 + 4 + bld->u.simple.bld_sig_len); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u32(p, 1); ++ p = exp_xdr_encode_u64(p, bld->u.simple.bld_offset); ++ exp_xdr_encode_opaque(p, bld->u.simple.bld_sig, ++ bld->u.simple.bld_sig_len); ++ ++ return 0; ++} ++ ++static int ++bl_encode_slice(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2 + 2 + 1); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_start); ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_len); ++ exp_xdr_encode_u32(p, bld->u.slice.bld_index); ++ ++ return 0; ++} ++ ++static int ++bl_encode_concat(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ return -ENOTSUPP; ++} ++ ++static int ++bl_encode_stripe(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ int i; ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 2 + 1 + bld->u.stripe.bld_stripes); ++ ++ p = exp_xdr_encode_u64(p, bld->u.stripe.bld_chunk_size); ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripes); ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripe_indexs[i]); ++ ++ return 0; ++} ++ ++int ++blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes) ++{ ++ u32 num_vols = 0, ++ *layoutlen_p = xdr->p; ++ pnfs_blocklayout_devinfo_t *bld; ++ int status = 0; ++ __be32 *p; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -ETOOSMALL; ++ p += 2; ++ ++ /* ++ * All simple volumes with their signature are required to be listed ++ * first. ++ */ ++ list_for_each_entry(bld, volumes, bld_list) { ++ num_vols++; ++ p = exp_xdr_reserve_qwords(xdr, 1); ++ if (!p) ++ return -ETOOSMALL; ++ p = exp_xdr_encode_u32(p, bld->bld_type); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ status = bl_encode_simple(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_SLICE: ++ status = bl_encode_slice(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ status = bl_encode_concat(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ status = bl_encode_stripe(xdr, bld); ++ break; ++ default: ++ BUG(); ++ } ++ if (status) ++ goto error; ++ } ++ ++ /* ---- Fill in the overall length and number of volumes ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (xdr->p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, num_vols); ++ ++error: ++ return status; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_devinfo); ++ ++enum nfsstat4 ++blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *bl_head) ++{ ++ struct pnfs_blocklayout_layout *b; ++ u32 *layoutlen_p = xdr->p, ++ extents = 0; ++ __be32 *p; ++ ++ /* ++ * Save spot for opaque block layout length and number of extents, ++ * fill-in later. ++ */ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p += 2; ++ ++ list_for_each_entry(b, bl_head, bll_list) { ++ extents++; ++ p = exp_xdr_reserve_qwords(xdr, 5 * 2 + 1); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.sbid); ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.devid); ++ p = exp_xdr_encode_u64(p, b->bll_foff); ++ p = exp_xdr_encode_u64(p, b->bll_len); ++ p = exp_xdr_encode_u64(p, b->bll_soff); ++ p = exp_xdr_encode_u32(p, b->bll_es); ++ } ++ ++ /* ---- Fill in the overall length and number of extents ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, extents); ++ ++ return NFS4_OK; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_layout); +diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c +--- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-23 12:09:03.283511561 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-23 12:09:03.283511561 -0400 +@@ -0,0 +1,218 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++/* We do our-own dprintk so filesystems are not dependent on sunrpc */ ++#ifdef dprintk ++#undef dprintk ++#endif ++#define dprintk(fmt, args, ...) do { } while (0) ++ ++/* Calculate the XDR length of the GETDEVICEINFO4resok structure ++ * excluding the gdir_notification and the gdir_device_addr da_layout_type. ++ */ ++static int fl_devinfo_xdr_words(const struct pnfs_filelayout_device *fdev) ++{ ++ struct pnfs_filelayout_devaddr *fl_addr; ++ struct pnfs_filelayout_multipath *mp; ++ int i, j, nwords; ++ ++ /* da_addr_body length, indice length, indices, ++ * multipath_list4 length */ ++ nwords = 1 + 1 + fdev->fl_stripeindices_length + 1; ++ for (i = 0; i < fdev->fl_device_length; i++) { ++ mp = &fdev->fl_device_list[i]; ++ nwords++; /* multipath list length */ ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ fl_addr = mp->fl_multipath_list; ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_netid.len); ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_addr.len); ++ } ++ } ++ dprintk("<-- %s nwords %d\n", __func__, nwords); ++ return nwords; ++} ++ ++/* Encodes the nfsv4_1_file_layout_ds_addr4 structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++int ++filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ unsigned int i, j, len = 0, opaque_words; ++ u32 *p_in; ++ u32 index_count = fdev->fl_stripeindices_length; ++ u32 dev_count = fdev->fl_device_length; ++ int error = 0; ++ __be32 *p; ++ ++ opaque_words = fl_devinfo_xdr_words(fdev); ++ dprintk("%s: Begin indx_cnt: %u dev_cnt: %u total size %u\n", ++ __func__, ++ index_count, ++ dev_count, ++ opaque_words*4); ++ ++ /* check space for opaque length */ ++ p = p_in = exp_xdr_reserve_qwords(xdr, opaque_words); ++ if (!p) { ++ error = -ETOOSMALL; ++ goto out; ++ } ++ ++ /* Fill in length later */ ++ p++; ++ ++ /* encode device list indices */ ++ p = exp_xdr_encode_u32(p, index_count); ++ for (i = 0; i < index_count; i++) ++ p = exp_xdr_encode_u32(p, fdev->fl_stripeindices_list[i]); ++ ++ /* encode device list */ ++ p = exp_xdr_encode_u32(p, dev_count); ++ for (i = 0; i < dev_count; i++) { ++ struct pnfs_filelayout_multipath *mp = &fdev->fl_device_list[i]; ++ ++ p = exp_xdr_encode_u32(p, mp->fl_multipath_length); ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ struct pnfs_filelayout_devaddr *da = ++ &mp->fl_multipath_list[j]; ++ ++ /* Encode device info */ ++ p = exp_xdr_encode_opaque(p, da->r_netid.data, ++ da->r_netid.len); ++ p = exp_xdr_encode_opaque(p, da->r_addr.data, ++ da->r_addr.len); ++ } ++ } ++ ++ /* backfill in length. Subtract 4 for da_addr_body size */ ++ len = (char *)p - (char *)p_in; ++ exp_xdr_encode_u32(p_in, len - 4); ++ ++ error = 0; ++out: ++ dprintk("%s: End err %d xdrlen %d\n", ++ __func__, error, len); ++ return error; ++} ++EXPORT_SYMBOL(filelayout_encode_devinfo); ++ ++/* Encodes the loc_body structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++enum nfsstat4 ++filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp) ++{ ++ u32 len = 0, nfl_util, fhlen, i; ++ u32 *layoutlen_p; ++ enum nfsstat4 nfserr; ++ __be32 *p; ++ ++ dprintk("%s: device_id %llx:%llx fsi %u, numfh %u\n", ++ __func__, ++ flp->device_id.pnfs_fsid, ++ flp->device_id.pnfs_devid, ++ flp->lg_first_stripe_index, ++ flp->lg_fh_length); ++ ++ /* Ensure file system added at least one file handle */ ++ if (flp->lg_fh_length <= 0) { ++ dprintk("%s: File Layout has no file handles!!\n", __func__); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto out; ++ } ++ ++ /* Ensure room for len, devid, util, first_stripe_index, ++ * pattern_offset, number of filehandles */ ++ p = layoutlen_p = exp_xdr_reserve_qwords(xdr, 1+2+2+1+1+2+1); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ /* save spot for opaque file layout length, fill-in later*/ ++ p++; ++ ++ /* encode device id */ ++ p = exp_xdr_encode_u64(p, flp->device_id.sbid); ++ p = exp_xdr_encode_u64(p, flp->device_id.devid); ++ ++ /* set and encode flags */ ++ nfl_util = flp->lg_stripe_unit; ++ if (flp->lg_commit_through_mds) ++ nfl_util |= NFL4_UFLG_COMMIT_THRU_MDS; ++ if (flp->lg_stripe_type == STRIPE_DENSE) ++ nfl_util |= NFL4_UFLG_DENSE; ++ p = exp_xdr_encode_u32(p, nfl_util); ++ ++ /* encode first stripe index */ ++ p = exp_xdr_encode_u32(p, flp->lg_first_stripe_index); ++ ++ /* encode striping pattern start */ ++ p = exp_xdr_encode_u64(p, flp->lg_pattern_offset); ++ ++ /* encode number of file handles */ ++ p = exp_xdr_encode_u32(p, flp->lg_fh_length); ++ ++ /* encode file handles */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ fhlen = flp->lg_fh_list[i].fh_size; ++ p = exp_xdr_reserve_space(xdr, 4 + fhlen); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ p = exp_xdr_encode_opaque(p, &flp->lg_fh_list[i].fh_base, fhlen); ++ } ++ ++ /* Set number of bytes encoded = total_bytes_encoded - length var */ ++ len = (char *)p - (char *)layoutlen_p; ++ exp_xdr_encode_u32(layoutlen_p, len - 4); ++ ++ nfserr = NFS4_OK; ++out: ++ dprintk("%s: End err %u xdrlen %d\n", ++ __func__, nfserr, len); ++ return nfserr; ++} ++EXPORT_SYMBOL(filelayout_encode_layout); +diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c +--- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-23 12:09:03.284511493 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-23 12:09:03.284511493 -0400 +@@ -0,0 +1,289 @@ ++/* ++ * pnfs_osd_xdr_enc.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_data_map( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_data_map *data_map) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 1+2+1+1+1+1); ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, data_map->odm_num_comps); ++ p = exp_xdr_encode_u64(p, data_map->odm_stripe_unit); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_width); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_depth); ++ p = exp_xdr_encode_u32(p, data_map->odm_mirror_cnt); ++ p = exp_xdr_encode_u32(p, data_map->odm_raid_algorithm); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline int pnfs_osd_xdr_encode_objid( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2+2+2+2); ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)&object_id->oid_device_id; ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u64(p, dev_id->sbid); ++ p = exp_xdr_encode_u64(p, dev_id->devid); ++ p = exp_xdr_encode_u64(p, object_id->oid_partition_id); ++ p = exp_xdr_encode_u64(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * enum pnfs_osd_cap_key_sec4 { ++ * PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ * PNFS_OSD_CAP_KEY_SEC_SSV = 1 ++ * }; ++ * ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_object_cred( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_object_cred *olo_comp) ++{ ++ __be32 *p; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_objid(xdr, &olo_comp->oc_object_id); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_space(xdr, 3*4 + 4+olo_comp->oc_cap.cred_len); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, olo_comp->oc_osd_version); ++ ++ /* No sec for now */ ++ p = exp_xdr_encode_u32(p, PNFS_OSD_CAP_KEY_SEC_NONE); ++ p = exp_xdr_encode_u32(p, 0); /* opaque oc_capability_key<> */ ++ ++ exp_xdr_encode_opaque(p, olo_comp->oc_cap.cred, ++ olo_comp->oc_cap.cred_len); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_layout { ++ * struct pnfs_osd_data_map olo_map; ++ * u32 olo_comps_index; ++ * u32 olo_num_comps; ++ * struct pnfs_osd_object_cred *olo_comps; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *pol) ++{ ++ __be32 *p; ++ u32 i; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_data_map(xdr, &pol->olo_map); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, pol->olo_comps_index); ++ p = exp_xdr_encode_u32(p, pol->olo_num_comps); ++ ++ for (i = 0; i < pol->olo_num_comps; i++) { ++ err = pnfs_osd_xdr_encode_object_cred(xdr, &pol->olo_comps[i]); ++ if (err) ++ return err; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_layout); ++ ++static int _encode_string(struct exp_xdr_stream *xdr, ++ const struct nfs4_string *str) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, 4 + str->len); ++ ++ if (!p) ++ return -E2BIG; ++ exp_xdr_encode_opaque(p, str->data, str->len); ++ return 0; ++} ++ ++/* struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr) ++{ ++ __be32 *p; ++ int err; ++ ++ p = exp_xdr_reserve_space(xdr, 4 + 4 + sizeof(devaddr->oda_lun)); ++ if (!p) ++ return -E2BIG; ++ ++ /* Empty oda_targetid */ ++ p = exp_xdr_encode_u32(p, OBJ_TARGET_ANON); ++ ++ /* Empty oda_targetaddr for now */ ++ p = exp_xdr_encode_u32(p, 0); ++ ++ /* oda_lun */ ++ exp_xdr_encode_bytes(p, devaddr->oda_lun, sizeof(devaddr->oda_lun)); ++ ++ err = _encode_string(xdr, &devaddr->oda_systemid); ++ if (err) ++ return err; ++ ++ err = pnfs_osd_xdr_encode_object_cred(xdr, ++ &devaddr->oda_root_obj_cred); ++ if (err) ++ return err; ++ ++ err = _encode_string(xdr, &devaddr->oda_osdname); ++ if (err) ++ return err; ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_deviceaddr); ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p) ++{ ++ lou->dsu_valid = be32_to_cpu(*p++); ++ if (lou->dsu_valid) ++ p = xdr_decode_hyper(p, &lou->dsu_delta); ++ lou->olu_ioerr_flag = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_layoutupdate); ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline __be32 * ++pnfs_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) ++{ ++ /* FIXME: p = xdr_decode_fixed(...) */ ++ memcpy(objid->oid_device_id.data, p, sizeof(objid->oid_device_id.data)); ++ p += XDR_QUADLEN(sizeof(objid->oid_device_id.data)); ++ ++ p = xdr_decode_hyper(p, &objid->oid_partition_id); ++ p = xdr_decode_hyper(p, &objid->oid_object_id); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p) ++{ ++ p = pnfs_osd_xdr_decode_objid(p, &ioerr->oer_component); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_offset); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_length); ++ ioerr->oer_iswrite = be32_to_cpu(*p++); ++ ioerr->oer_errno = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); +diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gfs2/ops_fstype.c +--- linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-08-23 12:09:03.285539075 -0400 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + #include "gfs2.h" + #include "incore.h" +@@ -1146,6 +1147,9 @@ static int fill_super(struct super_block + sb->s_magic = GFS2_MAGIC; + sb->s_op = &gfs2_super_ops; + sb->s_export_op = &gfs2_export_ops; ++#if defined(CONFIG_PNFSD) ++ sb->s_pnfs_op = &pnfs_dlm_export_ops; ++#endif /* CONFIG_PNFSD */ + sb->s_xattr = gfs2_xattr_handlers; + sb->s_qcop = &gfs2_quotactl_ops; + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; +diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig +--- linux-2.6.34.noarch/fs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/Kconfig 2010-08-23 12:09:03.286512316 -0400 +@@ -224,6 +224,31 @@ config LOCKD_V4 + config EXPORTFS + tristate + ++config EXPORTFS_FILE_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 files layout type. ++ Must be automatically selected by supporting filesystems. ++ ++config EXPORTFS_OSD_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 objects layout type. ++ Must be automatically selected by supporting osd ++ filesystems. ++ ++ If unsure, say N. ++ ++config EXPORTFS_BLOCK_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 blocks layout type. ++ Must be automatically selected by supporting filesystems. ++ ++ + config NFS_ACL_SUPPORT + tristate + select FS_POSIX_ACL +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-23 12:09:03.287381619 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-23 12:09:03.287381619 -0400 +@@ -0,0 +1,66 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pipefs_list bl_device_list; ++struct dentry *bl_device_pipe; ++ ++ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, size_t len) ++{ ++ int err; ++ struct pipefs_hdr *msg; ++ ++ dprintk("Entering %s...\n", __func__); ++ ++ msg = pipefs_readmsg(filp, src, len); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: unable to read pipefs message.\n"); ++ return PTR_ERR(msg); ++ } ++ ++ /* now assign the result, which wakes the blocked thread */ ++ err = pipefs_assign_upcall_reply(msg, &bl_device_list); ++ if (err) { ++ dprintk("ERROR: failed to assign upcall with id %u\n", ++ msg->msgid); ++ kfree(msg); ++ } ++ return len; ++} ++ ++static const struct rpc_pipe_ops bl_pipe_ops = { ++ .upcall = pipefs_generic_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = pipefs_generic_destroy_msg, ++}; ++ ++int bl_pipe_init(void) ++{ ++ dprintk("%s: block_device pipefs registering...\n", __func__); ++ bl_device_pipe = pipefs_mkpipe("bl_device_pipe", &bl_pipe_ops, 1); ++ if (IS_ERR(bl_device_pipe)) ++ dprintk("ERROR, unable to make block_device pipe\n"); ++ ++ if (!bl_device_pipe) ++ dprintk("bl_device_pipe is NULL!\n"); ++ else ++ dprintk("bl_device_pipe created!\n"); ++ pipefs_init_list(&bl_device_list); ++ return 0; ++} ++ ++void bl_pipe_exit(void) ++{ ++ dprintk("%s: block_device pipefs unregistering...\n", __func__); ++ if (IS_ERR(bl_device_pipe)) ++ return ; ++ pipefs_closepipe(bl_device_pipe); ++ return; ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-23 12:09:03.288501648 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-23 12:09:03.288501648 -0400 +@@ -0,0 +1,1160 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include ++ ++#include /* various write calls */ ++#include /* struct bio */ ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Andy Adamson "); ++MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); ++ ++/* Callback operations to the pNFS client */ ++static struct pnfs_client_operations *pnfs_block_callback_ops; ++ ++static void print_page(struct page *page) ++{ ++ dprintk("PRINTPAGE page %p\n", page); ++ dprintk(" PagePrivate %d\n", PagePrivate(page)); ++ dprintk(" PageUptodate %d\n", PageUptodate(page)); ++ dprintk(" PageError %d\n", PageError(page)); ++ dprintk(" PageDirty %d\n", PageDirty(page)); ++ dprintk(" PageReferenced %d\n", PageReferenced(page)); ++ dprintk(" PageLocked %d\n", PageLocked(page)); ++ dprintk(" PageWriteback %d\n", PageWriteback(page)); ++ dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); ++ dprintk("\n"); ++} ++ ++/* Given the be associated with isect, determine if page data needs to be ++ * initialized. ++ */ ++static int is_hole(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return !is_sector_initialized(be->be_inval, isect); ++} ++ ++/* Given the be associated with isect, determine if page data can be ++ * written to disk. ++ */ ++static int is_writable(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return is_sector_initialized(be->be_inval, isect); ++} ++ ++static int ++dont_like_caller(struct nfs_page *req) ++{ ++ if (atomic_read(&req->wb_complete)) { ++ /* Called by _multi */ ++ return 1; ++ } else { ++ /* Called by _one */ ++ return 0; ++ } ++} ++ ++static enum pnfs_try_status ++bl_commit(struct nfs_write_data *nfs_data, ++ int sync) ++{ ++ dprintk("%s enter\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++/* The data we are handed might be spread across several bios. We need ++ * to track when the last one is finished. ++ */ ++struct parallel_io { ++ struct kref refcnt; ++ struct rpc_call_ops call_ops; ++ void (*pnfs_callback) (void *data); ++ void *data; ++}; ++ ++static inline struct parallel_io *alloc_parallel(void *data) ++{ ++ struct parallel_io *rv; ++ ++ rv = kmalloc(sizeof(*rv), GFP_KERNEL); ++ if (rv) { ++ rv->data = data; ++ kref_init(&rv->refcnt); ++ } ++ return rv; ++} ++ ++static inline void get_parallel(struct parallel_io *p) ++{ ++ kref_get(&p->refcnt); ++} ++ ++static void destroy_parallel(struct kref *kref) ++{ ++ struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); ++ ++ dprintk("%s enter\n", __func__); ++ p->pnfs_callback(p->data); ++ kfree(p); ++} ++ ++static inline void put_parallel(struct parallel_io *p) ++{ ++ kref_put(&p->refcnt, destroy_parallel); ++} ++ ++static struct bio * ++bl_submit_bio(int rw, struct bio *bio) ++{ ++ if (bio) { ++ get_parallel(bio->bi_private); ++ dprintk("%s submitting %s bio %u@%llu\n", __func__, ++ rw == READ ? "read" : "write", ++ bio->bi_size, (u64)bio->bi_sector); ++ submit_bio(rw, bio); ++ } ++ return NULL; ++} ++ ++static inline void ++bl_done_with_rpage(struct page *page, const int ok) ++{ ++ if (ok) { ++ ClearPagePnfsErr(page); ++ SetPageUptodate(page); ++ } else { ++ ClearPageUptodate(page); ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ } ++ /* Page is unlocked via rpc_release. Should really be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_read(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_rpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++static void bl_read_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ pnfs_block_callback_ops->nfs_readlist_complete(rdata); ++} ++ ++static void ++bl_end_par_io_read(void *data) ++{ ++ struct nfs_read_data *rdata = data; ++ ++ INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); ++ schedule_work(&rdata->task.u.tk_work); ++} ++ ++/* We don't want normal .rpc_call_done callback used, so we replace it ++ * with this stub. ++ */ ++static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) ++{ ++ return; ++} ++ ++static enum pnfs_try_status ++bl_read_pagelist(struct nfs_read_data *rdata, ++ unsigned nr_pages) ++{ ++ int i, hole; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t f_offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct page **pages = rdata->args.pages; ++ int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, ++ nr_pages, f_offset, count); ++ ++ if (dont_like_caller(rdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ goto use_mds; ++ } ++ if ((nr_pages == 1) && PagePnfsErr(rdata->req->wb_page)) { ++ /* We want to fall back to mds in case of read_page ++ * after error on read_pages. ++ */ ++ dprintk("%s PG_pnfserr set\n", __func__); ++ goto use_mds; ++ } ++ par = alloc_parallel(rdata); ++ if (!par) ++ goto use_mds; ++ par->call_ops = *rdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_read; ++ /* At this point, we can no longer jump to use_mds */ ++ ++ isect = (sector_t) (f_offset >> 9); ++ /* Code assumes extents are page-aligned */ ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ put_extent(cow_read); ++ bio = bl_submit_bio(READ, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(rdata->pdata.lseg), ++ isect, &cow_read); ++ if (!be) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ if (cow_read) { ++ sector_t cow_length = cow_read->be_length - ++ (isect - cow_read->be_f_offset); ++ extent_length = min(extent_length, cow_length); ++ } ++ } ++ hole = is_hole(be, isect); ++ if (hole && !cow_read) { ++ bio = bl_submit_bio(READ, bio); ++ /* Fill hole w/ zeroes w/o accessing device */ ++ dprintk("%s Zeroing page for hole\n", __func__); ++ zero_user(pages[i], 0, ++ min_t(int, PAGE_CACHE_SIZE, count)); ++ print_page(pages[i]); ++ bl_done_with_rpage(pages[i], 1); ++ } else { ++ struct pnfs_block_extent *be_read; ++ ++ be_read = (hole && cow_read) ? cow_read : be; ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - ++ be_read->be_f_offset + ++ be_read->be_v_offset; ++ bio->bi_bdev = be_read->be_mdev; ++ bio->bi_end_io = bl_end_io_read; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(READ, bio); ++ } ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ if ((isect << 9) >= rdata->inode->i_size) { ++ rdata->res.eof = 1; ++ rdata->res.count = rdata->inode->i_size - f_offset; ++ } else { ++ rdata->res.count = (isect << 9) - f_offset; ++ } ++ put_extent(be); ++ put_extent(cow_read); ++ bl_submit_bio(READ, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++ ++ use_mds: ++ dprintk("Giving up and using normal NFS\n"); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static void mark_extents_written(struct pnfs_block_layout *bl, ++ __u64 offset, __u32 count) ++{ ++ sector_t isect, end; ++ struct pnfs_block_extent *be; ++ ++ dprintk("%s(%llu, %u)\n", __func__, offset, count); ++ if (count == 0) ++ return; ++ isect = (offset & (long)(PAGE_CACHE_MASK)) >> 9; ++ end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); ++ end >>= 9; ++ while (isect < end) { ++ sector_t len; ++ be = find_get_extent(bl, isect, NULL); ++ BUG_ON(!be); /* FIXME */ ++ len = min(end, be->be_f_offset + be->be_length) - isect; ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ mark_for_commit(be, isect, len); /* What if fails? */ ++ isect += len; ++ put_extent(be); ++ } ++} ++ ++/* STUB - this needs thought */ ++static inline void ++bl_done_with_wpage(struct page *page, const int ok) ++{ ++ if (!ok) { ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ /* This is an inline copy of nfs_zap_mapping */ ++ /* This is oh so fishy, and needs deep thought */ ++ if (page->mapping->nrpages != 0) { ++ struct inode *inode = page->mapping->host; ++ spin_lock(&inode->i_lock); ++ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; ++ spin_unlock(&inode->i_lock); ++ } ++ } ++ /* end_page_writeback called in rpc_release. Should be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_write(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_wpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++/* Function scheduled for call during bl_end_par_io_write, ++ * it marks sectors as written and extends the commitlist. ++ */ ++static void bl_write_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ if (!wdata->task.tk_status) { ++ /* Marks for LAYOUTCOMMIT */ ++ /* BUG - this should be called after each bio, not after ++ * all finish, unless have some way of storing success/failure ++ */ ++ mark_extents_written(BLK_LSEG2EXT(wdata->pdata.lseg), ++ wdata->args.offset, wdata->args.count); ++ } ++ pnfs_block_callback_ops->nfs_writelist_complete(wdata); ++} ++ ++/* Called when last of bios associated with a bl_write_pagelist call finishes */ ++static void ++bl_end_par_io_write(void *data) ++{ ++ struct nfs_write_data *wdata = data; ++ ++ /* STUB - ignoring error handling */ ++ wdata->task.tk_status = 0; ++ wdata->verf.committed = NFS_FILE_SYNC; ++ INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); ++ schedule_work(&wdata->task.u.tk_work); ++} ++ ++static enum pnfs_try_status ++bl_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int sync) ++{ ++ int i; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t offset = wdata->args.offset; ++ size_t count = wdata->args.count; ++ struct page **pages = wdata->args.pages; ++ int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); ++ if (!wdata->req->wb_lseg) { ++ dprintk("%s no lseg, falling back to MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ if (dont_like_caller(wdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ /* At this point, wdata->pages is a (sequential) list of nfs_pages. ++ * We want to write each, and if there is an error remove it from ++ * list and call ++ * nfs_retry_request(req) to have it redone using nfs. ++ * QUEST? Do as block or per req? Think have to do per block ++ * as part of end_bio ++ */ ++ par = alloc_parallel(wdata); ++ if (!par) ++ return PNFS_NOT_ATTEMPTED; ++ par->call_ops = *wdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_write; ++ /* At this point, have to be more careful with error handling */ ++ ++ isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> 9); ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ bio = bl_submit_bio(WRITE, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(wdata->pdata.lseg), ++ isect, NULL); ++ if (!be || !is_writable(be, isect)) { ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ } ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - be->be_f_offset + ++ be->be_v_offset; ++ bio->bi_bdev = be->be_mdev; ++ bio->bi_end_io = bl_end_io_write; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(WRITE, bio); ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ wdata->res.count = (isect << 9) - (offset & (long)PAGE_CACHE_MASK); ++ put_extent(be); ++ bl_submit_bio(WRITE, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++} ++ ++/* FIXME - range ignored */ ++static void ++release_extents(struct pnfs_block_layout *bl, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ int i; ++ struct pnfs_block_extent *be; ++ ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ while (!list_empty(&bl->bl_extents[i])) { ++ be = list_first_entry(&bl->bl_extents[i], ++ struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++} ++ ++static void ++release_inval_marks(struct pnfs_inval_markings *marks) ++{ ++ struct pnfs_inval_tracking *pos, *temp; ++ ++ list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { ++ list_del(&pos->it_link); ++ kfree(pos); ++ } ++ return; ++} ++ ++/* Note we are relying on caller locking to prevent nasty races. */ ++static void ++bl_free_layout(struct pnfs_layout_type *lo) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ ++ dprintk("%s enter\n", __func__); ++ release_extents(bl, NULL); ++ release_inval_marks(&bl->bl_inval); ++ kfree(bl); ++} ++ ++static struct pnfs_layout_type * ++bl_alloc_layout(struct inode *inode) ++{ ++ struct pnfs_block_layout *bl; ++ ++ dprintk("%s enter\n", __func__); ++ bl = kzalloc(sizeof(*bl), GFP_KERNEL); ++ if (!bl) ++ return NULL; ++ spin_lock_init(&bl->bl_ext_lock); ++ INIT_LIST_HEAD(&bl->bl_extents[0]); ++ INIT_LIST_HEAD(&bl->bl_extents[1]); ++ INIT_LIST_HEAD(&bl->bl_commit); ++ bl->bl_count = 0; ++ bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> 9; ++ INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); ++ return &bl->bl_layout; ++} ++ ++static void ++bl_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter\n", __func__); ++ kfree(lseg); ++} ++ ++/* Because the generic infrastructure does not correctly merge layouts, ++ * we pretty much ignore lseg, and store all data layout wide, so we ++ * can correctly merge. Eventually we should push some correct merge ++ * behavior up to the generic code, as the current behavior tends to ++ * cause lots of unnecessary overlapping LAYOUTGET requests. ++ */ ++static struct pnfs_layout_segment * ++bl_alloc_lseg(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layoutget_res *lgr) ++{ ++ struct pnfs_layout_segment *lseg; ++ int status; ++ ++ dprintk("%s enter\n", __func__); ++ lseg = kzalloc(sizeof(*lseg) + 0, GFP_KERNEL); ++ if (!lseg) ++ return NULL; ++ status = nfs4_blk_process_layoutget(lo, lgr); ++ if (status) { ++ /* We don't want to call the full-blown bl_free_lseg, ++ * since on error extents were not touched. ++ */ ++ /* STUB - we really want to distinguish between 2 error ++ * conditions here. This lseg failed, but lo data structures ++ * are OK, or we hosed the lo data structures. The calling ++ * code probably needs to distinguish this too. ++ */ ++ kfree(lseg); ++ return ERR_PTR(status); ++ } ++ return lseg; ++} ++ ++static int ++bl_setup_layoutcommit(struct pnfs_layout_type *lo, ++ struct pnfs_layoutcommit_arg *arg) ++{ ++ struct nfs_server *nfss = PNFS_NFS_SERVER(lo); ++ struct bl_layoutupdate_data *layoutupdate_data; ++ ++ dprintk("%s enter\n", __func__); ++ /* Need to ensure commit is block-size aligned */ ++ if (nfss->pnfs_blksize) { ++ u64 mask = nfss->pnfs_blksize - 1; ++ u64 offset = arg->lseg.offset & mask; ++ ++ arg->lseg.offset -= offset; ++ arg->lseg.length += offset + mask; ++ arg->lseg.length &= ~mask; ++ } ++ ++ layoutupdate_data = kmalloc(sizeof(struct bl_layoutupdate_data), ++ GFP_KERNEL); ++ if (unlikely(!layoutupdate_data)) ++ return -ENOMEM; ++ INIT_LIST_HEAD(&layoutupdate_data->ranges); ++ arg->layoutdriver_data = layoutupdate_data; ++ ++ return 0; ++} ++ ++static void ++bl_encode_layoutcommit(struct pnfs_layout_type *lo, struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *arg) ++{ ++ dprintk("%s enter\n", __func__); ++ encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); ++} ++ ++static void ++bl_cleanup_layoutcommit(struct pnfs_layout_type *lo, ++ struct pnfs_layoutcommit_arg *arg, int status) ++{ ++ dprintk("%s enter\n", __func__); ++ clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), arg, status); ++ kfree(arg->layoutdriver_data); ++} ++ ++static void free_blk_mountid(struct block_mount_id *mid) ++{ ++ if (mid) { ++ struct pnfs_block_dev *dev; ++ spin_lock(&mid->bm_lock); ++ while (!list_empty(&mid->bm_devlist)) { ++ dev = list_first_entry(&mid->bm_devlist, ++ struct pnfs_block_dev, ++ bm_node); ++ list_del(&dev->bm_node); ++ free_block_dev(dev); ++ } ++ spin_unlock(&mid->bm_lock); ++ kfree(mid); ++ } ++} ++ ++/* This is mostly copied form the filelayout's get_device_info function. ++ * It seems much of this should be at the generic pnfs level. ++ */ ++static struct pnfs_block_dev * ++nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, ++ struct pnfs_deviceid *d_id, ++ struct list_head *sdlist) ++{ ++ struct pnfs_device *dev; ++ struct pnfs_block_dev *rv = NULL; ++ u32 max_resp_sz; ++ int max_pages; ++ struct page **pages = NULL; ++ int i, rc; ++ ++ /* ++ * Use the session max response size as the basis for setting ++ * GETDEVICEINFO's maxcount ++ */ ++ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; ++ max_pages = max_resp_sz >> PAGE_SHIFT; ++ dprintk("%s max_resp_sz %u max_pages %d\n", ++ __func__, max_resp_sz, max_pages); ++ ++ dev = kmalloc(sizeof(*dev), GFP_KERNEL); ++ if (!dev) { ++ dprintk("%s kmalloc failed\n", __func__); ++ return NULL; ++ } ++ ++ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); ++ if (pages == NULL) { ++ kfree(dev); ++ return NULL; ++ } ++ for (i = 0; i < max_pages; i++) { ++ pages[i] = alloc_page(GFP_KERNEL); ++ if (!pages[i]) ++ goto out_free; ++ } ++ ++ /* set dev->area */ ++ dev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); ++ if (!dev->area) ++ goto out_free; ++ ++ memcpy(&dev->dev_id, d_id, sizeof(*d_id)); ++ dev->layout_type = LAYOUT_BLOCK_VOLUME; ++ dev->dev_notify_types = 0; ++ dev->pages = pages; ++ dev->pgbase = 0; ++ dev->pglen = PAGE_SIZE * max_pages; ++ dev->mincount = 0; ++ ++ dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); ++ rc = pnfs_block_callback_ops->nfs_getdeviceinfo(server, dev); ++ dprintk("%s getdevice info returns %d\n", __func__, rc); ++ if (rc) ++ goto out_free; ++ ++ rv = nfs4_blk_decode_device(server, dev, sdlist); ++ out_free: ++ if (dev->area != NULL) ++ vunmap(dev->area); ++ for (i = 0; i < max_pages; i++) ++ __free_page(pages[i]); ++ kfree(pages); ++ kfree(dev); ++ return rv; ++} ++ ++ ++/* ++ * Retrieve the list of available devices for the mountpoint. ++ */ ++static int ++bl_initialize_mountpoint(struct nfs_server *server, const struct nfs_fh *fh) ++{ ++ struct block_mount_id *b_mt_id = NULL; ++ struct pnfs_mount_type *mtype = NULL; ++ struct pnfs_devicelist *dlist = NULL; ++ struct pnfs_block_dev *bdev; ++ LIST_HEAD(block_disklist); ++ int status = 0, i; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (server->pnfs_blksize == 0) { ++ dprintk("%s Server did not return blksize\n", __func__); ++ return -EINVAL; ++ } ++ b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_KERNEL); ++ if (!b_mt_id) { ++ status = -ENOMEM; ++ goto out_error; ++ } ++ /* Initialize nfs4 block layout mount id */ ++ spin_lock_init(&b_mt_id->bm_lock); ++ INIT_LIST_HEAD(&b_mt_id->bm_devlist); ++ ++ dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_KERNEL); ++ if (!dlist) ++ goto out_error; ++ dlist->eof = 0; ++ while (!dlist->eof) { ++ status = pnfs_block_callback_ops->nfs_getdevicelist( ++ server, fh, dlist); ++ if (status) ++ goto out_error; ++ dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", ++ __func__, dlist->num_devs, dlist->eof); ++ /* For each device returned in dlist, call GETDEVICEINFO, and ++ * decode the opaque topology encoding to create a flat ++ * volume topology, matching VOLUME_SIMPLE disk signatures ++ * to disks in the visible block disk list. ++ * Construct an LVM meta device from the flat volume topology. ++ */ ++ for (i = 0; i < dlist->num_devs; i++) { ++ bdev = nfs4_blk_get_deviceinfo(server, fh, ++ &dlist->dev_id[i], ++ &block_disklist); ++ if (!bdev) ++ goto out_error; ++ spin_lock(&b_mt_id->bm_lock); ++ list_add(&bdev->bm_node, &b_mt_id->bm_devlist); ++ spin_unlock(&b_mt_id->bm_lock); ++ } ++ } ++ dprintk("%s SUCCESS\n", __func__); ++ server->pnfs_ld_data = b_mt_id; ++ ++ out_return: ++ kfree(dlist); ++ return status; ++ ++ out_error: ++ free_blk_mountid(b_mt_id); ++ kfree(mtype); ++ goto out_return; ++} ++ ++static int ++bl_uninitialize_mountpoint(struct nfs_server *server) ++{ ++ struct block_mount_id *b_mt_id = server->pnfs_ld_data; ++ ++ dprintk("%s enter\n", __func__); ++ free_blk_mountid(b_mt_id); ++ dprintk("%s RETURNS\n", __func__); ++ return 0; ++} ++ ++/* STUB - mark intersection of layout and page as bad, so is not ++ * used again. ++ */ ++static void mark_bad_read(void) ++{ ++ return; ++} ++ ++/* Copied from buffer.c */ ++static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) ++{ ++ if (uptodate) { ++ set_buffer_uptodate(bh); ++ } else { ++ /* This happens, due to failed READA attempts. */ ++ clear_buffer_uptodate(bh); ++ } ++ unlock_buffer(bh); ++} ++ ++/* Copied from buffer.c */ ++static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) ++{ ++ __end_buffer_read_notouch(bh, uptodate); ++} ++ ++/* ++ * map_block: map a requested I/0 block (isect) into an offset in the LVM ++ * meta block_device ++ */ ++static void ++map_block(sector_t isect, struct pnfs_block_extent *be, struct buffer_head *bh) ++{ ++ dprintk("%s enter be=%p\n", __func__, be); ++ ++ set_buffer_mapped(bh); ++ bh->b_bdev = be->be_mdev; ++ bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> ++ (be->be_mdev->bd_inode->i_blkbits - 9); ++ ++ dprintk("%s isect %ld, bh->b_blocknr %ld, using bsize %Zd\n", ++ __func__, (long)isect, ++ (long)bh->b_blocknr, ++ bh->b_size); ++ return; ++} ++ ++/* Given an unmapped page, zero it (or read in page for COW), ++ * and set appropriate flags/markings, but it is safe to not initialize ++ * the range given in [from, to). ++ */ ++/* This is loosely based on nobh_write_begin */ ++static int ++init_page_for_write(struct pnfs_block_layout *bl, struct page *page, ++ unsigned from, unsigned to, sector_t **pages_to_mark) ++{ ++ struct buffer_head *bh; ++ int inval, ret = -EIO; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect; ++ ++ dprintk("%s enter, %p\n", __func__, page); ++ bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); ++ if (!bh) { ++ ret = -ENOMEM; ++ goto cleanup; ++ } ++ ++ isect = (sector_t)page->index << (PAGE_CACHE_SHIFT - 9); ++ be = find_get_extent(bl, isect, &cow_read); ++ if (!be) ++ goto cleanup; ++ inval = is_hole(be, isect); ++ dprintk("%s inval=%i, from=%u, to=%u\n", __func__, inval, from, to); ++ if (inval) { ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) { ++ dprintk("%s PANIC - got NONE_DATA extent %p\n", ++ __func__, be); ++ goto cleanup; ++ } ++ map_block(isect, be, bh); ++ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); ++ } ++ if (PageUptodate(page)) { ++ /* Do nothing */ ++ } else if (inval & !cow_read) { ++ zero_user_segments(page, 0, from, to, PAGE_CACHE_SIZE); ++ } else if (0 < from || PAGE_CACHE_SIZE > to) { ++ struct pnfs_block_extent *read_extent; ++ ++ read_extent = (inval && cow_read) ? cow_read : be; ++ map_block(isect, read_extent, bh); ++ lock_buffer(bh); ++ bh->b_end_io = end_buffer_read_nobh; ++ submit_bh(READ, bh); ++ dprintk("%s: Waiting for buffer read\n", __func__); ++ /* XXX Don't really want to hold layout lock here */ ++ wait_on_buffer(bh); ++ if (!buffer_uptodate(bh)) ++ goto cleanup; ++ } ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ /* There is a BUG here if is a short copy after write_begin, ++ * but I think this is a generic fs bug. The problem is that ++ * we have marked the page as initialized, but it is possible ++ * that the section not copied may never get copied. ++ */ ++ ret = mark_initialized_sectors(be->be_inval, isect, ++ PAGE_CACHE_SECTORS, ++ pages_to_mark); ++ /* Want to preallocate mem so above can't fail */ ++ if (ret) ++ goto cleanup; ++ } ++ SetPageMappedToDisk(page); ++ ret = 0; ++ ++cleanup: ++ free_buffer_head(bh); ++ put_extent(be); ++ put_extent(cow_read); ++ if (ret) { ++ /* Need to mark layout with bad read...should now ++ * just use nfs4 for reads and writes. ++ */ ++ mark_bad_read(); ++ } ++ return ret; ++} ++ ++static int ++bl_write_begin(struct pnfs_layout_segment *lseg, struct page *page, loff_t pos, ++ unsigned count, struct pnfs_fsdata *fsdata) ++{ ++ unsigned from, to; ++ int ret; ++ sector_t *pages_to_mark = NULL; ++ struct pnfs_block_layout *bl = BLK_LSEG2EXT(lseg); ++ ++ dprintk("%s enter, %u@%lld\n", __func__, count, pos); ++ print_page(page); ++ /* The following code assumes blocksize >= PAGE_CACHE_SIZE */ ++ if (bl->bl_blocksize < (PAGE_CACHE_SIZE >> 9)) { ++ dprintk("%s Can't handle blocksize %llu\n", __func__, ++ (u64)bl->bl_blocksize); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ return 0; ++ } ++ if (PageMappedToDisk(page)) { ++ /* Basically, this is a flag that says we have ++ * successfully called write_begin already on this page. ++ */ ++ /* NOTE - there are cache consistency issues here. ++ * For example, what if the layout is recalled, then regained? ++ * If the file is closed and reopened, will the page flags ++ * be reset? If not, we'll have to use layout info instead of ++ * the page flag. ++ */ ++ return 0; ++ } ++ from = pos & (PAGE_CACHE_SIZE - 1); ++ to = from + count; ++ ret = init_page_for_write(bl, page, from, to, &pages_to_mark); ++ if (ret) { ++ dprintk("%s init page failed with %i", __func__, ret); ++ /* Revert back to plain NFS and just continue on with ++ * write. This assumes there is no request attached, which ++ * should be true if we get here. ++ */ ++ BUG_ON(PagePrivate(page)); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ kfree(pages_to_mark); ++ ret = 0; ++ } else { ++ fsdata->private = pages_to_mark; ++ } ++ return ret; ++} ++ ++/* CAREFUL - what happens if copied < count??? */ ++static int ++bl_write_end(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter, %u@%lld, lseg=%p\n", __func__, count, pos, lseg); ++ print_page(page); ++ if (lseg) ++ SetPageUptodate(page); ++ return 0; ++} ++ ++/* Return any memory allocated to fsdata->private, and take advantage ++ * of no page locks to mark pages noted in write_begin as needing ++ * initialization. ++ */ ++static void ++bl_write_end_cleanup(struct file *filp, struct pnfs_fsdata *fsdata) ++{ ++ struct page *page; ++ pgoff_t index; ++ sector_t *pos; ++ struct address_space *mapping = filp->f_mapping; ++ struct pnfs_fsdata *fake_data; ++ struct pnfs_layout_segment *lseg; ++ ++ if (!fsdata) ++ return; ++ lseg = fsdata->lseg; ++ if (!lseg) ++ return; ++ pos = fsdata->private; ++ if (!pos) ++ return; ++ dprintk("%s enter with pos=%llu\n", __func__, (u64)(*pos)); ++ for (; *pos != ~0; pos++) { ++ index = *pos >> (PAGE_CACHE_SHIFT - 9); ++ /* XXX How do we properly deal with failures here??? */ ++ page = grab_cache_page_write_begin(mapping, index, 0); ++ if (!page) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", __func__); ++ continue; ++ } ++ dprintk("%s: Examining block page\n", __func__); ++ print_page(page); ++ if (!PageMappedToDisk(page)) { ++ /* XXX How do we properly deal with failures here??? */ ++ dprintk("%s Marking block page\n", __func__); ++ init_page_for_write(BLK_LSEG2EXT(fsdata->lseg), page, ++ PAGE_CACHE_SIZE, PAGE_CACHE_SIZE, ++ NULL); ++ print_page(page); ++ fake_data = kzalloc(sizeof(*fake_data), GFP_KERNEL); ++ if (!fake_data) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", ++ __func__); ++ unlock_page(page); ++ continue; ++ } ++ get_lseg(lseg); ++ fake_data->lseg = lseg; ++ fake_data->bypass_eof = 1; ++ mapping->a_ops->write_end(filp, mapping, ++ index << PAGE_CACHE_SHIFT, ++ PAGE_CACHE_SIZE, ++ PAGE_CACHE_SIZE, ++ page, fake_data); ++ /* Note fake_data is freed by nfs_write_end */ ++ } else ++ unlock_page(page); ++ } ++ kfree(fsdata->private); ++ fsdata->private = NULL; ++} ++ ++static ssize_t ++bl_get_stripesize(struct pnfs_layout_type *lo) ++{ ++ dprintk("%s enter\n", __func__); ++ return 0; ++} ++ ++/* This is called by nfs_can_coalesce_requests via nfs_pageio_do_add_request. ++ * Should return False if there is a reason requests can not be coalesced, ++ * otherwise, should default to returning True. ++ */ ++static int ++bl_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ dprintk("%s enter\n", __func__); ++ if (pgio->pg_iswrite) ++ return prev->wb_lseg == req->wb_lseg; ++ else ++ return 1; ++} ++ ++static struct layoutdriver_io_operations blocklayout_io_operations = { ++ .commit = bl_commit, ++ .read_pagelist = bl_read_pagelist, ++ .write_pagelist = bl_write_pagelist, ++ .write_begin = bl_write_begin, ++ .write_end = bl_write_end, ++ .write_end_cleanup = bl_write_end_cleanup, ++ .alloc_layout = bl_alloc_layout, ++ .free_layout = bl_free_layout, ++ .alloc_lseg = bl_alloc_lseg, ++ .free_lseg = bl_free_lseg, ++ .setup_layoutcommit = bl_setup_layoutcommit, ++ .encode_layoutcommit = bl_encode_layoutcommit, ++ .cleanup_layoutcommit = bl_cleanup_layoutcommit, ++ .initialize_mountpoint = bl_initialize_mountpoint, ++ .uninitialize_mountpoint = bl_uninitialize_mountpoint, ++}; ++ ++static struct layoutdriver_policy_operations blocklayout_policy_operations = { ++ .get_stripesize = bl_get_stripesize, ++ .pg_test = bl_pg_test, ++}; ++ ++static struct pnfs_layoutdriver_type blocklayout_type = { ++ .id = LAYOUT_BLOCK_VOLUME, ++ .name = "LAYOUT_BLOCK_VOLUME", ++ .ld_io_ops = &blocklayout_io_operations, ++ .ld_policy_ops = &blocklayout_policy_operations, ++}; ++ ++static int __init nfs4blocklayout_init(void) ++{ ++ dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); ++ ++ pnfs_block_callback_ops = pnfs_register_layoutdriver(&blocklayout_type); ++ bl_pipe_init(); ++ return 0; ++} ++ ++static void __exit nfs4blocklayout_exit(void) ++{ ++ dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", ++ __func__); ++ ++ pnfs_unregister_layoutdriver(&blocklayout_type); ++ bl_pipe_exit(); ++} ++ ++module_init(nfs4blocklayout_init); ++module_exit(nfs4blocklayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-23 12:09:03.289501933 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-23 12:09:03.289501933 -0400 +@@ -0,0 +1,335 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdev.c ++ * ++ * Device operations for the pnfs nfs4 file layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include /* __bread */ ++ ++#include ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes) ++{ ++ uint32_t *q = p + XDR_QUADLEN(nbytes); ++ if (unlikely(q > end || q < p)) ++ return NULL; ++ return p; ++} ++EXPORT_SYMBOL(blk_overflow); ++ ++/* Open a block_device by device number. */ ++struct block_device *nfs4_blkdev_get(dev_t dev) ++{ ++ struct block_device *bd; ++ ++ dprintk("%s enter\n", __func__); ++ bd = open_by_devnum(dev, FMODE_READ); ++ if (IS_ERR(bd)) ++ goto fail; ++ return bd; ++fail: ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ return NULL; ++} ++ ++/* ++ * Release the block device ++ */ ++int nfs4_blkdev_put(struct block_device *bdev) ++{ ++ dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), ++ MINOR(bdev->bd_dev)); ++ bd_release(bdev); ++ return blkdev_put(bdev, FMODE_READ); ++} ++ ++/* Decodes pnfs_block_deviceaddr4 (draft-8) which is XDR encoded ++ * in dev->dev_addr_buf. ++ */ ++struct pnfs_block_dev * ++nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist) ++{ ++ struct pnfs_block_dev *rv = NULL; ++ struct block_device *bd = NULL; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint32_t major, minor; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return NULL; ++ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); ++ dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, ++ dev->mincount); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_MOUNT, 0, dev->area, ++ dev->mincount); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out_err; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ dprintk("%s CALLING USERSPACE DAEMON\n", __func__); ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out_err; ++ } ++ if (reply->status != BL_DEVICE_REQUEST_PROC) { ++ dprintk("%s failed to open device: %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ memcpy(&major, (uint32_t *)(payload_of(reply)), sizeof(uint32_t)); ++ memcpy(&minor, (uint32_t *)(payload_of(reply) + sizeof(uint32_t)), ++ sizeof(uint32_t)); ++ bd = nfs4_blkdev_get(MKDEV(major, minor)); ++ if (IS_ERR(bd)) { ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ ++ rv = kzalloc(sizeof(*rv), GFP_KERNEL); ++ if (!rv) ++ goto out_err; ++ ++ rv->bm_mdev = bd; ++ memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct pnfs_deviceid)); ++ dprintk("%s Created device %s with bd_block_size %u\n", ++ __func__, ++ bd->bd_disk->disk_name, ++ bd->bd_block_size); ++ kfree(reply); ++ kfree(msg); ++ return rv; ++ ++out_err: ++ kfree(rv); ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return NULL; ++} ++ ++/* Map deviceid returned by the server to constructed block_device */ ++static struct block_device *translate_devid(struct pnfs_layout_type *lo, ++ struct pnfs_deviceid *id) ++{ ++ struct block_device *rv = NULL; ++ struct block_mount_id *mid; ++ struct pnfs_block_dev *dev; ++ ++ dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); ++ mid = BLK_ID(lo); ++ spin_lock(&mid->bm_lock); ++ list_for_each_entry(dev, &mid->bm_devlist, bm_node) { ++ if (memcmp(id->data, dev->bm_mdevid.data, ++ NFS4_PNFS_DEVICEID4_SIZE) == 0) { ++ rv = dev->bm_mdev; ++ goto out; ++ } ++ } ++ out: ++ spin_unlock(&mid->bm_lock); ++ dprintk("%s returning %p\n", __func__, rv); ++ return rv; ++} ++ ++/* Tracks info needed to ensure extents in layout obey constraints of spec */ ++struct layout_verification { ++ u32 mode; /* R or RW */ ++ u64 start; /* Expected start of next non-COW extent */ ++ u64 inval; /* Start of INVAL coverage */ ++ u64 cowread; /* End of COW read coverage */ ++}; ++ ++/* Verify the extent meets the layout requirements of the pnfs-block draft, ++ * section 2.3.1. ++ */ ++static int verify_extent(struct pnfs_block_extent *be, ++ struct layout_verification *lv) ++{ ++ if (lv->mode == IOMODE_READ) { ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA || ++ be->be_state == PNFS_BLOCK_INVALID_DATA) ++ return -EIO; ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } ++ /* lv->mode == IOMODE_RW */ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ if (lv->cowread > lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ lv->inval = lv->start; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_READ_DATA) { ++ if (be->be_f_offset > lv->start) ++ return -EIO; ++ if (be->be_f_offset < lv->inval) ++ return -EIO; ++ if (be->be_f_offset < lv->cowread) ++ return -EIO; ++ /* It looks like you might want to min this with lv->start, ++ * but you really don't. ++ */ ++ lv->inval = lv->inval + be->be_length; ++ lv->cowread = be->be_f_offset + be->be_length; ++ return 0; ++ } else ++ return -EIO; ++} ++ ++/* XDR decode pnfs_block_layout4 structure */ ++int ++nfs4_blk_process_layoutget(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layoutget_res *lgr) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ uint32_t *p = (uint32_t *)lgr->layout.buf; ++ uint32_t *end = (uint32_t *)((char *)lgr->layout.buf + lgr->layout.len); ++ int i, status = -EIO; ++ uint32_t count; ++ struct pnfs_block_extent *be = NULL, *save; ++ uint64_t tmp; /* Used by READSECTOR */ ++ struct layout_verification lv = { ++ .mode = lgr->lseg.iomode, ++ .start = lgr->lseg.offset >> 9, ++ .inval = lgr->lseg.offset >> 9, ++ .cowread = lgr->lseg.offset >> 9, ++ }; ++ ++ LIST_HEAD(extents); ++ ++ BLK_READBUF(p, end, 4); ++ READ32(count); ++ ++ dprintk("%s enter, number of extents %i\n", __func__, count); ++ BLK_READBUF(p, end, (28 + NFS4_PNFS_DEVICEID4_SIZE) * count); ++ ++ /* Decode individual extents, putting them in temporary ++ * staging area until whole layout is decoded to make error ++ * recovery easier. ++ */ ++ for (i = 0; i < count; i++) { ++ be = alloc_extent(); ++ if (!be) { ++ status = -ENOMEM; ++ goto out_err; ++ } ++ READ_DEVID(&be->be_devid); ++ be->be_mdev = translate_devid(lo, &be->be_devid); ++ if (!be->be_mdev) ++ goto out_err; ++ /* The next three values are read in as bytes, ++ * but stored as 512-byte sector lengths ++ */ ++ READ_SECTOR(be->be_f_offset); ++ READ_SECTOR(be->be_length); ++ READ_SECTOR(be->be_v_offset); ++ READ32(be->be_state); ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ be->be_inval = &bl->bl_inval; ++ if (verify_extent(be, &lv)) { ++ dprintk("%s verify failed\n", __func__); ++ goto out_err; ++ } ++ list_add_tail(&be->be_node, &extents); ++ } ++ if (p != end) { ++ dprintk("%s Undecoded cruft at end of opaque\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lgr->lseg.offset + lgr->lseg.length != lv.start << 9) { ++ dprintk("%s Final length mismatch\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lv.start < lv.cowread) { ++ dprintk("%s Final uncovered COW extent\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ /* Extents decoded properly, now try to merge them in to ++ * existing layout extents. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ list_for_each_entry_safe(be, save, &extents, be_node) { ++ list_del(&be->be_node); ++ status = add_and_merge_extent(bl, be); ++ if (status) { ++ spin_unlock(&bl->bl_ext_lock); ++ /* This is a fairly catastrophic error, as the ++ * entire layout extent lists are now corrupted. ++ * We should have some way to distinguish this. ++ */ ++ be = NULL; ++ goto out_err; ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ status = 0; ++ out: ++ dprintk("%s returns %i\n", __func__, status); ++ return status; ++ ++ out_err: ++ put_extent(be); ++ while (!list_empty(&extents)) { ++ be = list_first_entry(&extents, struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ goto out; ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-23 12:09:03.290395707 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-23 12:09:03.290395707 -0400 +@@ -0,0 +1,120 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdm.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2007 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Fred Isaman ++ * Andy Adamson ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include /* gendisk - used in a dprintk*/ ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Defines used for calculating memory usage in nfs4_blk_flatten() */ ++#define ARGSIZE 24 /* Max bytes needed for linear target arg string */ ++#define SPECSIZE (sizeof8(struct dm_target_spec) + ARGSIZE) ++#define SPECS_PER_PAGE (PAGE_SIZE / SPECSIZE) ++#define SPEC_HEADER_ADJUST (SPECS_PER_PAGE - \ ++ (PAGE_SIZE - sizeof8(struct dm_ioctl)) / SPECSIZE) ++#define roundup8(x) (((x)+7) & ~7) ++#define sizeof8(x) roundup8(sizeof(x)) ++ ++static int dev_remove(dev_t dev) ++{ ++ int ret = 1; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint64_t bl_dev; ++ uint32_t major = MAJOR(dev), minor = MINOR(dev); ++ ++ dprintk("Entering %s\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return ret; ++ ++ memcpy((void *)&bl_dev, &major, sizeof(uint32_t)); ++ memcpy((void *)&bl_dev + sizeof(uint32_t), &minor, sizeof(uint32_t)); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_UMOUNT, 0, (void *)&bl_dev, ++ sizeof(uint64_t)); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out; ++ } ++ ++ if (reply->status == BL_DEVICE_REQUEST_PROC) ++ ret = 0; /*TODO: what to return*/ ++out: ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return ret; ++} ++ ++/* ++ * Release meta device ++ */ ++static int nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) ++{ ++ int rv; ++ ++ dprintk("%s Releasing\n", __func__); ++ /* XXX Check return? */ ++ rv = nfs4_blkdev_put(bdev->bm_mdev); ++ dprintk("%s nfs4_blkdev_put returns %d\n", __func__, rv); ++ ++ rv = dev_remove(bdev->bm_mdev->bd_dev); ++ dprintk("%s Returns %d\n", __func__, rv); ++ return rv; ++} ++ ++void free_block_dev(struct pnfs_block_dev *bdev) ++{ ++ if (bdev) { ++ if (bdev->bm_mdev) { ++ dprintk("%s Removing DM device: %d:%d\n", ++ __func__, ++ MAJOR(bdev->bm_mdev->bd_dev), ++ MINOR(bdev->bm_mdev->bd_dev)); ++ /* XXX Check status ?? */ ++ nfs4_blk_metadev_release(bdev); ++ } ++ kfree(bdev); ++ } ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-23 12:09:03.290395707 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-23 12:09:03.291501560 -0400 +@@ -0,0 +1,303 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#ifndef FS_NFS_NFS4BLOCKLAYOUT_H ++#define FS_NFS_NFS4BLOCKLAYOUT_H ++ ++#include ++#include /* Needed by nfs4_pnfs.h */ ++#include ++#include /* Needed for struct dm_ioctl*/ ++ ++#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> 9) ++ ++#define PG_pnfserr PG_owner_priv_1 ++#define PagePnfsErr(page) test_bit(PG_pnfserr, &(page)->flags) ++#define SetPagePnfsErr(page) set_bit(PG_pnfserr, &(page)->flags) ++#define ClearPagePnfsErr(page) clear_bit(PG_pnfserr, &(page)->flags) ++ ++extern int dm_dev_create(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_dev_remove(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_do_resume(struct dm_ioctl *param); ++extern int dm_table_load(struct dm_ioctl *param, size_t param_size); ++ ++struct block_mount_id { ++ spinlock_t bm_lock; /* protects list */ ++ struct list_head bm_devlist; /* holds pnfs_block_dev */ ++}; ++ ++struct pnfs_block_dev { ++ struct list_head bm_node; ++ struct pnfs_deviceid bm_mdevid; /* associated devid */ ++ struct block_device *bm_mdev; /* meta device itself */ ++}; ++ ++/* holds visible disks that can be matched against VOLUME_SIMPLE signatures */ ++struct visible_block_device { ++ struct list_head vi_node; ++ struct block_device *vi_bdev; ++ int vi_mapped; ++ int vi_put_done; ++}; ++ ++enum blk_vol_type { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, /* maps to a single LU */ ++ PNFS_BLOCK_VOLUME_SLICE = 1, /* slice of another volume */ ++ PNFS_BLOCK_VOLUME_CONCAT = 2, /* concatenation of multiple volumes */ ++ PNFS_BLOCK_VOLUME_STRIPE = 3 /* striped across multiple volumes */ ++}; ++ ++/* All disk offset/lengths are stored in 512-byte sectors */ ++struct pnfs_blk_volume { ++ uint32_t bv_type; ++ sector_t bv_size; ++ struct pnfs_blk_volume **bv_vols; ++ int bv_vol_n; ++ union { ++ dev_t bv_dev; ++ sector_t bv_stripe_unit; ++ sector_t bv_offset; ++ }; ++}; ++ ++/* Since components need not be aligned, cannot use sector_t */ ++struct pnfs_blk_sig_comp { ++ int64_t bs_offset; /* In bytes */ ++ uint32_t bs_length; /* In bytes */ ++ char *bs_string; ++}; ++ ++/* Maximum number of signatures components in a simple volume */ ++# define PNFS_BLOCK_MAX_SIG_COMP 16 ++ ++struct pnfs_blk_sig { ++ int si_num_comps; ++ struct pnfs_blk_sig_comp si_comps[PNFS_BLOCK_MAX_SIG_COMP]; ++}; ++ ++enum exstate4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ ++ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ ++}; ++ ++#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ ++ ++struct my_tree_t { ++ sector_t mtt_step_size; /* Internal sector alignment */ ++ struct list_head mtt_stub; /* Should be a radix tree */ ++}; ++ ++struct pnfs_inval_markings { ++ spinlock_t im_lock; ++ struct my_tree_t im_tree; /* Sectors that need LAYOUTCOMMIT */ ++ sector_t im_block_size; /* Server blocksize in sectors */ ++}; ++ ++struct pnfs_inval_tracking { ++ struct list_head it_link; ++ int it_sector; ++ int it_tags; ++}; ++ ++/* sector_t fields are all in 512-byte sectors */ ++struct pnfs_block_extent { ++ struct kref be_refcnt; ++ struct list_head be_node; /* link into lseg list */ ++ struct pnfs_deviceid be_devid; /* STUB - remevable??? */ ++ struct block_device *be_mdev; ++ sector_t be_f_offset; /* the starting offset in the file */ ++ sector_t be_length; /* the size of the extent */ ++ sector_t be_v_offset; /* the starting offset in the volume */ ++ enum exstate4 be_state; /* the state of this extent */ ++ struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ ++}; ++ ++/* Shortened extent used by LAYOUTCOMMIT */ ++struct pnfs_block_short_extent { ++ struct list_head bse_node; ++ struct pnfs_deviceid bse_devid; /* STUB - removable??? */ ++ struct block_device *bse_mdev; ++ sector_t bse_f_offset; /* the starting offset in the file */ ++ sector_t bse_length; /* the size of the extent */ ++}; ++ ++static inline void ++INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) ++{ ++ spin_lock_init(&marks->im_lock); ++ INIT_LIST_HEAD(&marks->im_tree.mtt_stub); ++ marks->im_block_size = blocksize; ++ marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, ++ blocksize); ++} ++ ++enum extentclass4 { ++ RW_EXTENT = 0, /* READWRTE and INVAL */ ++ RO_EXTENT = 1, /* READ and NONE */ ++ EXTENT_LISTS = 2, ++}; ++ ++static inline int choose_list(enum exstate4 state) ++{ ++ if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) ++ return RO_EXTENT; ++ else ++ return RW_EXTENT; ++} ++ ++struct pnfs_block_layout { ++ struct pnfs_layout_type bl_layout; ++ struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ ++ spinlock_t bl_ext_lock; /* Protects list manipulation */ ++ struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ ++ struct list_head bl_commit; /* Needs layout commit */ ++ unsigned int bl_count; /* entries in bl_commit */ ++ sector_t bl_blocksize; /* Server blocksize in sectors */ ++}; ++ ++/* this struct is comunicated between: ++ * bl_setup_layoutcommit && bl_encode_layoutcommit && bl_cleanup_layoutcommit ++ */ ++struct bl_layoutupdate_data { ++ struct list_head ranges; ++}; ++ ++#define BLK_ID(lo) ((struct block_mount_id *)(PNFS_NFS_SERVER(lo)->pnfs_ld_data)) ++ ++static inline struct pnfs_block_layout * ++BLK_LO2EXT(struct pnfs_layout_type *lo) ++{ ++ return container_of(lo, struct pnfs_block_layout, bl_layout); ++} ++ ++static inline struct pnfs_block_layout * ++BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) ++{ ++ return BLK_LO2EXT(lseg->layout); ++} ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes); ++ ++#define BLK_READBUF(p, e, nbytes) do { \ ++ p = blk_overflow(p, e, nbytes); \ ++ if (!p) { \ ++ printk(KERN_WARNING \ ++ "%s: reply buffer overflowed in line %d.\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++} while (0) ++ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (uint64_t)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++#define READ_DEVID(x) COPYMEM((x)->data, NFS4_PNFS_DEVICEID4_SIZE) ++#define READ_SECTOR(x) do { \ ++ READ64(tmp); \ ++ if (tmp & 0x1ff) { \ ++ printk(KERN_WARNING \ ++ "%s Value not 512-byte aligned at line %d\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++ (x) = tmp >> 9; \ ++} while (0) ++ ++#define WRITE32(n) do { \ ++ *p++ = htonl(n); \ ++ } while (0) ++#define WRITE64(n) do { \ ++ *p++ = htonl((uint32_t)((n) >> 32)); \ ++ *p++ = htonl((uint32_t)(n)); \ ++} while (0) ++#define WRITEMEM(ptr, nbytes) do { \ ++ p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ ++} while (0) ++#define WRITE_DEVID(x) WRITEMEM((x)->data, NFS4_PNFS_DEVICEID4_SIZE) ++ ++/* blocklayoutdev.c */ ++struct block_device *nfs4_blkdev_get(dev_t dev); ++int nfs4_blkdev_put(struct block_device *bdev); ++struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist); ++int nfs4_blk_process_layoutget(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layoutget_res *lgr); ++int nfs4_blk_create_block_disk_list(struct list_head *); ++void nfs4_blk_destroy_disk_list(struct list_head *); ++/* blocklayoutdm.c */ ++int nfs4_blk_flatten(struct pnfs_blk_volume *, int, struct pnfs_block_dev *); ++void free_block_dev(struct pnfs_block_dev *bdev); ++/* extents.c */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read); ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages); ++void put_extent(struct pnfs_block_extent *be); ++struct pnfs_block_extent *alloc_extent(void); ++struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be); ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect); ++int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *arg); ++void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct pnfs_layoutcommit_arg *arg, ++ int status); ++int add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new); ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length); ++ ++#include ++ ++extern struct pipefs_list bl_device_list; ++extern struct dentry *bl_device_pipe; ++ ++int bl_pipe_init(void); ++void bl_pipe_exit(void); ++ ++#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ ++#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ ++#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ ++#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ ++#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ ++ ++#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-23 12:09:03.292511531 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-08-23 12:09:03.292511531 -0400 +@@ -0,0 +1,948 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include "blocklayout.h" ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Bit numbers */ ++#define EXTENT_INITIALIZED 0 ++#define EXTENT_WRITTEN 1 ++#define EXTENT_IN_COMMIT 2 ++#define INTERNAL_EXISTS MY_MAX_TAGS ++#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) ++ ++/* Returns largest t<=s s.t. t%base==0 */ ++static inline sector_t normalize(sector_t s, int base) ++{ ++ sector_t tmp = s; /* Since do_div modifies its argument */ ++ return s - do_div(tmp, base); ++} ++ ++static inline sector_t normalize_up(sector_t s, int base) ++{ ++ return normalize(s + base - 1, base); ++} ++ ++/* Complete stub using list while determine API wanted */ ++ ++/* Returns tags, or negative */ ++static int32_t _find_entry(struct my_tree_t *tree, u64 s) ++{ ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu) enter\n", __func__, s); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) ++ return pos->it_tags & INTERNAL_MASK; ++ else ++ break; ++ } ++ return -ENOENT; ++} ++ ++static inline ++int _has_tag(struct my_tree_t *tree, u64 s, int32_t tag) ++{ ++ int32_t tags; ++ ++ dprintk("%s(%llu, %i) enter\n", __func__, s, tag); ++ s = normalize(s, tree->mtt_step_size); ++ tags = _find_entry(tree, s); ++ if ((tags < 0) || !(tags & (1 << tag))) ++ return 0; ++ else ++ return 1; ++} ++ ++/* Creates entry with tag, or if entry already exists, unions tag to it. ++ * If storage is not NULL, newly created entry will use it. ++ * Returns number of entries added, or negative on error. ++ */ ++static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag, ++ struct pnfs_inval_tracking *storage) ++{ ++ int found = 0; ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) { ++ found = 1; ++ break; ++ } else ++ break; ++ } ++ if (found) { ++ pos->it_tags |= (1 << tag); ++ return 0; ++ } else { ++ struct pnfs_inval_tracking *new; ++ if (storage) ++ new = storage; ++ else { ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ } ++ new->it_sector = s; ++ new->it_tags = (1 << tag); ++ list_add(&new->it_link, &pos->it_link); ++ return 1; ++ } ++} ++ ++/* XXXX Really want option to not create */ ++/* Over range, unions tag with existing entries, else creates entry with tag */ ++static int _set_range(struct my_tree_t *tree, int32_t tag, u64 s, u64 length) ++{ ++ u64 i; ++ ++ dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); ++ for (i = normalize(s, tree->mtt_step_size); i < s + length; ++ i += tree->mtt_step_size) ++ if (_add_entry(tree, i, tag, NULL)) ++ return -ENOMEM; ++ return 0; ++} ++ ++/* Ensure that future operations on given range of tree will not malloc */ ++static int _preload_range(struct my_tree_t *tree, u64 offset, u64 length) ++{ ++ u64 start, end, s; ++ int count, i, used = 0, status = -ENOMEM; ++ struct pnfs_inval_tracking **storage; ++ ++ dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); ++ start = normalize(offset, tree->mtt_step_size); ++ end = normalize_up(offset + length, tree->mtt_step_size); ++ count = (int)(end - start) / (int)tree->mtt_step_size; ++ ++ /* Pre-malloc what memory we might need */ ++ storage = kmalloc(sizeof(*storage) * count, GFP_KERNEL); ++ if (!storage) ++ return -ENOMEM; ++ for (i = 0; i < count; i++) { ++ storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), ++ GFP_KERNEL); ++ if (!storage[i]) ++ goto out_cleanup; ++ } ++ ++ /* Now need lock - HOW??? */ ++ ++ for (s = start; s < end; s += tree->mtt_step_size) ++ used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); ++ ++ /* Unlock - HOW??? */ ++ status = 0; ++ ++ out_cleanup: ++ for (i = used; i < count; i++) { ++ if (!storage[i]) ++ break; ++ kfree(storage[i]); ++ } ++ kfree(storage); ++ return status; ++} ++ ++static void set_needs_init(sector_t *array, sector_t offset) ++{ ++ sector_t *p = array; ++ ++ dprintk("%s enter\n", __func__); ++ if (!p) ++ return; ++ while (*p < offset) ++ p++; ++ if (*p == offset) ++ return; ++ else if (*p == ~0) { ++ *p++ = offset; ++ *p = ~0; ++ return; ++ } else { ++ sector_t *save = p; ++ dprintk("%s Adding %llu\n", __func__, (u64)offset); ++ while (*p != ~0) ++ p++; ++ p++; ++ memmove(save + 1, save, (char *)p - (char *)save); ++ *save = offset; ++ return; ++ } ++} ++ ++/* We are relying on page lock to serialize this */ ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Assume start, end already sector aligned */ ++static int ++_range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag) ++{ ++ struct pnfs_inval_tracking *pos; ++ u64 expect = 0; ++ ++ dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector >= end) ++ continue; ++ if (!expect) { ++ if ((pos->it_sector == end - tree->mtt_step_size) && ++ (pos->it_tags & (1 << tag))) { ++ expect = pos->it_sector - tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ continue; ++ } else { ++ return 0; ++ } ++ } ++ if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) ++ return 0; ++ expect -= tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ } ++ return 0; ++} ++ ++static int is_range_written(struct pnfs_inval_markings *marks, ++ sector_t start, sector_t end) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Marks sectors in [offest, offset_length) as having been initialized. ++ * All lengths are step-aligned, where step is min(pagesize, blocksize). ++ * Notes where partial block is initialized, and helps prepare it for ++ * complete initialization later. ++ */ ++/* Currently assumes offset is page-aligned */ ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages) ++{ ++ sector_t s, start, end; ++ sector_t *array = NULL; /* Pages to mark */ ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", ++ __func__, (u64)offset, (u64)length); ++ s = max((sector_t) 3, ++ 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); ++ dprintk("%s set max=%llu\n", __func__, (u64)s); ++ if (pages) { ++ array = kmalloc(s * sizeof(sector_t), GFP_KERNEL); ++ if (!array) ++ goto outerr; ++ array[0] = ~0; ++ } ++ ++ start = normalize(offset, marks->im_block_size); ++ end = normalize_up(offset + length, marks->im_block_size); ++ if (_preload_range(&marks->im_tree, start, end - start)) ++ goto outerr; ++ ++ spin_lock(&marks->im_lock); ++ ++ for (s = normalize_up(start, PAGE_CACHE_SECTORS); ++ s < offset; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s pre-area pages\n", __func__); ++ /* Portion of used block is not initialized */ ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) ++ goto out_unlock; ++ for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); ++ s < end; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s post-area pages\n", __func__); ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ ++ spin_unlock(&marks->im_lock); ++ ++ if (pages) { ++ if (array[0] == ~0) { ++ kfree(array); ++ *pages = NULL; ++ } else ++ *pages = array; ++ } ++ return 0; ++ ++ out_unlock: ++ spin_unlock(&marks->im_lock); ++ outerr: ++ if (pages) { ++ kfree(array); ++ *pages = NULL; ++ } ++ return -ENOMEM; ++} ++ ++/* Marks sectors in [offest, offset+length) as having been written to disk. ++ * All lengths should be block aligned. ++ */ ++int mark_written_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length) ++{ ++ int status; ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, ++ (u64)offset, (u64)length); ++ spin_lock(&marks->im_lock); ++ status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); ++ spin_unlock(&marks->im_lock); ++ return status; ++} ++ ++static void print_short_extent(struct pnfs_block_short_extent *be) ++{ ++ dprintk("PRINT SHORT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->bse_length); ++ } ++} ++ ++void print_clist(struct list_head *list, unsigned int count) ++{ ++ struct pnfs_block_short_extent *be; ++ unsigned int i = 0; ++ ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, bse_node) { ++ i++; ++ print_short_extent(be); ++ } ++ if (i != count) ++ dprintk("\n\nExpected %u entries\n\n\n", count); ++ dprintk("****************\n"); ++} ++ ++/* Note: In theory, we should do more checking that devid's match between ++ * old and new, but if they don't, the lists are too corrupt to salvage anyway. ++ */ ++/* Note this is very similar to add_and_merge_extent */ ++static void add_to_commitlist(struct pnfs_block_layout *bl, ++ struct pnfs_block_short_extent *new) ++{ ++ struct list_head *clist = &bl->bl_commit; ++ struct pnfs_block_short_extent *old, *save; ++ sector_t end = new->bse_f_offset + new->bse_length; ++ ++ dprintk("%s enter\n", __func__); ++ print_short_extent(new); ++ print_clist(clist, bl->bl_count); ++ bl->bl_count++; ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe(old, save, clist, bse_node) { ++ if (new->bse_f_offset < old->bse_f_offset) ++ break; ++ if (end <= old->bse_f_offset + old->bse_length) { ++ /* Range is already in list */ ++ bl->bl_count--; ++ kfree(new); ++ return; ++ } else if (new->bse_f_offset <= ++ old->bse_f_offset + old->bse_length) { ++ /* new overlaps or abuts existing be */ ++ if (new->bse_mdev == old->bse_mdev) { ++ /* extend new to fully replace old */ ++ new->bse_length += new->bse_f_offset - ++ old->bse_f_offset; ++ new->bse_f_offset = old->bse_f_offset; ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ } ++ /* Note that if we never hit the above break, old will not point to a ++ * valid extent. However, in that case &old->bse_node==list. ++ */ ++ list_add_tail(&new->bse_node, &old->bse_node); ++ /* Scan forward for overlaps. If we find any, extend new and ++ * remove the overlapped extent. ++ */ ++ old = list_prepare_entry(new, clist, bse_node); ++ list_for_each_entry_safe_continue(old, save, clist, bse_node) { ++ if (end < old->bse_f_offset) ++ break; ++ /* new overlaps or abuts old */ ++ if (new->bse_mdev == old->bse_mdev) { ++ if (end < old->bse_f_offset + old->bse_length) { ++ /* extend new to fully cover old */ ++ end = old->bse_f_offset + old->bse_length; ++ new->bse_length = end - new->bse_f_offset; ++ } ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ dprintk("%s: after merging\n", __func__); ++ print_clist(clist, bl->bl_count); ++} ++ ++/* Note the range described by offset, length is guaranteed to be contained ++ * within be. ++ */ ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length) ++{ ++ sector_t new_end, end = offset + length; ++ struct pnfs_block_short_extent *new; ++ struct pnfs_block_layout *bl = container_of(be->be_inval, ++ struct pnfs_block_layout, ++ bl_inval); ++ ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ ++ mark_written_sectors(be->be_inval, offset, length); ++ /* We want to add the range to commit list, but it must be ++ * block-normalized, and verified that the normalized range has ++ * been entirely written to disk. ++ */ ++ new->bse_f_offset = offset; ++ offset = normalize(offset, bl->bl_blocksize); ++ if (offset < new->bse_f_offset) { ++ if (is_range_written(be->be_inval, offset, new->bse_f_offset)) ++ new->bse_f_offset = offset; ++ else ++ new->bse_f_offset = offset + bl->bl_blocksize; ++ } ++ new_end = normalize_up(end, bl->bl_blocksize); ++ if (end < new_end) { ++ if (is_range_written(be->be_inval, end, new_end)) ++ end = new_end; ++ else ++ end = new_end - bl->bl_blocksize; ++ } ++ if (end <= new->bse_f_offset) { ++ kfree(new); ++ return 0; ++ } ++ new->bse_length = end - new->bse_f_offset; ++ new->bse_devid = be->be_devid; ++ new->bse_mdev = be->be_mdev; ++ ++ spin_lock(&bl->bl_ext_lock); ++ /* new will be freed, either by add_to_commitlist if it decides not ++ * to use it, or after LAYOUTCOMMIT uses it in the commitlist. ++ */ ++ add_to_commitlist(bl, new); ++ spin_unlock(&bl->bl_ext_lock); ++ return 0; ++} ++ ++static void print_bl_extent(struct pnfs_block_extent *be) ++{ ++ dprintk("PRINT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->be_length); ++ dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); ++ dprintk(" be_state %d\n", be->be_state); ++ } ++} ++ ++static void ++destroy_extent(struct kref *kref) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = container_of(kref, struct pnfs_block_extent, be_refcnt); ++ dprintk("%s be=%p\n", __func__, be); ++ kfree(be); ++} ++ ++void ++put_extent(struct pnfs_block_extent *be) ++{ ++ if (be) { ++ dprintk("%s enter %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_put(&be->be_refcnt, destroy_extent); ++ } ++} ++ ++struct pnfs_block_extent *alloc_extent(void) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = kmalloc(sizeof(struct pnfs_block_extent), GFP_KERNEL); ++ if (!be) ++ return NULL; ++ INIT_LIST_HEAD(&be->be_node); ++ kref_init(&be->be_refcnt); ++ be->be_inval = NULL; ++ return be; ++} ++ ++struct pnfs_block_extent * ++get_extent(struct pnfs_block_extent *be) ++{ ++ if (be) ++ kref_get(&be->be_refcnt); ++ return be; ++} ++ ++void print_elist(struct list_head *list) ++{ ++ struct pnfs_block_extent *be; ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, be_node) { ++ print_bl_extent(be); ++ } ++ dprintk("****************\n"); ++} ++ ++static inline int ++extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) ++{ ++ /* Note this assumes new->be_f_offset >= old->be_f_offset */ ++ return (new->be_state == old->be_state) && ++ ((new->be_state == PNFS_BLOCK_NONE_DATA) || ++ ((new->be_v_offset - old->be_v_offset == ++ new->be_f_offset - old->be_f_offset) && ++ new->be_mdev == old->be_mdev)); ++} ++ ++/* Adds new to appropriate list in bl, modifying new and removing existing ++ * extents as appropriate to deal with overlaps. ++ * ++ * See find_get_extent for list constraints. ++ * ++ * Refcount on new is already set. If end up not using it, or error out, ++ * need to put the reference. ++ * ++ * Lock is held by caller. ++ */ ++int ++add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new) ++{ ++ struct pnfs_block_extent *be, *tmp; ++ sector_t end = new->be_f_offset + new->be_length; ++ struct list_head *list; ++ ++ dprintk("%s enter with be=%p\n", __func__, new); ++ print_bl_extent(new); ++ list = &bl->bl_extents[choose_list(new->be_state)]; ++ print_elist(list); ++ ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe_reverse(be, tmp, list, be_node) { ++ if (new->be_f_offset >= be->be_f_offset + be->be_length) ++ break; ++ if (new->be_f_offset >= be->be_f_offset) { ++ if (end <= be->be_f_offset + be->be_length) { ++ /* new is a subset of existing be*/ ++ if (extents_consistent(be, new)) { ++ dprintk("%s: new is subset, ignoring\n", ++ __func__); ++ put_extent(new); ++ return 0; ++ } else { ++ goto out_err; ++ } ++ } else { ++ /* |<-- be -->| ++ * |<-- new -->| */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ new->be_length += new->be_f_offset - ++ be->be_f_offset; ++ new->be_f_offset = be->be_f_offset; ++ new->be_v_offset = be->be_v_offset; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } else if (end >= be->be_f_offset + be->be_length) { ++ /* new extent overlap existing be */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } else if (end > be->be_f_offset) { ++ /* |<-- be -->| ++ *|<-- new -->| */ ++ if (extents_consistent(new, be)) { ++ /* extend new to fully replace be */ ++ new->be_length += be->be_f_offset + be->be_length - ++ new->be_f_offset - new->be_length; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } ++ /* Note that if we never hit the above break, be will not point to a ++ * valid extent. However, in that case &be->be_node==list. ++ */ ++ list_add(&new->be_node, &be->be_node); ++ dprintk("%s: inserting new\n", __func__); ++ print_elist(list); ++ /* STUB - The per-list consistency checks have all been done, ++ * should now check cross-list consistency. ++ */ ++ return 0; ++ ++ out_err: ++ put_extent(new); ++ return -EIO; ++} ++ ++/* Returns extent, or NULL. If a second READ extent exists, it is returned ++ * in cow_read, if given. ++ * ++ * The extents are kept in two seperate ordered lists, one for READ and NONE, ++ * one for READWRITE and INVALID. Within each list, we assume: ++ * 1. Extents are ordered by file offset. ++ * 2. For any given isect, there is at most one extents that matches. ++ */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read) ++{ ++ struct pnfs_block_extent *be, *cow, *ret; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ cow = ret = NULL; ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret && ++ (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ if (!ret) ++ ret = be; ++ else if (be->be_state != PNFS_BLOCK_READ_DATA) ++ put_extent(be); ++ else ++ cow = be; ++ break; ++ } ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ if (cow_read) ++ *cow_read = cow; ++ print_bl_extent(ret); ++ return ret; ++} ++ ++/* Similar to find_get_extent, but called with lock held, and ignores cow */ ++static struct pnfs_block_extent * ++find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) ++{ ++ struct pnfs_block_extent *be, *ret = NULL; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ ret = be; ++ break; ++ } ++ } ++ } ++ print_bl_extent(ret); ++ return ret; ++} ++ ++int ++encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *arg) ++{ ++ sector_t start, end; ++ struct pnfs_block_short_extent *lce, *save; ++ unsigned int count = 0; ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct list_head *ranges = &bld->ranges; ++ __be32 *p, *xdr_start; ++ ++ dprintk("%s enter\n", __func__); ++ start = arg->lseg.offset >> 9; ++ end = start + (arg->lseg.length >> 9); ++ dprintk("%s set start=%llu, end=%llu\n", ++ __func__, (u64)start, (u64)end); ++ ++ /* BUG - creation of bl_commit is buggy - need to wait for ++ * entire block to be marked WRITTEN before it can be added. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ /* Want to adjust for possible truncate */ ++ /* We now want to adjust argument range */ ++ ++ /* XDR encode the ranges found */ ++ xdr_start = xdr_reserve_space(xdr, 8); ++ if (!xdr_start) ++ goto out; ++ list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { ++ p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); ++ if (!p) ++ break; ++ WRITE_DEVID(&lce->bse_devid); ++ WRITE64(lce->bse_f_offset << 9); ++ WRITE64(lce->bse_length << 9); ++ WRITE64(0LL); ++ WRITE32(PNFS_BLOCK_READWRITE_DATA); ++ list_del(&lce->bse_node); ++ list_add_tail(&lce->bse_node, ranges); ++ bl->bl_count--; ++ count++; ++ } ++ xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); ++ xdr_start[1] = cpu_to_be32(count); ++out: ++ spin_unlock(&bl->bl_ext_lock); ++ dprintk("%s found %i ranges\n", __func__, count); ++ return 0; ++} ++ ++/* Helper function to set_to_rw that initialize a new extent */ ++static void ++_prep_new_extent(struct pnfs_block_extent *new, ++ struct pnfs_block_extent *orig, ++ sector_t offset, sector_t length, int state) ++{ ++ kref_init(&new->be_refcnt); ++ /* don't need to INIT_LIST_HEAD(&new->be_node) */ ++ memcpy(&new->be_devid, &orig->be_devid, sizeof(struct pnfs_deviceid)); ++ new->be_mdev = orig->be_mdev; ++ new->be_f_offset = offset; ++ new->be_length = length; ++ new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; ++ new->be_state = state; ++ new->be_inval = orig->be_inval; ++} ++ ++/* Tries to merge be with extent in front of it in list. ++ * Frees storage if not used. ++ */ ++static struct pnfs_block_extent * ++_front_merge(struct pnfs_block_extent *be, struct list_head *head, ++ struct pnfs_block_extent *storage) ++{ ++ struct pnfs_block_extent *prev; ++ ++ if (!storage) ++ goto no_merge; ++ if (&be->be_node == head || be->be_node.prev == head) ++ goto no_merge; ++ prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); ++ if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || ++ !extents_consistent(prev, be)) ++ goto no_merge; ++ _prep_new_extent(storage, prev, prev->be_f_offset, ++ prev->be_length + be->be_length, prev->be_state); ++ list_replace(&prev->be_node, &storage->be_node); ++ put_extent(prev); ++ list_del(&be->be_node); ++ put_extent(be); ++ return storage; ++ ++ no_merge: ++ kfree(storage); ++ return be; ++} ++ ++static u64 ++set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) ++{ ++ u64 rv = offset + length; ++ struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; ++ struct pnfs_block_extent *children[3]; ++ struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; ++ int i = 0, j; ++ ++ dprintk("%s(%llu, %llu)\n", __func__, offset, length); ++ /* Create storage for up to three new extents e1, e2, e3 */ ++ e1 = kmalloc(sizeof(*e1), GFP_KERNEL); ++ e2 = kmalloc(sizeof(*e2), GFP_KERNEL); ++ e3 = kmalloc(sizeof(*e3), GFP_KERNEL); ++ /* BUG - we are ignoring any failure */ ++ if (!e1 || !e2 || !e3) ++ goto out_nosplit; ++ ++ spin_lock(&bl->bl_ext_lock); ++ be = find_get_extent_locked(bl, offset); ++ rv = be->be_f_offset + be->be_length; ++ if (be->be_state != PNFS_BLOCK_INVALID_DATA) { ++ spin_unlock(&bl->bl_ext_lock); ++ goto out_nosplit; ++ } ++ /* Add e* to children, bumping e*'s krefs */ ++ if (be->be_f_offset != offset) { ++ _prep_new_extent(e1, be, be->be_f_offset, ++ offset - be->be_f_offset, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e1; ++ print_bl_extent(e1); ++ } else ++ merge1 = e1; ++ _prep_new_extent(e2, be, offset, ++ min(length, be->be_f_offset + be->be_length - offset), ++ PNFS_BLOCK_READWRITE_DATA); ++ children[i++] = e2; ++ print_bl_extent(e2); ++ if (offset + length < be->be_f_offset + be->be_length) { ++ _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, ++ be->be_f_offset + be->be_length - ++ offset - length, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e3; ++ print_bl_extent(e3); ++ } else ++ merge2 = e3; ++ ++ /* Remove be from list, and insert the e* */ ++ /* We don't get refs on e*, since this list is the base reference ++ * set when init'ed. ++ */ ++ if (i < 3) ++ children[i] = NULL; ++ new = children[0]; ++ list_replace(&be->be_node, &new->be_node); ++ put_extent(be); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); ++ for (j = 1; j < i; j++) { ++ old = new; ++ new = children[j]; ++ list_add(&new->be_node, &old->be_node); ++ } ++ if (merge2) { ++ /* This is a HACK, should just create a _back_merge function */ ++ new = list_entry(new->be_node.next, ++ struct pnfs_block_extent, be_node); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ ++ /* Since we removed the base reference above, be is now scheduled for ++ * destruction. ++ */ ++ put_extent(be); ++ dprintk("%s returns %llu after split\n", __func__, rv); ++ return rv; ++ ++ out_nosplit: ++ kfree(e1); ++ kfree(e2); ++ kfree(e3); ++ dprintk("%s returns %llu without splitting\n", __func__, rv); ++ return rv; ++} ++ ++void ++clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct pnfs_layoutcommit_arg *arg, ++ int status) ++{ ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct pnfs_block_short_extent *lce, *save; ++ ++ dprintk("%s status %d\n", __func__, status); ++ list_for_each_entry_safe_reverse(lce, save, &bld->ranges, bse_node) { ++ if (likely(!status)) { ++ u64 offset = lce->bse_f_offset; ++ u64 end = offset + lce->bse_length; ++ ++ do { ++ offset = set_to_rw(bl, offset, end - offset); ++ } while (offset < end); ++ ++ kfree(lce); ++ } else { ++ spin_lock(&bl->bl_ext_lock); ++ add_to_commitlist(bl, lce); ++ spin_unlock(&bl->bl_ext_lock); ++ } ++ } ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile +--- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-23 12:09:03.292511531 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-08-23 12:09:03.293491476 -0400 +@@ -0,0 +1,6 @@ ++# ++# Makefile for the pNFS block layout driver kernel module ++# ++obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o ++blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \ ++ extents.o block-device-discovery-pipe.o +diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/callback.h +--- linux-2.6.34.noarch/fs/nfs/callback.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-08-23 12:09:03.293491476 -0400 +@@ -8,6 +8,8 @@ + #ifndef __LINUX_FS_NFS_CALLBACK_H + #define __LINUX_FS_NFS_CALLBACK_H + ++#include ++ + #define NFS4_CALLBACK 0x40000000 + #define NFS4_CALLBACK_XDRSIZE 2048 + #define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE) +@@ -72,6 +74,8 @@ struct cb_recallargs { + + #if defined(CONFIG_NFS_V4_1) + ++#include ++ + struct referring_call { + uint32_t rc_sequenceid; + uint32_t rc_slotid; +@@ -111,6 +115,13 @@ extern int nfs41_validate_delegation_sta + + #define RCA4_TYPE_MASK_RDATA_DLG 0 + #define RCA4_TYPE_MASK_WDATA_DLG 1 ++#define RCA4_TYPE_MASK_DIR_DLG 2 ++#define RCA4_TYPE_MASK_FILE_LAYOUT 3 ++#define RCA4_TYPE_MASK_BLK_LAYOUT 4 ++#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8 ++#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 ++#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 ++#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 + + struct cb_recallanyargs { + struct sockaddr *craa_addr; +@@ -127,6 +138,37 @@ struct cb_recallslotargs { + extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, + void *dummy); + ++struct cb_pnfs_layoutrecallargs { ++ struct sockaddr *cbl_addr; ++ struct nfs_fh cbl_fh; ++ struct nfs4_pnfs_layout_segment cbl_seg; ++ struct nfs_fsid cbl_fsid; ++ uint32_t cbl_recall_type; ++ uint32_t cbl_layout_type; ++ uint32_t cbl_layoutchanged; ++ nfs4_stateid cbl_stateid; ++}; ++ ++extern unsigned pnfs_cb_layoutrecall(struct cb_pnfs_layoutrecallargs *args, ++ void *dummy); ++ ++struct cb_pnfs_devicenotifyitem { ++ uint32_t cbd_notify_type; ++ uint32_t cbd_layout_type; ++ struct pnfs_deviceid cbd_dev_id; ++ uint32_t cbd_immediate; ++}; ++ ++/* XXX: Should be dynamic up to max compound size */ ++#define NFS4_DEV_NOTIFY_MAXENTRIES 10 ++struct cb_pnfs_devicenotifyargs { ++ struct sockaddr *addr; ++ int ndevs; ++ struct cb_pnfs_devicenotifyitem devs[NFS4_DEV_NOTIFY_MAXENTRIES]; ++}; ++ ++extern unsigned pnfs_cb_devicenotify(struct cb_pnfs_devicenotifyargs *args, ++ void *dummy); + #endif /* CONFIG_NFS_V4_1 */ + + extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); +diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/nfs/callback_proc.c +--- linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-08-23 12:09:03.294522414 -0400 +@@ -8,10 +8,15 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + #include "nfs4_fs.h" + #include "callback.h" + #include "delegation.h" + #include "internal.h" ++#include "pnfs.h" + + #ifdef NFS_DEBUG + #define NFSDBG_FACILITY NFSDBG_CALLBACK +@@ -62,16 +67,6 @@ out: + return res->status; + } + +-static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *) +-{ +-#if defined(CONFIG_NFS_V4_1) +- if (clp->cl_minorversion > 0) +- return nfs41_validate_delegation_stateid; +-#endif +- return nfs4_validate_delegation_stateid; +-} +- +- + __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) + { + struct nfs_client *clp; +@@ -92,8 +87,7 @@ __be32 nfs4_callback_recall(struct cb_re + inode = nfs_delegation_find_inode(clp, &args->fh); + if (inode != NULL) { + /* Set up a helper thread to actually return the delegation */ +- switch (nfs_async_inode_return_delegation(inode, &args->stateid, +- nfs_validate_delegation_stateid(clp))) { ++ switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { + case 0: + res = 0; + break; +@@ -116,24 +110,364 @@ out: + + int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) + { +- if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, +- sizeof(delegation->stateid.data)) != 0) ++ if (delegation == NULL || memcmp(delegation->stateid.u.data, ++ stateid->u.data, ++ sizeof(delegation->stateid.u.data))) + return 0; + return 1; + } + + #if defined(CONFIG_NFS_V4_1) + ++static bool ++pnfs_is_next_layout_stateid(const struct pnfs_layout_type *lo, ++ const nfs4_stateid stateid) ++{ ++ int seqlock; ++ bool res; ++ u32 oldseqid, newseqid; ++ ++ do { ++ seqlock = read_seqbegin(&lo->seqlock); ++ oldseqid = be32_to_cpu(lo->stateid.u.stateid.seqid); ++ newseqid = be32_to_cpu(stateid.u.stateid.seqid); ++ res = !memcmp(lo->stateid.u.stateid.other, ++ stateid.u.stateid.other, ++ NFS4_STATEID_OTHER_SIZE); ++ if (res) { /* comparing layout stateids */ ++ if (oldseqid == ~0) ++ res = (newseqid == 1); ++ else ++ res = (newseqid == oldseqid + 1); ++ } else { /* open stateid */ ++ res = !memcmp(lo->stateid.u.data, ++ &zero_stateid, ++ NFS4_STATEID_SIZE); ++ if (res) ++ res = (newseqid == 1); ++ } ++ } while (read_seqretry(&lo->seqlock, seqlock)); ++ ++ return res; ++} ++ ++/* ++ * Retrieve an inode based on layout recall parameters ++ * ++ * Note: caller must iput(inode) to dereference the inode. ++ */ ++static struct inode * ++nfs_layoutrecall_find_inode(struct nfs_client *clp, ++ const struct cb_pnfs_layoutrecallargs *args) ++{ ++ struct nfs_inode *nfsi; ++ struct pnfs_layout_type *layout; ++ struct nfs_server *server; ++ struct inode *ino = NULL; ++ ++ dprintk("%s: Begin recall_type=%d clp %p\n", ++ __func__, args->cbl_recall_type, clp); ++ ++ spin_lock(&clp->cl_lock); ++ list_for_each_entry(layout, &clp->cl_layouts, lo_layouts) { ++ nfsi = PNFS_NFS_INODE(layout); ++ if (!nfsi) ++ continue; ++ ++ dprintk("%s: Searching inode=%lu\n", ++ __func__, nfsi->vfs_inode.i_ino); ++ ++ if (args->cbl_recall_type == RETURN_FILE) { ++ if (nfs_compare_fh(&args->cbl_fh, &nfsi->fh)) ++ continue; ++ } else if (args->cbl_recall_type == RETURN_FSID) { ++ server = NFS_SERVER(&nfsi->vfs_inode); ++ if (server->fsid.major != args->cbl_fsid.major || ++ server->fsid.minor != args->cbl_fsid.minor) ++ continue; ++ } ++ ++ /* Make sure client didn't clean up layout without ++ * telling the server */ ++ if (!has_layout(nfsi)) ++ continue; ++ ++ ino = igrab(&nfsi->vfs_inode); ++ dprintk("%s: Found inode=%p\n", __func__, ino); ++ break; ++ } ++ spin_unlock(&clp->cl_lock); ++ return ino; ++} ++ ++struct recall_layout_threadargs { ++ struct inode *inode; ++ struct nfs_client *clp; ++ struct completion started; ++ struct cb_pnfs_layoutrecallargs *rl; ++ int result; ++}; ++ ++static int pnfs_recall_layout(void *data) ++{ ++ struct inode *inode, *ino; ++ struct nfs_client *clp; ++ struct cb_pnfs_layoutrecallargs rl; ++ struct nfs4_pnfs_layoutreturn *lrp; ++ struct recall_layout_threadargs *args = ++ (struct recall_layout_threadargs *)data; ++ int status = 0; ++ ++ daemonize("nfsv4-layoutreturn"); ++ ++ dprintk("%s: recall_type=%d fsid 0x%llx-0x%llx start\n", ++ __func__, args->rl->cbl_recall_type, ++ args->rl->cbl_fsid.major, args->rl->cbl_fsid.minor); ++ ++ clp = args->clp; ++ inode = args->inode; ++ rl = *args->rl; ++ ++ /* support whole file layouts only */ ++ rl.cbl_seg.offset = 0; ++ rl.cbl_seg.length = NFS4_MAX_UINT64; ++ ++ if (rl.cbl_recall_type == RETURN_FILE) { ++ if (pnfs_is_next_layout_stateid(NFS_I(inode)->layout, ++ rl.cbl_stateid)) ++ status = pnfs_return_layout(inode, &rl.cbl_seg, ++ &rl.cbl_stateid, RETURN_FILE, ++ false); ++ else ++ status = cpu_to_be32(NFS4ERR_DELAY); ++ if (status) ++ dprintk("%s RETURN_FILE error: %d\n", __func__, status); ++ else ++ status = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); ++ args->result = status; ++ complete(&args->started); ++ goto out; ++ } ++ ++ status = cpu_to_be32(NFS4_OK); ++ args->result = status; ++ complete(&args->started); ++ args = NULL; ++ ++ /* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */ ++ while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) { ++ /* FIXME: need to check status on pnfs_return_layout */ ++ pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, false); ++ iput(ino); ++ } ++ ++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); ++ if (!lrp) { ++ dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n", ++ __func__); ++ goto out; ++ } ++ ++ /* send final layoutreturn */ ++ lrp->args.reclaim = 0; ++ lrp->args.layout_type = rl.cbl_layout_type; ++ lrp->args.return_type = rl.cbl_recall_type; ++ lrp->args.lseg = rl.cbl_seg; ++ lrp->args.inode = inode; ++ pnfs4_proc_layoutreturn(lrp, true); ++ ++out: ++ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); ++ nfs_put_client(clp); ++ module_put_and_exit(0); ++ dprintk("%s: exit status %d\n", __func__, 0); ++ return 0; ++} ++ ++/* ++ * Asynchronous layout recall! ++ */ ++static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode, ++ struct cb_pnfs_layoutrecallargs *rl) ++{ ++ struct recall_layout_threadargs data = { ++ .clp = clp, ++ .inode = inode, ++ .rl = rl, ++ }; ++ struct task_struct *t; ++ int status = -EAGAIN; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ /* FIXME: do not allow two concurrent layout recalls */ ++ if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) ++ return status; ++ ++ init_completion(&data.started); ++ __module_get(THIS_MODULE); ++ if (!atomic_inc_not_zero(&clp->cl_count)) ++ goto out_put_no_client; ++ ++ t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout"); ++ if (IS_ERR(t)) { ++ printk(KERN_INFO "NFS: Layout recall callback thread failed " ++ "for client (clientid %08x/%08x)\n", ++ (unsigned)(clp->cl_clientid >> 32), ++ (unsigned)(clp->cl_clientid)); ++ status = PTR_ERR(t); ++ goto out_module_put; ++ } ++ wait_for_completion(&data.started); ++ return data.result; ++out_module_put: ++ nfs_put_client(clp); ++out_put_no_client: ++ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); ++ module_put(THIS_MODULE); ++ return status; ++} ++ ++static int pnfs_recall_all_layouts(struct nfs_client *clp) ++{ ++ struct cb_pnfs_layoutrecallargs rl; ++ struct inode *inode; ++ int status = 0; ++ ++ rl.cbl_recall_type = RETURN_ALL; ++ rl.cbl_seg.iomode = IOMODE_ANY; ++ rl.cbl_seg.offset = 0; ++ rl.cbl_seg.length = NFS4_MAX_UINT64; ++ ++ /* we need the inode to get the nfs_server struct */ ++ inode = nfs_layoutrecall_find_inode(clp, &rl); ++ if (!inode) ++ return status; ++ status = pnfs_async_return_layout(clp, inode, &rl); ++ iput(inode); ++ ++ return status; ++} ++ ++__be32 pnfs_cb_layoutrecall(struct cb_pnfs_layoutrecallargs *args, ++ void *dummy) ++{ ++ struct nfs_client *clp; ++ struct inode *inode = NULL; ++ __be32 res; ++ int status; ++ unsigned int num_client = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); ++ clp = nfs_find_client(args->cbl_addr, 4); ++ if (clp == NULL) { ++ dprintk("%s: no client for addr %u.%u.%u.%u\n", ++ __func__, NIPQUAD(args->cbl_addr)); ++ goto out; ++ } ++ ++ res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); ++ do { ++ struct nfs_client *prev = clp; ++ num_client++; ++ /* the callback must come from the MDS personality */ ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) ++ goto loop; ++ if (args->cbl_recall_type == RETURN_FILE) { ++ inode = nfs_layoutrecall_find_inode(clp, args); ++ if (inode != NULL) { ++ status = pnfs_async_return_layout(clp, inode, ++ args); ++ if (status) ++ res = cpu_to_be32(NFS4ERR_DELAY); ++ iput(inode); ++ } ++ } else { /* _ALL or _FSID */ ++ /* we need the inode to get the nfs_server struct */ ++ inode = nfs_layoutrecall_find_inode(clp, args); ++ if (!inode) ++ goto loop; ++ status = pnfs_async_return_layout(clp, inode, args); ++ if (status) ++ res = cpu_to_be32(NFS4ERR_DELAY); ++ iput(inode); ++ } ++loop: ++ clp = nfs_find_client_next(prev); ++ nfs_put_client(prev); ++ } while (clp != NULL); ++ ++out: ++ dprintk("%s: exit with status = %d numclient %u\n", ++ __func__, ntohl(res), num_client); ++ return res; ++} ++ ++/* Remove the deviceid(s) from the nfs_client deviceid cache */ ++static __be32 pnfs_devicenotify_client(struct nfs_client *clp, ++ struct cb_pnfs_devicenotifyargs *args) ++{ ++ uint32_t type; ++ int i; ++ ++ dprintk("%s: --> clp %p\n", __func__, clp); ++ ++ for (i = 0; i < args->ndevs; i++) { ++ struct cb_pnfs_devicenotifyitem *dev = &args->devs[i]; ++ type = dev->cbd_notify_type; ++ if (type == NOTIFY_DEVICEID4_DELETE && clp->cl_devid_cache) ++ nfs4_delete_device(clp->cl_devid_cache, ++ &dev->cbd_dev_id); ++ else if (type == NOTIFY_DEVICEID4_CHANGE) ++ printk(KERN_ERR "%s: NOTIFY_DEVICEID4_CHANGE " ++ "not supported\n", __func__); ++ } ++ return 0; ++} ++ ++__be32 pnfs_cb_devicenotify(struct cb_pnfs_devicenotifyargs *args, ++ void *dummy) ++{ ++ struct nfs_client *clp; ++ __be32 res = 0; ++ unsigned int num_client = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ res = __constant_htonl(NFS4ERR_INVAL); ++ clp = nfs_find_client(args->addr, 4); ++ if (clp == NULL) { ++ dprintk("%s: no client for addr %u.%u.%u.%u\n", ++ __func__, NIPQUAD(args->addr)); ++ goto out; ++ } ++ ++ do { ++ struct nfs_client *prev = clp; ++ num_client++; ++ res = pnfs_devicenotify_client(clp, args); ++ clp = nfs_find_client_next(prev); ++ nfs_put_client(prev); ++ } while (clp != NULL); ++ ++out: ++ dprintk("%s: exit with status = %d numclient %u\n", ++ __func__, ntohl(res), num_client); ++ return res; ++} ++ + int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) + { + if (delegation == NULL) + return 0; + +- /* seqid is 4-bytes long */ +- if (((u32 *) &stateid->data)[0] != 0) ++ if (stateid->u.stateid.seqid != 0) + return 0; +- if (memcmp(&delegation->stateid.data[4], &stateid->data[4], +- sizeof(stateid->data)-4)) ++ if (memcmp(&delegation->stateid.u.stateid.other, ++ &stateid->u.stateid.other, ++ NFS4_STATEID_OTHER_SIZE)) + return 0; + + return 1; +@@ -335,13 +669,37 @@ out: + return status; + } + ++static inline bool ++validate_bitmap_values(const unsigned long *mask) ++{ ++ int i; ++ ++ if (*mask == 0) ++ return true; ++ if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_WDATA_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_DIR_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, mask) || ++ test_bit(RCA4_TYPE_MASK_BLK_LAYOUT, mask)) ++ return true; ++ for (i = RCA4_TYPE_MASK_OBJ_LAYOUT_MIN; ++ i <= RCA4_TYPE_MASK_OBJ_LAYOUT_MAX; i++) ++ if (test_bit(i, mask)) ++ return true; ++ for (i = RCA4_TYPE_MASK_OTHER_LAYOUT_MIN; ++ i <= RCA4_TYPE_MASK_OTHER_LAYOUT_MAX; i++) ++ if (test_bit(i, mask)) ++ return true; ++ return false; ++} ++ + __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) + { + struct nfs_client *clp; + __be32 status; + fmode_t flags = 0; + +- status = htonl(NFS4ERR_OP_NOT_IN_SESSION); ++ status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + clp = nfs_find_client(args->craa_addr, 4); + if (clp == NULL) + goto out; +@@ -349,16 +707,25 @@ __be32 nfs4_callback_recallany(struct cb + dprintk("NFS: RECALL_ANY callback request from %s\n", + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + ++ status = cpu_to_be32(NFS4ERR_INVAL); ++ if (!validate_bitmap_values((const unsigned long *) ++ &args->craa_type_mask)) ++ return status; ++ ++ status = cpu_to_be32(NFS4_OK); + if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags = FMODE_READ; + if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags |= FMODE_WRITE; ++ if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) ++ &args->craa_type_mask)) ++ if (pnfs_recall_all_layouts(clp) == -EAGAIN) ++ status = cpu_to_be32(NFS4ERR_DELAY); + + if (flags) + nfs_expire_all_delegation_types(clp, flags); +- status = htonl(NFS4_OK); + out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/nfs/callback_xdr.c +--- linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-08-23 12:09:03.295502055 -0400 +@@ -22,6 +22,8 @@ + #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + + #if defined(CONFIG_NFS_V4_1) ++#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) ++#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ + 4 + 1 + 3) + #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +@@ -136,7 +138,7 @@ static __be32 decode_stateid(struct xdr_ + p = read_buf(xdr, 16); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); +- memcpy(stateid->data, p, 16); ++ memcpy(stateid->u.data, p, 16); + return 0; + } + +@@ -220,6 +222,148 @@ out: + + #if defined(CONFIG_NFS_V4_1) + ++static __be32 decode_pnfs_layoutrecall_args(struct svc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct cb_pnfs_layoutrecallargs *args) ++{ ++ __be32 *p; ++ __be32 status = 0; ++ ++ args->cbl_addr = svc_addr(rqstp); ++ p = read_buf(xdr, 4 * sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ ++ args->cbl_layout_type = ntohl(*p++); ++ args->cbl_seg.iomode = ntohl(*p++); ++ args->cbl_layoutchanged = ntohl(*p++); ++ args->cbl_recall_type = ntohl(*p++); ++ ++ if (likely(args->cbl_recall_type == RETURN_FILE)) { ++ status = decode_fh(xdr, &args->cbl_fh); ++ if (unlikely(status != 0)) ++ goto out; ++ ++ p = read_buf(xdr, 2 * sizeof(uint64_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ p = xdr_decode_hyper(p, &args->cbl_seg.offset); ++ p = xdr_decode_hyper(p, &args->cbl_seg.length); ++ status = decode_stateid(xdr, &args->cbl_stateid); ++ if (unlikely(status != 0)) ++ goto out; ++ } else if (args->cbl_recall_type == RETURN_FSID) { ++ p = read_buf(xdr, 2 * sizeof(uint64_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ p = xdr_decode_hyper(p, &args->cbl_fsid.major); ++ p = xdr_decode_hyper(p, &args->cbl_fsid.minor); ++ } ++ dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d " ++ "fsid %llx-%llx fhsize %d\n", __func__, ++ args->cbl_layout_type, args->cbl_seg.iomode, ++ args->cbl_layoutchanged, args->cbl_recall_type, ++ args->cbl_fsid.major, args->cbl_fsid.minor, ++ args->cbl_fh.size); ++out: ++ dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); ++ return status; ++} ++ ++static ++__be32 decode_pnfs_devicenotify_args(struct svc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct cb_pnfs_devicenotifyargs *args) ++{ ++ __be32 *p; ++ __be32 status = 0; ++ u32 tmp; ++ int n, i; ++ args->ndevs = 0; ++ ++ args->addr = svc_addr(rqstp); ++ ++ /* Num of device notifications */ ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ n = ntohl(*p++); ++ if (n <= 0) ++ goto out; ++ ++ /* XXX: need to possibly return error in this case */ ++ if (n > NFS4_DEV_NOTIFY_MAXENTRIES) { ++ dprintk("%s: Processing (%d) notifications out of (%d)\n", ++ __func__, NFS4_DEV_NOTIFY_MAXENTRIES, n); ++ n = NFS4_DEV_NOTIFY_MAXENTRIES; ++ } ++ ++ /* Decode each dev notification */ ++ for (i = 0; i < n; i++) { ++ struct cb_pnfs_devicenotifyitem *dev = &args->devs[i]; ++ ++ p = read_buf(xdr, (4 * sizeof(uint32_t)) ++ + NFS4_PNFS_DEVICEID4_SIZE); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* bitmap size */ ++ if (tmp != 1) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_notify_type = ntohl(*p++); ++ if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* opaque size */ ++ if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) && ++ (tmp != NFS4_PNFS_DEVICEID4_SIZE + 8)) || ++ ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) && ++ (tmp != NFS4_PNFS_DEVICEID4_SIZE + 4))) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_layout_type = ntohl(*p++); ++ memcpy(dev->cbd_dev_id.data, p, NFS4_PNFS_DEVICEID4_SIZE); ++ p += XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE); ++ ++ if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_DELAY); ++ goto out; ++ } ++ dev->cbd_immediate = ntohl(*p++); ++ } else { ++ dev->cbd_immediate = 0; ++ } ++ ++ args->ndevs++; ++ ++ dprintk("%s: type %d layout 0x%x immediate %d\n", ++ __func__, dev->cbd_notify_type, dev->cbd_layout_type, ++ dev->cbd_immediate); ++ } ++out: ++ dprintk("%s: status %d ndevs %d\n", ++ __func__, ntohl(status), args->ndevs); ++ return status; ++} ++ + static __be32 decode_sessionid(struct xdr_stream *xdr, + struct nfs4_sessionid *sid) + { +@@ -574,11 +718,11 @@ preprocess_nfs41_op(int nop, unsigned in + case OP_CB_SEQUENCE: + case OP_CB_RECALL_ANY: + case OP_CB_RECALL_SLOT: ++ case OP_CB_LAYOUTRECALL: ++ case OP_CB_NOTIFY_DEVICEID: + *op = &callback_ops[op_nr]; + break; + +- case OP_CB_LAYOUTRECALL: +- case OP_CB_NOTIFY_DEVICEID: + case OP_CB_NOTIFY: + case OP_CB_PUSH_DELEG: + case OP_CB_RECALLABLE_OBJ_AVAIL: +@@ -739,6 +883,18 @@ static struct callback_op callback_ops[] + .res_maxsize = CB_OP_RECALL_RES_MAXSZ, + }, + #if defined(CONFIG_NFS_V4_1) ++ [OP_CB_LAYOUTRECALL] = { ++ .process_op = (callback_process_op_t)pnfs_cb_layoutrecall, ++ .decode_args = ++ (callback_decode_arg_t)decode_pnfs_layoutrecall_args, ++ .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, ++ }, ++ [OP_CB_NOTIFY_DEVICEID] = { ++ .process_op = (callback_process_op_t)pnfs_cb_devicenotify, ++ .decode_args = ++ (callback_decode_arg_t)decode_pnfs_devicenotify_args, ++ .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ, ++ }, + [OP_CB_SEQUENCE] = { + .process_op = (callback_process_op_t)nfs4_callback_sequence, + .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, +diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/client.c +--- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-08-23 12:08:29.037481540 -0400 ++++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-23 12:09:03.297501650 -0400 +@@ -39,6 +39,7 @@ + #include + #include + #include ++#include + + #include + +@@ -48,6 +49,7 @@ + #include "iostat.h" + #include "internal.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_CLIENT + +@@ -150,11 +152,14 @@ static struct nfs_client *nfs_alloc_clie + clp->cl_boot_time = CURRENT_TIME; + clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; + clp->cl_minorversion = cl_init->minorversion; ++ clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; + #endif + cred = rpc_lookup_machine_cred(); + if (!IS_ERR(cred)) + clp->cl_machine_cred = cred; +- ++#if defined(CONFIG_NFS_V4_1) ++ INIT_LIST_HEAD(&clp->cl_layouts); ++#endif + nfs_fscache_get_client_cookie(clp); + + return clp; +@@ -178,7 +183,7 @@ static void nfs4_clear_client_minor_vers + clp->cl_session = NULL; + } + +- clp->cl_call_sync = _nfs4_call_sync; ++ clp->cl_mvops = nfs_v4_minor_ops[0]; + #endif /* CONFIG_NFS_V4_1 */ + } + +@@ -188,7 +193,7 @@ static void nfs4_clear_client_minor_vers + static void nfs4_destroy_callback(struct nfs_client *clp) + { + if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) +- nfs_callback_down(clp->cl_minorversion); ++ nfs_callback_down(clp->cl_mvops->minor_version); + } + + static void nfs4_shutdown_client(struct nfs_client *clp) +@@ -251,6 +256,7 @@ void nfs_put_client(struct nfs_client *c + nfs_free_client(clp); + } + } ++EXPORT_SYMBOL(nfs_put_client); + + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + /* +@@ -343,7 +349,7 @@ static int nfs_sockaddr_match_ipaddr(con + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, including the port number. + */ +-static int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2) + { + if (sa1->sa_family != sa2->sa_family) +@@ -357,6 +363,7 @@ static int nfs_sockaddr_cmp(const struct + } + return 0; + } ++EXPORT_SYMBOL(nfs_sockaddr_cmp); + + /* + * Find a client by IP address and protocol version +@@ -548,6 +555,7 @@ int nfs4_check_client_ready(struct nfs_c + return -EPROTONOSUPPORT; + return 0; + } ++EXPORT_SYMBOL(nfs4_check_client_ready); + + /* + * Initialise the timeout values for a connection +@@ -865,9 +873,34 @@ error: + } + + /* ++ * Initialize the pNFS layout driver and setup pNFS related parameters ++ */ ++static void nfs4_init_pnfs(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) ++{ ++#if defined(CONFIG_NFS_V4_1) ++ struct nfs_client *clp = server->nfs_client; ++ ++ if (nfs4_has_session(clp) && ++ (clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) { ++ server->pnfs_blksize = fsinfo->blksize; ++ set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype); ++ pnfs_set_ds_iosize(server); ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ ++static void nfs4_uninit_pnfs(struct nfs_server *server) ++{ ++#if defined(CONFIG_NFS_V4_1) ++ if (server->nfs_client && nfs4_has_session(server->nfs_client)) ++ unmount_pnfs_layoutdriver(server); ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ ++/* + * Load up the server record from information gained in an fsinfo record + */ +-static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) ++static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) + { + unsigned long max_rpc_payload; + +@@ -897,6 +930,8 @@ static void nfs_server_set_fsinfo(struct + if (server->wsize > NFS_MAX_FILE_IO_SIZE) + server->wsize = NFS_MAX_FILE_IO_SIZE; + server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; ++ nfs4_init_pnfs(server, mntfh, fsinfo); ++ + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); + + server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); +@@ -938,7 +973,7 @@ static int nfs_probe_fsinfo(struct nfs_s + if (error < 0) + goto out_error; + +- nfs_server_set_fsinfo(server, &fsinfo); ++ nfs_server_set_fsinfo(server, mntfh, &fsinfo); + + /* Get some general file system info */ + if (server->namelen == 0) { +@@ -1016,6 +1051,7 @@ void nfs_free_server(struct nfs_server * + { + dprintk("--> nfs_free_server()\n"); + ++ nfs4_uninit_pnfs(server); + spin_lock(&nfs_client_lock); + list_del(&server->client_link); + list_del(&server->master_link); +@@ -1126,7 +1162,7 @@ static int nfs4_init_callback(struct nfs + return error; + } + +- error = nfs_callback_up(clp->cl_minorversion, ++ error = nfs_callback_up(clp->cl_mvops->minor_version, + clp->cl_rpcclient->cl_xprt); + if (error < 0) { + dprintk("%s: failed to start callback. Error = %d\n", +@@ -1143,10 +1179,8 @@ static int nfs4_init_callback(struct nfs + */ + static int nfs4_init_client_minor_version(struct nfs_client *clp) + { +- clp->cl_call_sync = _nfs4_call_sync; +- + #if defined(CONFIG_NFS_V4_1) +- if (clp->cl_minorversion) { ++ if (clp->cl_mvops->minor_version) { + struct nfs4_session *session = NULL; + /* + * Create the session and mark it expired. +@@ -1158,7 +1192,13 @@ static int nfs4_init_client_minor_versio + return -ENOMEM; + + clp->cl_session = session; +- clp->cl_call_sync = _nfs4_call_sync_session; ++ /* ++ * The create session reply races with the server back ++ * channel probe. Mark the client NFS_CS_SESSION_INITING ++ * so that the client back channel can find the ++ * nfs_client struct ++ */ ++ clp->cl_cons_state = NFS_CS_SESSION_INITING; + } + #endif /* CONFIG_NFS_V4_1 */ + +@@ -1216,7 +1256,7 @@ error: + /* + * Set up an NFS4 client + */ +-static int nfs4_set_client(struct nfs_server *server, ++int nfs4_set_client(struct nfs_server *server, + const char *hostname, + const struct sockaddr *addr, + const size_t addrlen, +@@ -1259,6 +1299,7 @@ error: + dprintk("<-- nfs4_set_client() = xerror %d\n", error); + return error; + } ++EXPORT_SYMBOL(nfs4_set_client); + + + /* +@@ -1448,7 +1489,7 @@ struct nfs_server *nfs4_create_referral_ + data->authflavor, + parent_server->client->cl_xprt->prot, + parent_server->client->cl_timeout, +- parent_client->cl_minorversion); ++ parent_client->cl_mvops->minor_version); + if (error < 0) + goto error; + +diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/bl_com.c +--- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-08-23 12:09:03.297501650 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-08-23 12:09:03.298501447 -0400 +@@ -0,0 +1,292 @@ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++static ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); ++static void bl_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops bl_upcall_ops = { ++ .upcall = bl_pipe_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = bl_pipe_destroy_msg, ++}; ++ ++bl_comm_t *bl_comm_global; ++ ++int ++nfsd_bl_start(void) ++{ ++ bl_comm_t *bl_comm = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ dprintk("%s: starting pipe\n", __func__); ++ if (bl_comm_global) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ bl_comm = kzalloc(sizeof (*bl_comm), GFP_KERNEL); ++ if (!bl_comm) { ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ /* FIXME: rename to "spnfs_block" */ ++ bl_comm->pipe_dentry = rpc_mkpipe(nd.path.dentry, "pnfs_block", bl_comm, ++ &bl_upcall_ops, 0); ++ if (IS_ERR(bl_comm->pipe_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ mutex_init(&bl_comm->lock); ++ mutex_init(&bl_comm->pipe_lock); ++ init_waitqueue_head(&bl_comm->pipe_wq); ++ ++ bl_comm_global = bl_comm; ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(bl_comm); ++ return rc; ++} ++ ++void ++nfsd_bl_stop(void) ++{ ++ bl_comm_t *c = bl_comm_global; ++ ++ dprintk("%s: stopping pipe\n", __func__); ++ if (!c) ++ return; ++ rpc_unlink(c->pipe_dentry); ++ rpc_put_mount(); ++ bl_comm_global = NULL; ++ kfree(c); ++} ++ ++static ssize_t ++bl_pipe_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *dst, ++ size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied, ++ left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ ++ return mlen; ++} ++ ++static ssize_t ++bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ bl_comm_t *bc = (bl_comm_t *)rpci->private; ++ bl_comm_msg_t *im = &bc->msg; ++ int ret; ++ bl_comm_res_t *res; ++ ++ ++ if (mlen == 0) { ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ im->msg_res = NULL; ++ wake_up(&bc->pipe_wq); ++ return -EFAULT; ++ } ++ ++ if ((res = kmalloc(mlen, GFP_KERNEL)) == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(res, src, mlen)) { ++ kfree(res); ++ return -EFAULT; ++ } ++ ++ mutex_lock(&bc->pipe_lock); ++ ++ ret = mlen; ++ im->msg_status = res->res_status; ++ im->msg_res = res; ++ ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++ return ret; ++} ++ ++static void ++bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ bl_comm_msg_t *im = msg->data; ++ bl_comm_t *bc = container_of(im, struct bl_comm, msg); ++ ++ if (msg->errno >= 0) ++ return; ++ ++ mutex_lock(&bc->pipe_lock); ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++} ++ ++int ++bl_upcall(bl_comm_t *bc, bl_comm_msg_t *upmsg, bl_comm_res_t **res) ++{ ++ struct rpc_pipe_msg msg; ++ DECLARE_WAITQUEUE(wq, current); ++ int rval = 1; ++ bl_comm_msg_t *m = &bc->msg; ++ ++ if (bc == NULL) { ++ dprintk("%s: No pNFS block daemon available\n", __func__); ++ return 1; ++ } ++ ++ mutex_lock(&bc->lock); ++ mutex_lock(&bc->pipe_lock); ++ ++ memcpy(m, upmsg, sizeof (*m)); ++ ++ memset(&msg, 0, sizeof (msg)); ++ msg.data = m; ++ msg.len = sizeof (*m); ++ ++ add_wait_queue(&bc->pipe_wq, &wq); ++ rval = rpc_queue_upcall(bc->pipe_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&bc->pipe_lock); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ mutex_lock(&bc->pipe_lock); ++ ++ if (m->msg_status == PNFS_BLOCK_SUCCESS) { ++ *res = m->msg_res; ++ rval = 0; ++ } else ++ rval = 1; ++ ++out: ++ mutex_unlock(&bc->pipe_lock); ++ mutex_unlock(&bc->lock); ++ return rval; ++} ++ ++static ssize_t ctl_write(struct file *file, const char __user *buf, size_t len, ++ loff_t *offset) ++{ ++ int cmd, ++ rc; ++ bl_comm_t *bc = bl_comm_global; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof (int))) ++ return -EFAULT; ++ switch (cmd) { ++ case PNFS_BLOCK_CTL_STOP: ++ msg.msg_type = PNFS_UPCALL_MSG_STOP; ++ (void) bl_upcall(bc, &msg, &res); ++ kfree(res); ++ nfsd_bl_stop(); ++ break; ++ ++ case PNFS_BLOCK_CTL_START: ++ rc = nfsd_bl_start(); ++ if (rc != 0) ++ return rc; ++ break; ++ ++ case PNFS_BLOCK_CTL_VERS: ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bc, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ kfree(res); ++ break; ++ ++ default: ++ dprintk("%s: unknown ctl command %d\n", __func__, cmd); ++ break; ++ } ++ return len; ++} ++ ++static struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++ ++/* ++ * bl_init_proc -- set up proc interfaces ++ * ++ * Creating a pnfs_block directory isn't really required at this point ++ * since we've only got a single node in that directory. If the need for ++ * more nodes doesn't present itself shortly this code should revert ++ * to a single top level node. McNeal 11-Aug-2008. ++ */ ++int ++bl_init_proc(void) ++{ ++ struct proc_dir_entry *e; ++ ++ e = proc_mkdir("fs/pnfs_block", NULL); ++ if (!e) ++ return -ENOMEM; ++ ++ e = create_proc_entry("fs/pnfs_block/ctl", 0, NULL); ++ if (!e) ++ return -ENOMEM; ++ e->proc_fops = &ctl_ops; ++ ++ return 0; ++} ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/bl_ops.c +--- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-08-23 12:09:03.299501445 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-08-23 12:09:03.299501445 -0400 +@@ -0,0 +1,1672 @@ ++/* ++ * bl_ops.c ++ * spNFS ++ * ++ * Created by Rick McNeal on 4/1/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++ ++/* ++ * Block layout operations. ++ * ++ * These functions, with the exception of pnfs_block_enabled, are assigned to ++ * the super block s_export_op structure. ++ */ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#define MIN(a, b) ((a) < (b) ? (a) : (b)) ++ ++#define BL_LAYOUT_HASH_BITS 4 ++#define BL_LAYOUT_HASH_SIZE (1 << BL_LAYOUT_HASH_BITS) ++#define BL_LAYOUT_HASH_MASK (BL_LAYOUT_HASH_SIZE - 1) ++#define BL_LIST_REQ (sizeof (struct dm_ioctl) + 256) ++ ++#define bl_layout_hashval(id) \ ++ ((id) & BL_LAYOUT_HASH_MASK) ++ ++#define BLL_F_END(p) ((p)->bll_foff + (p)->bll_len) ++#define BLL_S_END(p) ((p)->bll_soff + (p)->bll_len) ++#define _2SECTS(v) ((v) >> 9) ++ ++#ifndef READ32 ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++(x) = (u64)ntohl(*p++) << 32; \ ++(x) |= ntohl(*p++); \ ++} while (0) ++#endif ++ ++ ++typedef enum {True, False} boolean_t; ++/* ---- block layoutget and commit structure ---- */ ++typedef struct bl_layout_rec { ++ struct list_head blr_hash, ++ blr_layouts; ++ dev_t blr_rdev; ++ struct inode *blr_inode; ++ int blr_recalled; // debug ++ u64 blr_orig_size, ++ blr_commit_size, ++ blr_ext_size; ++ spinlock_t blr_lock; // Protects blr_layouts ++} bl_layout_rec_t; ++ ++static struct list_head layout_hash; ++static struct list_head layout_hashtbl[BL_LAYOUT_HASH_SIZE]; ++static spinlock_t layout_hashtbl_lock; ++ ++/* ---- prototypes ---- */ ++static boolean_t device_slice(dev_t devid); ++static boolean_t device_dm(dev_t devid); ++static boolean_t layout_inode_add(struct inode *i, bl_layout_rec_t **); ++static bl_layout_rec_t *layout_inode_find(struct inode *i); ++static void layout_inode_del(struct inode *i); ++static char *map_state2name(enum pnfs_block_extent_state4 s); ++static pnfs_blocklayout_devinfo_t *bld_alloc(struct list_head *volume, int type); ++static void bld_free(pnfs_blocklayout_devinfo_t *bld); ++static pnfs_blocklayout_devinfo_t *bld_simple(struct list_head *volumes, ++ dev_t devid, int local_index); ++static pnfs_blocklayout_devinfo_t *bld_slice(struct list_head *volumes, ++ dev_t devid, int my_loc, int idx); ++static int layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg); ++struct list_head *layout_cache_iter(bl_layout_rec_t *r, ++ struct list_head *bl_possible, struct nfsd4_layout_seg *seg); ++static void layout_cache_merge(bl_layout_rec_t *r, struct list_head *h); ++static int layout_cache_update(bl_layout_rec_t *r, struct list_head *h); ++static void layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg); ++static void print_bll(pnfs_blocklayout_layout_t *b, char *); ++static inline boolean_t layout_cache_fill_from_list(bl_layout_rec_t *r, ++ struct list_head *h, struct nfsd4_layout_seg *seg); ++static inline void bll_collapse(bl_layout_rec_t *r, ++ pnfs_blocklayout_layout_t *c); ++static pnfs_blocklayout_layout_t *bll_alloc(u64 offset, u64 len, ++ enum bl_cache_state state, struct list_head *h); ++static pnfs_blocklayout_layout_t *bll_alloc_dup(pnfs_blocklayout_layout_t *b, ++ enum bl_cache_state c, struct list_head *h); ++static inline boolean_t layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s); ++static void extents_setup(struct fiemap_extent_info *fei); ++static void extents_count(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_get(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_process(struct fiemap_extent_info *fei, ++ struct list_head *bl_candidates, struct nfsd4_layout_seg *, dev_t dev, ++ pnfs_blocklayout_layout_t *b); ++static void extents_cleanup(struct fiemap_extent_info *fei); ++ ++void ++nfsd_bl_init(void) ++{ ++ int i; ++ dprintk("%s loaded\n", __func__); ++ ++ spin_lock_init(&layout_hashtbl_lock); ++ INIT_LIST_HEAD(&layout_hash); ++ for (i = 0; i < BL_LAYOUT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&layout_hashtbl[i]); ++ bl_init_proc(); ++} ++ ++/* ++ * pnfs_block_enabled -- check to see if this file system should be export as ++ * block pnfs ++ */ ++int ++pnfs_block_enabled(struct inode *inode, int ex_flags) ++{ ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ static int bl_comm_once = 0; ++ ++ dprintk("--> %s\n", __func__); ++ /* ++ * FIXME: Figure out method to determine if this file system should ++ * be exported. The following areas need to be checked. ++ * (1) Validate that this file system was exported as a pNFS ++ * block-layout ++ * (2) Has there been successful communication with the ++ * volume daemon? ++ */ ++ /* Check #1 */ ++#ifdef notyet ++ if (!(ex_flags & NFSEXP_PNFS_BLOCK)) { ++ dprintk("%s: pnfs_block not set in export\n", __func__); ++ return 0; ++ } ++#endif ++ ++ /* Check #1 */ ++ if (!bl_comm_once) { ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ if (msg.u.msg_vers != res->u.vers) { ++ dprintk("%s: vers mismatch, kernel != daemon\n", ++ __func__); ++ kfree(res); ++ return 0; ++ } ++ } ++ bl_comm_once = 1; ++ ++ kfree(res); ++ ++ dprintk("<-- %s okay\n", __func__); ++ return 1; ++} ++ ++int ++bl_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_BLOCK_VOLUME; ++} ++ ++int ++bl_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_devid = sb->s_dev; ++ res->gd_verf = 1; ++ res->gd_cookie = 1; ++ return 0; ++} ++ ++static int ++bl_getdeviceinfo_slice(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld_slice_p, ++ *bld_simple_p, ++ *bld; ++ int status = -EIO, ++ location = 0; ++ struct list_head volumes; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ bld_simple_p = bld_simple(&volumes, devid->devid, ++ location++); ++ if (!bld_simple_p) ++ goto out; ++ bld_slice_p = bld_slice(&volumes, devid->devid, location++, ++ bld_simple_p->bld_index_loc); ++ ++ if (!bld_slice_p) ++ goto out; ++ ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ if (bld->bld_type == PNFS_BLOCK_VOLUME_SIMPLE) ++ kfree(bld->u.simple.bld_sig); ++ bld_free(bld); ++ } ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++static int ++bl_getdeviceinfo_dm(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ int status = -EIO, // default to error ++ i, ++ location = 0; ++ struct list_head volumes; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMGET; ++ msg.u.msg_dev = devid->devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: upcall for DMGET failed\n", __func__); ++ goto out; ++ } ++ ++ /* ++ * Don't use bld_alloc() here. If used this will be the first volume ++ * type added to the list whereas the protocol requires it to be the ++ * last. ++ */ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ goto out; ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = PNFS_BLOCK_VOLUME_STRIPE; ++ bld->u.stripe.bld_stripes = res->u.stripe.num_stripes; ++ bld->u.stripe.bld_chunk_size = res->u.stripe.stripe_size * 512LL; ++ dprintk("%s: stripes %d, chunk_size %Lu\n", __func__, ++ bld->u.stripe.bld_stripes, bld->u.stripe.bld_chunk_size / 512LL); ++ ++ bld->u.stripe.bld_stripe_indexs = kmalloc(bld->u.stripe.bld_stripes * ++ sizeof (int), GFP_KERNEL); ++ if (!bld->u.stripe.bld_stripe_indexs) ++ goto out; ++ ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) { ++ dev_t dev; ++ pnfs_blocklayout_devinfo_t *bldp; ++ ++ dev = MKDEV(res->u.stripe.devs[i].major, ++ res->u.stripe.devs[i].minor); ++ if (dev == 0) ++ goto out; ++ ++ bldp = bld_simple(&volumes, dev, location++); ++ if (!bldp) { ++ dprintk("%s: bld_simple failed\n", __func__); ++ goto out; ++ } ++ bldp = bld_slice(&volumes, dev, location++, bldp->bld_index_loc); ++ ++ if (!bldp) { ++ dprintk("%s: bld_slice failed\n", __func__); ++ goto out; ++ } ++ bld->u.stripe.bld_stripe_indexs[i] = bldp->bld_index_loc; ++ ++ } ++ list_add_tail(&bld->bld_list, &volumes); ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SLICE: ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ // No memory to release for these ++ break; ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ kfree(bld->u.simple.bld_sig); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ kfree(bld->u.stripe.bld_stripe_indexs); ++ break; ++ } ++ bld_free(bld); ++ } ++ kfree(res); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++/* ++ * bl_getdeviceinfo -- determine device tree for requested devid ++ */ ++int ++bl_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ if (device_slice(devid->devid) == True) ++ return bl_getdeviceinfo_slice(sb, xdr, devid); ++ else if (device_dm(devid->devid) == True) ++ return bl_getdeviceinfo_dm(sb, xdr, devid); ++ return -EINVAL; ++} ++ ++enum nfsstat4 ++bl_layoutget(struct inode *i, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ pnfs_blocklayout_layout_t *b; ++ bl_layout_rec_t *r; ++ struct list_head bl_possible, ++ *bl_candidates = NULL; ++ boolean_t del_on_error = False; ++ int adj; ++ enum nfsstat4 nfserr = NFS4_OK; ++ ++ dprintk("--> %s (inode=[0x%x:%lu], offset=%Lu, len=%Lu, iomode=%d)\n", ++ __func__, i->i_sb->s_dev, i->i_ino, _2SECTS(res->lg_seg.offset), ++ _2SECTS(res->lg_seg.length), res->lg_seg.iomode); ++ ++ if (res->lg_seg.length == 0) { ++ printk("%s: request length of 0, error condition\n", __func__); ++ return NFS4ERR_BADLAYOUT; ++ } ++ ++ /* ++ * Adjust the length as required per spec. ++ * - First case is were the length is set to (u64)-1. Cheap means to ++ * define the end of the file. ++ * - Second case is were the I/O mode is read-only, but the request is ++ * past the end of the file so the request needs to be trimed. ++ */ ++ if ((res->lg_seg.length == NFS4_MAX_UINT64) || ++ (((res->lg_seg.offset + res->lg_seg.length) > i->i_size) && ++ (res->lg_seg.iomode == IOMODE_READ))) ++ res->lg_seg.length = i->i_size - res->lg_seg.offset; ++ ++ adj = (res->lg_seg.offset & 511) ? res->lg_seg.offset & 511 : 0; ++ res->lg_seg.offset -= adj; ++ res->lg_seg.length = (res->lg_seg.length + adj + 511) & ~511; ++ ++ if (res->lg_seg.iomode != IOMODE_READ) ++ if (i->i_op->fallocate(i, FALLOC_FL_KEEP_SIZE, ++ res->lg_seg.offset, res->lg_seg.length)) ++ return NFS4ERR_IO; ++ ++ INIT_LIST_HEAD(&bl_possible); ++ ++ if ((r = layout_inode_find(i)) == NULL) { ++ if (layout_inode_add(i, &r) == False) { ++ printk("%s: layout_inode_add failed\n", __func__); ++ return NFS4ERR_IO; ++ } ++ del_on_error = True; ++ } ++ BUG_ON(!r); ++ ++ spin_lock(&r->blr_lock); ++ ++ if (layout_cache_fill_from(r, &bl_possible, &res->lg_seg)) { ++ /* ++ * This will send LAYOUTTRYAGAIN error to the client. ++ */ ++ dprintk("%s: layout_cache_fill_from() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res->lg_return_on_close = 1; ++ res->lg_seg.length = 0; ++ ++ bl_candidates = layout_cache_iter(r, &bl_possible, &res->lg_seg); ++ if (!bl_candidates) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ layout_cache_merge(r, bl_candidates); ++ if (layout_cache_update(r, bl_candidates)) { ++ /* ---- Failed to allocate memory. ---- */ ++ dprintk("%s: layout_cache_update() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ nfserr = blocklayout_encode_layout(xdr, bl_candidates); ++ if (nfserr) ++ dprintk("%s: layoutget xdr routine failed\n", __func__); ++ ++layoutget_cleanup: ++ if (bl_candidates) { ++ while (!list_empty(bl_candidates)) { ++ b = list_entry(bl_candidates->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ } ++ ++ spin_unlock(&r->blr_lock); ++ if (unlikely(nfserr)) { ++ if (del_on_error == True) ++ layout_inode_del(i); ++ res->lg_seg.length = 0; ++ res->lg_seg.offset = 0; ++ } ++ ++ dprintk("<-- %s (rval %u)\n", __func__, nfserr); ++ return nfserr; ++} ++ ++/* ++ * bl_layoutcommit -- commit changes, especially size, to file systemj ++ * ++ * Currently this routine isn't called and everything is handled within ++ * nfsd4_layoutcommit(). By not calling this routine the server doesn't ++ * handle a partial return, a set of extents, of the layout. The extents ++ * are decoded here, but nothing is done with them. If this routine is ++ * be called the interface must change to pass the 'dentry' pointer such ++ * that notify_change() can be called. ++ */ ++int ++bl_layoutcommit(struct inode *i, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ bl_layout_rec_t *r; ++ int status = 0; ++ u64 lw_plus; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ r = layout_inode_find(i); ++ if (r) { ++ lw_plus = args->lc_last_wr + 1; ++ if (args->lc_newoffset) { ++ dprintk(" lc_last_wr %Lu\n", lw_plus); ++ if (r->blr_orig_size < lw_plus) { ++ r->blr_orig_size = lw_plus; ++ res->lc_size_chg = 1; ++ res->lc_newsize = lw_plus; ++ } ++ } ++ ++ if (args->lc_up_len) { ++ int extents, ++ i; ++ struct pnfs_blocklayout_layout *b; ++ __be32 *p = args->lc_up_layout; ++ ++ /* ++ * Client is returning a set of extents which ++ * should/could be used to update the file system. ++ * See section 2.3.2 in draft-ietf-nfsv4-pnfs-block-08 ++ */ ++ READ32(extents); ++ dprintk(" Client returning %d extents: data size %d\n", ++ extents, args->lc_up_len); ++ b = kmalloc(sizeof (struct pnfs_blocklayout_layout) * ++ extents, GFP_KERNEL); ++ if (b) { ++ for (i = 0; i < extents; i++) { ++ READ64(b[i].bll_vol_id.sbid); ++ READ64(b[i].bll_vol_id.devid); ++ READ64(b[i].bll_foff); ++ READ64(b[i].bll_len); ++ READ64(b[i].bll_soff); ++ READ32(b[i].bll_es); ++ dprintk(" %d: foff %Lu, len %Lu, soff %Lu " ++ "state %s\n", ++ i, _2SECTS(b[i].bll_foff), ++ _2SECTS(b[i].bll_len), ++ _2SECTS(b[i].bll_soff), ++ map_state2name(b[i].bll_es)); ++ } ++ kfree(b); ++ } else { ++ status = -ENOMEM; ++ } ++ } ++ } else ++ dprintk("%s: Unexpected commit to inode %p\n", __func__, i); ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutreturn(struct inode *i, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ int status = 0; ++ bl_layout_rec_t *r; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ ++ r = layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ layout_cache_del(r, &args->lr_seg); ++ spin_unlock(&r->blr_lock); ++ dprintk(" ext_size %Lu, i_size %Lu, orig_size %Lu\n", ++ r->blr_ext_size, i->i_size, r->blr_orig_size); ++ } ++ ++ layout_inode_del(i); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ bl_layout_rec_t *r; ++ pnfs_blocklayout_layout_t *b; ++ u64 adj; ++ ++ dprintk("--> %s\n", __func__); ++ BUG_ON(!len); ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk(" recalling layout [0x%x:%lu], %Lu:%Lu\n", ++ inode->i_sb->s_dev, inode->i_ino, ++ _2SECTS(offset), _2SECTS(len)); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* ++ * XXX figure out how to get a sb since there's no ++ * inode ptr ++ */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++restart: ++ r = layout_inode_find(inode); ++ if (r && len && !r->blr_recalled) { ++ spin_lock(&r->blr_lock); ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (!r->blr_recalled && !b->bll_recalled && ++ (offset >= b->bll_foff) && (offset < BLL_F_END(b))) { ++ b->bll_recalled = 1; ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_BLOCK_VOLUME; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = 0; ++ lr.cbl_seg.length = NFS4_MAX_UINT64; ++ r->blr_recalled = 1; ++ dprintk(" FULL LAYOUTRECALL\n"); ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ ++ /* ++ * Currently there are only two cases where the ++ * layout is being returned. ++ * (1) Someone is issuing a NFS_WRITE operation ++ * to this layout. ++ * (2) The file has been truncated which means ++ * the layout is immediately made invalid. ++ * In both cases the client must write any ++ * uncommitted modifications to the server via ++ * NFS_WRITE. ++ */ ++ lr.cbl_layoutchanged = 1; ++ ++ /* ++ * Need to drop the lock because we'll get a ++ * layoutreturn which will block waiting for ++ * the lock. The request will come in on the ++ * same thread which will cause a deadlock. ++ */ ++ spin_unlock(&r->blr_lock); ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ adj = MIN(b->bll_len - (offset - b->bll_foff), ++ len); ++ offset += adj; ++ len -= adj; ++ if (!len) { ++ spin_lock(&r->blr_lock); ++ break; ++ } ++ /* ++ * Since layoutreturn will have been called we ++ * can't assume blr_layouts is still valid, ++ * so restart. ++ */ ++ goto restart; ++ } ++ } ++ spin_unlock(&r->blr_lock); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++/* ++ * []------------------------------------------------------------------[] ++ * | Support functions from here on down. | ++ * []------------------------------------------------------------------[] ++ */ ++ ++/* ++ * bld_simple -- given a dev_t build a simple volume structure ++ * ++ * Simple volume contains the device signature and offset to that data in ++ * the storage volume. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_simple(struct list_head *volumes, dev_t devid, int local_index) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSIG; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to get signature information\n", __func__); ++ goto error; ++ } ++ ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SIMPLE); ++ if (!bld) ++ return NULL; ++ ++ bld->u.simple.bld_offset = (res->u.sig.sector * 512LL) + res->u.sig.offset; ++ bld->u.simple.bld_sig_len = res->u.sig.len; ++ bld->u.simple.bld_sig = kmalloc(res->u.sig.len, GFP_KERNEL); ++ if (!bld->u.simple.bld_sig) ++ goto error; ++ ++ memcpy(bld->u.simple.bld_sig, res->u.sig.sig, res->u.sig.len); ++ kfree(res); ++ return bld; ++ ++error: ++ if (bld) ++ bld_free(bld); ++ if (res) ++ kfree(res); ++ dprintk("%s: error in bld_simple\n", __func__); ++ return NULL; ++} ++ ++/* ++ * bld_slice -- given a dev_t build a slice volume structure ++ * ++ * A slice volume contains the length of the slice/partition and its offset ++ * from the beginning of the storage volume. There's also a reference to ++ * the "simple" volume which contains this slice. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_slice(struct list_head *volumes, dev_t devid, int my_loc, int simple_loc) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SLICE); ++ if (!bld) ++ return NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSLICE; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Upcall to get slice info failed\n"); ++ bld_free(bld); ++ return NULL; ++ } ++ ++ bld->bld_devid.devid = devid; ++ bld->bld_index_loc = my_loc; ++ bld->u.slice.bld_start = res->u.slice.start * 512LL; ++ bld->u.slice.bld_len = res->u.slice.length * 512LL; ++ bld->u.slice.bld_index = simple_loc; ++ ++ dprintk("%s: start %Lu, len %Lu\n", __func__, ++ bld->u.slice.bld_start / 512LL, bld->u.slice.bld_len / 512LL); ++ ++ kfree(res); ++ dprintk("<-- %s (rval %p)\n", __func__, bld); ++ return bld; ++} ++ ++static int ++layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!list_empty(&r->blr_layouts)) ++ if (layout_cache_fill_from_list(r, h, seg) == False) ++ return -EIO; ++ ++ /* ++ * This deals with two conditions. ++ * (1) When blr_layouts is empty we need to create the first entry ++ * (2) When the range requested falls past the end of any current ++ * layout the residual must be taken care of. ++ */ ++ if (seg->length) { ++ n = bll_alloc(seg->offset, seg->length, BLOCK_LAYOUT_NEW, h); ++ if (!n) ++ return -ENOMEM; ++ dprintk(" remaining at %Lu, len %Lu\n", _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len)); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++struct list_head * ++layout_cache_iter(bl_layout_rec_t *r, struct list_head *bl_possible, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n = NULL; ++ struct list_head *bl_candidates = NULL; ++ struct fiemap_extent_info fei; ++ struct inode *i; ++ dev_t dev; ++ ++ dev = r->blr_rdev; ++ i = r->blr_inode; ++ ++ dprintk("--> %s\n", __func__); ++ bl_candidates = kmalloc(sizeof (*bl_candidates), GFP_KERNEL); ++ if (!bl_candidates) ++ return NULL; ++ INIT_LIST_HEAD(bl_candidates); ++ extents_setup(&fei); ++ ++ list_for_each_entry(b, bl_possible, bll_list) { ++ if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ extents_count(&fei, i, b->bll_foff, b->bll_len); ++ if (fei.fi_extents_mapped) { ++ ++ /* ++ * Common case here. Got a range which has ++ * extents. Now get those extents and process ++ * them into pNFS extents. ++ */ ++ if (extents_get(&fei, i, b->bll_foff, ++ b->bll_len) == False) ++ goto cleanup; ++ if (extents_process(&fei, bl_candidates, ++ seg, dev, b) == False) ++ goto cleanup; ++ extents_cleanup(&fei); ++ ++ } else if (seg->iomode == IOMODE_READ) { ++ ++ /* ++ * Found a hole in a file while reading. No ++ * problem, just create a pNFS extent for the ++ * range and let the client know there's no ++ * backing store. ++ */ ++ n = bll_alloc(b->bll_foff, b->bll_len, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += b->bll_len; ++ } else { ++ ++ /* ++ * There's a problem here. Since the iomode ++ * is read/write fallocate should have allocated ++ * any necessary storage for the given range. ++ */ ++ dprintk(" Extent count for RW is 0\n"); ++ goto cleanup; ++ } ++ ++ } else { ++ n = bll_alloc_dup(b, b->bll_cache_state, bl_candidates); ++ seg->length += n->bll_len; ++ } ++ ++ if (r->blr_ext_size < (b->bll_foff + b->bll_len)) ++ r->blr_ext_size = b->bll_foff + b->bll_len; ++ } ++ ++ while (!list_empty(bl_possible)) { ++ b = list_entry(bl_possible->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ ++ b = list_first_entry(bl_candidates, struct pnfs_blocklayout_layout, ++ bll_list); ++ seg->offset = b->bll_foff; ++ dprintk("<-- %s okay\n", __func__); ++ return bl_candidates; ++ ++cleanup: ++ extents_cleanup(&fei); ++ if (bl_candidates) ++ kfree(bl_candidates); ++ dprintk("<-- %s, error occurred\n", __func__); ++ return NULL; ++} ++ ++/* ++ * layout_cache_merge -- collapse layouts which make up a contiguous range. ++ */ ++static void ++layout_cache_merge(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *p; ++ ++ dprintk("--> %s\n", __func__); ++restart: ++ p = NULL; ++ list_for_each_entry(b, h, bll_list) { ++ if (p && (BLL_S_END(p) == b->bll_soff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ /* ++ * We've got a condidate. ++ */ ++#ifdef too_verbose ++ dprintk(" merge %Lu(f):%Lu(l):%Lu(s) into %Lu(f):%Lu(l):%Lu(s)\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), ++ _2SECTS(p->bll_foff), _2SECTS(p->bll_len), ++ _2SECTS(b->bll_soff)); ++#endif ++ ++ if (p->bll_cache_state == BLOCK_LAYOUT_CACHE) ++ p->bll_cache_state = BLOCK_LAYOUT_UPDATE; ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else if (p && (BLL_F_END(p) == b->bll_foff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es == PNFS_BLOCK_NONE_DATA)) { ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else ++ p = b; ++ } ++ dprintk("<-- %s\n", __func__); ++} ++ ++static int ++layout_cache_update(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *c, ++ *n; ++ boolean_t status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ if (list_empty(&r->blr_layouts)) { ++ /* ---- Just add entries and return ---- */ ++ dprintk(" cache empty for inode 0x%x:%ld\n", r->blr_rdev, ++ r->blr_inode->i_ino); ++ list_for_each_entry(b, h, bll_list) { ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, ++ &r->blr_layouts); ++ if (!c) { ++ status = -ENOMEM; ++ break; ++ } ++ dprintk(" adding %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), c->bll_es); ++ } ++ return status; ++ } ++ ++ list_for_each_entry(b, h, bll_list) { ++ BUG_ON(!b->bll_vol_id.devid); ++ if (b->bll_cache_state == BLOCK_LAYOUT_UPDATE) { ++ boolean_t found = False; ++ list_for_each_entry(c, &r->blr_layouts, bll_list) { ++ if ((b->bll_soff >= c->bll_soff) && ++ (b->bll_soff < BLL_S_END(c)) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ u64 u; ++ ++ if ((b->bll_foff < c->bll_foff) || ++ (b->bll_foff > BLL_F_END(c))) ++ BUG(); ++ ++ u = BLL_S_END(b) - BLL_S_END(c); ++ /* ++ * The updated cache entry has to be ++ * different than the current. ++ * Otherwise the cache state for 'b' ++ * should be BLOCK_LAYOUT_CACHE. ++ */ ++ BUG_ON(BLL_S_END(b) < BLL_S_END(c)); ++ ++ dprintk(" " ++ "updating %Lu(f):%Lu(l):%Lu(s) to len %Lu\n", ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), ++ _2SECTS(c->bll_len + u)); ++ c->bll_len += u; ++ bll_collapse(r, c); ++ found = True; ++ break; ++ } ++ } ++ ++ if (found == False) { ++ dprintk(" ERROR Expected to find" ++ " %Lu(f):%Lu(l):%Lu(s), but didn't\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_for_each_entry(c, &r->blr_layouts, bll_list) ++ print_bll(c, "Cached"); ++ BUG(); ++ } ++ } else if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ c = list_first_entry(&r->blr_layouts, ++ struct pnfs_blocklayout_layout, bll_list); ++ if (b->bll_foff < c->bll_foff) { ++ /* ++ * Special case where new entry is before ++ * first cached entry. ++ */ ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, NULL); ++ list_add(&c->bll_list, &r->blr_layouts); ++ dprintk(" new entry at head of list at %Lu, " ++ "len %Lu\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len)); ++ } else { ++ list_for_each_entry(c, &r->blr_layouts, ++ bll_list) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, ++ bll_list); ++ /* ++ * This is ugly, but can't think of ++ * another way to examine this case. ++ * Consider the following. Need to ++ * add an entry which starts at 40 ++ * and the cache has the following ++ * entries: ++ * Start Length ++ * 10 5 ++ * 30 5 ++ * 50 5 ++ * So, need to look and see if the new ++ * entry starts after the current ++ * cache, but before the next one. ++ * There's a catch in that the next ++ * entry might not be valid as it's ++ * really just a pointer to the list ++ * head. ++ */ ++ if (((b->bll_foff >= ++ BLL_F_END(c)) && ++ (c->bll_list.next == &r->blr_layouts)) || ++ ((b->bll_foff >= ++ BLL_F_END(c)) && ++ (b->bll_foff < n->bll_foff))) { ++ ++ n = bll_alloc_dup(b, ++ BLOCK_LAYOUT_CACHE, NULL); ++ dprintk(" adding new %Lu:%Lu" ++ " after %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len), ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len)); ++ list_add(&n->bll_list, ++ &c->bll_list); ++ break; ++ } ++ } ++ } ++ } ++ } ++ dprintk("<-- %s\n", __func__); ++ return status; ++} ++ ++static void ++layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg_in) ++{ ++ struct pnfs_blocklayout_layout *b, ++ *n; ++ u64 len; ++ struct nfsd4_layout_seg seg = *seg_in; ++ ++ dprintk("--> %s\n", __func__); ++ if (seg.length == NFS4_MAX_UINT64) { ++ r->blr_recalled = 0; ++ dprintk(" Fast return of all layouts\n"); ++ while (!list_empty(&r->blr_layouts)) { ++ b = list_entry(r->blr_layouts.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ dprintk(" foff %Lu, len %Lu, soff %Lu\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ dprintk("<-- %s\n", __func__); ++ return; ++ } ++ ++restart: ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg.offset == b->bll_foff) { ++ /* ++ * This handle the following three cases: ++ * (1) return layout matches entire cache layout ++ * (2) return layout matches beginning portion of cache ++ * (3) return layout matches entire cache layout and ++ * into next entry. Varies from #1 in end case. ++ */ ++ dprintk(" match on offsets, %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length)); ++ len = MIN(seg.length, b->bll_len); ++ b->bll_foff += len; ++ b->bll_soff += len; ++ b->bll_len -= len; ++ seg.length -= len; ++ seg.offset += len; ++ if (!b->bll_len) { ++ list_del(&b->bll_list); ++ kfree(b); ++ dprintk(" removing cache line\n"); ++ if (!seg.length) { ++ dprintk(" also finished\n"); ++ goto complete; ++ } ++ /* ++ * Since 'b' was freed we can't continue at the ++ * next entry which is referenced as ++ * b->bll_list.next by the list_for_each_entry ++ * macro. Need to restart the loop. ++ * TODO: Think about creating a dummy 'b' which ++ * would keep list_for_each_entry() happy. ++ */ ++ goto restart; ++ } ++ if (!seg.length) { ++ dprintk(" finished, but cache line not" ++ "empty\n"); ++ goto complete; ++ } ++ } else if ((seg.offset >= b->bll_foff) && ++ (seg.offset < BLL_F_END(b))) { ++ /* ++ * layout being returned is within this cache line. ++ */ ++ dprintk(" layout %Lu:%Lu within cache line %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ BUG_ON(!seg.length); ++ if ((seg.offset + seg.length) >= BLL_F_END(b)) { ++ /* ++ * Layout returned starts in the middle of ++ * cache entry and just need to trim back ++ * cache to shorter length. ++ */ ++ dprintk(" trim back cache line\n"); ++ len = seg.offset - b->bll_foff; ++ seg.offset += b->bll_len - len; ++ seg.length -= b->bll_len - len; ++ b->bll_len = len; ++ if (!seg.length) ++ return; ++ } else { ++ /* ++ * Need to split current cache layout because ++ * chunk is being removed from the middle. ++ */ ++ dprintk(" split cache line\n"); ++ len = seg.offset + seg.length; ++ n = bll_alloc(len, ++ (b->bll_foff + b->bll_len) - len, ++ BLOCK_LAYOUT_CACHE, NULL); ++ n->bll_soff = b->bll_soff + len; ++ list_add(&n->bll_list, &b->bll_list); ++ b->bll_len = seg.offset - b->bll_foff; ++ return; ++ } ++ } ++ } ++complete: ++ if (list_empty(&r->blr_layouts)) ++ r->blr_recalled = 0; ++ dprintk("<-- %s\n", __func__); ++} ++ ++/* ++ * layout_cache_fill_from_list -- fills from cache list ++ * ++ * NOTE: This routine was only seperated out from layout_cache_file_from() ++ * to reduce the indentation level which makes the code easier to read. ++ */ ++static inline boolean_t ++layout_cache_fill_from_list(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n; ++ enum pnfs_block_extent_state4 s; ++ ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg->offset < b->bll_foff) { ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, b->bll_foff - seg->offset), ++ BLOCK_LAYOUT_NEW, NULL); ++ if (!n) ++ return False; ++ ++ list_add(&n->bll_list, h->prev); ++ dprintk(" new: %Lu:%Lu, added before %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ ++ if ((seg->offset >= b->bll_foff) && ++ (seg->offset < BLL_F_END(b))) { ++ if (layout_conflict(b, seg->iomode, &s) == False) { ++ dprintk(" CONFLICT FOUND: " ++ "%Lu(f):%Lu(l):%Lu(s) state %d, iomode %d\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es, ++ seg->iomode); ++ return False; ++ } ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, BLL_F_END(b) - seg->offset), ++ BLOCK_LAYOUT_CACHE, h); ++ dprintk(" CACHE hit: Found %Lu(f):%Lu(l): " ++ "in %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es); ++ if (!n) ++ return False; ++ ++ n->bll_soff = b->bll_soff + seg->offset - b->bll_foff; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ n->bll_es = s; ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ } ++ return True; ++} ++ ++static u64 ++bll_alloc_holey(struct list_head *bl_candidates, u64 offset, u64 length, ++ dev_t dev) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ n = bll_alloc(offset, length, BLOCK_LAYOUT_NEW, bl_candidates); ++ if (!n) ++ return 0; ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ ++ return n->bll_len; ++} ++ ++static void ++extents_setup(struct fiemap_extent_info *fei) ++{ ++ fei->fi_extents_start = NULL; ++} ++ ++/* ++ * extents_count -- Determine the number of extents for a given range. ++ * ++ * No need to call set_fs() here because the function ++ * doesn't use copy_to_user() if it's only counting ++ * the number of extents needed. ++ */ ++static void ++extents_count(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ dprintk(" Need fiemap of %Ld:%Ld\n", _2SECTS(foff), _2SECTS(len)); ++ fei->fi_flags = FIEMAP_FLAG_SYNC; ++ fei->fi_extents_max = 0; ++ fei->fi_extents_start = NULL; ++ fei->fi_extents_mapped = 0; ++ i->i_op->fiemap(i, fei, foff, len + (1 << i->i_sb->s_blocksize_bits) - 1); ++} ++ ++/* ++ * extents_get -- Get list of extents for range ++ * ++ * extents_count() must have been called before this routine such that ++ * fi_extents_mapped is known. ++ */ ++static boolean_t ++extents_get(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ int m_space, ++ rval; ++ struct fiemap_extent *fe; ++ mm_segment_t old_fs = get_fs(); ++ ++ /* ++ * Now malloc the correct amount of space ++ * needed. It's possible for the file to have changed ++ * between calls which would require more space for ++ * the extents. If that occurs the last extent will ++ * not have FIEMAP_EXTENT_LAST set and the error will ++ * be caught in extents_process(). ++ */ ++ m_space = fei->fi_extents_mapped * sizeof (struct fiemap_extent); ++ fe = kmalloc(m_space, GFP_KERNEL); ++ if (!fe) ++ return False; ++ memset(fe, 0, m_space); ++ ++ fei->fi_extents_max = fei->fi_extents_mapped; ++ fei->fi_extents_mapped = 0; ++ fei->fi_extents_start = fe; ++ ++ set_fs(KERNEL_DS); ++ rval = i->i_op->fiemap(i, fei, foff, len + ++ (1 << i->i_sb->s_blocksize_bits) - 1); ++ set_fs(old_fs); ++ ++ if (rval || !fei->fi_extents_mapped) { ++ dprintk(" No extents. Wanted %d, got %d\n", ++ fei->fi_extents_max, fei->fi_extents_mapped); ++ kfree(fe); ++ fei->fi_extents_start = NULL; ++ return False; ++ } else ++ return True; ++} ++ ++/* ++ * extents_process -- runs through the extent returned from the file system and ++ * creates block layout entries. ++ */ ++static boolean_t ++extents_process(struct fiemap_extent_info *fei, struct list_head *bl_candidates, ++ struct nfsd4_layout_seg *seg, dev_t dev, pnfs_blocklayout_layout_t *b) ++{ ++ struct fiemap_extent *fep, ++ *fep_last = NULL; ++ int i; ++ pnfs_blocklayout_layout_t *n; ++ u64 last_end, ++ rval; ++ ++ dprintk("--> %s\n", __func__); ++ for (fep = fei->fi_extents_start, i = 0; i < fei->fi_extents_mapped; ++ i++, fep++) { ++ ++ BUG_ON(!fep->fe_physical); ++ /* ++ * Deal with corner cases of hoel-y files. ++ */ ++ if (fep_last && ((fep_last->fe_logical + fep_last->fe_length) != ++ fep->fe_logical)) { ++ ++ /* ++ * If the last extent doesn't end logically ++ * at the beginning of the current we've got ++ * hole and need to create a pNFS extent. ++ */ ++ dprintk(" Got a hole at %Ld:%Ld \n", ++ _2SECTS(fep_last->fe_logical), ++ _2SECTS(fep_last->fe_length)); ++ last_end = fep_last->fe_logical + fep_last->fe_length; ++ rval = bll_alloc_holey(bl_candidates, last_end, ++ fep->fe_logical - last_end, dev); ++ if (!rval) ++ return False; ++ seg->length += rval; ++ } ++ ++ n = bll_alloc(fep->fe_logical, fep->fe_length, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ if (unlikely(n == NULL)) { ++ dprintk("%s: bll_alloc failed\n", __func__); ++ return False; ++ } ++ ++ n->bll_soff = fep->fe_physical; ++ n->bll_es = seg->iomode == IOMODE_READ ? ++ PNFS_BLOCK_READ_DATA : PNFS_BLOCK_READWRITE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += fep->fe_length; ++ print_bll(n, "New extent"); ++ fep_last = fep; ++ } ++ dprintk("<-- %s (i=%d)\n", __func__, i); ++ ++ return True; ++} ++ ++static void ++extents_cleanup(struct fiemap_extent_info *fei) ++{ ++ if (fei->fi_extents_start) { ++ kfree(fei->fi_extents_start); ++ fei->fi_extents_start = NULL; ++ } ++} ++ ++/* ++ * device_slice -- check to see if device is a slice or DM ++ */ ++static boolean_t ++device_slice(dev_t devid) ++{ ++ struct block_device *bd = open_by_devnum(devid, FMODE_READ); ++ boolean_t rval = False; ++ ++ if (bd) { ++ if (bd->bd_disk->minors > 1) ++ rval = True; ++ blkdev_put(bd, FMODE_READ); ++ } ++ return rval; ++} ++ ++/* ++ * device_dm -- check to see if device is a Device Mapper volume. ++ * ++ * Returns 1 for DM or 0 if not ++ */ ++static boolean_t ++device_dm(dev_t devid) ++{ ++ boolean_t rval = False; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMCHK; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Failed upcall to check on DM status\n"); ++ } else if (res->u.dm_vol) { ++ rval = True; ++ dprintk("Device is DM volume\n"); ++ } else ++ dprintk("Device is not DM volume\n"); ++ kfree(res); ++ ++ return rval; ++} ++ ++static boolean_t ++layout_inode_add(struct inode *i, bl_layout_rec_t **p) ++{ ++ bl_layout_rec_t *r = NULL; ++ ++ if (!i->i_op->fiemap || !i->i_op->fallocate) { ++ printk("pNFS: file system doesn't support required fiemap or" ++ "fallocate methods\n"); ++ return False; ++ } ++ ++ r = kmalloc(sizeof (*r), GFP_KERNEL); ++ if (!r) ++ goto error; ++ ++ r->blr_rdev = i->i_sb->s_dev; ++ r->blr_inode = i; ++ r->blr_orig_size = i->i_size; ++ r->blr_ext_size = 0; ++ r->blr_recalled = 0; ++ INIT_LIST_HEAD(&r->blr_layouts); ++ spin_lock_init(&r->blr_lock); ++ spin_lock(&layout_hashtbl_lock); ++ list_add_tail(&r->blr_hash, &layout_hash); ++ spin_unlock(&layout_hashtbl_lock); ++ *p = r; ++ return True; ++ ++error: ++ if (r) ++ kfree(r); ++ return False; ++} ++ ++static bl_layout_rec_t * ++__layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ if (!list_empty(&layout_hash)) { ++ list_for_each_entry(r, &layout_hash, blr_hash) { ++ if ((r->blr_inode->i_ino == i->i_ino) && ++ (r->blr_rdev == i->i_sb->s_dev)) { ++ return r; ++ } ++ } ++ } ++ return NULL; ++} ++ ++static bl_layout_rec_t * ++layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ spin_unlock(&layout_hashtbl_lock); ++ ++ return r; ++} ++ ++static void ++layout_inode_del(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ if (list_empty(&r->blr_layouts)) { ++ list_del(&r->blr_hash); ++ spin_unlock(&r->blr_lock); ++ kfree(r); ++ } else { ++ spin_unlock(&r->blr_lock); ++ } ++ } else { ++ dprintk("%s: failed to find inode [0x%x:%lu] in table for delete\n", ++ __func__, i->i_sb->s_dev, i->i_ino); ++ } ++ spin_unlock(&layout_hashtbl_lock); ++} ++ ++/* ++ * map_state2name -- converts state in ascii string. ++ * ++ * Used for debug messages only. ++ */ ++static char * ++map_state2name(enum pnfs_block_extent_state4 s) ++{ ++ switch (s) { ++ case PNFS_BLOCK_READWRITE_DATA: return " RW"; ++ case PNFS_BLOCK_READ_DATA: return " RO"; ++ case PNFS_BLOCK_INVALID_DATA: return "INVALID"; ++ case PNFS_BLOCK_NONE_DATA: return " NONE"; ++ default: ++ BUG(); ++ } ++} ++ ++static pnfs_blocklayout_devinfo_t * ++bld_alloc(struct list_head *volumes, int type) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ return NULL; ++ ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = type; ++ list_add_tail(&bld->bld_list, volumes); ++ ++ return bld; ++} ++ ++static void ++bld_free(pnfs_blocklayout_devinfo_t *bld) ++{ ++ list_del(&bld->bld_list); ++ kfree(bld); ++} ++ ++static void ++print_bll(pnfs_blocklayout_layout_t *b, char *text) ++{ ++ dprintk(" BLL: %s\n", text); ++ dprintk(" foff %Lu, soff %Lu, len %Lu, state %s\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_soff), _2SECTS(b->bll_len), ++ map_state2name(b->bll_es)); ++} ++ ++static inline void ++bll_collapse(bl_layout_rec_t *r, pnfs_blocklayout_layout_t *c) ++{ ++ pnfs_blocklayout_layout_t *n; ++ int dbg_count = 0; ++ u64 endpoint; ++ ++ BUG_ON(c->bll_es == PNFS_BLOCK_NONE_DATA); ++ while (c->bll_list.next != &r->blr_layouts) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ endpoint = BLL_S_END(c); ++ if ((n->bll_soff >= c->bll_soff) && ++ (n->bll_soff < endpoint)) { ++ if (endpoint < BLL_S_END(n)) { ++ /* ++ * The following is possible. ++ * ++ * ++ * Existing: +---+ +---+ ++ * New: +-----------------------+ ++ * The client request merge entries together ++ * but didn't require picking up all of the ++ * last entry. So, we still need to delete ++ * the last entry and add the remaining space ++ * to the new entry. ++ */ ++ c->bll_len += BLL_S_END(n) - endpoint; ++ } ++ dbg_count++; ++ list_del(&n->bll_list); ++ kfree(n); ++ } else { ++ break; ++ } ++ } ++ /* ---- Debug only, remove before integration ---- */ ++ if (dbg_count) ++ dprintk(" Collapsed %d cache entries between %Lu(s) and %Lu(s)\n", ++ dbg_count, _2SECTS(c->bll_soff), _2SECTS(BLL_S_END(c))); ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc(u64 offset, u64 len, enum bl_cache_state state, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = kmalloc(sizeof (*n), GFP_KERNEL); ++ if (n) { ++ memset(n, 0, sizeof (*n)); ++ n->bll_foff = offset; ++ n->bll_len = len; ++ n->bll_cache_state = state; ++ if (h) ++ list_add_tail(&n->bll_list, h); ++ } ++ return n; ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc_dup(pnfs_blocklayout_layout_t *b, enum bl_cache_state c, ++ struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = bll_alloc(b->bll_foff, b->bll_len, c, h); ++ if (n) { ++ n->bll_es = b->bll_es; ++ n->bll_soff = b->bll_soff; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ } ++ return n; ++} ++ ++static inline boolean_t ++layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s) ++{ ++ /* ---- Normal case ---- */ ++ *s = b->bll_es; ++ ++ switch (b->bll_es) { ++ case PNFS_BLOCK_READWRITE_DATA: ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_READ_DATA; ++ /* ---- Any use is permitted. ---- */ ++ break; ++ case PNFS_BLOCK_READ_DATA: ++ /* ---- Committed as read only data. ---- */ ++ if (iomode == IOMODE_RW) ++ return False; ++ break; ++ case PNFS_BLOCK_INVALID_DATA: ++ /* ---- Blocks have been allocated, but not initialized ---- */ ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_NONE_DATA; ++ break; ++ case PNFS_BLOCK_NONE_DATA: ++ /* ---- Hole-y file. No backing store avail. ---- */ ++ if (iomode != IOMODE_READ) ++ return False; ++ break; ++ default: ++ BUG(); ++ } ++ return True; ++} ++ ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs/delegation.c +--- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-08-23 12:08:29.037481540 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-23 12:09:03.300491952 -0400 +@@ -104,7 +104,8 @@ again: + continue; + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + continue; +- if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) ++ if (memcmp(state->stateid.u.data, stateid->u.data, ++ sizeof(state->stateid.u.data)) != 0) + continue; + get_nfs_open_context(ctx); + spin_unlock(&inode->i_lock); +@@ -133,8 +134,8 @@ void nfs_inode_reclaim_delegation(struct + if (delegation != NULL) { + spin_lock(&delegation->lock); + if (delegation->inode != NULL) { +- memcpy(delegation->stateid.data, res->delegation.data, +- sizeof(delegation->stateid.data)); ++ memcpy(delegation->stateid.u.data, res->delegation.u.data, ++ sizeof(delegation->stateid.u.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + oldcred = delegation->cred; +@@ -187,8 +188,9 @@ static struct nfs_delegation *nfs_detach + if (delegation == NULL) + goto nomatch; + spin_lock(&delegation->lock); +- if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data, +- sizeof(delegation->stateid.data)) != 0) ++ if (stateid != NULL && memcmp(delegation->stateid.u.data, ++ stateid->u.data, ++ sizeof(delegation->stateid.u.data)) != 0) + goto nomatch_unlock; + list_del_rcu(&delegation->super_list); + delegation->inode = NULL; +@@ -216,8 +218,8 @@ int nfs_inode_set_delegation(struct inod + delegation = kmalloc(sizeof(*delegation), GFP_NOFS); + if (delegation == NULL) + return -ENOMEM; +- memcpy(delegation->stateid.data, res->delegation.data, +- sizeof(delegation->stateid.data)); ++ memcpy(delegation->stateid.u.data, res->delegation.u.data, ++ sizeof(delegation->stateid.u.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + delegation->change_attr = nfsi->change_attr; +@@ -471,9 +473,7 @@ void nfs_expire_unreferenced_delegations + /* + * Asynchronous delegation recall! + */ +-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, +- int (*validate_stateid)(struct nfs_delegation *delegation, +- const nfs4_stateid *stateid)) ++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) + { + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_delegation *delegation; +@@ -481,7 +481,7 @@ int nfs_async_inode_return_delegation(st + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + +- if (!validate_stateid(delegation, stateid)) { ++ if (!clp->cl_mvops->validate_stateid(delegation, stateid)) { + rcu_read_unlock(); + return -ENOENT; + } +@@ -562,7 +562,8 @@ int nfs4_copy_delegation_stateid(nfs4_st + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation != NULL) { +- memcpy(dst->data, delegation->stateid.data, sizeof(dst->data)); ++ memcpy(dst->u.data, delegation->stateid.u.data, ++ sizeof(dst->u.data)); + ret = 1; + } + rcu_read_unlock(); +diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs/delegation.h +--- linux-2.6.34.noarch/fs/nfs/delegation.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-08-23 12:09:03.301431797 -0400 +@@ -34,9 +34,7 @@ enum { + int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); + void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); + int nfs_inode_return_delegation(struct inode *inode); +-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, +- int (*validate_stateid)(struct nfs_delegation *delegation, +- const nfs4_stateid *stateid)); ++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); + void nfs_inode_return_delegation_noreclaim(struct inode *inode); + + struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); +diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c +--- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-08-23 12:08:29.089481525 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 12:09:03.302511603 -0400 +@@ -17,11 +17,19 @@ + #include + #include + ++#include ++#if defined(CONFIG_SPNFS) ++#include ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif ++#endif + #include + #include + + #include "nfsd.h" + #include "nfsfh.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_EXPORT + +@@ -352,6 +360,40 @@ static int svc_export_upcall(struct cach + return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); + } + ++#if defined(CONFIG_PNFSD) ++static struct pnfsd_cb_operations pnfsd_cb_op = { ++ .cb_layout_recall = nfsd_layout_recall_cb, ++ .cb_device_notify = nfsd_device_notify_cb, ++ ++ .cb_get_state = nfs4_pnfs_cb_get_state, ++ .cb_change_state = nfs4_pnfs_cb_change_state, ++}; ++ ++#if defined(CONFIG_SPNFS) ++static struct pnfs_export_operations spnfs_export_ops = { ++ .layout_type = spnfs_layout_type, ++ .get_device_info = spnfs_getdeviceinfo, ++ .get_device_iter = spnfs_getdeviceiter, ++ .layout_get = spnfs_layoutget, ++ .layout_return = spnfs_layoutreturn, ++}; ++ ++static struct pnfs_export_operations spnfs_ds_export_ops = { ++ .get_state = spnfs_get_state, ++}; ++ ++#if defined(CONFIG_SPNFS_BLOCK) ++static struct pnfs_export_operations bl_export_ops = { ++ .layout_type = bl_layout_type, ++ .get_device_info = bl_getdeviceinfo, ++ .get_device_iter = bl_getdeviceiter, ++ .layout_get = bl_layoutget, ++ .layout_return = bl_layoutreturn, ++}; ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_SPNFS */ ++#endif /* CONFIG_PNFSD */ ++ + static struct svc_export *svc_export_update(struct svc_export *new, + struct svc_export *old); + static struct svc_export *svc_export_lookup(struct svc_export *); +@@ -395,6 +437,47 @@ static int check_export(struct inode *in + return -EINVAL; + } + ++#if !defined(CONFIG_SPNFS) ++ if (inode->i_sb->s_pnfs_op && ++ (!inode->i_sb->s_pnfs_op->layout_type || ++ !inode->i_sb->s_pnfs_op->get_device_info || ++ !inode->i_sb->s_pnfs_op->layout_get)) { ++ dprintk("exp_export: export of invalid fs pnfs export ops.\n"); ++ return -EINVAL; ++ } ++#endif /* CONFIG_SPNFS */ ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ if (!inode->i_sb->s_pnfs_op) ++ pnfsd_lexp_init(inode); ++ return 0; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, *flags)) { ++ dprintk("set pnfs block export structure... \n"); ++ inode->i_sb->s_pnfs_op = &bl_export_ops; ++ } else ++#endif /* CONFIG_SPNFS_BLOCK */ ++ /* ++ * spnfs_enabled() indicates we're an MDS. ++ * XXX Better to check an export time option as well. ++ */ ++ if (spnfs_enabled()) { ++ dprintk("set spnfs export structure...\n"); ++ inode->i_sb->s_pnfs_op = &spnfs_export_ops; ++ } else { ++ dprintk("%s spnfs not in use\n", __func__); ++ ++ /* ++ * get_state is needed if we're a DS using spnfs. ++ * XXX Better to check an export time option instead. ++ */ ++ inode->i_sb->s_pnfs_op = &spnfs_ds_export_ops; ++ } ++#endif /* CONFIG_SPNFS */ ++ + return 0; + + } +@@ -586,6 +669,8 @@ static int svc_export_parse(struct cache + if (exp.ex_uuid == NULL) + err = -ENOMEM; + } ++ } else if (strcmp(buf, "pnfs") == 0) { ++ exp.ex_pnfs = 1; + } else if (strcmp(buf, "secinfo") == 0) + err = secinfo_parse(&mesg, buf, &exp); + else +@@ -660,6 +745,8 @@ static int svc_export_show(struct seq_fi + seq_printf(m, "%02x", exp->ex_uuid[i]); + } + } ++ if (exp->ex_pnfs) ++ seq_puts(m, ",pnfs"); + show_secinfo(m, exp); + } + seq_puts(m, ")\n"); +@@ -687,6 +774,7 @@ static void svc_export_init(struct cache + new->ex_fslocs.locations = NULL; + new->ex_fslocs.locations_count = 0; + new->ex_fslocs.migrated = 0; ++ new->ex_pnfs = 0; + } + + static void export_update(struct cache_head *cnew, struct cache_head *citem) +@@ -699,6 +787,7 @@ static void export_update(struct cache_h + new->ex_anon_uid = item->ex_anon_uid; + new->ex_anon_gid = item->ex_anon_gid; + new->ex_fsid = item->ex_fsid; ++ new->ex_pnfs = item->ex_pnfs; + new->ex_uuid = item->ex_uuid; + item->ex_uuid = NULL; + new->ex_pathname = item->ex_pathname; +@@ -1635,8 +1724,17 @@ nfsd_export_init(void) + if (rv) + return rv; + rv = cache_register(&svc_expkey_cache); +- if (rv) ++ if (rv) { + cache_unregister(&svc_export_cache); ++ goto out; ++ } ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = THIS_MODULE; ++ pnfsd_cb_ctl.cb_op = &pnfsd_cb_op; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ ++out: + return rv; + + } +@@ -1664,6 +1762,12 @@ nfsd_export_shutdown(void) + + exp_writelock(); + ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = NULL; ++ pnfsd_cb_ctl.cb_op = NULL; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ + cache_unregister(&svc_expkey_cache); + cache_unregister(&svc_export_cache); + svcauth_unix_purge(); +diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/direct.c +--- linux-2.6.34.noarch/fs/nfs/direct.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-08-23 12:09:03.303491500 -0400 +@@ -267,6 +267,38 @@ static const struct rpc_call_ops nfs_rea + .rpc_release = nfs_direct_read_release, + }; + ++static long nfs_direct_read_execute(struct nfs_read_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ nfs_fattr_init(&data->fattr); ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ NFS_PROTO(inode)->read_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct read call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each rsize'd chunk of the user's buffer, dispatch an NFS READ + * operation. If nfs_readdata_alloc() or get_user_pages() fails, +@@ -283,7 +315,6 @@ static ssize_t nfs_direct_read_schedule_ + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; + size_t rsize = NFS_SERVER(inode)->rsize; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -343,26 +374,9 @@ static ssize_t nfs_direct_read_schedule_ + data->res.fattr = &data->fattr; + data->res.eof = 0; + data->res.count = bytes; +- nfs_fattr_init(&data->fattr); +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct read call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_read_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +@@ -448,12 +462,15 @@ static void nfs_direct_free_writedata(st + } + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg); ++ + static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) + { + struct inode *inode = dreq->inode; + struct list_head *p; + struct nfs_write_data *data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = dreq->ctx->cred, + }; +@@ -487,25 +504,7 @@ static void nfs_direct_write_reschedule( + * Reuse data->task; data->args should not have changed + * since the original request was sent. + */ +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- /* +- * We're called via an RPC callback, so BKL is already held. +- */ +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); +- +- dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- data->args.count, +- (unsigned long long)data->args.offset); ++ nfs_direct_write_execute(data, &task_setup_data, &msg); + } + + if (put_dreq(dreq)) +@@ -548,10 +547,31 @@ static const struct rpc_call_ops nfs_com + .rpc_release = nfs_direct_commit_release, + }; + ++static long nfs_direct_commit_execute(struct nfs_direct_req *dreq, ++ struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct rpc_task *task; ++ ++ NFS_PROTO(data->inode)->commit_setup(data, msg); ++ ++ /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ ++ dreq->commit_data = NULL; ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ return 0; ++} ++ + static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) + { + struct nfs_write_data *data = dreq->commit_data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +@@ -579,16 +599,7 @@ static void nfs_direct_commit_schedule(s + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- NFS_PROTO(data->inode)->commit_setup(data, &msg); +- +- /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ +- dreq->commit_data = NULL; +- +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); +- +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); ++ nfs_direct_commit_execute(dreq, data, &task_setup_data, &msg); + } + + static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +@@ -690,6 +701,36 @@ static const struct rpc_call_ops nfs_wri + .rpc_release = nfs_direct_write_release, + }; + ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ NFS_PROTO(inode)->write_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE + * operation. If nfs_writedata_alloc() or get_user_pages() fails, +@@ -705,7 +746,6 @@ static ssize_t nfs_direct_write_schedule + struct inode *inode = ctx->path.dentry->d_inode; + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -771,24 +811,8 @@ static ssize_t nfs_direct_write_schedule + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct write call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_write_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kconfig +--- linux-2.6.34.noarch/fs/nfsd/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-08-23 12:09:03.304505472 -0400 +@@ -79,3 +79,52 @@ config NFSD_V4 + available from http://linux-nfs.org/. + + If unsure, say N. ++ ++config PNFSD ++ bool "NFSv4.1 server support for Parallel NFS (pNFS) (DEVELOPER ONLY)" ++ depends on NFSD_V4 && EXPERIMENTAL ++ select EXPORTFS_FILE_LAYOUT ++ help ++ This option enables support for the parallel NFS features of the ++ minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1) ++ in the kernel's NFS server. ++ ++ Unless you're an NFS developer, say N. ++ ++config PNFSD_LOCAL_EXPORT ++ bool "Enable pNFS support for exporting local filesystems for debugging purposes" ++ depends on PNFSD ++ help ++ Say Y here if you want your pNFS server to export local file systems ++ over the files layout type. With this option the MDS (metadata ++ server) functions also as a single DS (data server). This is mostly ++ useful for development and debugging purposes. ++ ++ If unsure, say N. ++ ++config SPNFS ++ bool "Provide spNFS server support (EXPERIMENTAL)" ++ depends on PNFSD ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS server support. ++ ++ If unsure, say N. ++ ++config SPNFS_LAYOUTSEGMENTS ++ bool "Allow spNFS to return partial file layouts (EXPERIMENTAL)" ++ depends on SPNFS ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS to be able to return layout segments. ++ ++ If unsure, say N. ++ ++config SPNFS_BLOCK ++ bool "Provide Block Layout server support (EXPERIMENTAL)" ++ depends on SPNFS ++ select EXPORTFS_BLOCK_LAYOUT ++ help ++ Say Y here if you want spNFS block layout support ++ ++ If unsure, say N. +diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/Makefile +--- linux-2.6.34.noarch/fs/nfsd/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-08-23 12:09:03.304505472 -0400 +@@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs + nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o + nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ + nfs4acl.o nfs4callback.o nfs4recover.o ++nfsd-$(CONFIG_PNFSD) += nfs4pnfsd.o nfs4pnfsdlm.o nfs4pnfsds.o ++nfsd-$(CONFIG_PNFSD_LOCAL_EXPORT) += pnfsd_lexp.o ++nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o ++nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-23 12:08:29.090501507 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 12:09:03.306491345 -0400 +@@ -40,7 +40,6 @@ + + #define NFSPROC4_CB_NULL 0 + #define NFSPROC4_CB_COMPOUND 1 +-#define NFS4_STATEID_SIZE 16 + + /* Index of predefined Linux callback client operations */ + +@@ -48,11 +47,17 @@ enum { + NFSPROC4_CLNT_CB_NULL = 0, + NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_SEQUENCE, ++#if defined(CONFIG_PNFSD) ++ NFSPROC4_CLNT_CB_LAYOUT, ++ NFSPROC4_CLNT_CB_DEVICE, ++#endif + }; + + enum nfs_cb_opnum4 { + OP_CB_RECALL = 4, ++ OP_CB_LAYOUT = 5, + OP_CB_SEQUENCE = 11, ++ OP_CB_DEVICE = 14, + }; + + #define NFS4_MAXTAGLEN 20 +@@ -78,6 +83,19 @@ enum nfs_cb_opnum4 { + #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) ++#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 3 + \ ++ enc_nfs4_fh_sz + 4) ++#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) ++#define NFS4_enc_cb_device_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 6) ++#define NFS4_dec_cb_device_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) + + /* + * Generic encode routines from fs/nfs/nfs4xdr.c +@@ -94,6 +112,10 @@ xdr_writemem(__be32 *p, const void *ptr, + } + + #define WRITE32(n) *p++ = htonl(n) ++#define WRITE64(n) do { \ ++ *p++ = htonl((u32)((n) >> 32)); \ ++ *p++ = htonl((u32)(n)); \ ++} while (0) + #define WRITEMEM(ptr,nbytes) do { \ + p = xdr_writemem(p, ptr, nbytes); \ + } while (0) +@@ -204,6 +226,16 @@ nfs_cb_stat_to_errno(int stat) + */ + + static void ++encode_stateid(struct xdr_stream *xdr, stateid_t *sid) ++{ ++ __be32 *p; ++ ++ RESERVE_SPACE(sizeof(stateid_t)); ++ WRITE32(sid->si_generation); ++ WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); ++} ++ ++static void + encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) + { + __be32 * p; +@@ -228,10 +260,10 @@ encode_cb_recall(struct xdr_stream *xdr, + __be32 *p; + int len = dp->dl_fh.fh_size; + +- RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len); ++ RESERVE_SPACE(4); + WRITE32(OP_CB_RECALL); +- WRITE32(dp->dl_stateid.si_generation); +- WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t)); ++ encode_stateid(xdr, &dp->dl_stateid); ++ RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2)); + WRITE32(0); /* truncate optimization not implemented */ + WRITE32(len); + WRITEMEM(&dp->dl_fh.fh_base, len); +@@ -259,6 +291,111 @@ encode_cb_sequence(struct xdr_stream *xd + hdr->nops++; + } + ++#if defined(CONFIG_PNFSD) ++ ++#include "pnfsd.h" ++ ++static void ++encode_cb_layout(struct xdr_stream *xdr, struct nfs4_layoutrecall *clr, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ RESERVE_SPACE(20); ++ WRITE32(OP_CB_LAYOUT); ++ WRITE32(clr->cb.cbl_seg.layout_type); ++ WRITE32(clr->cb.cbl_seg.iomode); ++ WRITE32(clr->cb.cbl_layoutchanged); ++ WRITE32(clr->cb.cbl_recall_type); ++ if (unlikely(clr->cb.cbl_recall_type == RETURN_FSID)) { ++ struct nfs4_fsid fsid = clr->cb.cbl_fsid; ++ ++ RESERVE_SPACE(16); ++ WRITE64(fsid.major); ++ WRITE64(fsid.minor); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "fsid 0x%llx-0x%llx\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, fsid.major, fsid.minor); ++ } else if (clr->cb.cbl_recall_type == RETURN_FILE) { ++ int len = clr->clr_file->fi_fhlen; ++ stateid_t *cbl_sid = (stateid_t *)&clr->cb.cbl_sid; ++ ++ RESERVE_SPACE(20 + len); ++ WRITE32(len); ++ WRITEMEM(clr->clr_file->fi_fhval, len); ++ WRITE64(clr->cb.cbl_seg.offset); ++ WRITE64(clr->cb.cbl_seg.length); ++ encode_stateid(xdr, cbl_sid); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "offset %lld length %lld stateid " STATEID_FMT "\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, ++ clr->cb.cbl_seg.offset, clr->cb.cbl_seg.length, ++ STATEID_VAL(cbl_sid)); ++ } else { ++ dprintk("%s: type %x iomode %d changed %d recall_type %d\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type); ++ } ++ hdr->nops++; ++} ++ ++static void ++encode_cb_device(struct xdr_stream *xdr, struct nfs4_notify_device *nd, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ int i; ++ int len = nd->nd_list->cbd_len; ++ struct nfsd4_pnfs_cb_dev_item *cbd = nd->nd_list->cbd_list; ++ ++ dprintk("NFSD %s: --> num %d\n", __func__, len); ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ RESERVE_SPACE(8); ++ WRITE32(OP_CB_DEVICE); ++ ++ /* notify4 cnda_changes<>; */ ++ WRITE32(len); ++ for (i = 0; i < len; i++) { ++ dprintk("%s: nt %d lt %d devid x%llx-x%llx im %d i %d\n", ++ __func__, cbd[i].cbd_notify_type, ++ cbd[i].cbd_layout_type, ++ cbd[i].cbd_devid.sbid, ++ cbd[i].cbd_devid.devid, ++ cbd[i].cbd_immediate, i); ++ ++ BUG_ON(cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_DELETE); ++ RESERVE_SPACE(32); ++ /* bitmap4 notify_mask; */ ++ WRITE32(1); ++ WRITE32(cbd[i].cbd_notify_type); ++ /* opaque notify_vals<>; */ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) ++ WRITE32(24); ++ else ++ WRITE32(20); ++ WRITE32(cbd[i].cbd_layout_type); ++ WRITE64(cbd[i].cbd_devid.sbid); ++ WRITE64(cbd[i].cbd_devid.devid); ++ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) { ++ RESERVE_SPACE(4); ++ WRITE32(cbd[i].cbd_immediate); ++ } ++ } ++ hdr->nops++; ++} ++#endif /* CONFIG_PNFSD */ ++ + static int + nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) + { +@@ -288,6 +425,45 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst * + return 0; + } + ++#if defined(CONFIG_PNFSD) ++static int ++nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, u32 *p, ++ struct nfs4_rpc_args *rpc_args) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_layoutrecall *args = rpc_args->args_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = rpc_args->args_seq.cbs_minorversion, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_cb_compound_hdr(&xdr, &hdr); ++ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); ++ encode_cb_layout(&xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++ return 0; ++} ++ ++static int ++nfs4_xdr_enc_cb_device(struct rpc_rqst *req, u32 *p, ++ struct nfs4_rpc_args *rpc_args) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_notify_device *args = rpc_args->args_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = rpc_args->args_seq.cbs_minorversion, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_cb_compound_hdr(&xdr, &hdr); ++ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); ++ encode_cb_device(&xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++ return 0; ++} ++#endif /* CONFIG_PNFSD */ + + static int + decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ +@@ -403,6 +579,48 @@ out: + return status; + } + ++#if defined(CONFIG_PNFSD) ++static int ++nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, u32 *p, ++ struct nfsd4_cb_sequence *seq) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_cb_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_cb_sequence(&xdr, seq, rqstp); ++ if (status) ++ goto out; ++ status = decode_cb_op_hdr(&xdr, OP_CB_LAYOUT); ++out: ++ return status; ++} ++ ++static int ++nfs4_xdr_dec_cb_device(struct rpc_rqst *rqstp, u32 *p, ++ struct nfsd4_cb_sequence *seq) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_cb_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_cb_sequence(&xdr, seq, rqstp); ++ if (status) ++ goto out; ++ status = decode_cb_op_hdr(&xdr, OP_CB_DEVICE); ++out: ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * RPC procedure tables + */ +@@ -420,6 +638,10 @@ out: + static struct rpc_procinfo nfs4_cb_procedures[] = { + PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null), + PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall), ++#if defined(CONFIG_PNFSD) ++ PROC(CB_LAYOUT, COMPOUND, enc_cb_layout, dec_cb_layout), ++ PROC(CB_DEVICE, COMPOUND, enc_cb_device, dec_cb_device), ++#endif + }; + + static struct rpc_version nfs_cb_version4 = { +@@ -606,10 +828,9 @@ out: + * TODO: cb_sequence should support referring call lists, cachethis, multiple + * slots, and mark callback channel down on communication errors. + */ +-static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_prepare_sequence(struct rpc_task *task, ++ struct nfs4_client *clp) + { +- struct nfs4_delegation *dp = calldata; +- struct nfs4_client *clp = dp->dl_client; + struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; + u32 minorversion = clp->cl_cb_conn.cb_minorversion; + int status = 0; +@@ -629,11 +850,15 @@ static void nfsd4_cb_prepare(struct rpc_ + rpc_call_start(task); + } + +-static void nfsd4_cb_done(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_recall_prepare(struct rpc_task *task, void *calldata) + { + struct nfs4_delegation *dp = calldata; +- struct nfs4_client *clp = dp->dl_client; ++ nfsd4_cb_prepare_sequence(task, dp->dl_client); ++} + ++static void nfsd4_cb_done_sequence(struct rpc_task *task, ++ struct nfs4_client *clp) ++{ + dprintk("%s: minorversion=%d\n", __func__, + clp->cl_cb_conn.cb_minorversion); + +@@ -657,7 +882,7 @@ static void nfsd4_cb_recall_done(struct + struct nfs4_client *clp = dp->dl_client; + struct rpc_clnt *current_rpc_client = clp->cl_cb_client; + +- nfsd4_cb_done(task, calldata); ++ nfsd4_cb_done_sequence(task, clp); + + if (current_rpc_client == NULL) { + /* We're shutting down; give up. */ +@@ -688,7 +913,7 @@ static void nfsd4_cb_recall_done(struct + if (dp->dl_retries--) { + rpc_delay(task, 2*HZ); + task->tk_status = 0; +- rpc_restart_call(task); ++ rpc_restart_call_prepare(task); + return; + } else { + atomic_set(&clp->cl_cb_set, 0); +@@ -704,7 +929,7 @@ static void nfsd4_cb_recall_release(void + } + + static const struct rpc_call_ops nfsd4_cb_recall_ops = { +- .rpc_call_prepare = nfsd4_cb_prepare, ++ .rpc_call_prepare = nfsd4_cb_recall_prepare, + .rpc_call_done = nfsd4_cb_recall_done, + .rpc_release = nfsd4_cb_recall_release, + }; +@@ -781,3 +1006,173 @@ void nfsd4_cb_recall(struct nfs4_delegat + { + queue_work(callback_wq, &dp->dl_recall.cb_work); + } ++ ++#if defined(CONFIG_PNFSD) ++static void nfsd4_cb_layout_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ nfsd4_cb_prepare_sequence(task, clr->clr_client); ++} ++ ++static void nfsd4_cb_layout_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ struct nfs4_client *clp = clr->clr_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ if (!task->tk_status) ++ return; ++ ++ printk("%s: clp %p cb_client %p fp %p failed with status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ clr->clr_file, ++ task->tk_status); ++ ++ switch (task->tk_status) { ++ case -EIO: ++ /* Network partition? */ ++ atomic_set(&clp->cl_cb_set, 0); ++ warn_no_callback_path(clp, task->tk_status); ++ /* FIXME: ++ * The pnfs standard states that we need to only expire ++ * the client after at-least "lease time" .eg lease-time * 2 ++ * when failing to communicate a recall ++ */ ++ break; ++ case -NFS4ERR_DELAY: ++ /* Pole the client until it's done with the layout */ ++ rpc_delay(task, HZ/100); /* 10 mili-seconds */ ++ task->tk_status = 0; ++ rpc_restart_call_prepare(task); ++ break; ++ case -NFS4ERR_NOMATCHING_LAYOUT: ++ task->tk_status = 0; ++ nomatching_layout(clr); ++ } ++} ++ ++static void nfsd4_cb_layout_release(void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ kfree(clr->clr_args); ++ clr->clr_args = NULL; ++ put_layoutrecall(clr); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_layout_ops = { ++ .rpc_call_prepare = nfsd4_cb_layout_prepare, ++ .rpc_call_done = nfsd4_cb_layout_done, ++ .rpc_release = nfsd4_cb_layout_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++int ++nfsd4_cb_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfs4_client *clp = clr->clr_client; ++ struct rpc_clnt *clnt = clp->cl_cb_client; ++ struct nfs4_rpc_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_LAYOUT], ++ .rpc_cred = callback_cred ++ }; ++ int status; ++ ++ args = kzalloc(sizeof(*args), GFP_KERNEL); ++ if (!args) { ++ status = -ENOMEM; ++ goto out; ++ } ++ clr->clr_args = args; ++ args->args_op = clr; ++ msg.rpc_argp = args; ++ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, ++ &nfsd4_cb_layout_ops, clr); ++out: ++ if (status) { ++ kfree(args); ++ put_layoutrecall(clr); ++ } ++ dprintk("NFSD: nfsd4_cb_layout: status %d\n", status); ++ return status; ++} ++ ++static void nfsd4_cb_device_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ nfsd4_cb_prepare_sequence(task, cbnd->nd_client); ++} ++ ++static void nfsd4_cb_device_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ struct nfs4_client *clp = cbnd->nd_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ dprintk("%s: clp %p cb_client %p: status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ task->tk_status); ++ ++ if (task->tk_status == -EIO) { ++ /* Network partition? */ ++ atomic_set(&clp->cl_cb_set, 0); ++ warn_no_callback_path(clp, task->tk_status); ++ } ++} ++ ++static void nfsd4_cb_device_release(void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ kfree(cbnd->nd_args); ++ cbnd->nd_args = NULL; ++ kfree(cbnd); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_device_ops = { ++ .rpc_call_prepare = nfsd4_cb_device_prepare, ++ .rpc_call_done = nfsd4_cb_device_done, ++ .rpc_release = nfsd4_cb_device_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++int ++nfsd4_cb_notify_device(struct nfs4_notify_device *cbnd) ++{ ++ struct nfs4_client *clp = cbnd->nd_client; ++ struct rpc_clnt *clnt = clp->cl_cb_client; ++ struct nfs4_rpc_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_DEVICE], ++ .rpc_cred = callback_cred ++ }; ++ int status = -EIO; ++ ++ dprintk("%s: clp %p\n", __func__, clp); ++ ++ args = kzalloc(sizeof(*args), GFP_KERNEL); ++ if (!args) { ++ status = -ENOMEM; ++ goto out; ++ } ++ args->args_op = cbnd; ++ msg.rpc_argp = args; ++ ++ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, ++ &nfsd4_cb_device_ops, cbnd); ++out: ++ if (status) ++ kfree(args); ++ dprintk("%s: status %d\n", __func__, status); ++ return status; ++} ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-23 12:09:03.307491492 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-23 12:09:03.308491262 -0400 +@@ -0,0 +1,1679 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ *****************************************************************************/ ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Globals */ ++static u32 current_layoutid = 1; ++ ++/* ++ * Currently used for manipulating the layout state. ++ */ ++static DEFINE_SPINLOCK(layout_lock); ++ ++#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_SMP) ++# define BUG_ON_UNLOCKED_LAYOUT() BUG_ON(!spin_is_locked(&layout_lock)) ++#else ++# define BUG_ON_UNLOCKED_LAYOUT() ++#endif ++ ++/* ++ * Layout state - NFSv4.1 pNFS ++ */ ++static struct kmem_cache *pnfs_layout_slab; ++static struct kmem_cache *pnfs_layoutrecall_slab; ++ ++/* hash table for nfsd4_pnfs_deviceid.sbid */ ++#define SBID_HASH_BITS 8 ++#define SBID_HASH_SIZE (1 << SBID_HASH_BITS) ++#define SBID_HASH_MASK (SBID_HASH_SIZE - 1) ++ ++struct sbid_tracker { ++ u64 id; ++ struct super_block *sb; ++ struct list_head hash; ++}; ++ ++static u64 current_sbid; ++static struct list_head sbid_hashtbl[SBID_HASH_SIZE]; ++ ++static inline unsigned long ++sbid_hashval(struct super_block *sb) ++{ ++ return hash_ptr(sb, SBID_HASH_BITS); ++} ++ ++static inline struct sbid_tracker * ++alloc_sbid(void) ++{ ++ return kmalloc(sizeof(struct sbid_tracker), GFP_KERNEL); ++} ++ ++static void ++destroy_sbid(struct sbid_tracker *sbid) ++{ ++ spin_lock(&layout_lock); ++ list_del(&sbid->hash); ++ spin_unlock(&layout_lock); ++ kfree(sbid); ++} ++ ++void ++nfsd4_free_pnfs_slabs(void) ++{ ++ int i; ++ struct sbid_tracker *sbid; ++ ++ nfsd4_free_slab(&pnfs_layout_slab); ++ nfsd4_free_slab(&pnfs_layoutrecall_slab); ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ while (!list_empty(&sbid_hashtbl[i])) { ++ sbid = list_first_entry(&sbid_hashtbl[i], ++ struct sbid_tracker, ++ hash); ++ destroy_sbid(sbid); ++ } ++ } ++} ++ ++int ++nfsd4_init_pnfs_slabs(void) ++{ ++ int i; ++ ++ pnfs_layout_slab = kmem_cache_create("pnfs_layouts", ++ sizeof(struct nfs4_layout), 0, 0, NULL); ++ if (pnfs_layout_slab == NULL) ++ return -ENOMEM; ++ pnfs_layoutrecall_slab = kmem_cache_create("pnfs_layoutrecalls", ++ sizeof(struct nfs4_layoutrecall), 0, 0, NULL); ++ if (pnfs_layoutrecall_slab == NULL) ++ return -ENOMEM; ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ INIT_LIST_HEAD(&sbid_hashtbl[i]); ++ } ++ ++ return 0; ++} ++ ++/* XXX: Need to implement the notify types and track which ++ * clients have which devices. */ ++void pnfs_set_device_notify(clientid_t *clid, unsigned int types) ++{ ++ struct nfs4_client *clp; ++ dprintk("%s: -->\n", __func__); ++ ++ nfs4_lock_state(); ++ /* Indicate that client has a device so we can only notify ++ * the correct clients */ ++ clp = find_confirmed_client(clid); ++ if (clp) { ++ atomic_inc(&clp->cl_deviceref); ++ dprintk("%s: Incr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++ } ++ nfs4_unlock_state(); ++} ++ ++/* Clear notifications for this client ++ * XXX: Do we need to loop through a clean up all ++ * krefs when nfsd cleans up the client? */ ++void pnfs_clear_device_notify(struct nfs4_client *clp) ++{ ++ atomic_dec(&clp->cl_deviceref); ++ dprintk("%s: Decr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++} ++ ++static struct nfs4_layout_state * ++alloc_init_layout_state(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid) ++{ ++ struct nfs4_layout_state *new; ++ ++ /* FIXME: use a kmem_cache */ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return new; ++ get_nfs4_file(fp); ++ INIT_LIST_HEAD(&new->ls_perfile); ++ INIT_LIST_HEAD(&new->ls_layouts); ++ kref_init(&new->ls_ref); ++ new->ls_client = clp; ++ new->ls_file = fp; ++ new->ls_stateid.si_boot = stateid->si_boot; ++ new->ls_stateid.si_stateownerid = 0; /* identifies layout stateid */ ++ new->ls_stateid.si_generation = 1; ++ spin_lock(&layout_lock); ++ new->ls_stateid.si_fileid = current_layoutid++; ++ list_add(&new->ls_perfile, &fp->fi_layout_states); ++ spin_unlock(&layout_lock); ++ return new; ++} ++ ++static inline void ++get_layout_state(struct nfs4_layout_state *ls) ++{ ++ kref_get(&ls->ls_ref); ++} ++ ++static void ++destroy_layout_state_common(struct nfs4_layout_state *ls) ++{ ++ struct nfs4_file *fp = ls->ls_file; ++ ++ dprintk("pNFS %s: ls %p fp %p clp %p\n", __func__, ls, fp, ++ ls->ls_client); ++ BUG_ON(!list_empty(&ls->ls_layouts)); ++ kfree(ls); ++ put_nfs4_file(fp); ++} ++ ++static void ++destroy_layout_state(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ spin_lock(&layout_lock); ++ list_del(&ls->ls_perfile); ++ spin_unlock(&layout_lock); ++ destroy_layout_state_common(ls); ++} ++ ++static void ++destroy_layout_state_locked(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ list_del(&ls->ls_perfile); ++ destroy_layout_state_common(ls); ++} ++ ++static inline void ++put_layout_state(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state); ++} ++ ++static inline void ++put_layout_state_locked(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state_locked); ++} ++ ++/* ++ * Search the fp->fi_layout_state list for a layout state with the clientid. ++ * If not found, then this is a 'first open/delegation/lock stateid' from ++ * the client for this file. ++ * Called under the layout_lock. ++ */ ++static struct nfs4_layout_state * ++find_get_layout_state(struct nfs4_client *clp, struct nfs4_file *fp) ++{ ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_for_each_entry(ls, &fp->fi_layout_states, ls_perfile) { ++ if (ls->ls_client == clp) { ++ dprintk("pNFS %s: before GET ls %p ls_ref %d\n", ++ __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ get_layout_state(ls); ++ return ls; ++ } ++ } ++ return NULL; ++} ++ ++static __be32 ++verify_stateid(struct nfs4_file *fp, stateid_t *stateid) ++{ ++ struct nfs4_stateid *local = NULL; ++ struct nfs4_delegation *temp = NULL; ++ ++ /* check if open or lock stateid */ ++ local = find_stateid(stateid, RD_STATE); ++ if (local) ++ return 0; ++ temp = find_delegation_stateid(fp->fi_inode, stateid); ++ if (temp) ++ return 0; ++ return nfserr_bad_stateid; ++} ++ ++/* ++ * nfs4_preocess_layout_stateid () ++ * ++ * We have looked up the nfs4_file corresponding to the current_fh, and ++ * confirmed the clientid. Pull the few tests from nfs4_preprocess_stateid_op() ++ * that make sense with a layout stateid. ++ * ++ * Called with the state_lock held ++ * Returns zero and stateid is updated, or error. ++ * ++ * Note: the struct nfs4_layout_state pointer is only set by layoutget. ++ */ ++static __be32 ++nfs4_process_layout_stateid(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid, struct nfs4_layout_state **lsp) ++{ ++ struct nfs4_layout_state *ls = NULL; ++ __be32 status = 0; ++ ++ dprintk("--> %s clp %p fp %p \n", __func__, clp, fp); ++ ++ dprintk("%s: operation stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ goto out; ++ ++ /* Is this the first use of this layout ? */ ++ spin_lock(&layout_lock); ++ ls = find_get_layout_state(clp, fp); ++ spin_unlock(&layout_lock); ++ if (!ls) { ++ /* Only alloc layout state on layoutget (which sets lsp). */ ++ if (!lsp) { ++ dprintk("%s ERROR: Not layoutget & no layout stateid\n", ++ __func__); ++ status = nfserr_bad_stateid; ++ goto out; ++ } ++ dprintk("%s Initial stateid for layout: file %p client %p\n", ++ __func__, fp, clp); ++ ++ /* verify input stateid */ ++ status = verify_stateid(fp, stateid); ++ if (status) { ++ dprintk("%s ERROR: invalid open/deleg/lock stateid\n", ++ __func__); ++ goto out; ++ } ++ ls = alloc_init_layout_state(clp, fp, stateid); ++ if (!ls) { ++ dprintk("%s pNFS ERROR: no memory for layout state\n", ++ __func__); ++ status = nfserr_resource; ++ goto out; ++ } ++ } else { ++ dprintk("%s Not initial stateid. Layout state %p file %p\n", ++ __func__, ls, fp); ++ ++ /* BAD STATEID */ ++ status = nfserr_bad_stateid; ++ if (memcmp(&ls->ls_stateid.si_opaque, &stateid->si_opaque, ++ sizeof(stateid_opaque_t)) != 0) { ++ ++ /* if a LAYOUTGET operation and stateid is a valid ++ * open/deleg/lock stateid, accept it as a parallel ++ * initial layout stateid ++ */ ++ if (lsp && ((verify_stateid(fp, stateid)) == 0)) { ++ dprintk("%s parallel initial layout state\n", ++ __func__); ++ goto update; ++ } ++ ++ dprintk("%s ERROR bad opaque in stateid 1\n", __func__); ++ goto out_put; ++ } ++ ++ /* stateid is a valid layout stateid for this file. */ ++ if (stateid->si_generation > ls->ls_stateid.si_generation) { ++ dprintk("%s bad stateid 1\n", __func__); ++ goto out_put; ++ } ++update: ++ update_stateid(&ls->ls_stateid); ++ dprintk("%s Updated ls_stateid to %d on layoutstate %p\n", ++ __func__, ls->ls_stateid.si_generation, ls); ++ } ++ status = 0; ++ /* Set the stateid to be encoded */ ++ memcpy(stateid, &ls->ls_stateid, sizeof(stateid_t)); ++ ++ /* Return the layout state if requested */ ++ if (lsp) { ++ get_layout_state(ls); ++ *lsp = ls; ++ } ++ dprintk("%s: layout stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(&ls->ls_stateid)); ++out_put: ++ dprintk("%s PUT LO STATE:\n", __func__); ++ put_layout_state(ls); ++out: ++ dprintk("<-- %s status %d\n", __func__, htonl(status)); ++ ++ return status; ++} ++ ++static inline struct nfs4_layout * ++alloc_layout(void) ++{ ++ return kmem_cache_alloc(pnfs_layout_slab, GFP_KERNEL); ++} ++ ++static inline void ++free_layout(struct nfs4_layout *lp) ++{ ++ kmem_cache_free(pnfs_layout_slab, lp); ++} ++ ++static void ++init_layout(struct nfs4_layout_state *ls, ++ struct nfs4_layout *lp, ++ struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg) ++{ ++ dprintk("pNFS %s: ls %p lp %p clp %p fp %p ino %p\n", __func__, ++ ls, lp, clp, fp, fp->fi_inode); ++ ++ get_nfs4_file(fp); ++ lp->lo_client = clp; ++ lp->lo_file = fp; ++ get_layout_state(ls); ++ lp->lo_state = ls; ++ memcpy(&lp->lo_seg, seg, sizeof(lp->lo_seg)); ++ spin_lock(&layout_lock); ++ list_add_tail(&lp->lo_perstate, &ls->ls_layouts); ++ list_add_tail(&lp->lo_perclnt, &clp->cl_layouts); ++ list_add_tail(&lp->lo_perfile, &fp->fi_layouts); ++ spin_unlock(&layout_lock); ++ dprintk("pNFS %s end\n", __func__); ++} ++ ++static void ++dequeue_layout(struct nfs4_layout *lp) ++{ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del(&lp->lo_perclnt); ++ list_del(&lp->lo_perfile); ++ list_del(&lp->lo_perstate); ++} ++ ++static void ++destroy_layout(struct nfs4_layout *lp) ++{ ++ struct nfs4_client *clp; ++ struct nfs4_file *fp; ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ clp = lp->lo_client; ++ fp = lp->lo_file; ++ ls = lp->lo_state; ++ dprintk("pNFS %s: lp %p clp %p fp %p ino %p ls_layouts empty %d\n", ++ __func__, lp, clp, fp, fp->fi_inode, ++ list_empty(&ls->ls_layouts)); ++ ++ kmem_cache_free(pnfs_layout_slab, lp); ++ /* release references taken by init_layout */ ++ put_layout_state_locked(ls); ++ put_nfs4_file(fp); ++} ++ ++void fs_layout_return(struct super_block *sb, struct inode *ino, ++ struct nfsd4_pnfs_layoutreturn *lrp, int flags, ++ void *recall_cookie) ++{ ++ int ret; ++ ++ if (unlikely(!sb->s_pnfs_op->layout_return)) ++ return; ++ ++ lrp->lr_flags = flags; ++ lrp->args.lr_cookie = recall_cookie; ++ ++ if (!ino) /* FSID or ALL */ ++ ino = sb->s_root->d_inode; ++ ++ ret = sb->s_pnfs_op->layout_return(ino, &lrp->args); ++ dprintk("%s: inode %lu iomode=%d offset=0x%llx length=0x%llx " ++ "cookie = %p flags 0x%x status=%d\n", ++ __func__, ino->i_ino, lrp->args.lr_seg.iomode, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, ++ recall_cookie, flags, ret); ++} ++ ++static u64 ++alloc_init_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ struct sbid_tracker *new = alloc_sbid(); ++ unsigned long hash_idx = sbid_hashval(sb); ++ u64 id = 0; ++ ++ if (likely(new)) { ++ spin_lock(&layout_lock); ++ id = ++current_sbid; ++ new->id = (id << SBID_HASH_BITS) | (hash_idx & SBID_HASH_MASK); ++ id = new->id; ++ BUG_ON(id == 0); ++ new->sb = sb; ++ ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) ++ if (sbid->sb == sb) { ++ kfree(new); ++ id = sbid->id; ++ spin_unlock(&layout_lock); ++ return id; ++ } ++ list_add(&new->hash, &sbid_hashtbl[hash_idx]); ++ spin_unlock(&layout_lock); ++ } ++ return id; ++} ++ ++struct super_block * ++find_sbid_id(u64 id) ++{ ++ struct sbid_tracker *sbid; ++ struct super_block *sb = NULL; ++ unsigned long hash_idx = id & SBID_HASH_MASK; ++ int pos = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->id != id) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ sb = sbid->sb; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return sb; ++} ++ ++u64 ++find_create_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ unsigned long hash_idx = sbid_hashval(sb); ++ int pos = 0; ++ u64 id = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->sb != sb) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ id = sbid->id; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ if (!id) ++ id = alloc_init_sbid(sb); ++ ++ return id; ++} ++ ++/* ++ * Create a layoutrecall structure ++ * An optional layoutrecall can be cloned (except for the layoutrecall lists) ++ */ ++static struct nfs4_layoutrecall * ++alloc_init_layoutrecall(struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_client *clp, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ dprintk("NFSD %s\n", __func__); ++ clr = kmem_cache_alloc(pnfs_layoutrecall_slab, GFP_KERNEL); ++ if (clr == NULL) ++ return clr; ++ ++ dprintk("NFSD %s -->\n", __func__); ++ ++ memset(clr, 0, sizeof(*clr)); ++ if (lrfile) ++ get_nfs4_file(lrfile); ++ clr->clr_client = clp; ++ clr->clr_file = lrfile; ++ clr->cb = *cbl; ++ ++ kref_init(&clr->clr_ref); ++ INIT_LIST_HEAD(&clr->clr_perclnt); ++ ++ dprintk("NFSD %s return %p\n", __func__, clr); ++ return clr; ++} ++ ++static void ++get_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ kref_get(&clr->clr_ref); ++} ++ ++static void ++destroy_layoutrecall(struct kref *kref) ++{ ++ struct nfs4_layoutrecall *clr = ++ container_of(kref, struct nfs4_layoutrecall, clr_ref); ++ dprintk("pNFS %s: clr %p fp %p clp %p\n", __func__, clr, ++ clr->clr_file, clr->clr_client); ++ BUG_ON(!list_empty(&clr->clr_perclnt)); ++ if (clr->clr_file) ++ put_nfs4_file(clr->clr_file); ++ kmem_cache_free(pnfs_layoutrecall_slab, clr); ++} ++ ++int ++put_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ return kref_put(&clr->clr_ref, destroy_layoutrecall); ++} ++ ++void * ++layoutrecall_done(struct nfs4_layoutrecall *clr) ++{ ++ void *recall_cookie = clr->cb.cbl_cookie; ++ struct nfs4_layoutrecall *parent = clr->parent; ++ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del_init(&clr->clr_perclnt); ++ put_layoutrecall(clr); ++ ++ if (parent && !put_layoutrecall(parent)) ++ recall_cookie = NULL; ++ ++ return recall_cookie; ++} ++ ++/* ++ * get_state() and cb_get_state() are ++ */ ++void ++release_pnfs_ds_dev_list(struct nfs4_stateid *stp) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ while (!list_empty(&stp->st_pnfs_ds_id)) { ++ ddp = list_entry(stp->st_pnfs_ds_id.next, ++ struct pnfs_ds_dev_entry, dd_dev_entry); ++ list_del(&ddp->dd_dev_entry); ++ kfree(ddp); ++ } ++} ++ ++static int ++nfs4_add_pnfs_ds_dev(struct nfs4_stateid *stp, u32 dsid) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ ddp = kmalloc(sizeof(*ddp), GFP_KERNEL); ++ if (!ddp) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&ddp->dd_dev_entry); ++ list_add(&ddp->dd_dev_entry, &stp->st_pnfs_ds_id); ++ ddp->dd_dsid = dsid; ++ return 0; ++} ++ ++/* ++ * are two octet ranges overlapping? ++ * start1 last1 ++ * |-----------------| ++ * start2 last2 ++ * |----------------| ++ */ ++static inline int ++lo_seg_overlapping(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 last1 = last_byte_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 last2 = last_byte_offset(start2, l2->length); ++ int ret; ++ ++ /* if last1 == start2 there's a single byte overlap */ ++ ret = (last2 >= start1) && (last1 >= start2); ++ dprintk("%s: l1 %llu:%lld l2 %llu:%lld ret=%d\n", __func__, ++ l1->offset, l1->length, l2->offset, l2->length, ret); ++ return ret; ++} ++ ++static inline int ++same_fsid_major(struct nfs4_fsid *fsid, u64 major) ++{ ++ return fsid->major == major; ++} ++ ++static inline int ++same_fsid(struct nfs4_fsid *fsid, struct svc_fh *current_fh) ++{ ++ return same_fsid_major(fsid, current_fh->fh_export->ex_fsid); ++} ++ ++/* ++ * find a layout recall conflicting with the specified layoutget ++ */ ++static int ++is_layout_recalled(struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (clr, &clp->cl_layoutrecalls, clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != seg->layout_type) ++ continue; ++ if (clr->cb.cbl_recall_type == RETURN_ALL) ++ goto found; ++ if (clr->cb.cbl_recall_type == RETURN_FSID) { ++ if (same_fsid(&clr->cb.cbl_fsid, current_fh)) ++ goto found; ++ else ++ continue; ++ } ++ BUG_ON(clr->cb.cbl_recall_type != RETURN_FILE); ++ if (clr->cb.cbl_seg.clientid == seg->clientid && ++ lo_seg_overlapping(&clr->cb.cbl_seg, seg)) ++ goto found; ++ } ++ spin_unlock(&layout_lock); ++ return 0; ++found: ++ spin_unlock(&layout_lock); ++ return 1; ++} ++ ++/* ++ * are two octet ranges overlapping or adjacent? ++ */ ++static inline int ++lo_seg_mergeable(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ /* is end1 == start2 ranges are adjacent */ ++ return (end2 >= start1) && (end1 >= start2); ++} ++ ++static void ++extend_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lg) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lg_start = lg->offset; ++ u64 lg_end = end_offset(lg_start, lg->length); ++ ++ /* lo already covers lg? */ ++ if (lo_start <= lg_start && lg_end <= lo_end) ++ return; ++ ++ /* extend start offset */ ++ if (lo_start > lg_start) ++ lo_start = lg_start; ++ ++ /* extend end offset */ ++ if (lo_end < lg_end) ++ lo_end = lg_end; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? ++ lo_end : lo_end - lo_start; ++} ++ ++static struct nfs4_layout * ++merge_layout(struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layout *lp = NULL; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (lp, &fp->fi_layouts, lo_perfile) ++ if (lp->lo_seg.layout_type == seg->layout_type && ++ lp->lo_seg.clientid == seg->clientid && ++ lp->lo_seg.iomode == seg->iomode && ++ lo_seg_mergeable(&lp->lo_seg, seg)) { ++ extend_layout(&lp->lo_seg, seg); ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return lp; ++} ++ ++__be32 ++nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *lgp, ++ struct exp_xdr_stream *xdr) ++{ ++ u32 status; ++ __be32 nfserr; ++ struct inode *ino = lgp->lg_fhp->fh_dentry->d_inode; ++ struct super_block *sb = ino->i_sb; ++ int can_merge; ++ struct nfs4_file *fp; ++ struct nfs4_client *clp; ++ struct nfs4_layout *lp = NULL; ++ struct nfs4_layout_state *ls = NULL; ++ struct nfsd4_pnfs_layoutget_arg args = { ++ .lg_minlength = lgp->lg_minlength, ++ .lg_fh = &lgp->lg_fhp->fh_handle, ++ }; ++ struct nfsd4_pnfs_layoutget_res res = { ++ .lg_seg = lgp->lg_seg, ++ }; ++ ++ dprintk("NFSD: %s Begin\n", __func__); ++ ++ args.lg_sbid = find_create_sbid(sb); ++ if (!args.lg_sbid) { ++ nfserr = nfserr_layouttrylater; ++ goto out; ++ } ++ ++ can_merge = sb->s_pnfs_op->can_merge_layouts != NULL && ++ sb->s_pnfs_op->can_merge_layouts(lgp->lg_seg.layout_type); ++ ++ nfs4_lock_state(); ++ fp = find_alloc_file(ino, lgp->lg_fhp); ++ clp = find_confirmed_client((clientid_t *)&lgp->lg_seg.clientid); ++ dprintk("pNFS %s: fp %p clp %p \n", __func__, fp, clp); ++ if (!fp || !clp) { ++ nfserr = nfserr_inval; ++ goto out_unlock; ++ } ++ ++ /* Check decoded layout stateid */ ++ nfserr = nfs4_process_layout_stateid(clp, fp, &lgp->lg_sid, &ls); ++ if (nfserr) ++ goto out_unlock; ++ ++ if (is_layout_recalled(clp, lgp->lg_fhp, &lgp->lg_seg)) { ++ nfserr = nfserr_recallconflict; ++ goto out; ++ } ++ ++ /* pre-alloc layout in case we can't merge after we call ++ * the file system ++ */ ++ lp = alloc_layout(); ++ if (!lp) { ++ nfserr = nfserr_layouttrylater; ++ goto out_unlock; ++ } ++ ++ dprintk("pNFS %s: pre-export type 0x%x maxcount %Zd " ++ "iomode %u offset %llu length %llu\n", ++ __func__, lgp->lg_seg.layout_type, ++ exp_xdr_qbytes(xdr->end - xdr->p), ++ lgp->lg_seg.iomode, lgp->lg_seg.offset, lgp->lg_seg.length); ++ ++ /* FIXME: need to eliminate the use of the state lock */ ++ nfs4_unlock_state(); ++ status = sb->s_pnfs_op->layout_get(ino, xdr, &args, &res); ++ nfs4_lock_state(); ++ ++ dprintk("pNFS %s: post-export status %u " ++ "iomode %u offset %llu length %llu\n", ++ __func__, status, res.lg_seg.iomode, ++ res.lg_seg.offset, res.lg_seg.length); ++ ++ /* ++ * The allowable error codes for the layout_get pNFS export ++ * operations vector function (from the file system) can be ++ * expanded as needed to include other errors defined for ++ * the RFC 5561 LAYOUTGET operation. ++ */ ++ switch (status) { ++ case 0: ++ nfserr = NFS4_OK; ++ break; ++ case NFS4ERR_ACCESS: ++ case NFS4ERR_BADIOMODE: ++ /* No support for LAYOUTIOMODE4_RW layouts */ ++ case NFS4ERR_BADLAYOUT: ++ /* No layout matching loga_minlength rules */ ++ case NFS4ERR_INVAL: ++ case NFS4ERR_IO: ++ case NFS4ERR_LAYOUTTRYLATER: ++ case NFS4ERR_LAYOUTUNAVAILABLE: ++ case NFS4ERR_LOCKED: ++ case NFS4ERR_NOSPC: ++ case NFS4ERR_RECALLCONFLICT: ++ case NFS4ERR_SERVERFAULT: ++ case NFS4ERR_TOOSMALL: ++ /* Requested layout too big for loga_maxcount */ ++ case NFS4ERR_WRONG_TYPE: ++ /* Not a regular file */ ++ nfserr = cpu_to_be32(status); ++ goto out_freelayout; ++ default: ++ BUG(); ++ nfserr = nfserr_serverfault; ++ } ++ ++ lgp->lg_seg = res.lg_seg; ++ lgp->lg_roc = res.lg_return_on_close; ++ ++ /* SUCCESS! ++ * Can the new layout be merged into an existing one? ++ * If so, free unused layout struct ++ */ ++ if (can_merge && merge_layout(fp, clp, &res.lg_seg)) ++ goto out_freelayout; ++ ++ /* Can't merge, so let's initialize this new layout */ ++ init_layout(ls, lp, fp, clp, lgp->lg_fhp, &res.lg_seg); ++out_unlock: ++ if (ls) ++ put_layout_state(ls); ++ if (fp) ++ put_nfs4_file(fp); ++ nfs4_unlock_state(); ++out: ++ dprintk("pNFS %s: lp %p exit nfserr %u\n", __func__, lp, ++ be32_to_cpu(nfserr)); ++ return nfserr; ++out_freelayout: ++ free_layout(lp); ++ goto out_unlock; ++} ++ ++static void ++trim_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lr) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lr_start = lr->offset; ++ u64 lr_end = end_offset(lr_start, lr->length); ++ ++ dprintk("%s:Begin lo %llu:%lld lr %llu:%lld\n", __func__, ++ lo->offset, lo->length, lr->offset, lr->length); ++ ++ /* lr fully covers lo? */ ++ if (lr_start <= lo_start && lo_end <= lr_end) { ++ lo->length = 0; ++ goto out; ++ } ++ ++ /* ++ * split not supported yet. retain layout segment. ++ * remains must be returned by the client ++ * on the final layout return. ++ */ ++ if (lo_start < lr_start && lr_end < lo_end) { ++ dprintk("%s: split not supported\n", __func__); ++ goto out; ++ } ++ ++ if (lo_start < lr_start) ++ lo_end = lr_start - 1; ++ else /* lr_end < lo_end */ ++ lo_start = lr_end + 1; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? lo_end : lo_end - lo_start; ++out: ++ dprintk("%s:End lo %llu:%lld\n", __func__, lo->offset, lo->length); ++} ++ ++static int ++pnfs_return_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ dprintk("%s: clp %p fp %p\n", __func__, clp, fp); ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &fp->fi_layouts, lo_perfile) { ++ dprintk("%s: lp %p client %p,%p lo_type %x,%x iomode %d,%d\n", ++ __func__, lp, ++ lp->lo_client, clp, ++ lp->lo_seg.layout_type, lrp->args.lr_seg.layout_type, ++ lp->lo_seg.iomode, lrp->args.lr_seg.iomode); ++ if (lp->lo_client != clp || ++ lp->lo_seg.layout_type != lrp->args.lr_seg.layout_type || ++ (lp->lo_seg.iomode != lrp->args.lr_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) || ++ !lo_seg_overlapping(&lp->lo_seg, &lrp->args.lr_seg)) ++ continue; ++ layouts_found++; ++ trim_layout(&lp->lo_seg, &lrp->args.lr_seg); ++ if (!lp->lo_seg.length) { ++ lrp->lrs_present = 0; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ } ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++pnfs_return_client_layouts(struct nfs4_client *clp, ++ struct nfsd4_pnfs_layoutreturn *lrp, u64 ex_fsid) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &clp->cl_layouts, lo_perclnt) { ++ if (lrp->args.lr_seg.layout_type != lp->lo_seg.layout_type || ++ (lrp->args.lr_seg.iomode != lp->lo_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY)) ++ continue; ++ ++ if (lrp->args.lr_return_type == RETURN_FSID && ++ !same_fsid_major(&lp->lo_file->fi_fsid, ex_fsid)) ++ continue; ++ ++ layouts_found++; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++recall_return_perfect_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode || ++ clr->cb.cbl_recall_type != lrp->args.lr_return_type) ++ return 0; ++ ++ return (clr->cb.cbl_recall_type == RETURN_FILE && ++ clr->clr_file == fp && ++ clr->cb.cbl_seg.offset == lrp->args.lr_seg.offset && ++ clr->cb.cbl_seg.length == lrp->args.lr_seg.length) || ++ ++ (clr->cb.cbl_recall_type == RETURN_FSID && ++ same_fsid(&clr->cb.cbl_fsid, current_fh)) || ++ ++ clr->cb.cbl_recall_type == RETURN_ALL; ++} ++ ++static int ++recall_return_partial_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ /* iomode matching? */ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode && ++ clr->cb.cbl_seg.iomode != IOMODE_ANY && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) ++ return 0; ++ ++ if (clr->cb.cbl_recall_type == RETURN_ALL || ++ lrp->args.lr_return_type == RETURN_ALL) ++ return 1; ++ ++ /* fsid matches? */ ++ if (clr->cb.cbl_recall_type == RETURN_FSID || ++ lrp->args.lr_return_type == RETURN_FSID) ++ return same_fsid(&clr->cb.cbl_fsid, current_fh); ++ ++ /* file matches, range overlapping? */ ++ return clr->clr_file == fp && ++ lo_seg_overlapping(&clr->cb.cbl_seg, &lrp->args.lr_seg); ++} ++ ++int nfs4_pnfs_return_layout(struct super_block *sb, struct svc_fh *current_fh, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status = 0; ++ int layouts_found = 0; ++ struct inode *ino = current_fh->fh_dentry->d_inode; ++ struct nfs4_file *fp = NULL; ++ struct nfs4_client *clp; ++ struct nfs4_layoutrecall *clr, *nextclr; ++ u64 ex_fsid = current_fh->fh_export->ex_fsid; ++ void *recall_cookie = NULL; ++ ++ dprintk("NFSD: %s\n", __func__); ++ ++ nfs4_lock_state(); ++ clp = find_confirmed_client((clientid_t *)&lrp->args.lr_seg.clientid); ++ if (!clp) ++ goto out; ++ ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ fp = find_file(ino); ++ if (!fp) { ++ printk(KERN_ERR "%s: RETURN_FILE: no nfs4_file for " ++ "ino %p:%lu\n", ++ __func__, ino, ino ? ino->i_ino : 0L); ++ goto out; ++ } ++ ++ /* Check the stateid */ ++ dprintk("%s PROCESS LO_STATEID inode %p\n", __func__, ino); ++ status = nfs4_process_layout_stateid(clp, fp, &lrp->lr_sid, ++ NULL); ++ if (status) ++ goto out_put_file; ++ ++ /* update layouts */ ++ layouts_found = pnfs_return_file_layouts(clp, fp, lrp); ++ /* optimize for the all-empty case */ ++ if (list_empty(&fp->fi_layouts)) ++ recall_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ } else { ++ layouts_found = pnfs_return_client_layouts(clp, lrp, ex_fsid); ++ } ++ ++ dprintk("pNFS %s: clp %p fp %p layout_type 0x%x iomode %d " ++ "return_type %d fsid 0x%llx offset %llu length %llu: " ++ "layouts_found %d\n", ++ __func__, clp, fp, lrp->args.lr_seg.layout_type, ++ lrp->args.lr_seg.iomode, lrp->args.lr_return_type, ++ ex_fsid, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, layouts_found); ++ ++ /* update layoutrecalls ++ * note: for RETURN_{FSID,ALL}, fp may be NULL ++ */ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (clr, nextclr, &clp->cl_layoutrecalls, ++ clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != lrp->args.lr_seg.layout_type) ++ continue; ++ ++ if (recall_return_perfect_match(clr, lrp, fp, current_fh)) ++ recall_cookie = layoutrecall_done(clr); ++ else if (layouts_found && ++ recall_return_partial_match(clr, lrp, fp, current_fh)) ++ clr->clr_time = CURRENT_TIME; ++ } ++ spin_unlock(&layout_lock); ++ ++out_put_file: ++ if (fp) ++ put_nfs4_file(fp); ++out: ++ nfs4_unlock_state(); ++ ++ /* call exported filesystem layout_return (ignore return-code) */ ++ fs_layout_return(sb, ino, lrp, 0, recall_cookie); ++ ++ dprintk("pNFS %s: exit status %d \n", __func__, status); ++ return status; ++} ++ ++/* ++ * PNFS Metadata server export operations callback for get_state ++ * ++ * called by the cluster fs when it receives a get_state() from a data ++ * server. ++ * returns status, or pnfs_get_state* with pnfs_get_state->status set. ++ * ++ */ ++int ++nfs4_pnfs_cb_get_state(struct super_block *sb, struct pnfs_get_state *arg) ++{ ++ struct nfs4_stateid *stp; ++ int flags = LOCK_STATE | OPEN_STATE; /* search both hash tables */ ++ int status = -EINVAL; ++ struct inode *ino; ++ struct nfs4_delegation *dl; ++ stateid_t *stid = (stateid_t *)&arg->stid; ++ ++ dprintk("NFSD: %s sid=" STATEID_FMT " ino %llu\n", __func__, ++ STATEID_VAL(stid), arg->ino); ++ ++ nfs4_lock_state(); ++ stp = find_stateid(stid, flags); ++ if (!stp) { ++ ino = iget_locked(sb, arg->ino); ++ if (!ino) ++ goto out; ++ ++ if (ino->i_state & I_NEW) { ++ iget_failed(ino); ++ goto out; ++ } ++ ++ dl = find_delegation_stateid(ino, stid); ++ if (dl) ++ status = 0; ++ ++ iput(ino); ++ } else { ++ /* XXX ANDROS: marc removed nfs4_check_fh - how come? */ ++ ++ /* arg->devid is the Data server id, set by the cluster fs */ ++ status = nfs4_add_pnfs_ds_dev(stp, arg->dsid); ++ if (status) ++ goto out; ++ ++ arg->access = stp->st_access_bmap; ++ *(clientid_t *)&arg->clid = ++ stp->st_stateowner->so_client->cl_clientid; ++ } ++out: ++ nfs4_unlock_state(); ++ return status; ++} ++ ++static int ++cl_has_file_layout(struct nfs4_client *clp, struct nfs4_file *lrfile, ++ stateid_t *lsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ struct nfs4_layout_state *ls; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) { ++ if (lp->lo_file != lrfile) ++ continue; ++ ++ ls = find_get_layout_state(clp, lrfile); ++ if (!ls) { ++ /* This shouldn't happen as the file should have a ++ * layout stateid if it has a layout. ++ */ ++ printk(KERN_ERR "%s: file %p has no layout stateid\n", ++ __func__, lrfile); ++ WARN_ON(1); ++ break; ++ } ++ update_stateid(&ls->ls_stateid); ++ memcpy(lsid, &ls->ls_stateid, sizeof(stateid_t)); ++ put_layout_state_locked(ls); ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return found; ++} ++ ++static int ++cl_has_fsid_layout(struct nfs4_client *clp, struct nfs4_fsid *fsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ ++ /* note: minor version unused */ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) ++ if (lp->lo_file->fi_fsid.major == fsid->major) { ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return found; ++} ++ ++static int ++cl_has_any_layout(struct nfs4_client *clp) ++{ ++ return !list_empty(&clp->cl_layouts); ++} ++ ++static int ++cl_has_layout(struct nfs4_client *clp, struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile, stateid_t *lsid) ++{ ++ switch (cbl->cbl_recall_type) { ++ case RETURN_FILE: ++ return cl_has_file_layout(clp, lrfile, lsid); ++ case RETURN_FSID: ++ return cl_has_fsid_layout(clp, &cbl->cbl_fsid); ++ default: ++ return cl_has_any_layout(clp); ++ } ++} ++ ++/* ++ * Called without the layout_lock. ++ */ ++void ++nomatching_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfsd4_pnfs_layoutreturn lr = { ++ .args.lr_return_type = clr->cb.cbl_recall_type, ++ .args.lr_seg = clr->cb.cbl_seg, ++ }; ++ struct inode *inode; ++ void *recall_cookie; ++ ++ if (clr->clr_file) { ++ inode = igrab(clr->clr_file->fi_inode); ++ if (WARN_ON(!inode)) ++ return; ++ } else { ++ inode = NULL; ++ } ++ ++ dprintk("%s: clp %p fp %p: simulating layout_return\n", __func__, ++ clr->clr_client, clr->clr_file); ++ ++ if (clr->cb.cbl_recall_type == RETURN_FILE) ++ pnfs_return_file_layouts(clr->clr_client, clr->clr_file, &lr); ++ else ++ pnfs_return_client_layouts(clr->clr_client, &lr, ++ clr->cb.cbl_fsid.major); ++ ++ spin_lock(&layout_lock); ++ recall_cookie = layoutrecall_done(clr); ++ spin_unlock(&layout_lock); ++ ++ fs_layout_return(clr->clr_sb, inode, &lr, LR_FLAG_INTERN, ++ recall_cookie); ++ iput(inode); ++} ++ ++void pnfs_expire_client(struct nfs4_client *clp) ++{ ++ for (;;) { ++ struct nfs4_layoutrecall *lrp = NULL; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layoutrecalls)) { ++ lrp = list_entry(clp->cl_layoutrecalls.next, ++ struct nfs4_layoutrecall, clr_perclnt); ++ get_layoutrecall(lrp); ++ } ++ spin_unlock(&layout_lock); ++ if (!lrp) ++ break; ++ ++ dprintk("%s: lrp %p, fp %p\n", __func__, lrp, lrp->clr_file); ++ BUG_ON(lrp->clr_client != clp); ++ nomatching_layout(lrp); ++ put_layoutrecall(lrp); ++ } ++ ++ for (;;) { ++ struct nfs4_layout *lp = NULL; ++ struct inode *inode = NULL; ++ struct nfsd4_pnfs_layoutreturn lr; ++ bool empty = false; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layouts)) { ++ lp = list_entry(clp->cl_layouts.next, ++ struct nfs4_layout, lo_perclnt); ++ inode = igrab(lp->lo_file->fi_inode); ++ memset(&lr, 0, sizeof(lr)); ++ lr.args.lr_return_type = RETURN_FILE; ++ lr.args.lr_seg = lp->lo_seg; ++ empty = list_empty(&lp->lo_file->fi_layouts); ++ BUG_ON(lp->lo_client != clp); ++ dequeue_layout(lp); ++ destroy_layout(lp); /* do not access lp after this */ ++ } ++ spin_unlock(&layout_lock); ++ if (!lp) ++ break; ++ ++ if (WARN_ON(!inode)) ++ break; ++ ++ dprintk("%s: inode %lu lp %p clp %p\n", __func__, inode->i_ino, ++ lp, clp); ++ ++ fs_layout_return(inode->i_sb, inode, &lr, LR_FLAG_EXPIRE, ++ empty ? PNFS_LAST_LAYOUT_NO_RECALLS : NULL); ++ iput(inode); ++ } ++} ++ ++struct create_recall_list_arg { ++ struct nfsd4_pnfs_cb_layout *cbl; ++ struct nfs4_file *lrfile; ++ struct list_head *todolist; ++ unsigned todo_count; ++}; ++ ++/* ++ * look for matching layout for the given client ++ * and add a pending layout recall to the todo list ++ * if found any. ++ * returns: ++ * 0 if layouts found or negative error. ++ */ ++static int ++lo_recall_per_client(struct nfs4_client *clp, void *p) ++{ ++ stateid_t lsid; ++ struct nfs4_layoutrecall *pending; ++ struct create_recall_list_arg *arg = p; ++ ++ memset(&lsid, 0, sizeof(lsid)); ++ if (!cl_has_layout(clp, arg->cbl, arg->lrfile, &lsid)) ++ return 0; ++ ++ /* Matching put done by layoutreturn */ ++ pending = alloc_init_layoutrecall(arg->cbl, clp, arg->lrfile); ++ /* out of memory, drain todo queue */ ++ if (!pending) ++ return -ENOMEM; ++ ++ *(stateid_t *)&pending->cb.cbl_sid = lsid; ++ list_add(&pending->clr_perclnt, arg->todolist); ++ arg->todo_count++; ++ return 0; ++} ++ ++/* Create a layoutrecall structure for each client based on the ++ * original structure. */ ++int ++create_layout_recall_list(struct list_head *todolist, unsigned *todo_len, ++ struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_client *clp; ++ struct create_recall_list_arg arg = { ++ .cbl = cbl, ++ .lrfile = lrfile, ++ .todolist = todolist, ++ }; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ /* If client given by fs, just do single client */ ++ if (cbl->cbl_seg.clientid) { ++ clp = find_confirmed_client( ++ (clientid_t *)&cbl->cbl_seg.clientid); ++ if (!clp) { ++ status = -ENOENT; ++ dprintk("%s: clientid %llx not found\n", __func__, ++ (unsigned long long)cbl->cbl_seg.clientid); ++ goto out; ++ } ++ ++ status = lo_recall_per_client(clp, &arg); ++ } else { ++ /* Check all clients for layout matches */ ++ status = filter_confirmed_clients(lo_recall_per_client, &arg); ++ } ++ ++out: ++ *todo_len = arg.todo_count; ++ dprintk("%s: <-- list len %u status %d\n", __func__, *todo_len, status); ++ return status; ++} ++ ++/* ++ * Recall layouts asynchronously ++ * Called with state lock. ++ */ ++static int ++spawn_layout_recall(struct super_block *sb, struct list_head *todolist, ++ unsigned todo_len) ++{ ++ struct nfs4_layoutrecall *pending; ++ struct nfs4_layoutrecall *parent = NULL; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ if (todo_len > 1) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ ++ parent = alloc_init_layoutrecall(&pending->cb, NULL, ++ pending->clr_file); ++ if (unlikely(!parent)) { ++ /* We want forward progress. If parent cannot be ++ * allocated take the first one as parent but don't ++ * execute it. Caller must check for -EAGAIN, if so ++ * When the partial recalls return, ++ * nfsd_layout_recall_cb should be called again. ++ */ ++ list_del_init(&pending->clr_perclnt); ++ if (todo_len > 2) { ++ parent = pending; ++ } else { ++ parent = NULL; ++ put_layoutrecall(pending); ++ } ++ --todo_len; ++ status = -ENOMEM; ++ } ++ } ++ ++ while (!list_empty(todolist)) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ list_del_init(&pending->clr_perclnt); ++ dprintk("%s: clp %p cb_client %p fp %p\n", __func__, ++ pending->clr_client, ++ pending->clr_client->cl_cb_client, ++ pending->clr_file); ++ if (unlikely(!pending->clr_client->cl_cb_client)) { ++ printk(KERN_INFO ++ "%s: clientid %08x/%08x has no callback path\n", ++ __func__, ++ pending->clr_client->cl_clientid.cl_boot, ++ pending->clr_client->cl_clientid.cl_id); ++ put_layoutrecall(pending); ++ continue; ++ } ++ ++ pending->clr_time = CURRENT_TIME; ++ pending->clr_sb = sb; ++ if (parent) { ++ /* If we created a parent its initial ref count is 1. ++ * We will need to de-ref it eventually. So we just ++ * don't increment on behalf of the last one. ++ */ ++ if (todo_len != 1) ++ get_layoutrecall(parent); ++ } ++ pending->parent = parent; ++ get_layoutrecall(pending); ++ /* Add to list so corresponding layoutreturn can find req */ ++ list_add(&pending->clr_perclnt, ++ &pending->clr_client->cl_layoutrecalls); ++ ++ nfsd4_cb_layout(pending); ++ --todo_len; ++ } ++ ++ return status; ++} ++ ++/* ++ * Spawn a thread to perform a recall layout ++ * ++ */ ++int nfsd_layout_recall_cb(struct super_block *sb, struct inode *inode, ++ struct nfsd4_pnfs_cb_layout *cbl) ++{ ++ int status; ++ struct nfs4_file *lrfile = NULL; ++ struct list_head todolist; ++ unsigned todo_len = 0; ++ ++ dprintk("NFSD nfsd_layout_recall_cb: inode %p cbl %p\n", inode, cbl); ++ BUG_ON(!cbl); ++ BUG_ON(cbl->cbl_recall_type != RETURN_FILE && ++ cbl->cbl_recall_type != RETURN_FSID && ++ cbl->cbl_recall_type != RETURN_ALL); ++ BUG_ON(cbl->cbl_recall_type == RETURN_FILE && !inode); ++ BUG_ON(cbl->cbl_seg.iomode != IOMODE_READ && ++ cbl->cbl_seg.iomode != IOMODE_RW && ++ cbl->cbl_seg.iomode != IOMODE_ANY); ++ ++ if (nfsd_serv == NULL) { ++ dprintk("NFSD nfsd_layout_recall_cb: nfsd_serv == NULL\n"); ++ return -ENOENT; ++ } ++ ++ nfs4_lock_state(); ++ status = -ENOENT; ++ if (inode) { ++ lrfile = find_file(inode); ++ if (!lrfile) { ++ dprintk("NFSD nfsd_layout_recall_cb: " ++ "nfs4_file not found\n"); ++ goto err; ++ } ++ if (cbl->cbl_recall_type == RETURN_FSID) ++ cbl->cbl_fsid = lrfile->fi_fsid; ++ } ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ /* If no cookie provided by FS, return a default one */ ++ if (!cbl->cbl_cookie) ++ cbl->cbl_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ ++ status = create_layout_recall_list(&todolist, &todo_len, cbl, lrfile); ++ if (list_empty(&todolist)) { ++ status = -ENOENT; ++ } else { ++ /* process todolist even if create_layout_recall_list ++ * returned an error */ ++ int status2 = spawn_layout_recall(sb, &todolist, todo_len); ++ if (status2) ++ status = status2; ++ } ++ ++err: ++ nfs4_unlock_state(); ++ if (lrfile) ++ put_nfs4_file(lrfile); ++ return (todo_len && status) ? -EAGAIN : status; ++} ++ ++struct create_device_notify_list_arg { ++ struct list_head *todolist; ++ struct nfsd4_pnfs_cb_dev_list *ndl; ++}; ++ ++static int ++create_device_notify_per_cl(struct nfs4_client *clp, void *p) ++{ ++ struct nfs4_notify_device *cbnd; ++ struct create_device_notify_list_arg *arg = p; ++ ++ if (atomic_read(&clp->cl_deviceref) <= 0) ++ return 0; ++ ++ cbnd = kmalloc(sizeof(*cbnd), GFP_KERNEL); ++ if (!cbnd) ++ return -ENOMEM; ++ ++ cbnd->nd_list = arg->ndl; ++ cbnd->nd_client = clp; ++ list_add(&cbnd->nd_perclnt, arg->todolist); ++ return 0; ++} ++ ++/* Create a list of clients to send device notifications. */ ++int ++create_device_notify_list(struct list_head *todolist, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ int status; ++ struct create_device_notify_list_arg arg = { ++ .todolist = todolist, ++ .ndl = ndl, ++ }; ++ ++ nfs4_lock_state(); ++ status = filter_confirmed_clients(create_device_notify_per_cl, &arg); ++ nfs4_unlock_state(); ++ ++ return status; ++} ++ ++/* ++ * For each client that a device, send a device notification. ++ * XXX: Need to track which clients have which devices. ++ */ ++int nfsd_device_notify_cb(struct super_block *sb, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ struct nfs4_notify_device *cbnd; ++ unsigned int notify_num = 0; ++ int status2, status = 0; ++ struct list_head todolist; ++ ++ BUG_ON(!ndl || ndl->cbd_len == 0 || !ndl->cbd_list); ++ ++ dprintk("NFSD %s: cbl %p len %u\n", __func__, ndl, ndl->cbd_len); ++ ++ if (nfsd_serv == NULL) ++ return -ENOENT; ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ status = create_device_notify_list(&todolist, ndl); ++ ++ while (!list_empty(&todolist)) { ++ cbnd = list_entry(todolist.next, struct nfs4_notify_device, ++ nd_perclnt); ++ list_del_init(&cbnd->nd_perclnt); ++ status2 = nfsd4_cb_notify_device(cbnd); ++ pnfs_clear_device_notify(cbnd->nd_client); ++ if (status2) { ++ kfree(cbnd); ++ status = status2; ++ } ++ notify_num++; ++ } ++ ++ dprintk("NFSD %s: status %d clients %u\n", ++ __func__, status, notify_num); ++ return status; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-23 12:09:03.309501439 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-23 12:09:03.309501439 -0400 +@@ -0,0 +1,461 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsfh.h" ++#include "nfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Just use a linked list. Do not expect more than 32 dlm_device_entries ++ * the first implementation will just use one device per cluster file system ++ */ ++ ++static LIST_HEAD(dlm_device_list); ++static DEFINE_SPINLOCK(dlm_device_list_lock); ++ ++struct dlm_device_entry { ++ struct list_head dlm_dev_list; ++ char disk_name[DISK_NAME_LEN]; ++ int num_ds; ++ char ds_list[NFSD_DLM_DS_LIST_MAX]; ++}; ++ ++static struct dlm_device_entry * ++_nfsd4_find_pnfs_dlm_device(char *disk_name) ++{ ++ struct dlm_device_entry *dlm_pdev; ++ ++ dprintk("--> %s disk name %s\n", __func__, disk_name); ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) { ++ dprintk("%s Look for dlm_pdev %s\n", __func__, ++ dlm_pdev->disk_name); ++ if (!memcmp(dlm_pdev->disk_name, disk_name, strlen(disk_name))) { ++ spin_unlock(&dlm_device_list_lock); ++ return dlm_pdev; ++ } ++ } ++ spin_unlock(&dlm_device_list_lock); ++ return NULL; ++} ++ ++static struct dlm_device_entry * ++nfsd4_find_pnfs_dlm_device(struct super_block *sb) { ++ char dname[BDEVNAME_SIZE]; ++ ++ bdevname(sb->s_bdev, dname); ++ return _nfsd4_find_pnfs_dlm_device(dname); ++} ++ ++ssize_t ++nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen) ++{ ++ char *pos = buf; ++ ssize_t size = 0; ++ struct dlm_device_entry *dlm_pdev; ++ int ret = -EINVAL; ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) ++ { ++ int advanced; ++ advanced = snprintf(pos, buflen - size, "%s:%s\n", dlm_pdev->disk_name, dlm_pdev->ds_list); ++ if (advanced >= buflen - size) ++ goto out; ++ size += advanced; ++ pos += advanced; ++ } ++ ret = size; ++ ++out: ++ spin_unlock(&dlm_device_list_lock); ++ return ret; ++} ++ ++bool nfsd4_validate_pnfs_dlm_device(char *ds_list, int *num_ds) ++{ ++ char *start = ds_list; ++ ++ *num_ds = 0; ++ ++ while (*start) { ++ struct sockaddr_storage tempAddr; ++ int ipLen = strcspn(start, ","); ++ ++ if (!rpc_pton(start, ipLen, (struct sockaddr *)&tempAddr, sizeof(tempAddr))) ++ return false; ++ (*num_ds)++; ++ start += ipLen + 1; ++ } ++ return true; ++} ++ ++/* ++ * pnfs_dlm_device string format: ++ * block-device-path:, ++ * ++ * Examples ++ * /dev/sda:192.168.1.96,192.168.1.97' creates a data server list with ++ * two data servers for the dlm cluster file system mounted on /dev/sda. ++ * ++ * /dev/sda:192.168.1.96,192.168.1.100' ++ * replaces the data server list for /dev/sda ++ * ++ * Only the deviceid == 1 is supported. Can add device id to ++ * pnfs_dlm_device string when needed. ++ * ++ * Only the round robin each data server once stripe index is supported. ++ */ ++int ++nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len) ++ ++{ ++ struct dlm_device_entry *new, *found; ++ char *bufp = pnfs_dlm_device; ++ char *endp = bufp + strlen(bufp); ++ int err = -ENOMEM; ++ ++ dprintk("--> %s len %d\n", __func__, len); ++ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return err; ++ ++ err = -EINVAL; ++ /* disk_name */ ++ /* FIXME: need to check for valid disk_name. search superblocks? ++ * check for slash dev slash ? ++ */ ++ len = strcspn(bufp, ":"); ++ if (len > DISK_NAME_LEN) ++ goto out_free; ++ memcpy(new->disk_name, bufp, len); ++ ++ err = -EINVAL; ++ bufp += len + 1; ++ if (bufp >= endp) ++ goto out_free; ++ ++ /* data server list */ ++ /* FIXME: need to check for comma separated valid ip format */ ++ len = strcspn(bufp, ":"); ++ if (len > NFSD_DLM_DS_LIST_MAX) ++ goto out_free; ++ memcpy(new->ds_list, bufp, len); ++ ++ ++ /* validate the ips */ ++ if (!nfsd4_validate_pnfs_dlm_device(new->ds_list, &(new->num_ds))) ++ goto out_free; ++ ++ dprintk("%s disk_name %s num_ds %d ds_list %s\n", __func__, ++ new->disk_name, new->num_ds, new->ds_list); ++ ++ found = _nfsd4_find_pnfs_dlm_device(new->disk_name); ++ if (found) { ++ /* FIXME: should compare found->ds_list with new->ds_list ++ * and if it is different, kick off a CB_NOTIFY change ++ * deviceid. ++ */ ++ dprintk("%s pnfs_dlm_device %s:%s already in cache " ++ " replace ds_list with new ds_list %s\n", __func__, ++ found->disk_name, found->ds_list, new->ds_list); ++ memset(found->ds_list, 0, DISK_NAME_LEN); ++ memcpy(found->ds_list, new->ds_list, strlen(new->ds_list)); ++ found->num_ds = new->num_ds; ++ kfree(new); ++ } else { ++ dprintk("%s Adding pnfs_dlm_device %s:%s\n", __func__, ++ new->disk_name, new->ds_list); ++ spin_lock(&dlm_device_list_lock); ++ list_add(&new->dlm_dev_list, &dlm_device_list); ++ spin_unlock(&dlm_device_list_lock); ++ } ++ dprintk("<-- %s Success\n", __func__); ++ return 0; ++ ++out_free: ++ kfree(new); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ struct dlm_device_entry *dlm_pdev, *next; ++ ++ dprintk("--> %s\n", __func__); ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry_safe (dlm_pdev, next, &dlm_device_list, ++ dlm_dev_list) { ++ list_del(&dlm_pdev->dlm_dev_list); ++ kfree(dlm_pdev); ++ } ++ spin_unlock(&dlm_device_list_lock); ++} ++ ++static int nfsd4_pnfs_dlm_getdeviter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ printk(KERN_ERR "%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return -ENOTSUPP; ++ } ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ return 0; ++} ++ ++static int nfsd4_pnfs_dlm_getdevinfo(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err, len, i = 0; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_devaddr *daddr; ++ struct dlm_device_entry *dlm_pdev; ++ char *bufp; ++ ++ err = -ENOTSUPP; ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ dprintk("%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return err; ++ } ++ ++ /* We only hand out a deviceid of 1 in LAYOUTGET, so a GETDEVICEINFO ++ * with a gdia_device_id != 1 is invalid. ++ */ ++ err = -EINVAL; ++ if (devid->devid != 1) { ++ dprintk("%s: WARNING: didn't receive a deviceid of " ++ "1 (got: 0x%llx)\n", __func__, devid->devid); ++ return err; ++ } ++ ++ /* ++ * If the DS list has not been established, return -EINVAL ++ */ ++ dlm_pdev = nfsd4_find_pnfs_dlm_device(sb); ++ if (!dlm_pdev) { ++ dprintk("%s: DEBUG: disk %s Not Found\n", __func__, ++ sb->s_bdev->bd_disk->disk_name); ++ return err; ++ } ++ ++ dprintk("%s: Found disk %s with DS list |%s|\n", ++ __func__, dlm_pdev->disk_name, dlm_pdev->ds_list); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ fdev.fl_device_length = dlm_pdev->num_ds; ++ ++ err = -ENOMEM; ++ len = sizeof(*fdev.fl_device_list) * fdev.fl_device_length; ++ fdev.fl_device_list = kzalloc(len, GFP_KERNEL); ++ if (!fdev.fl_device_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device list " ++ "buffer for %d DSes.\n", __func__, i); ++ fdev.fl_device_length = 0; ++ goto out; ++ } ++ ++ /* Set a simple stripe indicie */ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = kzalloc(sizeof(u32) * ++ fdev.fl_stripeindices_length, GFP_KERNEL); ++ ++ if (!fdev.fl_stripeindices_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a stripeindices " ++ "list buffer for %d DSes.\n", __func__, i); ++ goto out; ++ } ++ for (i = 0; i < fdev.fl_stripeindices_length; i++) ++ fdev.fl_stripeindices_list[i] = i; ++ ++ /* Transfer the data server list with a single multipath entry */ ++ bufp = dlm_pdev->ds_list; ++ for (i = 0; i < fdev.fl_device_length; i++) { ++ daddr = kmalloc(sizeof(*daddr), GFP_KERNEL); ++ if (!daddr) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device " ++ "addr buffer.\n", __func__); ++ goto out; ++ } ++ ++ daddr->r_netid.data = "tcp"; ++ daddr->r_netid.len = 3; ++ ++ len = strcspn(bufp, ","); ++ daddr->r_addr.data = kmalloc(len + 4, GFP_KERNEL); ++ memcpy(daddr->r_addr.data, bufp, len); ++ /* ++ * append the port number. interpreted as two more bytes ++ * beyond the quad: ".8.1" -> 0x08.0x01 -> 0x0801 = port 2049. ++ */ ++ memcpy(daddr->r_addr.data + len, ".8.1", 4); ++ daddr->r_addr.len = len + 4; ++ ++ fdev.fl_device_list[i].fl_multipath_length = 1; ++ fdev.fl_device_list[i].fl_multipath_list = daddr; ++ ++ dprintk("%s: encoding DS |%s|\n", __func__, bufp); ++ ++ bufp += len + 1; ++ } ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ for (i = 0; i < fdev.fl_device_length; i++) ++ kfree(fdev.fl_device_list[i].fl_multipath_list); ++ kfree(fdev.fl_device_list); ++ kfree(fdev.fl_stripeindices_list); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize >= NFSSVC_MAXBLKSIZE) ++ return blocksize; ++ return NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++} ++ ++/* ++ * Look up inode block device in pnfs_dlm_device list. ++ * Hash on the inode->i_ino and number of data servers. ++ */ ++static int dlm_ino_hash(struct inode *ino) ++{ ++ struct dlm_device_entry *de; ++ u32 hash_mask = 0; ++ ++ /* If can't find the inode block device in the pnfs_dlm_deivce list ++ * then don't hand out a layout ++ */ ++ de = nfsd4_find_pnfs_dlm_device(ino->i_sb); ++ if (!de) ++ return -1; ++ hash_mask = de->num_ds - 1; ++ return ino->i_ino & hash_mask; ++} ++ ++static enum nfsstat4 nfsd4_pnfs_dlm_layoutget(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ int index; ++ enum nfsstat4 rc = NFS4_OK; ++ ++ dprintk("%s: LAYOUT_GET\n", __func__); ++ ++ /* DLM exported file systems only support layouts for READ */ ++ if (res->lg_seg.iomode == IOMODE_RW) ++ return NFS4ERR_BADIOMODE; ++ ++ index = dlm_ino_hash(inode); ++ dprintk("%s first stripe index %d i_ino %lu\n", __func__, index, ++ inode->i_ino); ++ if (index < 0) ++ return NFS4ERR_LAYOUTUNAVAILABLE; ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ /* Always give out whole file layouts */ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ /* Always give out READ ONLY layouts */ ++ res->lg_seg.iomode = IOMODE_READ; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = false; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = args->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = index; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ memcpy(fhp, args->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++nfsd4_pnfs_dlm_layouttype(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++const struct pnfs_export_operations pnfs_dlm_export_ops = { ++ .layout_type = nfsd4_pnfs_dlm_layouttype, ++ .get_device_info = nfsd4_pnfs_dlm_getdevinfo, ++ .get_device_iter = nfsd4_pnfs_dlm_getdeviter, ++ .layout_get = nfsd4_pnfs_dlm_layoutget, ++}; ++EXPORT_SYMBOL(pnfs_dlm_export_ops); +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-23 12:09:03.310501390 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-23 12:09:03.310501390 -0400 +@@ -0,0 +1,620 @@ ++/* ++* linux/fs/nfsd/nfs4pnfsds.c ++* ++* Copyright (c) 2005 The Regents of the University of Michigan. ++* All rights reserved. ++* ++* Andy Adamson ++* ++* Redistribution and use in source and binary forms, with or without ++* modification, are permitted provided that the following conditions ++* are met: ++* ++* 1. Redistributions of source code must retain the above copyright ++* notice, this list of conditions and the following disclaimer. ++* 2. Redistributions in binary form must reproduce the above copyright ++* notice, this list of conditions and the following disclaimer in the ++* documentation and/or other materials provided with the distribution. ++* 3. Neither the name of the University nor the names of its ++* contributors may be used to endorse or promote products derived ++* from this software without specific prior written permission. ++* ++* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++* ++*/ ++#if defined(CONFIG_PNFSD) ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsd.h" ++#include "pnfsd.h" ++#include "state.h" ++ ++/* ++ ******************* ++ * PNFS ++ ******************* ++ */ ++/* ++ * Hash tables for pNFS Data Server state ++ * ++ * mds_nodeid: list of struct pnfs_mds_id one per Metadata server (MDS) using ++ * this data server (DS). ++ * ++ * mds_clid_hashtbl[]: uses clientid_hashval(), hash of all clientids obtained ++ * from any MDS. ++ * ++ * ds_stid_hashtbl[]: uses stateid_hashval(), hash of all stateids obtained ++ * from any MDS. ++ * ++ */ ++/* Hash tables for clientid state */ ++#define CLIENT_HASH_BITS 4 ++#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) ++#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) ++ ++#define clientid_hashval(id) \ ++ ((id) & CLIENT_HASH_MASK) ++ ++/* hash table for pnfs_ds_stateid */ ++#define STATEID_HASH_BITS 10 ++#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) ++#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) ++ ++#define stateid_hashval(owner_id, file_id) \ ++ (((owner_id) + (file_id)) & STATEID_HASH_MASK) ++ ++static struct list_head mds_id_tbl; ++static struct list_head mds_clid_hashtbl[CLIENT_HASH_SIZE]; ++static struct list_head ds_stid_hashtbl[STATEID_HASH_SIZE]; ++ ++static inline void put_ds_clientid(struct pnfs_ds_clientid *dcp); ++static inline void put_ds_mdsid(struct pnfs_mds_id *mdp); ++ ++/* Mutex for data server state. Needs to be separate from ++ * mds state mutex since a node can be both mds and ds */ ++static DEFINE_MUTEX(ds_mutex); ++static struct thread_info *ds_mutex_owner; ++ ++static void ++ds_lock_state(void) ++{ ++ mutex_lock(&ds_mutex); ++ ds_mutex_owner = current_thread_info(); ++} ++ ++static void ++ds_unlock_state(void) ++{ ++ BUG_ON(ds_mutex_owner != current_thread_info()); ++ ds_mutex_owner = NULL; ++ mutex_unlock(&ds_mutex); ++} ++ ++static int ++cmp_clid(const clientid_t *cl1, const clientid_t *cl2) ++{ ++ return (cl1->cl_boot == cl2->cl_boot) && ++ (cl1->cl_id == cl2->cl_id); ++} ++ ++void ++nfs4_pnfs_state_init(void) ++{ ++ int i; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&mds_clid_hashtbl[i]); ++ ++ for (i = 0; i < STATEID_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&ds_stid_hashtbl[i]); ++ ++ INIT_LIST_HEAD(&mds_id_tbl); ++} ++ ++static struct pnfs_mds_id * ++find_pnfs_mds_id(u32 mdsid) ++{ ++ struct pnfs_mds_id *local = NULL; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ list_for_each_entry(local, &mds_id_tbl, di_hash) { ++ if (local->di_mdsid == mdsid) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_clientid * ++find_pnfs_ds_clientid(const clientid_t *clid) ++{ ++ struct pnfs_ds_clientid *local = NULL; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = clientid_hashval(clid->cl_id); ++ list_for_each_entry(local, &mds_clid_hashtbl[hashval], dc_hash) { ++ if (cmp_clid(&local->dc_mdsclid, clid)) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_stateid * ++find_pnfs_ds_stateid(stateid_t *stid) ++{ ++ struct pnfs_ds_stateid *local = NULL; ++ u32 st_id = stid->si_stateownerid; ++ u32 f_id = stid->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_for_each_entry(local, &ds_stid_hashtbl[hashval], ds_hash) ++ if ((local->ds_stid.si_stateownerid == st_id) && ++ (local->ds_stid.si_fileid == f_id) && ++ (local->ds_stid.si_boot == stid->si_boot)) { ++ stateid_t *sid = &local->ds_stid; ++ dprintk("NFSD: %s <-- %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, local, local->ds_flags, ++ STATEID_VAL(sid)); ++ return local; ++ } ++ return NULL; ++} ++ ++static void ++release_ds_mdsid(struct kref *kref) ++{ ++ struct pnfs_mds_id *mdp = ++ container_of(kref, struct pnfs_mds_id, di_ref); ++ dprintk("pNFSD: %s\n", __func__); ++ ++ list_del(&mdp->di_hash); ++ list_del(&mdp->di_mdsclid); ++ kfree(mdp); ++} ++ ++static void ++release_ds_clientid(struct kref *kref) ++{ ++ struct pnfs_ds_clientid *dcp = ++ container_of(kref, struct pnfs_ds_clientid, dc_ref); ++ struct pnfs_mds_id *mdp; ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(dcp->dc_mdsid); ++ if (mdp) ++ put_ds_mdsid(mdp); ++ ++ list_del(&dcp->dc_hash); ++ list_del(&dcp->dc_stateid); ++ list_del(&dcp->dc_permdsid); ++ kfree(dcp); ++} ++ ++static void ++release_ds_stateid(struct kref *kref) ++{ ++ struct pnfs_ds_stateid *dsp = ++ container_of(kref, struct pnfs_ds_stateid, ds_ref); ++ struct pnfs_ds_clientid *dcp; ++ dprintk("pNFS %s: dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid(&dsp->ds_mdsclid); ++ if (dcp) ++ put_ds_clientid(dcp); ++ ++ list_del(&dsp->ds_hash); ++ list_del(&dsp->ds_perclid); ++ kfree(dsp); ++} ++ ++static inline void ++put_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_put(&dcp->dc_ref, release_ds_clientid); ++} ++ ++static inline void ++get_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_get(&dcp->dc_ref); ++} ++ ++static inline void ++put_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_put(&mdp->di_ref, release_ds_mdsid); ++} ++ ++static inline void ++get_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_get(&mdp->di_ref); ++} ++ ++static inline void ++put_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_put(&dsp->ds_ref, release_ds_stateid); ++} ++ ++static inline void ++get_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_get(&dsp->ds_ref); ++} ++ ++void ++nfs4_pnfs_state_shutdown(void) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int i; ++ ++ dprintk("pNFSD %s: -->\n", __func__); ++ ++ ds_lock_state(); ++ for (i = 0; i < STATEID_HASH_SIZE; i++) { ++ while (!list_empty(&ds_stid_hashtbl[i])) { ++ dsp = list_entry(ds_stid_hashtbl[i].next, ++ struct pnfs_ds_stateid, ds_hash); ++ put_ds_stateid(dsp); ++ } ++ } ++ ds_unlock_state(); ++} ++ ++static struct pnfs_mds_id * ++alloc_init_mds_id(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = kmalloc(sizeof(*mdp), GFP_KERNEL); ++ if (!mdp) ++ return NULL; ++ INIT_LIST_HEAD(&mdp->di_hash); ++ INIT_LIST_HEAD(&mdp->di_mdsclid); ++ list_add(&mdp->di_hash, &mds_id_tbl); ++ mdp->di_mdsid = gsp->dsid; ++ mdp->di_mdsboot = 0; ++ kref_init(&mdp->di_ref); ++ return mdp; ++} ++ ++static struct pnfs_ds_clientid * ++alloc_init_ds_clientid(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ struct pnfs_ds_clientid *dcp; ++ clientid_t *clid = (clientid_t *)&gsp->clid; ++ unsigned int hashval = clientid_hashval(clid->cl_id); ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(gsp->dsid); ++ if (!mdp) { ++ mdp = alloc_init_mds_id(gsp); ++ if (!mdp) ++ return NULL; ++ } else { ++ get_ds_mdsid(mdp); ++ } ++ ++ dcp = kmalloc(sizeof(*dcp), GFP_KERNEL); ++ if (!dcp) ++ return NULL; ++ ++ INIT_LIST_HEAD(&dcp->dc_hash); ++ INIT_LIST_HEAD(&dcp->dc_stateid); ++ INIT_LIST_HEAD(&dcp->dc_permdsid); ++ list_add(&dcp->dc_hash, &mds_clid_hashtbl[hashval]); ++ list_add(&dcp->dc_permdsid, &mdp->di_mdsclid); ++ dcp->dc_mdsclid = *clid; ++ kref_init(&dcp->dc_ref); ++ dcp->dc_mdsid = gsp->dsid; ++ return dcp; ++} ++ ++static struct pnfs_ds_stateid * ++alloc_init_ds_stateid(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct pnfs_ds_stateid *dsp; ++ u32 st_id = stidp->si_stateownerid; ++ u32 f_id = stidp->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ dsp = kmalloc(sizeof(*dsp), GFP_KERNEL); ++ if (!dsp) ++ return dsp; ++ ++ INIT_LIST_HEAD(&dsp->ds_hash); ++ INIT_LIST_HEAD(&dsp->ds_perclid); ++ memcpy(&dsp->ds_stid, stidp, sizeof(stateid_t)); ++ fh_copy_shallow(&dsp->ds_fh, &cfh->fh_handle); ++ dsp->ds_access = 0; ++ dsp->ds_status = 0; ++ dsp->ds_flags = 0L; ++ kref_init(&dsp->ds_ref); ++ set_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ clear_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ init_waitqueue_head(&dsp->ds_waitq); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_add(&dsp->ds_hash, &ds_stid_hashtbl[hashval]); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++static int ++update_ds_stateid(struct pnfs_ds_stateid *dsp, struct svc_fh *cfh, ++ struct pnfs_get_state *gsp) ++{ ++ struct pnfs_ds_clientid *dcp; ++ int new = 0; ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid((clientid_t *)&gsp->clid); ++ if (!dcp) { ++ dcp = alloc_init_ds_clientid(gsp); ++ if (!dcp) ++ return 1; ++ new = 1; ++ } ++ if (test_bit(DS_STATEID_NEW, &dsp->ds_flags)) { ++ list_add(&dsp->ds_perclid, &dcp->dc_stateid); ++ if (!new) ++ get_ds_clientid(dcp); ++ } ++ ++ memcpy(&dsp->ds_stid, &gsp->stid, sizeof(stateid_t)); ++ dsp->ds_access = gsp->access; ++ dsp->ds_status = 0; ++ dsp->ds_verifier[0] = gsp->verifier[0]; ++ dsp->ds_verifier[1] = gsp->verifier[1]; ++ memcpy(&dsp->ds_mdsclid, &gsp->clid, sizeof(clientid_t)); ++ set_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ clear_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ return 0; ++} ++ ++int ++nfs4_pnfs_cb_change_state(struct pnfs_get_state *gs) ++{ ++ stateid_t *stid = (stateid_t *)&gs->stid; ++ struct pnfs_ds_stateid *dsp; ++ ++ dprintk("pNFSD: %s stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stid)); ++ ++ ds_lock_state(); ++ dsp = find_pnfs_ds_stateid(stid); ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ if (dsp) ++ return 0; ++ return -ENOENT; ++} ++ ++/* Retrieves and validates stateid. ++ * If stateid exists and its fields match, return it. ++ * If stateid exists but either the generation or ++ * ownerids don't match, check with mds to see if it is valid. ++ * If the stateid doesn't exist, the first thread creates a ++ * invalid *marker* stateid, then checks to see if the ++ * stateid exists on the mds. If so, it validates the *marker* ++ * stateid and updates its fields. Subsequent threads that ++ * find the *marker* stateid wait until it is valid or an error ++ * occurs. ++ * Called with ds_state_lock. ++ */ ++static struct pnfs_ds_stateid * ++nfsv4_ds_get_state(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct inode *ino = cfh->fh_dentry->d_inode; ++ struct super_block *sb; ++ struct pnfs_ds_stateid *dsp = NULL; ++ struct pnfs_get_state gs = { ++ .access = 0, ++ }; ++ int status = 0, waiter = 0; ++ ++ dprintk("pNFSD: %s -->\n", __func__); ++ ++ dsp = find_pnfs_ds_stateid(stidp); ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags) && ++ (stidp->si_generation == dsp->ds_stid.si_generation)) ++ goto out_noput; ++ ++ sb = ino->i_sb; ++ if (!sb || !sb->s_pnfs_op->get_state) ++ goto out_noput; ++ ++ /* Uninitialize current state if it exists yet it doesn't match. ++ * If it is already invalid, another thread is checking state */ ++ if (dsp) { ++ if (!test_and_clear_bit(DS_STATEID_VALID, &dsp->ds_flags)) ++ waiter = 1; ++ } else { ++ dsp = alloc_init_ds_stateid(cfh, stidp); ++ if (!dsp) ++ goto out_noput; ++ } ++ ++ dprintk("pNFSD: %s Starting loop\n", __func__); ++ get_ds_stateid(dsp); ++ while (!test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ ds_unlock_state(); ++ ++ /* Another thread is checking the state */ ++ if (waiter) { ++ dprintk("pNFSD: %s waiting\n", __func__); ++ wait_event_interruptible_timeout(dsp->ds_waitq, ++ (test_bit(DS_STATEID_VALID, &dsp->ds_flags) || ++ test_bit(DS_STATEID_ERROR, &dsp->ds_flags)), ++ msecs_to_jiffies(1024)); ++ dprintk("pNFSD: %s awake\n", __func__); ++ ds_lock_state(); ++ if (test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ goto out; ++ ++ continue; ++ } ++ ++ /* Validate stateid on mds */ ++ dprintk("pNFSD: %s Checking state on MDS\n", __func__); ++ memcpy(&gs.stid, stidp, sizeof(stateid_t)); ++ status = sb->s_pnfs_op->get_state(ino, &cfh->fh_handle, &gs); ++ dprintk("pNFSD: %s from MDS status %d\n", __func__, status); ++ ds_lock_state(); ++ /* if !status and stateid is valid, update id and mark valid */ ++ if (status || update_ds_stateid(dsp, cfh, &gs)) { ++ set_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ /* remove invalid stateid from list */ ++ put_ds_stateid(dsp); ++ wake_up(&dsp->ds_waitq); ++ goto out; ++ } ++ ++ wake_up(&dsp->ds_waitq); ++ } ++out: ++ if (dsp) ++ put_ds_stateid(dsp); ++out_noput: ++ if (dsp) ++ dprintk("pNFSD: %s <-- dsp %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, dsp, dsp->ds_flags, STATEID_VAL(&dsp->ds_stid)); ++ /* If error, return null */ ++ if (dsp && test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ dsp = NULL; ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++int ++nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *cfh, stateid_t *stateid) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int status = 0; ++ ++ dprintk("pNFSD: %s --> " STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ /* Must release state lock while verifying stateid on mds */ ++ nfs4_unlock_state(); ++ ds_lock_state(); ++ dsp = nfsv4_ds_get_state(cfh, stateid); ++ if (dsp) { ++ get_ds_stateid(dsp); ++ dprintk("pNFSD: %s Found " STATEID_FMT "\n", __func__, ++ STATEID_VAL(&dsp->ds_stid)); ++ ++ dprintk("NFSD: %s: dsp %p fh_size %u:%u " ++ "fh [%08x:%08x:%08x:%08x]:[%08x:%08x:%08x:%08x] " ++ "gen %x:%x\n", ++ __func__, dsp, ++ cfh->fh_handle.fh_size, dsp->ds_fh.fh_size, ++ ((unsigned *)&cfh->fh_handle.fh_base)[0], ++ ((unsigned *)&cfh->fh_handle.fh_base)[1], ++ ((unsigned *)&cfh->fh_handle.fh_base)[2], ++ ((unsigned *)&cfh->fh_handle.fh_base)[3], ++ ((unsigned *)&dsp->ds_fh.fh_base)[0], ++ ((unsigned *)&dsp->ds_fh.fh_base)[1], ++ ((unsigned *)&dsp->ds_fh.fh_base)[2], ++ ((unsigned *)&dsp->ds_fh.fh_base)[3], ++ stateid->si_generation, dsp->ds_stid.si_generation); ++ } ++ ++ if (!dsp || ++ (cfh->fh_handle.fh_size != dsp->ds_fh.fh_size) || ++ (memcmp(&cfh->fh_handle.fh_base, &dsp->ds_fh.fh_base, ++ dsp->ds_fh.fh_size) != 0) || ++ (stateid->si_generation > dsp->ds_stid.si_generation)) ++ status = nfserr_bad_stateid; ++ else if (stateid->si_generation < dsp->ds_stid.si_generation) ++ status = nfserr_old_stateid; ++ ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ nfs4_lock_state(); ++ dprintk("pNFSD: %s <-- status %d\n", __func__, be32_to_cpu(status)); ++ return status; ++} ++ ++void ++nfs4_ds_get_verifier(stateid_t *stateid, struct super_block *sb, u32 *p) ++{ ++ struct pnfs_ds_stateid *dsp = NULL; ++ ++ dprintk("pNFSD: %s --> stid %p\n", __func__, stateid); ++ ++ ds_lock_state(); ++ if (stateid != NULL) { ++ dsp = find_pnfs_ds_stateid(stateid); ++ if (dsp) ++ get_ds_stateid(dsp); ++ } ++ ++ /* XXX: Should we fetch the stateid or wait if some other ++ * thread is currently retrieving the stateid ? */ ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ *p++ = dsp->ds_verifier[0]; ++ *p++ = dsp->ds_verifier[1]; ++ put_ds_stateid(dsp); ++ } else { ++ /* must be on MDS */ ++ ds_unlock_state(); ++ sb->s_pnfs_op->get_verifier(sb, p); ++ ds_lock_state(); ++ p += 2; ++ } ++ ds_unlock_state(); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-23 12:08:29.091491685 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 12:09:03.311501496 -0400 +@@ -34,10 +34,14 @@ + */ + #include + #include ++#include ++#include ++#include + + #include "cache.h" + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_PROC + +@@ -372,6 +376,24 @@ nfsd4_open(struct svc_rqst *rqstp, struc + * set, (2) sets open->op_stateid, (3) sets open->op_delegation. + */ + status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); ++#if defined(CONFIG_SPNFS) ++ if (!status && spnfs_enabled()) { ++ struct inode *inode = cstate->current_fh.fh_dentry->d_inode; ++ ++ status = spnfs_open(inode, open); ++ if (status) { ++ dprintk( ++ "nfsd: pNFS could not be enabled for inode: %lu\n", ++ inode->i_ino); ++ /* ++ * XXX When there's a failure then need to indicate to ++ * future ops that no pNFS is available. Should I save ++ * the status in the inode? It's kind of a big hammer. ++ * But there may be no stripes available? ++ */ ++ } ++ } ++#endif /* CONFIG_SPNFS */ + out: + if (open->op_stateowner) { + nfs4_get_stateowner(open->op_stateowner); +@@ -454,16 +476,30 @@ nfsd4_access(struct svc_rqst *rqstp, str + &access->ac_supported); + } + ++static void ++nfsd4_get_verifier(struct super_block *sb, nfs4_verifier *verf) ++{ ++ u32 *p = (u32 *)verf->data; ++ ++#if defined(CONFIG_PNFSD) ++ if (sb->s_pnfs_op && sb->s_pnfs_op->get_verifier) { ++ nfs4_ds_get_verifier(NULL, sb, p); ++ return; ++ } ++#endif /* CONFIG_PNFSD */ ++ ++ *p++ = nfssvc_boot.tv_sec; ++ *p++ = nfssvc_boot.tv_usec; ++} ++ + static __be32 + nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_commit *commit) + { + __be32 status; + +- u32 *p = (u32 *)commit->co_verf.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; +- ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &commit->co_verf); + status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + commit->co_count); + if (status == nfserr_symlink) +@@ -816,7 +852,6 @@ nfsd4_write(struct svc_rqst *rqstp, stru + { + stateid_t *stateid = &write->wr_stateid; + struct file *filp = NULL; +- u32 *p; + __be32 status = nfs_ok; + unsigned long cnt; + +@@ -838,13 +873,49 @@ nfsd4_write(struct svc_rqst *rqstp, stru + + cnt = write->wr_buflen; + write->wr_how_written = write->wr_stable_how; +- p = (u32 *)write->wr_verifier.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; + ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &write->wr_verifier); ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(cstate->current_fh.fh_dentry->d_inode, 0)) { ++ status = bl_layoutrecall(cstate->current_fh.fh_dentry->d_inode, ++ RETURN_FILE, write->wr_offset, write->wr_buflen); ++ if (!status) { ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++ } ++ } else ++#endif ++ ++ if (spnfs_enabled()) { ++ status = spnfs_write(cstate->current_fh.fh_dentry->d_inode, ++ write->wr_offset, write->wr_buflen, write->wr_vlen, ++ rqstp); ++ if (status == nfs_ok) { ++ /* DMXXX: HACK to get filesize set */ ++ /* write one byte at offset+length-1 */ ++ struct kvec k[1]; ++ char zero = 0; ++ unsigned long cnt = 1; ++ ++ k[0].iov_base = (void *)&zero; ++ k[0].iov_len = 1; ++ nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset+write->wr_buflen-1, k, 1, ++ &cnt, &write->wr_how_written); ++ } ++ } else /* we're not an MDS */ ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++#else + status = nfsd_write(rqstp, &cstate->current_fh, filp, + write->wr_offset, rqstp->rq_vec, write->wr_vlen, + &cnt, &write->wr_how_written); ++#endif /* CONFIG_SPNFS */ ++ + if (filp) + fput(filp); + +@@ -935,6 +1006,306 @@ nfsd4_verify(struct svc_rqst *rqstp, str + return status == nfserr_same ? nfs_ok : status; + } + ++#if defined(CONFIG_PNFSD) ++ ++static __be32 ++nfsd4_layout_verify(struct super_block *sb, struct svc_export *exp, ++ unsigned int layout_type) ++{ ++ int status, type; ++ ++ /* check to see if pNFS is supported. */ ++ status = nfserr_layoutunavailable; ++ if (exp && exp->ex_pnfs == 0) { ++ dprintk("%s: Underlying file system " ++ "is not exported over pNFS\n", __func__); ++ goto out; ++ } ++ if (!sb->s_pnfs_op || !sb->s_pnfs_op->layout_type) { ++ dprintk("%s: Underlying file system " ++ "does not support pNFS\n", __func__); ++ goto out; ++ } ++ ++ type = sb->s_pnfs_op->layout_type(sb); ++ ++ /* check to see if requested layout type is supported. */ ++ status = nfserr_unknown_layouttype; ++ if (!type) ++ dprintk("BUG: %s: layout_type 0 is reserved and must not be " ++ "used by filesystem\n", __func__); ++ else if (type != layout_type) ++ dprintk("%s: requested layout type %d " ++ "does not match supported type %d\n", ++ __func__, layout_type, type); ++ else ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevlist(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevlist *gdlp) ++{ ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ int status; ++ ++ dprintk("%s: type %u maxdevices %u cookie %llu verf %llu\n", ++ __func__, gdlp->gd_layout_type, gdlp->gd_maxdevices, ++ gdlp->gd_cookie, gdlp->gd_verf); ++ ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* We must be able to encode at list one device */ ++ if (!gdlp->gd_maxdevices) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ gdlp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Do nothing if underlying file system does not support ++ * getdevicelist */ ++ if (!sb->s_pnfs_op->get_device_iter) { ++ status = nfserr_notsupp; ++ goto out; ++ } ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdlp->gd_fhp = &cstate->current_fh; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevinfo(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevinfo *gdp) ++{ ++ struct super_block *sb; ++ int status; ++ clientid_t clid; ++ ++ dprintk("%s: layout_type %u dev_id %llx:%llx maxcnt %u\n", ++ __func__, gdp->gd_layout_type, gdp->gd_devid.sbid, ++ gdp->gd_devid.devid, gdp->gd_maxcount); ++ ++ status = nfserr_inval; ++ sb = find_sbid_id(gdp->gd_devid.sbid); ++ dprintk("%s: sb %p\n", __func__, sb); ++ if (!sb) { ++ status = nfserr_noent; ++ goto out; ++ } ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, NULL, gdp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdp->gd_sb = sb; ++ ++ /* Update notifications */ ++ copy_clientid(&clid, cstate->session); ++ pnfs_set_device_notify(&clid, gdp->gd_notify_types); ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutget(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lgp->lg_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_badiomode; ++ if (lgp->lg_seg.iomode != IOMODE_READ && ++ lgp->lg_seg.iomode != IOMODE_RW) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lgp->lg_seg.iomode); ++ goto out; ++ } ++ ++ /* Set up arguments so layout can be retrieved at encode time */ ++ lgp->lg_fhp = current_fh; ++ copy_clientid((clientid_t *)&lgp->lg_seg.clientid, cstate->session); ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutcommit(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ int status; ++ struct inode *ino = NULL; ++ struct iattr ia; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ dprintk("NFSD: nfsd4_layoutcommit \n"); ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ ino = current_fh->fh_dentry->d_inode; ++ if (!ino) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = ino->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lcp->args.lc_seg.layout_type); ++ if (status) ++ goto out; ++ ++ /* This will only extend the file length. Do a quick ++ * check to see if there is any point in waiting for the update ++ * locks. ++ * TODO: Is this correct for all back ends? ++ */ ++ dprintk("%s:new offset: %d new size: %llu old size: %lld\n", ++ __func__, lcp->args.lc_newoffset, lcp->args.lc_last_wr + 1, ++ ino->i_size); ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lcp->args.lc_seg.clientid, cstate->session); ++ lcp->res.lc_size_chg = 0; ++ if (sb->s_pnfs_op->layout_commit) { ++ status = sb->s_pnfs_op->layout_commit(ino, &lcp->args, &lcp->res); ++ dprintk("%s:layout_commit result %d\n", __func__, status); ++ } else { ++ fh_lock(current_fh); ++ if ((lcp->args.lc_newoffset == 0) || ++ ((lcp->args.lc_last_wr + 1) <= ino->i_size)) { ++ status = 0; ++ lcp->res.lc_size_chg = 0; ++ fh_unlock(current_fh); ++ goto out; ++ } ++ ++ /* Try our best to update the file size */ ++ dprintk("%s: Modifying file size\n", __func__); ++ ia.ia_valid = ATTR_SIZE; ++ ia.ia_size = lcp->args.lc_last_wr + 1; ++ status = notify_change(current_fh->fh_dentry, &ia); ++ fh_unlock(current_fh); ++ dprintk("%s:notify_change result %d\n", __func__, status); ++ } ++ ++ if (!status && lcp->res.lc_size_chg && ++ EX_ISSYNC(current_fh->fh_export)) { ++ dprintk("%s: Synchronously writing inode size %llu\n", ++ __func__, ino->i_size); ++ write_inode_now(ino, 1); ++ lcp->res.lc_newsize = i_size_read(ino); ++ } ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutreturn(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lrp->args.lr_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_return_type != RETURN_FILE && ++ lrp->args.lr_return_type != RETURN_FSID && ++ lrp->args.lr_return_type != RETURN_ALL) { ++ dprintk("pNFS %s: invalid return_type %d\n", __func__, ++ lrp->args.lr_return_type); ++ goto out; ++ } ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_seg.iomode != IOMODE_READ && ++ lrp->args.lr_seg.iomode != IOMODE_RW && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lrp->args.lr_seg.iomode); ++ goto out; ++ } ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lrp->args.lr_seg.clientid, cstate->session); ++ lrp->lrs_present = (lrp->args.lr_return_type == RETURN_FILE); ++ status = nfs4_pnfs_return_layout(sb, current_fh, lrp); ++out: ++ dprintk("pNFS %s: status %d return_type 0x%x lrs_present %d\n", ++ __func__, status, lrp->args.lr_return_type, lrp->lrs_present); ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * NULL call. + */ +@@ -1317,6 +1688,29 @@ static struct nfsd4_operation nfsd4_ops[ + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_RECLAIM_COMPLETE", + }, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICELIST] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevlist, ++ .op_name = "OP_GETDEVICELIST", ++ }, ++ [OP_GETDEVICEINFO] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevinfo, ++ .op_flags = ALLOWED_WITHOUT_FH, ++ .op_name = "OP_GETDEVICEINFO", ++ }, ++ [OP_LAYOUTGET] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutget, ++ .op_name = "OP_LAYOUTGET", ++ }, ++ [OP_LAYOUTCOMMIT] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutcommit, ++ .op_name = "OP_LAYOUTCOMMIT", ++ }, ++ [OP_LAYOUTRETURN] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutreturn, ++ .op_name = "OP_LAYOUTRETURN", ++ }, ++#endif /* CONFIG_PNFSD */ + }; + + static const char *nfsd4_op_name(unsigned opnum) +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-08-23 12:08:29.093491375 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 12:09:03.313491310 -0400 +@@ -42,6 +42,8 @@ + #include "xdr4.h" + #include "vfs.h" + ++#include "pnfsd.h" ++ + #define NFSDDBG_FACILITY NFSDDBG_PROC + + /* Globals */ +@@ -60,8 +62,6 @@ static u64 current_sessionid = 1; + #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) + + /* forward declarations */ +-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); +-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); + static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; + static void nfs4_set_recdir(char *recdir); + +@@ -69,6 +69,7 @@ static void nfs4_set_recdir(char *recdir + + /* Currently used for almost all code touching nfsv4 state: */ + static DEFINE_MUTEX(client_mutex); ++struct task_struct *client_mutex_owner; + + /* + * Currently used for the del_recall_lru and file hash table. In an +@@ -86,11 +87,21 @@ void + nfs4_lock_state(void) + { + mutex_lock(&client_mutex); ++ client_mutex_owner = current; ++} ++ ++#define BUG_ON_UNLOCKED_STATE() BUG_ON(client_mutex_owner != current) ++ ++void ++nfs4_bug_on_unlocked_state(void) ++{ ++ BUG_ON(client_mutex_owner != current); + } + + void + nfs4_unlock_state(void) + { ++ client_mutex_owner = NULL; + mutex_unlock(&client_mutex); + } + +@@ -109,7 +120,7 @@ opaque_hashval(const void *ptr, int nbyt + + static struct list_head del_recall_lru; + +-static inline void ++inline void + put_nfs4_file(struct nfs4_file *fi) + { + if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { +@@ -120,7 +131,7 @@ put_nfs4_file(struct nfs4_file *fi) + } + } + +-static inline void ++inline void + get_nfs4_file(struct nfs4_file *fi) + { + atomic_inc(&fi->fi_ref); +@@ -230,7 +241,10 @@ nfs4_close_delegation(struct nfs4_delega + * but we want to remove the lease in any case. */ + if (dp->dl_flock) + vfs_setlease(filp, F_UNLCK, &dp->dl_flock); ++ BUG_ON_UNLOCKED_STATE(); ++ nfs4_unlock_state(); /* allow nested layout recall/return */ + nfsd_close(filp); ++ nfs4_lock_state(); + } + + /* Called under the state lock. */ +@@ -266,8 +280,8 @@ static DEFINE_SPINLOCK(client_lock); + * reclaim_str_hashtbl[] holds known client info from previous reset/reboot + * used in reboot/reset lease grace period processing + * +- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed +- * setclientid_confirmed info. ++ * conf_id_hashtbl[], and conf_str_hashtbl[] hold ++ * confirmed setclientid_confirmed info. + * + * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed + * setclientid info. +@@ -292,6 +306,7 @@ static void unhash_generic_stateid(struc + list_del(&stp->st_hash); + list_del(&stp->st_perfile); + list_del(&stp->st_perstateowner); ++ release_pnfs_ds_dev_list(stp); + } + + static void free_generic_stateid(struct nfs4_stateid *stp) +@@ -345,7 +360,10 @@ static void release_open_stateid(struct + { + unhash_generic_stateid(stp); + release_stateid_lockowners(stp); ++ BUG_ON_UNLOCKED_STATE(); ++ nfs4_unlock_state(); /* allow nested layout recall/return */ + nfsd_close(stp->st_vfs_file); ++ nfs4_lock_state(); + free_generic_stateid(stp); + } + +@@ -739,6 +757,8 @@ expire_client(struct nfs4_client *clp) + struct nfs4_delegation *dp; + struct list_head reaplist; + ++ BUG_ON_UNLOCKED_STATE(); ++ + INIT_LIST_HEAD(&reaplist); + spin_lock(&recall_lock); + while (!list_empty(&clp->cl_delegations)) { +@@ -758,6 +778,7 @@ expire_client(struct nfs4_client *clp) + sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); + release_openowner(sop); + } ++ pnfs_expire_client(clp); + nfsd4_set_callback_client(clp, NULL); + if (clp->cl_cb_conn.cb_xprt) + svc_xprt_put(clp->cl_cb_conn.cb_xprt); +@@ -770,6 +791,13 @@ expire_client(struct nfs4_client *clp) + spin_unlock(&client_lock); + } + ++void expire_client_lock(struct nfs4_client *clp) ++{ ++ nfs4_lock_state(); ++ expire_client(clp); ++ nfs4_unlock_state(); ++} ++ + static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) + { + memcpy(target->cl_verifier.data, source->data, +@@ -859,6 +887,11 @@ static struct nfs4_client *create_client + INIT_LIST_HEAD(&clp->cl_strhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&clp->cl_layouts); ++ INIT_LIST_HEAD(&clp->cl_layoutrecalls); ++ atomic_set(&clp->cl_deviceref, 0); ++#endif /* CONFIG_PNFSD */ + INIT_LIST_HEAD(&clp->cl_sessions); + INIT_LIST_HEAD(&clp->cl_lru); + clp->cl_time = get_seconds(); +@@ -908,7 +941,7 @@ move_to_confirmed(struct nfs4_client *cl + renew_client(clp); + } + +-static struct nfs4_client * ++struct nfs4_client * + find_confirmed_client(clientid_t *clid) + { + struct nfs4_client *clp; +@@ -978,6 +1011,24 @@ find_unconfirmed_client_by_str(const cha + return NULL; + } + ++int ++filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), ++ void *arg) ++{ ++ struct nfs4_client *clp, *next; ++ int i, status = 0; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ list_for_each_entry_safe (clp, next, &conf_str_hashtbl[i], ++ cl_strhash) { ++ status = func(clp, arg); ++ if (status) ++ break; ++ } ++ ++ return status; ++} ++ + static void + gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) + { +@@ -1110,8 +1161,12 @@ nfsd4_replay_cache_entry(struct nfsd4_co + static void + nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) + { +- /* pNFS is not supported */ ++#if defined(CONFIG_PNFSD) ++ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS | ++ EXCHGID4_FLAG_USE_PNFS_DS; ++#else /* CONFIG_PNFSD */ + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; ++#endif /* CONFIG_PNFSD */ + + /* Referrals are supported, Migration is not. */ + new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; +@@ -1301,6 +1356,13 @@ nfsd4_create_session(struct svc_rqst *rq + struct nfsd4_clid_slot *cs_slot = NULL; + int status = 0; + ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ /* XXX hack to get local ip address */ ++ memcpy(&pnfsd_lexp_addr, &rqstp->rq_xprt->xpt_local, ++ sizeof(pnfsd_lexp_addr)); ++ pnfs_lexp_addr_len = rqstp->rq_xprt->xpt_locallen; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ + nfs4_lock_state(); + unconf = find_unconfirmed_client(&cr_ses->clientid); + conf = find_confirmed_client(&cr_ses->clientid); +@@ -1340,25 +1402,26 @@ nfsd4_create_session(struct svc_rqst *rq + cs_slot->sl_seqid++; /* from 0 to 1 */ + move_to_confirmed(unconf); + +- if (cr_ses->flags & SESSION4_BACK_CHAN) { +- unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; +- svc_xprt_get(rqstp->rq_xprt); +- rpc_copy_addr( +- (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, +- sa); +- unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); +- unconf->cl_cb_conn.cb_minorversion = +- cstate->minorversion; +- unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; +- unconf->cl_cb_seq_nr = 1; +- nfsd4_probe_callback(unconf, &unconf->cl_cb_conn); +- } ++ if (is_ds_only_session(unconf->cl_exchange_flags)) ++ cr_ses->flags &= ~SESSION4_BACK_CHAN; ++ + conf = unconf; + } else { + status = nfserr_stale_clientid; + goto out; + } + ++ if (cr_ses->flags & SESSION4_BACK_CHAN) { ++ conf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; ++ svc_xprt_get(rqstp->rq_xprt); ++ rpc_copy_addr((struct sockaddr *)&conf->cl_cb_conn.cb_addr, sa); ++ conf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); ++ conf->cl_cb_conn.cb_minorversion = cstate->minorversion; ++ conf->cl_cb_conn.cb_prog = cr_ses->callback_prog; ++ conf->cl_cb_seq_nr = 1; ++ nfsd4_probe_callback(conf, &conf->cl_cb_conn); ++ } ++ + /* + * We do not support RDMA or persistent sessions + */ +@@ -1746,7 +1809,7 @@ out: + + /* OPEN Share state helper functions */ + static inline struct nfs4_file * +-alloc_init_file(struct inode *ino) ++alloc_init_file(struct inode *ino, struct svc_fh *current_fh) + { + struct nfs4_file *fp; + unsigned int hashval = file_hashval(ino); +@@ -1760,6 +1823,16 @@ alloc_init_file(struct inode *ino) + fp->fi_inode = igrab(ino); + fp->fi_id = current_fileid++; + fp->fi_had_conflict = false; ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&fp->fi_layouts); ++ INIT_LIST_HEAD(&fp->fi_layout_states); ++ fp->fi_fsid.major = current_fh->fh_export->ex_fsid; ++ fp->fi_fsid.minor = 0; ++ fp->fi_fhlen = current_fh->fh_handle.fh_size; ++ BUG_ON(fp->fi_fhlen > sizeof(fp->fi_fhval)); ++ memcpy(fp->fi_fhval, ¤t_fh->fh_handle.fh_base, ++ fp->fi_fhlen); ++#endif /* CONFIG_PNFSD */ + spin_lock(&recall_lock); + list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); +@@ -1768,7 +1841,7 @@ alloc_init_file(struct inode *ino) + return NULL; + } + +-static void ++void + nfsd4_free_slab(struct kmem_cache **slab) + { + if (*slab == NULL) +@@ -1784,6 +1857,7 @@ nfsd4_free_slabs(void) + nfsd4_free_slab(&file_slab); + nfsd4_free_slab(&stateid_slab); + nfsd4_free_slab(&deleg_slab); ++ nfsd4_free_pnfs_slabs(); + } + + static int +@@ -1805,6 +1879,8 @@ nfsd4_init_slabs(void) + sizeof(struct nfs4_delegation), 0, 0, NULL); + if (deleg_slab == NULL) + goto out_nomem; ++ if (nfsd4_init_pnfs_slabs()) ++ goto out_nomem; + return 0; + out_nomem: + nfsd4_free_slabs(); +@@ -1878,6 +1954,9 @@ init_stateid(struct nfs4_stateid *stp, s + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); + INIT_LIST_HEAD(&stp->st_perfile); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &stateid_hashtbl[hashval]); + list_add(&stp->st_perstateowner, &sop->so_stateids); + list_add(&stp->st_perfile, &fp->fi_stateids); +@@ -1919,6 +1998,7 @@ find_openstateowner_str(unsigned int has + { + struct nfs4_stateowner *so = NULL; + ++ BUG_ON_UNLOCKED_STATE(); + list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { + if (same_owner_str(so, &open->op_owner, &open->op_clientid)) + return so; +@@ -1927,7 +2007,7 @@ find_openstateowner_str(unsigned int has + } + + /* search file_hashtbl[] for file */ +-static struct nfs4_file * ++struct nfs4_file * + find_file(struct inode *ino) + { + unsigned int hashval = file_hashval(ino); +@@ -1945,6 +2025,18 @@ find_file(struct inode *ino) + return NULL; + } + ++struct nfs4_file * ++find_alloc_file(struct inode *ino, struct svc_fh *current_fh) ++{ ++ struct nfs4_file *fp; ++ ++ fp = find_file(ino); ++ if (fp) ++ return fp; ++ ++ return alloc_init_file(ino, current_fh); ++} ++ + static inline int access_valid(u32 x, u32 minorversion) + { + if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) +@@ -2503,7 +2595,7 @@ nfsd4_process_open2(struct svc_rqst *rqs + if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) + goto out; + status = nfserr_resource; +- fp = alloc_init_file(ino); ++ fp = alloc_init_file(ino, current_fh); + if (fp == NULL) + goto out; + } +@@ -2730,7 +2822,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct + return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_path.dentry->d_inode; + } + +-static int ++int + STALE_STATEID(stateid_t *stateid) + { + if (stateid->si_boot == boot_time) +@@ -2740,6 +2832,16 @@ STALE_STATEID(stateid_t *stateid) + return 1; + } + ++__be32 ++nfs4_check_stateid(stateid_t *stateid) ++{ ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ return nfserr_bad_stateid; ++ if (STALE_STATEID(stateid)) ++ return nfserr_stale_stateid; ++ return 0; ++} ++ + static inline int + access_permit_read(unsigned long access_bmap) + { +@@ -2848,6 +2950,24 @@ nfs4_preprocess_stateid_op(struct nfsd4_ + if (grace_disallows_io(ino)) + return nfserr_grace; + ++#if defined(CONFIG_PNFSD) ++ if (pnfs_fh_is_ds(¤t_fh->fh_handle)) { ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ status = nfserr_bad_stateid; ++ else ++#ifdef CONFIG_GFS2_FS_LOCKING_DLM ++ { ++ dprintk("%s Don't check DS stateid\n", __func__); ++ return 0; ++ } ++#else /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ status = nfs4_preprocess_pnfs_ds_stateid(current_fh, ++ stateid); ++#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ goto out; ++ } ++#endif /* CONFIG_PNFSD */ ++ + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + +@@ -2924,13 +3044,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_co + *stpp = NULL; + *sopp = NULL; + +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { +- dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); +- return nfserr_bad_stateid; +- } +- +- if (STALE_STATEID(stateid)) +- return nfserr_stale_stateid; ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ return status; + + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; +@@ -3205,11 +3321,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + nfs4_lock_state(); +- status = nfserr_bad_stateid; +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) +- goto out; +- status = nfserr_stale_stateid; +- if (STALE_STATEID(stateid)) ++ status = nfs4_check_stateid(stateid); ++ if (status) + goto out; + status = nfserr_bad_stateid; + if (!is_delegation_stateid(stateid)) +@@ -3238,26 +3351,6 @@ out: + #define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) + #define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) + +-static inline u64 +-end_offset(u64 start, u64 len) +-{ +- u64 end; +- +- end = start + len; +- return end >= start ? end: NFS4_MAX_UINT64; +-} +- +-/* last octet in a range */ +-static inline u64 +-last_byte_offset(u64 start, u64 len) +-{ +- u64 end; +- +- BUG_ON(!len); +- end = start + len; +- return end > start ? end - 1: NFS4_MAX_UINT64; +-} +- + #define lockownerid_hashval(id) \ + ((id) & LOCK_HASH_MASK) + +@@ -3274,7 +3367,7 @@ static struct list_head lock_ownerid_has + static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; + static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; + +-static struct nfs4_stateid * ++struct nfs4_stateid * + find_stateid(stateid_t *stid, int flags) + { + struct nfs4_stateid *local; +@@ -3303,7 +3396,7 @@ find_stateid(stateid_t *stid, int flags) + return NULL; + } + +-static struct nfs4_delegation * ++struct nfs4_delegation * + find_delegation_stateid(struct inode *ino, stateid_t *stid) + { + struct nfs4_file *fp; +@@ -3436,6 +3529,9 @@ alloc_init_lock_stateid(struct nfs4_stat + INIT_LIST_HEAD(&stp->st_perfile); + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]); + list_add(&stp->st_perfile, &fp->fi_stateids); + list_add(&stp->st_perstateowner, &sop->so_stateids); +@@ -3998,6 +4094,9 @@ nfs4_state_init(void) + INIT_LIST_HEAD(&client_lru); + INIT_LIST_HEAD(&del_recall_lru); + reclaim_str_hashtbl_size = 0; ++#if defined(CONFIG_PNFSD) ++ nfs4_pnfs_state_init(); ++#endif /* CONFIG_PNFSD */ + return 0; + } + +@@ -4110,6 +4209,7 @@ __nfs4_state_shutdown(void) + } + + nfsd4_shutdown_recdir(); ++ nfs4_pnfs_state_shutdown(); + nfs4_init = 0; + } + +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-23 12:09:03.315491356 -0400 +@@ -47,9 +47,14 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_XDR + +@@ -1234,6 +1239,138 @@ nfsd4_decode_sequence(struct nfsd4_compo + DECODE_TAIL; + } + ++#if defined(CONFIG_PNFSD) ++static __be32 ++nfsd4_decode_getdevlist(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16 + sizeof(nfs4_verifier)); ++ READ32(gdevl->gd_layout_type); ++ READ32(gdevl->gd_maxdevices); ++ READ64(gdevl->gd_cookie); ++ COPYMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_getdevinfo(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ u32 num; ++ DECODE_HEAD; ++ ++ READ_BUF(12 + sizeof(struct nfsd4_pnfs_deviceid)); ++ READ64(gdev->gd_devid.sbid); ++ READ64(gdev->gd_devid.devid); ++ READ32(gdev->gd_layout_type); ++ READ32(gdev->gd_maxcount); ++ READ32(num); ++ if (num) { ++ READ_BUF(4); ++ READ32(gdev->gd_notify_types); ++ } else { ++ gdev->gd_notify_types = 0; ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(36); ++ READ32(lgp->lg_signal); ++ READ32(lgp->lg_seg.layout_type); ++ READ32(lgp->lg_seg.iomode); ++ READ64(lgp->lg_seg.offset); ++ READ64(lgp->lg_seg.length); ++ READ64(lgp->lg_minlength); ++ nfsd4_decode_stateid(argp, &lgp->lg_sid); ++ READ_BUF(4); ++ READ32(lgp->lg_maxcount); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ DECODE_HEAD; ++ u32 timechange; ++ ++ READ_BUF(20); ++ READ64(lcp->args.lc_seg.offset); ++ READ64(lcp->args.lc_seg.length); ++ READ32(lcp->args.lc_reclaim); ++ nfsd4_decode_stateid(argp, &lcp->lc_sid); ++ READ_BUF(4); ++ READ32(lcp->args.lc_newoffset); ++ if (lcp->args.lc_newoffset) { ++ READ_BUF(8); ++ READ64(lcp->args.lc_last_wr); ++ } else ++ lcp->args.lc_last_wr = 0; ++ READ_BUF(4); ++ READ32(timechange); ++ if (timechange) { ++ READ_BUF(12); ++ READ64(lcp->args.lc_mtime.seconds); ++ READ32(lcp->args.lc_mtime.nseconds); ++ } else { ++ lcp->args.lc_mtime.seconds = 0; ++ lcp->args.lc_mtime.nseconds = 0; ++ } ++ READ_BUF(8); ++ READ32(lcp->args.lc_seg.layout_type); ++ /* XXX: saving XDR'ed layout update. Since we don't have the ++ * current_fh yet, and therefore no export_ops, we can't call ++ * the layout specific decode routines. File and pVFS2 ++ * do not use the layout update.... ++ */ ++ READ32(lcp->args.lc_up_len); ++ if (lcp->args.lc_up_len > 0) { ++ READ_BUF(lcp->args.lc_up_len); ++ READMEM(lcp->args.lc_up_layout, lcp->args.lc_up_len); ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16); ++ READ32(lrp->args.lr_reclaim); ++ READ32(lrp->args.lr_seg.layout_type); ++ READ32(lrp->args.lr_seg.iomode); ++ READ32(lrp->args.lr_return_type); ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ READ_BUF(16); ++ READ64(lrp->args.lr_seg.offset); ++ READ64(lrp->args.lr_seg.length); ++ nfsd4_decode_stateid(argp, &lrp->lr_sid); ++ READ_BUF(4); ++ READ32(lrp->args.lrf_body_len); ++ if (lrp->args.lrf_body_len > 0) { ++ READ_BUF(lrp->args.lrf_body_len); ++ READMEM(lrp->args.lrf_body, lrp->args.lrf_body_len); ++ } ++ } ++ ++ DECODE_TAIL; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) + { +@@ -1335,11 +1472,19 @@ static nfsd4_dec nfsd41_dec_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, + [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, +@@ -2136,6 +2281,36 @@ out_acl: + } + WRITE64(stat.ino); + } ++#if defined(CONFIG_PNFSD) ++ if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { ++ struct super_block *sb = dentry->d_inode->i_sb; ++ int type = 0; ++ ++ /* Query the filesystem for supported pNFS layout types. ++ * Currently, we only support one layout type per file system. ++ * The export_ops->layout_type() returns the pnfs_layouttype4. ++ */ ++ buflen -= 4; ++ if (buflen < 0) /* length */ ++ goto out_resource; ++ ++ if (sb && sb->s_pnfs_op && sb->s_pnfs_op->layout_type) ++ type = sb->s_pnfs_op->layout_type(sb); ++ if (type) { ++ if ((buflen -= 4) < 0) /* type */ ++ goto out_resource; ++ WRITE32(1); /* length */ ++ WRITE32(type); /* type */ ++ } else ++ WRITE32(0); /* length */ ++ } ++ ++ if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ if ((buflen -= 4) < 0) ++ goto out_resource; ++ WRITE32(stat.blksize); ++ } ++#endif /* CONFIG_PNFSD */ + if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + WRITE32(3); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); +@@ -2366,6 +2541,10 @@ nfsd4_encode_commit(struct nfsd4_compoun + if (!nfserr) { + RESERVE_SPACE(8); + WRITEMEM(commit->co_verf.data, 8); ++ dprintk("NFSD: nfsd4_encode_commit: verifier %x:%x\n", ++ ((u32 *)(&commit->co_verf.data))[0], ++ ((u32 *)(&commit->co_verf.data))[1]); ++ + ADJUST_ARGS(); + } + return nfserr; +@@ -2620,9 +2799,20 @@ nfsd4_encode_read(struct nfsd4_compoundr + } + read->rd_vlen = v; + ++#if defined(CONFIG_SPNFS) ++ if (spnfs_enabled()) ++ nfserr = spnfs_read(read->rd_fhp->fh_dentry->d_inode, ++ read->rd_offset, &maxcount, read->rd_vlen, ++ resp->rqstp); ++ else /* we're not an MDS */ ++ nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, ++ read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, ++ &maxcount); ++#else + nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, + read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, + &maxcount); ++#endif /* CONFIG_SPNFS */ + + if (nfserr == nfserr_symlink) + nfserr = nfserr_inval; +@@ -2926,6 +3116,9 @@ nfsd4_encode_write(struct nfsd4_compound + WRITE32(write->wr_bytes_written); + WRITE32(write->wr_how_written); + WRITEMEM(write->wr_verifier.data, 8); ++ dprintk("NFSD: nfsd4_encode_write: verifier %x:%x\n", ++ ((u32 *)(&write->wr_verifier.data))[0], ++ ((u32 *)(&write->wr_verifier.data))[1]); + ADJUST_ARGS(); + } + return nfserr; +@@ -3069,6 +3262,343 @@ nfsd4_encode_sequence(struct nfsd4_compo + return 0; + } + ++#if defined(CONFIG_PNFSD) ++ ++/* Uses the export interface to iterate through the available devices ++ * and encodes them on the response stream. ++ */ ++static __be32 ++nfsd4_encode_devlist_iterator(struct nfsd4_compoundres *resp, ++ struct nfsd4_pnfs_getdevlist *gdevl, ++ unsigned int *dev_count) ++{ ++ struct super_block *sb = gdevl->gd_fhp->fh_dentry->d_inode->i_sb; ++ __be32 nfserr; ++ int status; ++ __be32 *p; ++ struct nfsd4_pnfs_dev_iter_res res = { ++ .gd_cookie = gdevl->gd_cookie, ++ .gd_verf = gdevl->gd_verf, ++ .gd_eof = 0 ++ }; ++ u64 sbid; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ sbid = find_create_sbid(sb); ++ *dev_count = 0; ++ do { ++ status = sb->s_pnfs_op->get_device_iter(sb, ++ gdevl->gd_layout_type, ++ &res); ++ if (status) { ++ if (status == -ENOENT) { ++ res.gd_eof = 1; ++ /* return success */ ++ break; ++ } ++ nfserr = nfserrno(status); ++ goto out_err; ++ } ++ ++ /* Encode device id and layout type */ ++ RESERVE_SPACE(sizeof(struct nfsd4_pnfs_deviceid)); ++ WRITE64((__be64)sbid); ++ WRITE64(res.gd_devid); /* devid minor */ ++ ADJUST_ARGS(); ++ (*dev_count)++; ++ } while (*dev_count < gdevl->gd_maxdevices && !res.gd_eof); ++ gdevl->gd_cookie = res.gd_cookie; ++ gdevl->gd_verf = res.gd_verf; ++ gdevl->gd_eof = res.gd_eof; ++ nfserr = nfs_ok; ++out_err: ++ dprintk("%s: Encoded %u devices\n", __func__, *dev_count); ++ return nfserr; ++} ++ ++/* Encodes the response of get device list. ++*/ ++static __be32 ++nfsd4_encode_getdevlist(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ unsigned int dev_count = 0, lead_count; ++ u32 *p_in = resp->p; ++ __be32 *p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ /* Ensure we have room for cookie, verifier, and devlist len, ++ * which we will backfill in after we encode as many devices as possible ++ */ ++ lead_count = 8 + sizeof(nfs4_verifier) + 4; ++ RESERVE_SPACE(lead_count); ++ /* skip past these values */ ++ p += XDR_QUADLEN(lead_count); ++ ADJUST_ARGS(); ++ ++ /* Iterate over as many device ids as possible on the xdr stream */ ++ nfserr = nfsd4_encode_devlist_iterator(resp, gdevl, &dev_count); ++ if (nfserr) ++ goto out_err; ++ ++ /* Backfill in cookie, verf and number of devices encoded */ ++ p = p_in; ++ WRITE64(gdevl->gd_cookie); ++ WRITEMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ WRITE32(dev_count); ++ ++ /* Skip over devices */ ++ p += XDR_QUADLEN(dev_count * sizeof(struct nfsd4_pnfs_deviceid)); ++ ADJUST_ARGS(); ++ ++ /* are we at the end of devices? */ ++ RESERVE_SPACE(4); ++ WRITE32(gdevl->gd_eof); ++ ADJUST_ARGS(); ++ ++ dprintk("%s: done.\n", __func__); ++ ++ nfserr = nfs_ok; ++out: ++ return nfserr; ++out_err: ++ p = p_in; ++ ADJUST_ARGS(); ++ goto out; ++} ++ ++/* For a given device id, have the file system retrieve and encode the ++ * associated device. For file layout, the encoding function is ++ * passed down to the file system. The file system then has the option ++ * of using this encoding function or one of its own. ++ * ++ * Note: the file system must return the XDR size of struct device_addr4 ++ * da_addr_body in pnfs_xdr_info.bytes_written on NFS4ERR_TOOSMALL for the ++ * gdir_mincount calculation. ++ */ ++static __be32 ++nfsd4_encode_getdevinfo(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ struct super_block *sb; ++ int maxcount = 0, type_notify_len = 12; ++ __be32 *p, *p_save = NULL, *p_in = resp->p; ++ struct exp_xdr_stream xdr; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = gdev->gd_sb; ++ ++ if (gdev->gd_maxcount != 0) { ++ /* FIXME: this will be bound by the session max response */ ++ maxcount = svc_max_payload(resp->rqstp); ++ if (maxcount > gdev->gd_maxcount) ++ maxcount = gdev->gd_maxcount; ++ ++ /* Ensure have room for type and notify field */ ++ maxcount -= type_notify_len; ++ if (maxcount < 0) { ++ nfserr = -ETOOSMALL; ++ goto toosmall; ++ } ++ } ++ ++ RESERVE_SPACE(4); ++ WRITE32(gdev->gd_layout_type); ++ ADJUST_ARGS(); ++ ++ /* If maxcount is 0 then just update notifications */ ++ if (gdev->gd_maxcount == 0) ++ goto handle_notifications; ++ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ nfserr = sb->s_pnfs_op->get_device_info(sb, &xdr, gdev->gd_layout_type, ++ &gdev->gd_devid); ++ if (nfserr) ++ goto err; ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ BUG_ON(xdr.p > xdr.end); ++ ++ /* Update the xdr stream with the number of bytes encoded ++ * by the file system. ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++handle_notifications: ++ /* Encode supported device notifications */ ++ RESERVE_SPACE(4); ++ if (sb->s_pnfs_op->set_device_notify) { ++ struct pnfs_devnotify_arg dn_args; ++ ++ dn_args.dn_layout_type = gdev->gd_layout_type; ++ dn_args.dn_devid = gdev->gd_devid; ++ dn_args.dn_notify_types = gdev->gd_notify_types; ++ nfserr = sb->s_pnfs_op->set_device_notify(sb, &dn_args); ++ if (nfserr) ++ goto err; ++ WRITE32(dn_args.dn_notify_types); ++ } else { ++ WRITE32(0); ++ } ++ ADJUST_ARGS(); ++ ++out: ++ return nfserrno(nfserr); ++toosmall: ++ dprintk("%s: maxcount too small\n", __func__); ++ RESERVE_SPACE(4); ++ WRITE32((p_save ? (xdr.p - p_save) * 4 : 0) + type_notify_len); ++ ADJUST_ARGS(); ++ goto out; ++err: ++ /* Rewind to the beginning */ ++ p = p_in; ++ ADJUST_ARGS(); ++ if (nfserr == -ETOOSMALL) ++ goto toosmall; ++ printk(KERN_ERR "%s: export ERROR %d\n", __func__, nfserr); ++ goto out; ++} ++ ++static __be32 ++nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, ++ __be32 nfserr, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int maxcount, leadcount; ++ struct super_block *sb; ++ struct exp_xdr_stream xdr; ++ __be32 *p, *p_save, *p_start = resp->p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = lgp->lg_fhp->fh_dentry->d_inode->i_sb; ++ maxcount = PAGE_SIZE; ++ if (maxcount > lgp->lg_maxcount) ++ maxcount = lgp->lg_maxcount; ++ ++ /* Check for space on xdr stream */ ++ leadcount = 36 + sizeof(stateid_opaque_t); ++ RESERVE_SPACE(leadcount); ++ /* encode layout metadata after file system encodes layout */ ++ p += XDR_QUADLEN(leadcount); ++ ADJUST_ARGS(); ++ ++ /* Ensure have room for ret_on_close, off, len, iomode, type */ ++ maxcount -= leadcount; ++ if (maxcount < 0) { ++ printk(KERN_ERR "%s: buffer too small\n", __func__); ++ nfserr = nfserr_toosmall; ++ goto err; ++ } ++ ++ /* Set xdr info so file system can encode layout */ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ /* Retrieve, encode, and merge layout; process stateid */ ++ nfserr = nfs4_pnfs_get_layout(lgp, &xdr); ++ if (nfserr) ++ goto err; ++ ++ /* Ensure file system returned enough bytes for the client ++ * to access. ++ */ ++ if (lgp->lg_seg.length < lgp->lg_minlength) { ++ nfserr = nfserr_badlayout; ++ goto err; ++ } ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ ++ /* Rewind to beginning and encode attrs */ ++ resp->p = p_start; ++ RESERVE_SPACE(4); ++ WRITE32(lgp->lg_roc); /* return on close */ ++ ADJUST_ARGS(); ++ nfsd4_encode_stateid(resp, &lgp->lg_sid); ++ RESERVE_SPACE(28); ++ /* Note: response logr_layout array count, always one for now */ ++ WRITE32(1); ++ WRITE64(lgp->lg_seg.offset); ++ WRITE64(lgp->lg_seg.length); ++ WRITE32(lgp->lg_seg.iomode); ++ WRITE32(lgp->lg_seg.layout_type); ++ ++ /* Update the xdr stream with the number of bytes written ++ * by the file system ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++ return nfs_ok; ++err: ++ resp->p = p_start; ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lcp->res.lc_size_chg); ++ ADJUST_ARGS(); ++ if (lcp->res.lc_size_chg) { ++ RESERVE_SPACE(8); ++ WRITE64(lcp->res.lc_newsize); ++ ADJUST_ARGS(); ++ } ++out: ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lrp->lrs_present != 0); /* got stateid? */ ++ ADJUST_ARGS(); ++ if (lrp->lrs_present) ++ nfsd4_encode_stateid(resp, &lrp->lr_sid); ++out: ++ return nfserr; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) + { +@@ -3129,11 +3659,19 @@ static nfsd4_enc nfsd4_enc_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, + [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, +diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c +--- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-08-23 12:08:29.094491943 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 12:09:03.317501495 -0400 +@@ -13,10 +13,15 @@ + #include + #include + #include ++#include + + #include "nfsd.h" + #include "cache.h" + ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++#include ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + /* + * We have a single directory with 9 nodes in it. + */ +@@ -49,6 +54,9 @@ enum { + NFSD_Gracetime, + NFSD_RecoveryDir, + #endif ++#ifdef CONFIG_PNFSD ++ NFSD_pnfs_dlm_device, ++#endif + }; + + /* +@@ -74,6 +82,9 @@ static ssize_t write_leasetime(struct fi + static ssize_t write_gracetime(struct file *file, char *buf, size_t size); + static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); + #endif ++#ifdef CONFIG_PNFSD ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size); ++#endif + + static ssize_t (*write_op[])(struct file *, char *, size_t) = { + [NFSD_Svc] = write_svc, +@@ -96,6 +107,9 @@ static ssize_t (*write_op[])(struct file + [NFSD_Gracetime] = write_gracetime, + [NFSD_RecoveryDir] = write_recoverydir, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = write_pnfs_dlm_device, ++#endif + }; + + static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) +@@ -1349,6 +1363,68 @@ static ssize_t write_recoverydir(struct + + #endif + ++#ifdef CONFIG_PNFSD ++ ++static ssize_t __write_pnfs_dlm_device(struct file *file, char *buf, ++ size_t size) ++{ ++ char *mesg = buf; ++ char *pnfs_dlm_device; ++ int max_size = NFSD_PNFS_DLM_DEVICE_MAX; ++ int len, ret = 0; ++ ++ if (size > 0) { ++ ret = -EINVAL; ++ if (size > max_size || buf[size-1] != '\n') ++ return ret; ++ buf[size-1] = 0; ++ ++ pnfs_dlm_device = mesg; ++ len = qword_get(&mesg, pnfs_dlm_device, size); ++ if (len <= 0) ++ return ret; ++ ++ ret = nfsd4_set_pnfs_dlm_device(pnfs_dlm_device, len); ++ } else ++ return nfsd4_get_pnfs_dlm_device_list(buf, SIMPLE_TRANSACTION_LIMIT); ++ ++ return ret <= 0 ? ret : strlen(buf); ++} ++ ++/** ++ * write_pnfs_dlm_device - Set or report the current pNFS data server list ++ * ++ * Input: ++ * buf: ignored ++ * size: zero ++ * ++ * OR ++ * ++ * Input: ++ * buf: C string containing a block device name, ++ * a colon, and then a comma separated ++ * list of pNFS data server IPv4 addresses ++ * size: non-zero length of C string in @buf ++ * Output: ++ * On success: passed-in buffer filled with '\n'-terminated C ++ * string containing a block device name, a colon, and ++ * then a comma separated list of pNFS ++ * data server IPv4 addresses. ++ * return code is the size in bytes of the string ++ * On error: return code is a negative errno value ++ */ ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size) ++{ ++ ssize_t rv; ++ ++ mutex_lock(&nfsd_mutex); ++ rv = __write_pnfs_dlm_device(file, buf, size); ++ mutex_unlock(&nfsd_mutex); ++ return rv; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ + /*----------------------------------------------------------------------------*/ + /* + * populating the filesystem. +@@ -1383,6 +1459,10 @@ static int nfsd_fill_super(struct super_ + [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = {"pnfs_dlm_device", &transaction_ops, ++ S_IWUSR|S_IRUSR}, ++#endif + /* last one */ {""} + }; + return simple_fill_super(sb, 0x6e667364, nfsd_files); +@@ -1421,6 +1501,9 @@ static int create_proc_exports_entry(voi + } + #endif + ++#if defined(CONFIG_SPNFS_BLOCK) ++int nfsd_bl_init(void); ++#endif + static int __init init_nfsd(void) + { + int retval; +@@ -1443,6 +1526,15 @@ static int __init init_nfsd(void) + retval = create_proc_exports_entry(); + if (retval) + goto out_free_idmap; ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ retval = spnfs_init_proc(); ++ if (retval != 0) ++ goto out_free_idmap; ++#if defined(CONFIG_SPNFS_BLOCK) ++ nfsd_bl_init(); ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + retval = register_filesystem(&nfsd_fs_type); + if (retval) + goto out_free_all; +@@ -1465,7 +1557,22 @@ out_free_stat: + + static void __exit exit_nfsd(void) + { ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ remove_proc_entry("fs/nfs/spnfs/recall", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/getfh", NULL); ++ remove_proc_entry("fs/nfs/spnfs/config", NULL); ++ remove_proc_entry("fs/nfs/spnfs/ctl", NULL); ++ remove_proc_entry("fs/nfs/spnfs", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutsegsize", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ + nfsd_export_shutdown(); ++ nfsd4_pnfs_dlm_shutdown(); + nfsd_reply_cache_shutdown(); + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); +diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h +--- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-08-23 12:08:29.095491390 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 12:09:03.318355741 -0400 +@@ -285,11 +285,17 @@ extern time_t nfsd4_grace; + #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ + NFSD4_SUPPORTED_ATTRS_WORD0 + ++#if defined(CONFIG_PNFSD) ++#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ ++ (NFSD4_SUPPORTED_ATTRS_WORD1 | FATTR4_WORD1_FS_LAYOUT_TYPES) ++#else /* CONFIG_PNFSD */ + #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ + NFSD4_SUPPORTED_ATTRS_WORD1 ++#endif /* CONFIG_PNFSD */ + + #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ +- (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) ++ (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT | \ ++ FATTR4_WORD2_LAYOUT_BLKSIZE) + + static inline u32 nfsd_suppattrs0(u32 minorversion) + { +diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.c +--- linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-08-23 12:09:03.319511586 -0400 +@@ -10,6 +10,7 @@ + #include + + #include ++#include + #include "nfsd.h" + #include "vfs.h" + #include "auth.h" +@@ -139,6 +140,7 @@ static inline __be32 check_pseudo_root(s + static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) + { + struct knfsd_fh *fh = &fhp->fh_handle; ++ int fsid_type; + struct fid *fid = NULL, sfid; + struct svc_export *exp; + struct dentry *dentry; +@@ -159,7 +161,8 @@ static __be32 nfsd_set_fh_dentry(struct + return error; + if (fh->fh_auth_type != 0) + return error; +- len = key_len(fh->fh_fsid_type) / 4; ++ fsid_type = pnfs_fh_fsid_type(fh); ++ len = key_len(fsid_type) / 4; + if (len == 0) + return error; + if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { +@@ -172,7 +175,7 @@ static __be32 nfsd_set_fh_dentry(struct + data_left -= len; + if (data_left < 0) + return error; +- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth); ++ exp = rqst_exp_find(rqstp, fsid_type, fh->fh_auth); + fid = (struct fid *)(fh->fh_auth + len); + } else { + __u32 tfh[2]; +diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.h +--- linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-08-23 12:09:03.319511586 -0400 +@@ -14,6 +14,7 @@ enum nfsd_fsid { + FSID_UUID8, + FSID_UUID16, + FSID_UUID16_INUM, ++ FSID_MAX + }; + + enum fsid_source { +@@ -205,4 +206,42 @@ fh_unlock(struct svc_fh *fhp) + } + } + ++#if defined(CONFIG_PNFSD) ++ ++/* ++ * fh_fsid_type is overloaded to indicate whether a filehandle was one supplied ++ * to a DS by LAYOUTGET. nfs4_preprocess_stateid_op() uses this to decide how ++ * to handle a given stateid. ++ */ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return fh->fh_fsid_type >= FSID_MAX; ++} ++ ++static inline void pnfs_fh_mark_ds(struct knfsd_fh *fh) ++{ ++ BUG_ON(fh->fh_version != 1); ++ BUG_ON(pnfs_fh_is_ds(fh)); ++ fh->fh_fsid_type += FSID_MAX; ++} ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ ++/* allows fh_verify() to check the real fsid_type (i.e., not overloaded). */ ++static inline int pnfs_fh_fsid_type(struct knfsd_fh *fh) ++{ ++ int fsid_type = fh->fh_fsid_type; ++ ++ if (pnfs_fh_is_ds(fh)) ++ return fsid_type - FSID_MAX; ++ return fsid_type; ++} ++ + #endif /* _LINUX_NFSD_FH_INT_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/nfssvc.c +--- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-08-23 12:08:27.631563969 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-08-23 12:09:03.320416974 -0400 +@@ -115,7 +115,7 @@ struct svc_program nfsd_program = { + + }; + +-u32 nfsd_supported_minorversion; ++u32 nfsd_supported_minorversion = NFSD_SUPPORTED_MINOR_VERSION; + + int nfsd_vers(int vers, enum vers_op change) + { +diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pnfsd.h +--- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-08-23 12:09:03.321376171 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-08-23 12:09:03.321376171 -0400 +@@ -0,0 +1,143 @@ ++/* ++ * Copyright (c) 2005 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef LINUX_NFSD_PNFSD_H ++#define LINUX_NFSD_PNFSD_H ++ ++#include ++#include ++ ++#include "state.h" ++#include "xdr4.h" ++ ++/* outstanding layout stateid */ ++struct nfs4_layout_state { ++ struct list_head ls_perfile; ++ struct list_head ls_layouts; /* list of nfs4_layouts */ ++ struct kref ls_ref; ++ struct nfs4_client *ls_client; ++ struct nfs4_file *ls_file; ++ stateid_t ls_stateid; ++}; ++ ++/* outstanding layout */ ++struct nfs4_layout { ++ struct list_head lo_perfile; /* hash by f_id */ ++ struct list_head lo_perclnt; /* hash by clientid */ ++ struct list_head lo_perstate; ++ struct nfs4_file *lo_file; /* backpointer */ ++ struct nfs4_client *lo_client; ++ struct nfs4_layout_state *lo_state; ++ struct nfsd4_layout_seg lo_seg; ++}; ++ ++struct pnfs_inval_state { ++ struct knfsd_fh mdsfh; /* needed only by invalidate all */ ++ stateid_t stid; ++ clientid_t clid; ++ u32 status; ++}; ++ ++/* pNFS Data Server state */ ++#define DS_STATEID_VALID 0 ++#define DS_STATEID_ERROR 1 ++#define DS_STATEID_NEW 2 ++ ++struct pnfs_ds_stateid { ++ struct list_head ds_hash; /* ds_stateid hash entry */ ++ struct list_head ds_perclid; /* per client hash entry */ ++ stateid_t ds_stid; ++ struct knfsd_fh ds_fh; ++ unsigned long ds_access; ++ u32 ds_status; /* from MDS */ ++ u32 ds_verifier[2]; /* from MDS */ ++ wait_queue_head_t ds_waitq; ++ unsigned long ds_flags; ++ struct kref ds_ref; ++ clientid_t ds_mdsclid; ++}; ++ ++struct pnfs_ds_clientid { ++ struct list_head dc_hash; /* mds_clid_hashtbl entry */ ++ struct list_head dc_stateid; /* ds_stateid head */ ++ struct list_head dc_permdsid; /* per mdsid hash entry */ ++ clientid_t dc_mdsclid; ++ struct kref dc_ref; ++ uint32_t dc_mdsid; ++}; ++ ++struct pnfs_mds_id { ++ struct list_head di_hash; /* mds_nodeid list entry */ ++ struct list_head di_mdsclid; /* mds_clientid head */ ++ uint32_t di_mdsid; ++ time_t di_mdsboot; /* mds boot time */ ++ struct kref di_ref; ++}; ++ ++/* notify device request (from exported filesystem) */ ++struct nfs4_notify_device { ++ struct nfsd4_pnfs_cb_dev_list *nd_list; ++ struct nfs4_client *nd_client; ++ struct list_head nd_perclnt; ++ ++ void *nd_args; /* nfsd internal */ ++}; ++ ++u64 find_create_sbid(struct super_block *); ++struct super_block *find_sbid_id(u64); ++__be32 nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *, struct exp_xdr_stream *); ++int nfs4_pnfs_return_layout(struct super_block *, struct svc_fh *, ++ struct nfsd4_pnfs_layoutreturn *); ++int nfs4_pnfs_cb_get_state(struct super_block *, struct pnfs_get_state *); ++int nfs4_pnfs_cb_change_state(struct pnfs_get_state *); ++void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++int put_layoutrecall(struct nfs4_layoutrecall *); ++void nomatching_layout(struct nfs4_layoutrecall *); ++void *layoutrecall_done(struct nfs4_layoutrecall *); ++int nfsd4_cb_layout(struct nfs4_layoutrecall *); ++int nfsd_layout_recall_cb(struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++int nfsd_device_notify_cb(struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++int nfsd4_cb_notify_device(struct nfs4_notify_device *); ++void pnfs_set_device_notify(clientid_t *, unsigned int types); ++void pnfs_clear_device_notify(struct nfs4_client *); ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++extern struct sockaddr pnfsd_lexp_addr; ++extern size_t pnfs_lexp_addr_len; ++ ++extern void pnfsd_lexp_init(struct inode *); ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#endif /* LINUX_NFSD_PNFSD_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c +--- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-23 12:09:03.321376171 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-23 12:09:03.322501672 -0400 +@@ -0,0 +1,225 @@ ++/* ++ * linux/fs/nfsd/pnfs_lexp.c ++ * ++ * pNFS export of local filesystems. ++ * ++ * Export local file systems over the files layout type. ++ * The MDS (metadata server) functions also as a single DS (data server). ++ * This is mostly useful for development and debugging purposes. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * Copyright (C) 2008 Benny Halevy, ++ * ++ * Initial implementation was based on the pnfs-gfs2 patches done ++ * by David M. Richter ++ */ ++ ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++struct sockaddr pnfsd_lexp_addr; ++size_t pnfs_lexp_addr_len; ++ ++static int ++pnfsd_lexp_layout_type(struct super_block *sb) ++{ ++ int ret = LAYOUT_NFSV4_1_FILES; ++ dprintk("<-- %s: return %d\n", __func__, ret); ++ return ret; ++} ++ ++static int ++pnfsd_lexp_get_device_iter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ ++ dprintk("<-- %s: return 0\n", __func__); ++ return 0; ++} ++ ++static int ++pnfsd_lexp_get_device_info(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_multipath fl_devices[1]; ++ u32 fl_stripe_indices[1] = { 0 }; ++ struct pnfs_filelayout_devaddr daddr; ++ /* %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x.%03u.%03u */ ++ char daddr_buf[8*4 + 2*3 + 10]; ++ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ ++ if (devid->devid != 1) { ++ printk(KERN_ERR "%s: WARNING: didn't receive a deviceid of 1 " ++ "(got: 0x%llx)\n", __func__, devid->devid); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ /* count the number of comma-delimited DS IPs */ ++ fdev.fl_device_length = 1; ++ fdev.fl_device_list = fl_devices; ++ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = fl_stripe_indices; ++ ++ daddr.r_addr.data = daddr_buf; ++ daddr.r_addr.len = sizeof(daddr_buf); ++ err = __svc_print_netaddr(&pnfsd_lexp_addr, &daddr.r_addr); ++ if (err < 0) ++ goto out; ++ daddr.r_addr.len = err; ++ switch (pnfsd_lexp_addr.sa_family) { ++ case AF_INET: ++ daddr.r_netid.data = "tcp"; ++ daddr.r_netid.len = 3; ++ break; ++ case AF_INET6: ++ daddr.r_netid.data = "tcp6"; ++ daddr.r_netid.len = 4; ++ break; ++ default: ++ BUG(); ++ } ++ fdev.fl_device_list[0].fl_multipath_length = 1; ++ fdev.fl_device_list[0].fl_multipath_list = &daddr; ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ dprintk("<-- %s: return %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize < NFSSVC_MAXBLKSIZE) ++ blocksize = NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++ dprintk("%s: return %d\n", __func__, blocksize); ++ return blocksize; ++} ++ ++static enum nfsstat4 ++pnfsd_lexp_layout_get(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ enum nfsstat4 rc = NFS4_OK; ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ ++ dprintk("--> %s: inode=%p\n", __func__, inode); ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = true; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = arg->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = 0; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ memcpy(fhp, arg->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ dprintk("<-- %s: return %d\n", __func__, rc); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++pnfsd_lexp_layout_commit(struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ dprintk("%s: (unimplemented)\n", __func__); ++ ++ return 0; ++} ++ ++static int ++pnfsd_lexp_layout_return(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ dprintk("%s: (unimplemented)\n", __func__); ++ ++ return 0; ++} ++ ++static int pnfsd_lexp_get_state(struct inode *inode, struct knfsd_fh *fh, ++ struct pnfs_get_state *p) ++{ ++ return 0; /* just use the current stateid */ ++} ++ ++static struct pnfs_export_operations pnfsd_lexp_ops = { ++ .layout_type = pnfsd_lexp_layout_type, ++ .get_device_info = pnfsd_lexp_get_device_info, ++ .get_device_iter = pnfsd_lexp_get_device_iter, ++ .layout_get = pnfsd_lexp_layout_get, ++ .layout_commit = pnfsd_lexp_layout_commit, ++ .layout_return = pnfsd_lexp_layout_return, ++ .get_state = pnfsd_lexp_get_state, ++}; ++ ++void ++pnfsd_lexp_init(struct inode *inode) ++{ ++ dprintk("%s: &pnfsd_lexp_ops=%p\n", __func__, &pnfsd_lexp_ops); ++ inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_com.c +--- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-23 12:09:03.322501672 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-08-23 12:09:03.323511608 -0400 +@@ -0,0 +1,535 @@ ++/* ++ * fs/nfsd/spnfs_com.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * Based heavily on idmap.c ++ * ++ */ ++ ++/* ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Marius Aamodt Eriksen ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++static ssize_t spnfs_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t spnfs_pipe_downcall(struct file *, const char __user *, ++ size_t); ++static void spnfs_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops spnfs_upcall_ops = { ++ .upcall = spnfs_pipe_upcall, ++ .downcall = spnfs_pipe_downcall, ++ .destroy_msg = spnfs_pipe_destroy_msg, ++}; ++ ++/* evil global variable */ ++struct spnfs *global_spnfs; ++struct spnfs_config *spnfs_config; ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++int spnfs_use_layoutsegments; ++uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++/* ++ * Used by spnfs_enabled() ++ * Tracks if the subsystem has been initialized at some point. It doesn't ++ * matter if it's not currently initialized. ++ */ ++static int spnfs_enabled_at_some_point; ++ ++/* call this to start the ball rolling */ ++/* code it like we're going to avoid the global variable in the future */ ++int ++nfsd_spnfs_new(void) ++{ ++ struct spnfs *spnfs = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ if (global_spnfs != NULL) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ spnfs = kzalloc(sizeof(*spnfs), GFP_KERNEL); ++ if (spnfs == NULL){ ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ spnfs->spnfs_dentry = rpc_mkpipe(nd.path.dentry, "spnfs", spnfs, ++ &spnfs_upcall_ops, 0); ++ if (IS_ERR(spnfs->spnfs_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ ++ mutex_init(&spnfs->spnfs_lock); ++ mutex_init(&spnfs->spnfs_plock); ++ init_waitqueue_head(&spnfs->spnfs_wq); ++ ++ global_spnfs = spnfs; ++ spnfs_enabled_at_some_point = 1; ++ ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(spnfs); ++ return rc; ++} ++ ++/* again, code it like we're going to remove the global variable */ ++void ++nfsd_spnfs_delete(void) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ ++ if (!spnfs) ++ return; ++ rpc_unlink(spnfs->spnfs_dentry); ++ rpc_put_mount(); ++ global_spnfs = NULL; ++ kfree(spnfs); ++} ++ ++/* RPC pipefs upcall/downcall routines */ ++/* looks like this code is invoked by the rpc_pipe code */ ++/* to handle upcalls on things we've queued elsewhere */ ++/* See nfs_idmap_id for an exmaple of enqueueing */ ++static ssize_t ++spnfs_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, ++ char __user *dst, size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied; ++ ssize_t left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ return mlen; ++} ++ ++static ssize_t ++spnfs_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ struct spnfs *spnfs = (struct spnfs *)rpci->private; ++ struct spnfs_msg *im_in = NULL, *im = &spnfs->spnfs_im; ++ int ret; ++ ++ if (mlen != sizeof(struct spnfs_msg)) ++ return -ENOSPC; ++ ++ im_in = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im_in == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(im_in, src, mlen) != 0) ++ return -EFAULT; ++ ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ ret = mlen; ++ im->im_status = im_in->im_status; ++ /* If we got an error, terminate now, and wake up pending upcalls */ ++ if (!(im_in->im_status & SPNFS_STATUS_SUCCESS)) { ++ wake_up(&spnfs->spnfs_wq); ++ goto out; ++ } ++ ++ ret = -EINVAL; ++ /* Did we match the current upcall? */ ++ /* DMXXX: do not understand the comment above, from original code */ ++ /* DMXXX: when do we _not_ match the current upcall? */ ++ /* DMXXX: anyway, let's to a simplistic check */ ++ if (im_in->im_type == im->im_type) { ++ /* copy the response into the spnfs struct */ ++ memcpy(&im->im_res, &im_in->im_res, sizeof(im->im_res)); ++ ret = mlen; ++ } else ++ dprintk("spnfs: downcall type != upcall type\n"); ++ ++ ++ wake_up(&spnfs->spnfs_wq); ++/* DMXXX handle rval processing */ ++out: ++ mutex_unlock(&spnfs->spnfs_plock); ++ kfree(im_in); ++ return ret; ++} ++ ++static void ++spnfs_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ struct spnfs_msg *im = msg->data; ++ struct spnfs *spnfs = container_of(im, struct spnfs, spnfs_im); ++ ++ if (msg->errno >= 0) ++ return; ++ mutex_lock(&spnfs->spnfs_plock); ++ im->im_status = SPNFS_STATUS_FAIL; /* DMXXX */ ++ wake_up(&spnfs->spnfs_wq); ++ mutex_unlock(&spnfs->spnfs_plock); ++} ++ ++/* generic upcall. called by functions in spnfs_ops.c */ ++int ++spnfs_upcall(struct spnfs *spnfs, struct spnfs_msg *upmsg, ++ union spnfs_msg_res *res) ++{ ++ struct rpc_pipe_msg msg; ++ struct spnfs_msg *im; ++ DECLARE_WAITQUEUE(wq, current); ++ int ret = -EIO; ++ int rval; ++ ++ im = &spnfs->spnfs_im; ++ ++ mutex_lock(&spnfs->spnfs_lock); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ memset(im, 0, sizeof(*im)); ++ memcpy(im, upmsg, sizeof(*upmsg)); ++ ++ memset(&msg, 0, sizeof(msg)); ++ msg.data = im; ++ msg.len = sizeof(*im); ++ ++ add_wait_queue(&spnfs->spnfs_wq, &wq); ++ rval = rpc_queue_upcall(spnfs->spnfs_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&spnfs->spnfs_plock); ++ schedule(); ++ current->state = TASK_RUNNING; ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ if (im->im_status & SPNFS_STATUS_SUCCESS) { ++ /* copy our result from the upcall */ ++ memcpy(res, &im->im_res, sizeof(*res)); ++ ret = 0; ++ } ++ ++out: ++ memset(im, 0, sizeof(*im)); ++ mutex_unlock(&spnfs->spnfs_plock); ++ mutex_unlock(&spnfs->spnfs_lock); ++ return(ret); ++} ++ ++/* ++ * This is used to determine if the spnfsd daemon has been started at ++ * least once since the system came up. This is used to by the export ++ * mechanism to decide if spnfs is in use. ++ * ++ * Returns non-zero if the spnfsd has initialized the communication pipe ++ * at least once. ++ */ ++int spnfs_enabled(void) ++{ ++ return spnfs_enabled_at_some_point; ++} ++ ++#ifdef CONFIG_PROC_FS ++ ++/* ++ * procfs virtual files for user/kernel space communication: ++ * ++ * ctl - currently just an on/off switch...can be expanded ++ * getfh - fd to fh conversion ++ * recall - recall a layout from the command line, for example: ++ * echo > /proc/fs/spnfs/recall ++ * config - configuration info, e.g., stripe size, num ds, etc. ++ */ ++ ++/*-------------- start ctl -------------------------*/ ++static ssize_t ctl_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int cmd, rc; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (cmd) { ++ rc = nfsd_spnfs_new(); ++ if (rc != 0) ++ return rc; ++ } else ++ nfsd_spnfs_delete(); ++ ++ return count; ++} ++ ++static const struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++/*-------------- end ctl ---------------------------*/ ++ ++/*-------------- start config -------------------------*/ ++static ssize_t config_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ static struct spnfs_config cfg; ++ ++ if (copy_from_user(&cfg, buf, count)) ++ return -EFAULT; ++ ++ spnfs_config = &cfg; ++ return 0; ++} ++ ++static const struct file_operations config_ops = { ++ .write = config_write, ++}; ++/*-------------- end config ---------------------------*/ ++ ++/*-------------- start getfh -----------------------*/ ++static int getfh_open(struct inode *inode, struct file *file) ++{ ++ file->private_data = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); ++ if (file->private_data == NULL) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static ssize_t getfh_read(struct file *file, char __user *buf, size_t count, ++ loff_t *offset) ++{ ++ if (copy_to_user(buf, file->private_data, sizeof(struct nfs_fh))) ++ return -EFAULT; ++ ++ return count; ++} ++ ++static ssize_t getfh_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int fd; ++ ++ if (copy_from_user((int *)&fd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (spnfs_getfh(fd, file->private_data) != 0) ++ return -EIO; ++ ++ return count; ++} ++ ++static int getfh_release(struct inode *inode, struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static const struct file_operations getfh_ops = { ++ .open = getfh_open, ++ .read = getfh_read, ++ .write = getfh_write, ++ .release = getfh_release, ++}; ++/*-------------- end getfh ------------------------*/ ++ ++ ++/*-------------- start recall layout --------------*/ ++static ssize_t recall_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char input[128]; ++ char *path, *str, *p; ++ int rc; ++ u64 off = 0, len = 0; ++ ++ if (count > 128) ++ return -EINVAL; ++ ++ if (copy_from_user(input, buf, count)) ++ return -EFAULT; ++ ++ /* assumes newline-terminated path */ ++ p = memchr(input, '\n', count); ++ if (p == NULL) ++ return -EINVAL; ++ *p = '\0'; ++ ++ /* ++ * Scan for path and, optionally, an offset and length ++ * of a layout segment to be recalled; if there are two ++ * fields, they're assumed to be path and offset. ++ */ ++ p = input; ++ path = strsep(&p, " "); ++ if (path == NULL) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &off); ++ if (rc != 0) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &len); ++ if (rc != 0) ++ return -EINVAL; ++ } ++ } ++ ++ rc = spnfs_test_layoutrecall(path, off, len); ++ if (rc != 0) ++ return rc; ++ ++ return count; ++} ++ ++static const struct file_operations recall_ops = { ++ .write = recall_write, ++}; ++/*-------------- end recall layout --------------*/ ++ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++/*-------------- start layoutseg -------------------------*/ ++static ssize_t layoutseg_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[3]; ++ ++ if (copy_from_user(cmd, buf, 1)) ++ return -EFAULT; ++ if (cmd[0] == '0') ++ spnfs_use_layoutsegments = 0; ++ else ++ spnfs_use_layoutsegments = 1; ++ ++ return count; ++} ++ ++static const struct file_operations layoutseg_ops = { ++ .write = layoutseg_write, ++}; ++/*-------------- end layoutseg ---------------------------*/ ++ ++/*-------------- start layoutsegsize -------------------------*/ ++static ssize_t layoutsegsize_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[50]; ++ ++ if (copy_from_user(cmd, buf, 49)) ++ return -EFAULT; ++ layoutsegment_size = simple_strtoull(cmd, NULL, 10); ++ ++ return count; ++} ++ ++static const struct file_operations layoutsegsize_ops = { ++ .write = layoutsegsize_write, ++}; ++/*-------------- end layoutsegsize ---------------------------*/ ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++int ++spnfs_init_proc(void) ++{ ++ struct proc_dir_entry *entry; ++ ++ entry = proc_mkdir("fs/spnfs", NULL); ++ if (!entry) ++ return -ENOMEM; ++ ++ entry = create_proc_entry("fs/spnfs/ctl", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &ctl_ops; ++ ++ entry = create_proc_entry("fs/spnfs/config", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &config_ops; ++ ++ entry = create_proc_entry("fs/spnfs/getfh", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &getfh_ops; ++ ++ entry = create_proc_entry("fs/spnfs/recall", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &recall_ops; ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++ entry = create_proc_entry("fs/spnfs/layoutseg", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutseg_ops; ++ ++ entry = create_proc_entry("fs/spnfs/layoutsegsize", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutsegsize_ops; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ return 0; ++} ++#endif /* CONFIG_PROC_FS */ +diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c +--- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-23 12:09:03.324501390 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-08-23 12:09:03.324501390 -0400 +@@ -0,0 +1,878 @@ ++/* ++ * fs/nfsd/spnfs_ops.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * ++ */ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++/* comment out CONFIG_SPNFS_TEST for non-test behaviour */ ++/* #define CONFIG_SPNFS_TEST 1 */ ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++/* ++ * The functions that are called from elsewhere in the kernel ++ * to perform tasks in userspace ++ * ++ */ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++extern int spnfs_use_layoutsegments; ++extern uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++extern struct spnfs *global_spnfs; ++ ++int ++spnfs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++enum nfsstat4 ++spnfs_layoutget(struct inode *inode, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *lg_arg, ++ struct nfsd4_pnfs_layoutget_res *lg_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct pnfs_filelayout_layout *flp = NULL; ++ int status, i; ++ enum nfsstat4 nfserr; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ im->im_type = SPNFS_TYPE_LAYOUTGET; ++ im->im_args.layoutget_args.inode = inode->i_ino; ++ im->im_args.layoutget_args.generation = inode->i_generation; ++ ++ /* call function to queue the msg for upcall */ ++ if (spnfs_upcall(spnfs, im, res) != 0) { ++ dprintk("failed spnfs upcall: layoutget\n"); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto layoutget_cleanup; ++ } ++ status = res->layoutget_res.status; ++ if (status != 0) { ++ /* FIXME? until user mode is fixed, translate system error */ ++ switch (status) { ++ case -E2BIG: ++ case -ETOOSMALL: ++ nfserr = NFS4ERR_TOOSMALL; ++ break; ++ case -ENOMEM: ++ case -EAGAIN: ++ case -EINTR: ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ break; ++ case -ENOENT: ++ nfserr = NFS4ERR_BADLAYOUT; ++ break; ++ default: ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ } ++ dprintk("spnfs layout_get upcall: status=%d nfserr=%u\n", ++ status, nfserr); ++ goto layoutget_cleanup; ++ } ++ ++ lg_res->lg_return_on_close = 0; ++#if defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ /* if spnfs_use_layoutsegments & layoutsegment_size == 0, use */ ++ /* the amount requested by the client. */ ++ if (spnfs_use_layoutsegments) { ++ if (layoutsegment_size != 0) ++ lg_res->lg_seg.length = layoutsegment_size; ++ } else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ flp = kmalloc(sizeof(struct pnfs_filelayout_layout), GFP_KERNEL); ++ if (flp == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ flp->device_id.sbid = lg_arg->lg_sbid; ++ flp->device_id.devid = res->layoutget_res.devid; ++ flp->lg_layout_type = 1; /* XXX */ ++ flp->lg_stripe_type = res->layoutget_res.stripe_type; ++ flp->lg_commit_through_mds = 0; ++ flp->lg_stripe_unit = res->layoutget_res.stripe_size; ++ flp->lg_first_stripe_index = 0; ++ flp->lg_pattern_offset = 0; ++ flp->lg_fh_length = res->layoutget_res.stripe_count; ++ ++ flp->lg_fh_list = kmalloc(flp->lg_fh_length * sizeof(struct knfsd_fh), ++ GFP_KERNEL); ++ if (flp->lg_fh_list == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ /* ++ * FIX: Doing an extra copy here. Should group res.flist's fh_len ++ * and fh_val into a knfsd_fh structure. ++ */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ flp->lg_fh_list[i].fh_size = res->layoutget_res.flist[i].fh_len; ++ memcpy(&flp->lg_fh_list[i].fh_base, ++ res->layoutget_res.flist[i].fh_val, ++ res->layoutget_res.flist[i].fh_len); ++ } ++ ++ /* encode the layoutget body */ ++ nfserr = filelayout_encode_layout(xdr, flp); ++ ++layoutget_cleanup: ++ if (flp) { ++ if (flp->lg_fh_list) ++ kfree(flp->lg_fh_list); ++ kfree(flp); ++ } ++ kfree(im); ++ kfree(res); ++ ++ return nfserr; ++} ++ ++int ++spnfs_layoutcommit(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutreturn(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for ino = %lu\n", ++ __func__, inode->i_ino); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* XXX figure out how to get a sb since there's no inode ptr */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = offset; ++ lr.cbl_seg.length = len; ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ lr.cbl_layoutchanged = 0; ++ ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ ++ return 0; ++} ++ ++ ++int ++spnfs_test_layoutrecall(char *path, u64 offset, u64 len) ++{ ++ struct nameidata nd; ++ struct inode *inode; ++ int type, rc; ++ ++ dprintk("%s: path=%s, offset=%llu, len=%llu\n", ++ __func__, path, offset, len); ++ ++ if (strcmp(path, "all") == 0) { ++ inode = NULL; ++ type = RETURN_ALL; ++ } else { ++ rc = path_lookup(path, 0, &nd); ++ if (rc != 0) ++ return -ENOENT; ++ ++ /* ++ * XXX todo: add a RETURN_FSID scenario here...maybe if ++ * inode is a dir... ++ */ ++ ++ inode = nd.path.dentry->d_inode; ++ type = RETURN_FILE; ++ } ++ ++ if (len == 0) ++ len = NFS4_MAX_UINT64; ++ ++ rc = spnfs_layoutrecall(inode, type, offset, len); ++ ++ if (type != RETURN_ALL) ++ path_put(&nd.path); ++ return rc; ++} ++ ++int ++spnfs_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *gd_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* XXX keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEITER; ++ im->im_args.getdeviceiter_args.cookie = gd_res->gd_cookie; ++ im->im_args.getdeviceiter_args.verf = gd_res->gd_verf; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceiter_out; ++ } ++ status = res->getdeviceiter_res.status; ++ ++ if (res->getdeviceiter_res.eof) ++ gd_res->gd_eof = 1; ++ else { ++ gd_res->gd_devid = res->getdeviceiter_res.devid; ++ gd_res->gd_cookie = res->getdeviceiter_res.cookie; ++ gd_res->gd_verf = res->getdeviceiter_res.verf; ++ gd_res->gd_eof = 0; ++ } ++ ++getdeviceiter_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++#ifdef CONFIG_SPNFS_TEST ++/* ++ * Setup the rq_res xdr_buf. The svc_rqst rq_respages[1] page contains the ++ * 1024 encoded stripe indices. ++ * ++ * Skip the devaddr4 length and encode the indicies count (1024) in the ++ * rq_res.head and set the rq_res.head length. ++ * ++ * Set the rq_res page_len to 4096 (for the 1024 stripe indices). ++ * Set the rq_res xdr_buf tail base to rq_respages[0] just after the ++ * rq_res head to hold the rest of the getdeviceinfo return. ++ * ++ * So rq_respages[rq_resused - 1] contains the rq_res.head and rq_res.tail and ++ * rq_respages[rq_resused] contains the rq_res.pages. ++ */ ++static int spnfs_test_indices_xdr(struct pnfs_xdr_info *info, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ struct nfsd4_compoundres *resp = info->resp; ++ struct svc_rqst *rqstp = resp->rqstp; ++ struct xdr_buf *xb = &resp->rqstp->rq_res; ++ __be32 *p; ++ ++ p = nfsd4_xdr_reserve_space(resp, 8); ++ p++; /* Fill in length later */ ++ *p++ = cpu_to_be32(fdev->fl_stripeindices_length); /* 1024 */ ++ resp->p = p; ++ ++ xb->head[0].iov_len = (char *)resp->p - (char *)xb->head[0].iov_base; ++ xb->pages = &rqstp->rq_respages[rqstp->rq_resused]; ++ xb->page_base = 0; ++ xb->page_len = PAGE_SIZE; /* page of 1024 encoded indices */ ++ xb->tail[0].iov_base = resp->p; ++ resp->end = xb->head[0].iov_base + PAGE_SIZE; ++ xb->tail[0].iov_len = (char *)resp->end - (char *)resp->p; ++ return 0; ++} ++/* ++ * Return a stripeindices of length 1024 to test ++ * the pNFS client multipage getdeviceinfo implementation. ++ * ++ * Encode a page of stripe indices. ++ */ ++static void spnfs_set_test_indices(struct pnfs_filelayout_device *fldev, ++ struct spnfs_device *dev, ++ struct pnfs_devinfo_arg *info) ++{ ++ struct svc_rqst *rqstp = info->xdr.resp->rqstp; ++ __be32 *p; ++ int i, j = 0; ++ ++ p = (__be32 *)page_address(rqstp->rq_respages[rqstp->rq_resused]); ++ fldev->fl_stripeindices_length = 1024; ++ /* round-robin the data servers device index into the stripe indicie */ ++ for (i = 0; i < 1024; i++) { ++ *p++ = cpu_to_be32(j); ++ if (j < dev->dscount - 1) ++ j++; ++ else ++ j = 0; ++ } ++ fldev->fl_stripeindices_list = NULL; ++} ++#endif /* CONFIG_SPNFS_TEST */ ++ ++int ++spnfs_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct spnfs_device *dev; ++ struct pnfs_filelayout_device *fldev = NULL; ++ struct pnfs_filelayout_multipath *mp = NULL; ++ struct pnfs_filelayout_devaddr *fldap = NULL; ++ int status = 0, i, len; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEINFO; ++ /* XXX FIX: figure out what to do about fsid */ ++ im->im_args.getdeviceinfo_args.devid = devid->devid; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceinfo_out; ++ } ++ status = res->getdeviceinfo_res.status; ++ if (status != 0) ++ goto getdeviceinfo_out; ++ ++ dev = &res->getdeviceinfo_res.devinfo; ++ ++ /* Fill in the device data, i.e., nfs4_1_file_layout_ds_addr4 */ ++ fldev = kzalloc(sizeof(struct pnfs_filelayout_device), GFP_KERNEL); ++ if (fldev == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ /* ++ * Stripe count is the same as data server count for our purposes ++ */ ++ fldev->fl_stripeindices_length = dev->dscount; ++ fldev->fl_device_length = dev->dscount; ++ ++ /* Set stripe indices */ ++#ifdef CONFIG_SPNFS_TEST ++ spnfs_set_test_indices(fldev, dev, info); ++ fldev->fl_enc_stripe_indices = spnfs_test_indices_xdr; ++#else /* CONFIG_SPNFS_TEST */ ++ fldev->fl_stripeindices_list = ++ kmalloc(fldev->fl_stripeindices_length * sizeof(u32), ++ GFP_KERNEL); ++ if (fldev->fl_stripeindices_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_stripeindices_length; i++) ++ fldev->fl_stripeindices_list[i] = i; ++#endif /* CONFIG_SPNFS_TEST */ ++ ++ /* ++ * Set the device's data server addresses No multipath for spnfs, ++ * so mp length is always 1. ++ * ++ */ ++ fldev->fl_device_list = ++ kmalloc(fldev->fl_device_length * ++ sizeof(struct pnfs_filelayout_multipath), ++ GFP_KERNEL); ++ if (fldev->fl_device_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ mp = &fldev->fl_device_list[i]; ++ mp->fl_multipath_length = 1; ++ mp->fl_multipath_list = ++ kmalloc(sizeof(struct pnfs_filelayout_devaddr), ++ GFP_KERNEL); ++ if (mp->fl_multipath_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ fldap = mp->fl_multipath_list; ++ ++ /* ++ * Copy the netid into the device address, for example: "tcp" ++ */ ++ len = strlen(dev->dslist[i].netid); ++ fldap->r_netid.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_netid.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_netid.data, dev->dslist[i].netid, len); ++ fldap->r_netid.len = len; ++ ++ /* ++ * Copy the network address into the device address, ++ * for example: "10.35.9.16.08.01" ++ */ ++ len = strlen(dev->dslist[i].addr); ++ fldap->r_addr.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_addr.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_addr.data, dev->dslist[i].addr, len); ++ fldap->r_addr.len = len; ++ } ++ ++ /* encode the device data */ ++ status = filelayout_encode_devinfo(xdr, fldev); ++ ++getdeviceinfo_out: ++ if (fldev) { ++ kfree(fldev->fl_stripeindices_list); ++ if (fldev->fl_device_list) { ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ fldap = ++ fldev->fl_device_list[i].fl_multipath_list; ++ kfree(fldap->r_netid.data); ++ kfree(fldap->r_addr.data); ++ kfree(fldap); ++ } ++ kfree(fldev->fl_device_list); ++ } ++ kfree(fldev); ++ } ++ ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_setattr(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_open(struct inode *inode, struct nfsd4_open *open) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_OPEN; ++ im->im_args.open_args.inode = inode->i_ino; ++ im->im_args.open_args.generation = inode->i_generation; ++ im->im_args.open_args.create = open->op_create; ++ im->im_args.open_args.createmode = open->op_createmode; ++ im->im_args.open_args.truncate = open->op_truncate; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto open_out; ++ } ++ status = res->open_res.status; ++ ++open_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_create(void) ++{ ++ return 0; ++} ++ ++/* ++ * Invokes the spnfsd with the inode number of the object to remove. ++ * The file has already been removed on the MDS, so all the spnsfd ++ * daemon does is remove the stripes. ++ * Returns 0 on success otherwise error code ++ */ ++int ++spnfs_remove(unsigned long ino, unsigned long generation) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_REMOVE; ++ im->im_args.remove_args.inode = ino; ++ im->im_args.remove_args.generation = generation; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto remove_out; ++ } ++ status = res->remove_res.status; ++ ++remove_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++static int ++read_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_read(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ if (err == 0) ++ break; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDONLY | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0 ; vnum < vlen ; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = read_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err < 0) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ if (err < iolen) { ++ bytecount += err; ++ goto read_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++read_out: ++ *lenp = bytecount; ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ return status; ++} ++ ++__be32 ++spnfs_read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return read(inode, offset, lenp, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++static int ++write_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_write(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDWR | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto write_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0; vnum < vlen; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = write_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err != iolen) { ++ dprintk("spnfs_write: err=%d expected %Zd\n", err, len); ++ status = nfserr_io; ++ goto write_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++write_out: ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ ++ return status; ++} ++ ++__be32 ++spnfs_write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return write(inode, offset, len, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++int ++spnfs_commit(void) ++{ ++ return 0; ++} ++ ++/* ++ * Return the state for this object. ++ * At this time simply return 0 to indicate success and use the existing state ++ */ ++int ++spnfs_get_state(struct inode *inode, struct knfsd_fh *fh, struct pnfs_get_state *arg) ++{ ++ return 0; ++} ++ ++/* ++ * Return the filehandle for the specified file descriptor ++ */ ++int ++spnfs_getfh(int fd, struct nfs_fh *fh) ++{ ++ struct file *file; ++ ++ file = fget(fd); ++ if (file == NULL) ++ return -EIO; ++ ++ memcpy(fh, NFS_FH(file->f_dentry->d_inode), sizeof(struct nfs_fh)); ++ fput(file); ++ return 0; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h +--- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-08-23 12:08:29.096512142 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 12:09:03.325501424 -0400 +@@ -242,6 +242,12 @@ struct nfs4_client { + u32 cl_cb_seq_nr; + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ ++#if defined(CONFIG_PNFSD) ++ struct list_head cl_layouts; /* outstanding layouts */ ++ struct list_head cl_layoutrecalls; /* outstanding layoutrecall ++ callbacks */ ++ atomic_t cl_deviceref; /* Num outstanding devs */ ++#endif /* CONFIG_PNFSD */ + }; + + static inline void +@@ -342,12 +348,31 @@ struct nfs4_file { + struct list_head fi_hash; /* hash by "struct inode *" */ + struct list_head fi_stateids; + struct list_head fi_delegations; ++#if defined(CONFIG_PNFSD) ++ struct list_head fi_layouts; ++ struct list_head fi_layout_states; ++#endif /* CONFIG_PNFSD */ + struct inode *fi_inode; + u32 fi_id; /* used with stateowner->so_id + * for stateid_hashtbl hash */ + bool fi_had_conflict; ++#if defined(CONFIG_PNFSD) ++ /* used by layoutget / layoutrecall */ ++ struct nfs4_fsid fi_fsid; ++ u32 fi_fhlen; ++ u8 fi_fhval[NFS4_FHSIZE]; ++#endif /* CONFIG_PNFSD */ + }; + ++#if defined(CONFIG_PNFSD) ++/* pNFS Metadata server state */ ++ ++struct pnfs_ds_dev_entry { ++ struct list_head dd_dev_entry; /* st_pnfs_ds_id entry */ ++ u32 dd_dsid; ++}; ++#endif /* CONFIG_PNFSD */ ++ + /* + * nfs4_stateid can either be an open stateid or (eventually) a lock stateid + * +@@ -370,6 +395,9 @@ struct nfs4_stateid { + struct list_head st_perfile; + struct list_head st_perstateowner; + struct list_head st_lockowners; ++#if defined(CONFIG_PNFSD) ++ struct list_head st_pnfs_ds_id; ++#endif /* CONFIG_PNFSD */ + struct nfs4_stateowner * st_stateowner; + struct nfs4_file * st_file; + stateid_t st_stateid; +@@ -421,6 +449,34 @@ extern void nfsd4_recdir_purge_old(void) + extern int nfsd4_create_clid_dir(struct nfs4_client *clp); + extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); + extern void release_session_client(struct nfsd4_session *); ++extern void nfsd4_free_slab(struct kmem_cache **); ++extern struct nfs4_file *find_file(struct inode *); ++extern struct nfs4_file *find_alloc_file(struct inode *, struct svc_fh *); ++extern void put_nfs4_file(struct nfs4_file *); ++extern void get_nfs4_file(struct nfs4_file *); ++extern struct nfs4_client *find_confirmed_client(clientid_t *); ++extern struct nfs4_stateid *find_stateid(stateid_t *, int flags); ++extern struct nfs4_delegation *find_delegation_stateid(struct inode *, stateid_t *); ++extern __be32 nfs4_check_stateid(stateid_t *); ++extern void expire_client_lock(struct nfs4_client *); ++extern int filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), void *); ++ ++#if defined(CONFIG_PNFSD) ++extern int nfsd4_init_pnfs_slabs(void); ++extern void nfsd4_free_pnfs_slabs(void); ++extern void pnfs_expire_client(struct nfs4_client *); ++extern void release_pnfs_ds_dev_list(struct nfs4_stateid *); ++extern void nfs4_pnfs_state_init(void); ++extern void nfs4_pnfs_state_shutdown(void); ++extern void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++extern int nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *, stateid_t *); ++#else /* CONFIG_PNFSD */ ++static inline void nfsd4_free_pnfs_slabs(void) {} ++static inline int nfsd4_init_pnfs_slabs(void) { return 0; } ++static inline void pnfs_expire_client(struct nfs4_client *clp) {} ++static inline void release_pnfs_ds_dev_list(struct nfs4_stateid *stp) {} ++static inline void nfs4_pnfs_state_shutdown(void) {} ++#endif /* CONFIG_PNFSD */ + + static inline void + nfs4_put_stateowner(struct nfs4_stateowner *so) +@@ -434,4 +490,24 @@ nfs4_get_stateowner(struct nfs4_stateown + kref_get(&so->so_ref); + } + ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ + #endif /* NFSD4_STATE_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs.c +--- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-08-23 12:08:27.632564132 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-08-23 12:09:03.326501490 -0400 +@@ -37,7 +37,12 @@ + #ifdef CONFIG_NFSD_V4 + #include + #include ++#include ++#include + #endif /* CONFIG_NFSD_V4 */ ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif + + #include "nfsd.h" + #include "vfs.h" +@@ -383,6 +388,12 @@ nfsd_setattr(struct svc_rqst *rqstp, str + NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); + if (err) + goto out; ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, 0)) { ++ err = bl_layoutrecall(inode, RETURN_FILE, ++ iap->ia_size, inode->i_size - iap->ia_size); ++ } ++#endif /* CONFIG_SPNFS_BLOCK */ + } + + /* +@@ -1703,6 +1714,11 @@ nfsd_rename(struct svc_rqst *rqstp, stru + struct inode *fdir, *tdir; + __be32 err; + int host_err; ++#ifdef CONFIG_SPNFS ++ unsigned long ino = 0; ++ unsigned long generation = 0; ++ unsigned int nlink = 0; ++#endif /* CONFIG_SPNFS */ + + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); + if (err) +@@ -1766,7 +1782,26 @@ nfsd_rename(struct svc_rqst *rqstp, stru + if (host_err) + goto out_dput_new; + ++#ifdef CONFIG_SPNFS ++ /* ++ * if the target is a preexisting regular file, remember the ++ * inode number and generation so we can delete the stripes; ++ * save the link count as well so that the stripes only get ++ * get deleted when the last link is deleted ++ */ ++ if (ndentry && ndentry->d_inode && S_ISREG(ndentry->d_inode->i_mode)) { ++ ino = ndentry->d_inode->i_ino; ++ generation = ndentry->d_inode->i_generation; ++ nlink = ndentry->d_inode->i_nlink; ++ } ++#endif /* CONFIG_SPNFS */ ++ + host_err = vfs_rename(fdir, odentry, tdir, ndentry); ++#ifdef CONFIG_SPNFS ++ if (spnfs_enabled() && (!host_err && ino && nlink == 1)) ++ spnfs_remove(ino, generation); ++#endif /* CONFIG_SPNFS */ ++ + if (!host_err) { + host_err = commit_metadata(tfhp); + if (!host_err) +@@ -1807,6 +1842,11 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + struct inode *dirp; + __be32 err; + int host_err; ++#if defined(CONFIG_SPNFS) ++ unsigned long ino; ++ unsigned long generation; ++ unsigned int nlink; ++#endif /* defined(CONFIG_SPNFS) */ + + err = nfserr_acces; + if (!flen || isdotent(fname, flen)) +@@ -1830,6 +1870,17 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + goto out; + } + ++#if defined(CONFIG_SPNFS) ++ /* ++ * Remember the inode number to communicate to the spnfsd ++ * for removal of stripes; save the link count as well so that ++ * the stripes only get get deleted when the last link is deleted ++ */ ++ ino = rdentry->d_inode->i_ino; ++ generation = rdentry->d_inode->i_generation; ++ nlink = rdentry->d_inode->i_nlink; ++#endif /* defined(CONFIG_SPNFS) */ ++ + if (!type) + type = rdentry->d_inode->i_mode & S_IFMT; + +@@ -1854,6 +1905,29 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + if (!host_err) + host_err = commit_metadata(fhp); + ++#if defined(CONFIG_SPNFS) ++ /* ++ * spnfs: notify spnfsd of removal to destroy stripes ++ */ ++/* ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (sb->s_export_op->spnfs_remove) { ++*/ ++ dprintk("%s check if spnfs_enabled\n", __FUNCTION__); ++ if (spnfs_enabled() && nlink == 1) { ++ BUG_ON(ino == 0); ++ dprintk("%s calling spnfs_remove inumber=%ld\n", ++ __FUNCTION__, ino); ++ if (spnfs_remove(ino, generation) == 0) { ++ dprintk("%s spnfs_remove success\n", __FUNCTION__); ++ } else { ++ /* XXX How do we make this atomic? */ ++ printk(KERN_WARNING "nfsd: pNFS could not " ++ "remove stripes for inode: %ld\n", ino); ++ } ++ } ++#endif /* defined(CONFIG_SPNFS) */ ++ + mnt_drop_write(fhp->fh_export->ex_path.mnt); + out_nfserr: + err = nfserrno(host_err); +diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h +--- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-08-23 12:08:29.097425997 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 12:09:03.327451643 -0400 +@@ -37,6 +37,8 @@ + #ifndef _LINUX_NFSD_XDR4_H + #define _LINUX_NFSD_XDR4_H + ++#include ++ + #include "state.h" + #include "nfsd.h" + +@@ -385,6 +387,51 @@ struct nfsd4_reclaim_complete { + u32 rca_one_fs; + }; + ++struct nfsd4_pnfs_getdevinfo { ++ struct nfsd4_pnfs_deviceid gd_devid; /* request */ ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxcount; /* request */ ++ u32 gd_notify_types;/* request */ ++ struct super_block *gd_sb; ++}; ++ ++struct nfsd4_pnfs_getdevlist { ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxdevices; /* request */ ++ u64 gd_cookie; /* request - response */ ++ u64 gd_verf; /* request - response */ ++ struct svc_fh *gd_fhp; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutget { ++ u64 lg_minlength; /* request */ ++ u32 lg_signal; /* request */ ++ u32 lg_maxcount; /* request */ ++ struct svc_fh *lg_fhp; /* request */ ++ stateid_t lg_sid; /* request/response */ ++ struct nfsd4_layout_seg lg_seg; /* request/response */ ++ u32 lg_roc; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit { ++ struct nfsd4_pnfs_layoutcommit_arg args; ++ stateid_t lc_sid; /* request */ ++ struct nfsd4_pnfs_layoutcommit_res res; ++}; ++ ++enum layoutreturn_flags { ++ LR_FLAG_INTERN = 1 << 0, /* internal return */ ++ LR_FLAG_EXPIRE = 1 << 1, /* return on client expiration */ ++}; ++ ++struct nfsd4_pnfs_layoutreturn { ++ struct nfsd4_pnfs_layoutreturn_arg args; ++ u32 lr_flags; ++ stateid_t lr_sid; /* request/resopnse */ ++ u32 lrs_present; /* response */ ++}; ++ + struct nfsd4_op { + int opnum; + __be32 status; +@@ -426,6 +473,13 @@ struct nfsd4_op { + struct nfsd4_destroy_session destroy_session; + struct nfsd4_sequence sequence; + struct nfsd4_reclaim_complete reclaim_complete; ++#if defined(CONFIG_PNFSD) ++ struct nfsd4_pnfs_getdevlist pnfs_getdevlist; ++ struct nfsd4_pnfs_getdevinfo pnfs_getdevinfo; ++ struct nfsd4_pnfs_layoutget pnfs_layoutget; ++ struct nfsd4_pnfs_layoutcommit pnfs_layoutcommit; ++ struct nfsd4_pnfs_layoutreturn pnfs_layoutreturn; ++#endif /* CONFIG_PNFSD */ + } u; + struct nfs4_replay * replay; + }; +diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file.c +--- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-23 12:08:29.039491912 -0400 ++++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-23 12:09:03.328501680 -0400 +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -36,6 +37,7 @@ + #include "internal.h" + #include "iostat.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_FILE + +@@ -388,12 +390,17 @@ static int nfs_write_begin(struct file * + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int once_thru = 0; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + mapping->host->i_ino, len, (long long) pos); + ++ pnfs_update_layout(mapping->host, ++ nfs_file_open_context(file), ++ 0, NFS4_MAX_UINT64, IOMODE_RW, ++ &lseg); + start: + /* + * Prevent starvation issues if someone is doing a consistency +@@ -402,17 +409,22 @@ start: + ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) +- return ret; ++ goto out; + + page = grab_cache_page_write_begin(mapping, index, flags); +- if (!page) +- return -ENOMEM; ++ if (!page) { ++ ret = -ENOMEM; ++ goto out; ++ } + *pagep = page; + +- ret = nfs_flush_incompatible(file, page); ++ ret = nfs_flush_incompatible(file, page, lseg); + if (ret) { + unlock_page(page); + page_cache_release(page); ++ *pagep = NULL; ++ *fsdata = NULL; ++ goto out; + } else if (!once_thru && + nfs_want_read_modify_write(file, page, pos, len)) { + once_thru = 1; +@@ -421,6 +433,12 @@ start: + if (!ret) + goto start; + } ++ ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata); ++ out: ++ if (ret) { ++ put_lseg(lseg); ++ *fsdata = NULL; ++ } + return ret; + } + +@@ -430,6 +448,7 @@ static int nfs_write_end(struct file *fi + { + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + int status; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, +@@ -456,10 +475,17 @@ static int nfs_write_end(struct file *fi + zero_user_segment(page, pglen, PAGE_CACHE_SIZE); + } + +- status = nfs_updatepage(file, page, offset, copied); ++ lseg = nfs4_pull_lseg_from_fsdata(file, fsdata); ++ status = pnfs_write_end(file, page, pos, len, copied, lseg); ++ if (status) ++ goto out; ++ status = nfs_updatepage(file, page, offset, copied, lseg, fsdata); + ++ out: + unlock_page(page); + page_cache_release(page); ++ pnfs_write_end_cleanup(file, fsdata); ++ put_lseg(lseg); + + if (status < 0) + return status; +@@ -570,6 +596,8 @@ static int nfs_vm_page_mkwrite(struct vm + /* make sure the cache has finished storing the page */ + nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); + ++ /* XXX Do we want to call pnfs_update_layout here? */ ++ + lock_page(page); + mapping = page->mapping; + if (mapping != dentry->d_inode->i_mapping) +@@ -580,11 +608,11 @@ static int nfs_vm_page_mkwrite(struct vm + if (pagelen == 0) + goto out_unlock; + +- ret = nfs_flush_incompatible(filp, page); ++ ret = nfs_flush_incompatible(filp, page, NULL); + if (ret != 0) + goto out_unlock; + +- ret = nfs_updatepage(filp, page, 0, pagelen); ++ ret = nfs_updatepage(filp, page, 0, pagelen, NULL, NULL); + out_unlock: + if (!ret) + return VM_FAULT_LOCKED; +diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inode.c +--- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-08-23 12:08:29.042511552 -0400 ++++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-23 12:09:03.329501644 -0400 +@@ -48,6 +48,7 @@ + #include "internal.h" + #include "fscache.h" + #include "dns_resolve.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_VFS + +@@ -278,7 +279,7 @@ nfs_fhget(struct super_block *sb, struct + */ + inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; + if (S_ISREG(inode->i_mode)) { +- inode->i_fop = &nfs_file_operations; ++ inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; + inode->i_data.a_ops = &nfs_file_aops; + inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; + } else if (S_ISDIR(inode->i_mode)) { +@@ -530,6 +531,68 @@ out: + return err; + } + ++static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) ++{ ++ atomic_set(&l_ctx->count, 1); ++ l_ctx->lockowner = current->files; ++ l_ctx->pid = current->tgid; ++ INIT_LIST_HEAD(&l_ctx->list); ++} ++ ++static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) ++{ ++ struct nfs_lock_context *pos; ++ ++ list_for_each_entry(pos, &ctx->lock_context.list, list) { ++ if (pos->lockowner != current->files) ++ continue; ++ if (pos->pid != current->tgid) ++ continue; ++ atomic_inc(&pos->count); ++ return pos; ++ } ++ return NULL; ++} ++ ++struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) ++{ ++ struct nfs_lock_context *res, *new = NULL; ++ struct inode *inode = ctx->path.dentry->d_inode; ++ ++ spin_lock(&inode->i_lock); ++ res = __nfs_find_lock_context(ctx); ++ if (res == NULL) { ++ spin_unlock(&inode->i_lock); ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (new == NULL) ++ return NULL; ++ nfs_init_lock_context(new); ++ spin_lock(&inode->i_lock); ++ res = __nfs_find_lock_context(ctx); ++ if (res == NULL) { ++ list_add_tail(&new->list, &ctx->lock_context.list); ++ new->open_context = ctx; ++ res = new; ++ new = NULL; ++ } ++ } ++ spin_unlock(&inode->i_lock); ++ kfree(new); ++ return res; ++} ++ ++void nfs_put_lock_context(struct nfs_lock_context *l_ctx) ++{ ++ struct nfs_open_context *ctx = l_ctx->open_context; ++ struct inode *inode = ctx->path.dentry->d_inode; ++ ++ if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) ++ return; ++ list_del(&l_ctx->list); ++ spin_unlock(&inode->i_lock); ++ kfree(l_ctx); ++} ++ + /** + * nfs_close_context - Common close_context() routine NFSv2/v3 + * @ctx: pointer to context +@@ -566,11 +629,11 @@ static struct nfs_open_context *alloc_nf + path_get(&ctx->path); + ctx->cred = get_rpccred(cred); + ctx->state = NULL; +- ctx->lockowner = current->files; + ctx->flags = 0; + ctx->error = 0; + ctx->dir_cookie = 0; +- atomic_set(&ctx->count, 1); ++ nfs_init_lock_context(&ctx->lock_context); ++ ctx->lock_context.open_context = ctx; + } + return ctx; + } +@@ -578,15 +641,16 @@ static struct nfs_open_context *alloc_nf + struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) + { + if (ctx != NULL) +- atomic_inc(&ctx->count); ++ atomic_inc(&ctx->lock_context.count); + return ctx; + } ++EXPORT_SYMBOL(get_nfs_open_context); + + static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) + { + struct inode *inode = ctx->path.dentry->d_inode; + +- if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) ++ if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) + return; + list_del(&ctx->list); + spin_unlock(&inode->i_lock); +@@ -933,6 +997,7 @@ void nfs_fattr_init(struct nfs_fattr *fa + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); + } ++EXPORT_SYMBOL(nfs_fattr_init); + + struct nfs_fattr *nfs_alloc_fattr(void) + { +@@ -1142,6 +1207,14 @@ static int nfs_update_inode(struct inode + server->fsid = fattr->fsid; + + /* ++ * file needs layout commit, server attributes may be stale ++ */ ++ if (layoutcommit_needed(nfsi) && nfsi->change_attr >= fattr->change_attr) { ++ dprintk("NFS: %s: layoutcommit is needed for file %s/%ld\n", ++ __func__, inode->i_sb->s_id, inode->i_ino); ++ return 0; ++ } ++ /* + * Update the read time so we don't revalidate too often. + */ + nfsi->read_cache_jiffies = fattr->time_start; +@@ -1340,9 +1413,10 @@ static int nfs_update_inode(struct inode + */ + void nfs4_clear_inode(struct inode *inode) + { ++ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); ++ + /* If we are holding a delegation, return it! */ + nfs_inode_return_delegation_noreclaim(inode); +- /* First call standard NFS clear_inode() code */ + nfs_clear_inode(inode); + } + #endif +@@ -1367,7 +1441,10 @@ struct inode *nfs_alloc_inode(struct sup + + void nfs_destroy_inode(struct inode *inode) + { +- kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); ++ struct nfs_inode *nfsi = NFS_I(inode); ++ ++ pnfs_destroy_layout(nfsi); ++ kmem_cache_free(nfs_inode_cachep, nfsi); + } + + static inline void nfs4_init_once(struct nfs_inode *nfsi) +@@ -1377,6 +1454,11 @@ static inline void nfs4_init_once(struct + nfsi->delegation = NULL; + nfsi->delegation_state = 0; + init_rwsem(&nfsi->rwsem); ++#ifdef CONFIG_NFS_V4_1 ++ init_waitqueue_head(&nfsi->lo_waitq); ++ nfsi->pnfs_layout_suspend = 0; ++ nfsi->layout = NULL; ++#endif /* CONFIG_NFS_V4_1 */ + #endif + } + +@@ -1488,6 +1570,12 @@ static int __init init_nfs_fs(void) + if (err) + goto out0; + ++#ifdef CONFIG_NFS_V4_1 ++ err = pnfs_initialize(); ++ if (err) ++ goto out00; ++#endif /* CONFIG_NFS_V4_1 */ ++ + #ifdef CONFIG_PROC_FS + rpc_proc_register(&nfs_rpcstat); + #endif +@@ -1498,6 +1586,10 @@ out: + #ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); + #endif ++#ifdef CONFIG_NFS_V4_1 ++out00: ++ pnfs_uninitialize(); ++#endif /* CONFIG_NFS_V4_1 */ + nfs_destroy_directcache(); + out0: + nfs_destroy_writepagecache(); +@@ -1531,6 +1623,9 @@ static void __exit exit_nfs_fs(void) + #ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); + #endif ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_uninitialize(); ++#endif + unregister_nfs_fs(); + nfs_fs_proc_exit(); + nfsiod_stop(); +diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/internal.h +--- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-08-23 12:08:29.042511552 -0400 ++++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-23 12:09:03.330502148 -0400 +@@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_serv + struct nfs_fattr *); + extern void nfs_mark_client_ready(struct nfs_client *clp, int state); + extern int nfs4_check_client_ready(struct nfs_client *clp); ++extern int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++ const struct sockaddr *sa2); ++extern int nfs4_set_client(struct nfs_server *server, ++ const char *hostname, ++ const struct sockaddr *addr, ++ const size_t addrlen, ++ const char *ip_addr, ++ rpc_authflavor_t authflavour, ++ int proto, const struct rpc_timeout *timeparms, ++ u32 minorversion); + #ifdef CONFIG_PROC_FS + extern int __init nfs_fs_proc_init(void); + extern void nfs_fs_proc_exit(void); +@@ -201,6 +211,8 @@ extern const u32 nfs41_maxwrite_overhead + extern struct rpc_procinfo nfs4_procedures[]; + #endif + ++extern int nfs4_recover_expired_lease(struct nfs_client *clp); ++ + /* proc.c */ + void nfs_close_context(struct nfs_open_context *ctx, int is_sync); + +@@ -248,10 +260,31 @@ extern int nfs4_get_rootfh(struct nfs_se + #endif + + /* read.c */ ++extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); ++extern int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); + extern void nfs_read_prepare(struct rpc_task *task, void *calldata); + + /* write.c */ ++extern int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs); + extern void nfs_write_prepare(struct rpc_task *task, void *calldata); ++extern void nfs_mark_list_commit(struct list_head *head); + #ifdef CONFIG_MIGRATION + extern int nfs_migrate_page(struct address_space *, + struct page *, struct page *); +diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kconfig +--- linux-2.6.34.noarch/fs/nfs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-08-23 12:09:03.331395814 -0400 +@@ -79,10 +79,48 @@ config NFS_V4_1 + depends on NFS_V4 && EXPERIMENTAL + help + This option enables support for minor version 1 of the NFSv4 protocol +- (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. ++ (RFC5661) including support for the parallel NFS (pNFS) features ++ in the kernel's NFS client. + + Unless you're an NFS developer, say N. + ++config PNFS_FILE_LAYOUT ++ tristate "NFS client support for the pNFS nfs-files layout (DEVELOPER ONLY)" ++ depends on NFS_FS && NFS_V4_1 ++ default y ++ help ++ This option enables support for the pNFS nfs-files layout. ++ ++ Unless you're an NFS developer, say N. ++ ++config PNFS_OBJLAYOUT ++ tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD ++ help ++ Say M here if you want your pNFS client to support the Objects Layout Driver. ++ Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and ++ upper level driver (SCSI_OSD_ULD). ++ ++ If unsure, say N. ++ ++config PNFS_PANLAYOUT ++ tristate "Provide support for the Panasas OSD Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on PNFS_OBJLAYOUT ++ help ++ Say M or y here if you want your pNFS client to support the Panasas OSD Layout Driver. ++ ++ If unsure, say N. ++ ++config PNFS_BLOCK ++ tristate "Provide a pNFS block client (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 ++ select MD ++ select BLK_DEV_DM ++ help ++ Say M or y here if you want your pNfs client to support the block protocol ++ ++ If unsure, say N. ++ + config ROOT_NFS + bool "Root file system on NFS" + depends on NFS_FS=y && IP_PNP +diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Makefile +--- linux-2.6.34.noarch/fs/nfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-08-23 12:09:03.331395814 -0400 +@@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4x + delegation.o idmap.o \ + callback.o callback_xdr.o callback_proc.o \ + nfs4namespace.o ++nfs-$(CONFIG_NFS_V4_1) += pnfs.o + nfs-$(CONFIG_SYSCTL) += sysctl.o + nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o ++ ++obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o ++nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o ++ ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ ++obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ +diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs3proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-08-23 12:08:29.045525837 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-23 12:09:03.332511640 -0400 +@@ -833,6 +833,7 @@ const struct nfs_rpc_ops nfs_v3_clientop + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs3_dir_inode_operations, + .file_inode_ops = &nfs3_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs3_proc_get_root, + .getattr = nfs3_proc_getattr, + .setattr = nfs3_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-23 12:09:03.333512111 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-08-23 12:09:03.334491472 -0400 +@@ -0,0 +1,765 @@ ++/* ++ * linux/fs/nfs/nfs4filelayout.c ++ * ++ * Module for the pnfs nfs4 file layout driver. ++ * Defines all I/O and Policy interface operations, plus code ++ * to register itself with the pNFS client. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfs4filelayout.h" ++#include "nfs4_fs.h" ++#include "internal.h" ++#include "pnfs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Dean Hildebrand "); ++MODULE_DESCRIPTION("The NFSv4 file layout driver"); ++ ++/* Callback operations to the pNFS client */ ++struct pnfs_client_operations *pnfs_callback_ops; ++ ++/* Forward declaration */ ++struct layoutdriver_io_operations filelayout_io_operations; ++ ++int ++filelayout_initialize_mountpoint(struct nfs_server *nfss, ++ const struct nfs_fh *mntfh) ++{ ++ int status = nfs4_alloc_init_deviceid_cache(nfss->nfs_client, ++ nfs4_fl_free_deviceid_callback); ++ if (status) { ++ printk(KERN_WARNING "%s: deviceid cache could not be " ++ "initialized\n", __func__); ++ return status; ++ } ++ dprintk("%s: deviceid cache has been initialized successfully\n", ++ __func__); ++ return 0; ++} ++ ++/* Uninitialize a mountpoint by destroying its device list */ ++int ++filelayout_uninitialize_mountpoint(struct nfs_server *nfss) ++{ ++ dprintk("--> %s\n", __func__); ++ ++ if (nfss->pnfs_curr_ld && nfss->nfs_client->cl_devid_cache) ++ nfs4_put_deviceid_cache(nfss->nfs_client); ++ return 0; ++} ++ ++/* This function is used by the layout driver to calculate the ++ * offset of the file on the dserver based on whether the ++ * layout type is STRIPE_DENSE or STRIPE_SPARSE ++ */ ++static loff_t ++filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ ++ switch (flseg->stripe_type) { ++ case STRIPE_SPARSE: ++ return offset; ++ ++ case STRIPE_DENSE: ++ { ++ u32 stripe_width; ++ u64 tmp, off; ++ u32 unit = flseg->stripe_unit; ++ ++ stripe_width = unit * FILE_DSADDR(lseg)->stripe_count; ++ tmp = off = offset - flseg->pattern_offset; ++ do_div(tmp, stripe_width); ++ return tmp * unit + do_div(off, unit); ++ } ++ default: ++ BUG(); ++ } ++ ++ /* We should never get here... just to stop the gcc warning */ ++ return 0; ++} ++ ++/* ++ * Call ops for the async read/write cases ++ * In the case of dense layouts, the offset needs to be reset to its ++ * original value. ++ */ ++static void filelayout_read_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ if (rdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ rdata->args.offset, rdata->fldata.orig_offset); ++ rdata->args.offset = rdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ rdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_read_release(void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ put_lseg(rdata->pdata.lseg); ++ rdata->pdata.lseg = NULL; ++ rdata->pdata.call_ops->rpc_release(data); ++} ++ ++static void filelayout_write_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ if (wdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ wdata->args.offset, wdata->fldata.orig_offset); ++ wdata->args.offset = wdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_write_release(void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ put_lseg(wdata->pdata.lseg); ++ wdata->pdata.lseg = NULL; ++ wdata->pdata.call_ops->rpc_release(data); ++} ++ ++struct rpc_call_ops filelayout_read_call_ops = { ++ .rpc_call_prepare = nfs_read_prepare, ++ .rpc_call_done = filelayout_read_call_done, ++ .rpc_release = filelayout_read_release, ++}; ++ ++struct rpc_call_ops filelayout_write_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_write_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* Perform sync or async reads. ++ * ++ * An optimization for the NFS file layout driver ++ * allows the original read/write data structs to be passed in the ++ * last argument. ++ * ++ * TODO: join with write_pagelist? ++ */ ++static enum pnfs_try_status ++filelayout_read_pagelist(struct nfs_read_data *data, unsigned nr_pages) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ dprintk("--> %s ino %lu nr_pages %d pgbase %u req %Zu@%llu\n", ++ __func__, data->inode->i_ino, nr_pages, ++ data->args.pgbase, (size_t)data->args.count, offset); ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s USE DS:ip %x %s\n", __func__, ++ htonl(ds->ds_ip_addr), ds->r_addr); ++ ++ /* just try the first data server for the index..*/ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ ++ /* ++ * Now get the file offset on the dserver ++ * Set the read offset to this offset, and ++ * save the original offset in orig_offset ++ * In the case of aync reads, the offset will be reset in the ++ * call_ops->rpc_call_done() routine. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* Perform an asynchronous read */ ++ nfs_initiate_read(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_read_call_ops); ++ ++ data->pdata.pnfs_error = 0; ++ ++ return PNFS_ATTEMPTED; ++} ++ ++/* Perform async writes. */ ++static enum pnfs_try_status ++filelayout_write_pagelist(struct nfs_write_data *data, unsigned nr_pages, int sync) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu %s\n", __func__, ++ data->inode->i_ino, sync, (size_t) data->args.count, offset, ++ htonl(ds->ds_ip_addr), ntohs(ds->ds_port), ds->r_addr); ++ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ /* ++ * Get the file offset on the dserver. Set the write offset to ++ * this offset and save the original offset. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* ++ * Perform an asynchronous write The offset will be reset in the ++ * call_ops->rpc_call_done() routine ++ */ ++ nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_write_call_ops, sync); ++ ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++} ++ ++/* ++ * Create a filelayout layout structure and return it. The pNFS client ++ * will use the pnfs_layout_type type to refer to the layout for this ++ * inode from now on. ++ */ ++static struct pnfs_layout_type * ++filelayout_alloc_layout(struct inode *inode) ++{ ++ struct nfs4_filelayout *flp; ++ ++ dprintk("NFS_FILELAYOUT: allocating layout\n"); ++ flp = kzalloc(sizeof(struct nfs4_filelayout), GFP_KERNEL); ++ return flp ? &flp->fl_layout : NULL; ++} ++ ++/* Free a filelayout layout structure */ ++static void ++filelayout_free_layout(struct pnfs_layout_type *lo) ++{ ++ dprintk("NFS_FILELAYOUT: freeing layout\n"); ++ kfree(FILE_LO(lo)); ++} ++ ++/* ++ * filelayout_check_layout() ++ * ++ * Make sure layout segment parameters are sane WRT the device. ++ * ++ * Notes: ++ * 1) current code insists that # stripe index = # data servers in ds_list ++ * which is wrong. ++ * 2) pattern_offset is ignored and must == 0 which is wrong; ++ * 3) the pattern_offset needs to be a mutliple of the stripe unit. ++ * 4) stripe unit is multiple of page size ++ */ ++ ++static int ++filelayout_check_layout(struct pnfs_layout_type *lo, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct nfs4_filelayout_segment *fl = LSEG_LD_DATA(lseg); ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ int status = -EINVAL; ++ struct nfs_server *nfss = NFS_SERVER(PNFS_INODE(lo)); ++ ++ dprintk("--> %s\n", __func__); ++ dsaddr = nfs4_pnfs_device_item_find(nfss->nfs_client, &fl->dev_id); ++ if (dsaddr == NULL) { ++ dsaddr = get_device_info(PNFS_INODE(lo), &fl->dev_id); ++ if (dsaddr == NULL) { ++ dprintk("%s NO device for dev_id %s\n", ++ __func__, deviceid_fmt(&fl->dev_id)); ++ goto out; ++ } ++ } ++ if (fl->first_stripe_index < 0 || ++ fl->first_stripe_index > dsaddr->stripe_count) { ++ dprintk("%s Bad first_stripe_index %d\n", ++ __func__, fl->first_stripe_index); ++ goto out; ++ } ++ ++ if (fl->pattern_offset != 0) { ++ dprintk("%s Unsupported no-zero pattern_offset %Ld\n", ++ __func__, fl->pattern_offset); ++ goto out; ++ } ++ ++ if (fl->stripe_unit % PAGE_SIZE) { ++ dprintk("%s Stripe unit (%u) not page aligned\n", ++ __func__, fl->stripe_unit); ++ goto out; ++ } ++ ++ /* XXX only support SPARSE packing. Don't support use MDS open fh */ ++ if (!(fl->num_fh == 1 || fl->num_fh == dsaddr->ds_num)) { ++ dprintk("%s num_fh %u not equal to 1 or ds_num %u\n", ++ __func__, fl->num_fh, dsaddr->ds_num); ++ goto out; ++ } ++ ++ if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { ++ dprintk("%s Stripe unit (%u) not aligned with rsize %u " ++ "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, ++ nfss->wsize); ++ } ++ ++ /* reference the device */ ++ nfs4_set_layout_deviceid(lseg, &dsaddr->deviceid); ++ ++ status = 0; ++out: ++ dprintk("--> %s returns %d\n", __func__, status); ++ return status; ++} ++ ++static void _filelayout_free_lseg(struct pnfs_layout_segment *lseg); ++static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl); ++ ++/* Decode layout and store in layoutid. Overwrite any existing layout ++ * information for this file. ++ */ ++static int ++filelayout_set_layout(struct nfs4_filelayout *flo, ++ struct nfs4_filelayout_segment *fl, ++ struct nfs4_pnfs_layoutget_res *lgr) ++{ ++ uint32_t *p = (uint32_t *)lgr->layout.buf; ++ uint32_t nfl_util; ++ int i; ++ ++ dprintk("%s: set_layout_map Begin\n", __func__); ++ ++ memcpy(&fl->dev_id, p, NFS4_PNFS_DEVICEID4_SIZE); ++ p += XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE); ++ nfl_util = be32_to_cpup(p++); ++ if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) ++ fl->commit_through_mds = 1; ++ if (nfl_util & NFL4_UFLG_DENSE) ++ fl->stripe_type = STRIPE_DENSE; ++ else ++ fl->stripe_type = STRIPE_SPARSE; ++ fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; ++ ++ if (!flo->stripe_unit) ++ flo->stripe_unit = fl->stripe_unit; ++ else if (flo->stripe_unit != fl->stripe_unit) { ++ printk(KERN_NOTICE "%s: updating strip_unit from %u to %u\n", ++ __func__, flo->stripe_unit, fl->stripe_unit); ++ flo->stripe_unit = fl->stripe_unit; ++ } ++ ++ fl->first_stripe_index = be32_to_cpup(p++); ++ p = xdr_decode_hyper(p, &fl->pattern_offset); ++ fl->num_fh = be32_to_cpup(p++); ++ ++ dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu dev_id %s\n", ++ __func__, nfl_util, fl->num_fh, fl->first_stripe_index, ++ fl->pattern_offset, deviceid_fmt(&fl->dev_id)); ++ ++ if (fl->num_fh * sizeof(struct nfs_fh) > 2*PAGE_SIZE) { ++ fl->fh_array = vmalloc(fl->num_fh * sizeof(struct nfs_fh)); ++ if (fl->fh_array) ++ memset(fl->fh_array, 0, ++ fl->num_fh * sizeof(struct nfs_fh)); ++ } else { ++ fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh), ++ GFP_KERNEL); ++ } ++ if (!fl->fh_array) ++ return -ENOMEM; ++ ++ for (i = 0; i < fl->num_fh; i++) { ++ /* fh */ ++ fl->fh_array[i].size = be32_to_cpup(p++); ++ if (sizeof(struct nfs_fh) < fl->fh_array[i].size) { ++ printk(KERN_ERR "Too big fh %d received %d\n", ++ i, fl->fh_array[i].size); ++ /* Layout is now invalid, pretend it doesn't exist */ ++ filelayout_free_fh_array(fl); ++ fl->num_fh = 0; ++ break; ++ } ++ memcpy(fl->fh_array[i].data, p, fl->fh_array[i].size); ++ p += XDR_QUADLEN(fl->fh_array[i].size); ++ dprintk("DEBUG: %s: fh len %d\n", __func__, ++ fl->fh_array[i].size); ++ } ++ ++ return 0; ++} ++ ++static struct pnfs_layout_segment * ++filelayout_alloc_lseg(struct pnfs_layout_type *layoutid, ++ struct nfs4_pnfs_layoutget_res *lgr) ++{ ++ struct nfs4_filelayout *flo = FILE_LO(layoutid); ++ struct pnfs_layout_segment *lseg; ++ int rc; ++ ++ dprintk("--> %s\n", __func__); ++ lseg = kzalloc(sizeof(struct pnfs_layout_segment) + ++ sizeof(struct nfs4_filelayout_segment), GFP_KERNEL); ++ if (!lseg) ++ return NULL; ++ ++ rc = filelayout_set_layout(flo, LSEG_LD_DATA(lseg), lgr); ++ ++ if (rc != 0 || filelayout_check_layout(layoutid, lseg)) { ++ _filelayout_free_lseg(lseg); ++ lseg = NULL; ++ } ++ return lseg; ++} ++ ++static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) ++{ ++ if (fl->num_fh * sizeof(struct nfs_fh) > 2*PAGE_SIZE) ++ vfree(fl->fh_array); ++ else ++ kfree(fl->fh_array); ++ ++ fl->fh_array = NULL; ++} ++ ++static void ++_filelayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ filelayout_free_fh_array(LSEG_LD_DATA(lseg)); ++ kfree(lseg); ++} ++ ++static void ++filelayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ dprintk("--> %s\n", __func__); ++ nfs4_unset_layout_deviceid(lseg, lseg->deviceid, ++ nfs4_fl_free_deviceid_callback); ++ _filelayout_free_lseg(lseg); ++} ++ ++/* Allocate a new nfs_write_data struct and initialize */ ++static struct nfs_write_data * ++filelayout_clone_write_data(struct nfs_write_data *old) ++{ ++ static struct nfs_write_data *new; ++ ++ new = nfs_commitdata_alloc(); ++ if (!new) ++ goto out; ++ kref_init(&new->refcount); ++ new->parent = old; ++ kref_get(&old->refcount); ++ new->inode = old->inode; ++ new->cred = old->cred; ++ new->args.offset = 0; ++ new->args.count = 0; ++ new->res.count = 0; ++ new->res.fattr = &new->fattr; ++ nfs_fattr_init(&new->fattr); ++ new->res.verf = &new->verf; ++ new->args.context = get_nfs_open_context(old->args.context); ++ new->pdata.lseg = NULL; ++ new->pdata.call_ops = old->pdata.call_ops; ++ new->pdata.how = old->pdata.how; ++out: ++ return new; ++} ++ ++static void filelayout_commit_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static struct rpc_call_ops filelayout_commit_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_commit_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* ++ * Execute a COMMIT op to the MDS or to each data server on which a page ++ * in 'pages' exists. ++ * Invoke the pnfs_commit_complete callback. ++ */ ++enum pnfs_try_status ++filelayout_commit(struct nfs_write_data *data, int sync) ++{ ++ LIST_HEAD(head); ++ struct nfs_page *req; ++ loff_t file_offset = 0; ++ u16 idx, i; ++ struct list_head **ds_page_list = NULL; ++ u16 *indices_used; ++ int num_indices_seen = 0; ++ const struct rpc_call_ops *call_ops; ++ struct rpc_clnt *clnt; ++ struct nfs_write_data **clone_list = NULL; ++ struct nfs_write_data *dsdata; ++ struct nfs4_pnfs_ds *ds; ++ ++ dprintk("%s data %p sync %d\n", __func__, data, sync); ++ ++ /* Alloc room for both in one go */ ++ ds_page_list = kzalloc((NFS4_PNFS_MAX_MULTI_CNT + 1) * ++ (sizeof(u16) + sizeof(struct list_head *)), ++ GFP_KERNEL); ++ if (!ds_page_list) ++ goto mem_error; ++ indices_used = (u16 *) (ds_page_list + NFS4_PNFS_MAX_MULTI_CNT + 1); ++ /* ++ * Sort pages based on which ds to send to. ++ * MDS is given index equal to NFS4_PNFS_MAX_MULTI_CNT. ++ * Note we are assuming there is only a single lseg in play. ++ * When that is not true, we could first sort on lseg, then ++ * sort within each as we do here. ++ */ ++ while (!list_empty(&data->pages)) { ++ req = nfs_list_entry(data->pages.next); ++ nfs_list_remove_request(req); ++ if (!req->wb_lseg || ++ ((struct nfs4_filelayout_segment *) ++ LSEG_LD_DATA(req->wb_lseg))->commit_through_mds) ++ idx = NFS4_PNFS_MAX_MULTI_CNT; ++ else { ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ idx = nfs4_fl_calc_ds_index(req->wb_lseg, file_offset); ++ } ++ if (ds_page_list[idx]) { ++ /* Already seen this idx */ ++ list_add(&req->wb_list, ds_page_list[idx]); ++ } else { ++ /* New idx not seen so far */ ++ list_add_tail(&req->wb_list, &head); ++ indices_used[num_indices_seen++] = idx; ++ } ++ ds_page_list[idx] = &req->wb_list; ++ } ++ /* Once created, clone must be released via call_op */ ++ clone_list = kzalloc(num_indices_seen * ++ sizeof(struct nfs_write_data *), GFP_KERNEL); ++ if (!clone_list) ++ goto mem_error; ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ clone_list[i] = filelayout_clone_write_data(data); ++ if (!clone_list[i]) ++ goto mem_error; ++ } ++ clone_list[i] = data; ++ /* ++ * Now send off the RPCs to each ds. Note that it is important ++ * that any RPC to the MDS be sent last (or at least after all ++ * clones have been made.) ++ */ ++ for (i = 0; i < num_indices_seen; i++) { ++ dsdata = clone_list[i]; ++ idx = indices_used[i]; ++ list_cut_position(&dsdata->pages, &head, ds_page_list[idx]); ++ if (idx == NFS4_PNFS_MAX_MULTI_CNT) { ++ call_ops = data->pdata.call_ops;; ++ clnt = NFS_CLIENT(dsdata->inode); ++ ds = NULL; ++ } else { ++ struct nfs_fh *fh; ++ ++ call_ops = &filelayout_commit_call_ops; ++ req = nfs_list_entry(dsdata->pages.next); ++ ds = nfs4_fl_prepare_ds(req->wb_lseg, idx); ++ if (!ds) { ++ /* Trigger retry of this chunk through MDS */ ++ dsdata->task.tk_status = -EIO; ++ data->pdata.call_ops->rpc_release(dsdata); ++ continue; ++ } ++ clnt = ds->ds_clp->cl_rpcclient; ++ dsdata->fldata.ds_nfs_client = ds->ds_clp; ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ fh = nfs4_fl_select_ds_fh(req->wb_lseg, file_offset); ++ if (fh) ++ dsdata->args.fh = fh; ++ } ++ dprintk("%s: Initiating commit: %llu USE DS:\n", ++ __func__, file_offset); ++ print_ds(ds); ++ ++ /* Send COMMIT to data server */ ++ nfs_initiate_commit(dsdata, clnt, call_ops, sync); ++ } ++ kfree(clone_list); ++ kfree(ds_page_list); ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++ ++ mem_error: ++ if (clone_list) { ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ if (!clone_list[i]) ++ break; ++ data->pdata.call_ops->rpc_release(clone_list[i]); ++ } ++ kfree(clone_list); ++ } ++ kfree(ds_page_list); ++ /* One of these will be empty, but doesn't hurt to do both */ ++ nfs_mark_list_commit(&head); ++ nfs_mark_list_commit(&data->pages); ++ data->pdata.call_ops->rpc_release(data); ++ return PNFS_ATTEMPTED; ++} ++ ++/* Return the stripesize for the specified file */ ++ssize_t ++filelayout_get_stripesize(struct pnfs_layout_type *layoutid) ++{ ++ struct nfs4_filelayout *flo = FILE_LO(layoutid); ++ ++ return flo->stripe_unit; ++} ++ ++/* ++ * filelayout_pg_test(). Called by nfs_can_coalesce_requests() ++ * ++ * return 1 : coalesce page ++ * return 0 : don't coalesce page ++ */ ++int ++filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ u64 p_stripe, r_stripe; ++ ++ if (pgio->pg_boundary == 0) ++ return 1; ++ p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; ++ r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; ++ ++ do_div(p_stripe, pgio->pg_boundary); ++ do_div(r_stripe, pgio->pg_boundary); ++ ++ return (p_stripe == r_stripe); ++} ++ ++struct layoutdriver_io_operations filelayout_io_operations = { ++ .commit = filelayout_commit, ++ .read_pagelist = filelayout_read_pagelist, ++ .write_pagelist = filelayout_write_pagelist, ++ .alloc_layout = filelayout_alloc_layout, ++ .free_layout = filelayout_free_layout, ++ .alloc_lseg = filelayout_alloc_lseg, ++ .free_lseg = filelayout_free_lseg, ++ .initialize_mountpoint = filelayout_initialize_mountpoint, ++ .uninitialize_mountpoint = filelayout_uninitialize_mountpoint, ++}; ++ ++struct layoutdriver_policy_operations filelayout_policy_operations = { ++ .flags = PNFS_USE_RPC_CODE, ++ .get_stripesize = filelayout_get_stripesize, ++ .pg_test = filelayout_pg_test, ++}; ++ ++struct pnfs_layoutdriver_type filelayout_type = { ++ .id = LAYOUT_NFSV4_1_FILES, ++ .name = "LAYOUT_NFSV4_1_FILES", ++ .ld_io_ops = &filelayout_io_operations, ++ .ld_policy_ops = &filelayout_policy_operations, ++}; ++ ++static int __init nfs4filelayout_init(void) ++{ ++ printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", ++ __func__); ++ ++ /* ++ * Need to register file_operations struct with global list to indicate ++ * that NFS4 file layout is a possible pNFS I/O module ++ */ ++ pnfs_callback_ops = pnfs_register_layoutdriver(&filelayout_type); ++ ++ return 0; ++} ++ ++static void __exit nfs4filelayout_exit(void) ++{ ++ printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", ++ __func__); ++ ++ /* Unregister NFS4 file layout driver with pNFS client*/ ++ pnfs_unregister_layoutdriver(&filelayout_type); ++} ++ ++module_init(nfs4filelayout_init); ++module_exit(nfs4filelayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-23 12:09:03.334491472 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-23 12:09:03.335501543 -0400 +@@ -0,0 +1,636 @@ ++/* ++ * linux/fs/nfs/nfs4filelayoutdev.c ++ * ++ * Device operations for the pnfs nfs4 file layout driver. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * Garth Goodson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include "nfs4filelayout.h" ++#include "internal.h" ++#include "nfs4_fs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++DEFINE_SPINLOCK(nfs4_ds_cache_lock); ++static LIST_HEAD(nfs4_data_server_cache); ++ ++void ++print_ds(struct nfs4_pnfs_ds *ds) ++{ ++ if (ds == NULL) { ++ dprintk("%s NULL device \n", __func__); ++ return; ++ } ++ dprintk(" ip_addr %x\n", ntohl(ds->ds_ip_addr)); ++ dprintk(" port %hu\n", ntohs(ds->ds_port)); ++ dprintk(" client %p\n", ds->ds_clp); ++ dprintk(" ref count %d\n", atomic_read(&ds->ds_count)); ++ if (ds->ds_clp) ++ dprintk(" cl_exchange_flags %x\n", ++ ds->ds_clp->cl_exchange_flags); ++ dprintk(" ip:port %s\n", ds->r_addr); ++} ++ ++void ++print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr) ++{ ++ int i; ++ ++ dprintk("%s dsaddr->ds_num %d\n", __func__, ++ dsaddr->ds_num); ++ for (i = 0; i < dsaddr->ds_num; i++) ++ print_ds(dsaddr->ds_list[i]); ++} ++ ++/* Debugging function assuming a 64bit major/minor split of the deviceid */ ++char * ++deviceid_fmt(const struct pnfs_deviceid *dev_id) ++{ ++ static char buf[17]; ++ uint32_t *p = (uint32_t *)dev_id->data; ++ uint64_t major, minor; ++ ++ p = xdr_decode_hyper(p, &major); ++ p = xdr_decode_hyper(p, &minor); ++ ++ sprintf(buf, "%08llu %08llu", major, minor); ++ return buf; ++} ++ ++/* nfs4_ds_cache_lock is held */ ++static inline struct nfs4_pnfs_ds * ++_data_server_lookup(u32 ip_addr, u32 port) ++{ ++ struct nfs4_pnfs_ds *ds; ++ ++ dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", ++ ntohl(ip_addr), ntohs(port)); ++ ++ list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { ++ if (ds->ds_ip_addr == ip_addr && ++ ds->ds_port == port) { ++ return ds; ++ } ++ } ++ return NULL; ++} ++ ++/* Create an rpc to the data server defined in 'dev_list' */ ++static int ++nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) ++{ ++ struct nfs_server *tmp; ++ struct sockaddr_in sin; ++ struct rpc_clnt *mds_clnt = mds_srv->client; ++ struct nfs_client *clp = mds_srv->nfs_client; ++ struct sockaddr *mds_addr; ++ int err = 0; ++ ++ dprintk("--> %s ip:port %s au_flavor %d\n", __func__, ++ ds->r_addr, mds_clnt->cl_auth->au_flavor); ++ ++ sin.sin_family = AF_INET; ++ sin.sin_addr.s_addr = ds->ds_ip_addr; ++ sin.sin_port = ds->ds_port; ++ ++ /* ++ * If this DS is also the MDS, use the MDS session only if the ++ * MDS exchangeid flags show the EXCHGID4_FLAG_USE_PNFS_DS pNFS role. ++ */ ++ mds_addr = (struct sockaddr *)&clp->cl_addr; ++ if (nfs_sockaddr_cmp((struct sockaddr *)&sin, mds_addr)) { ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO "ip:port %s is not a pNFS Data " ++ "Server\n", ds->r_addr); ++ err = -ENODEV; ++ } else { ++ atomic_inc(&clp->cl_count); ++ ds->ds_clp = clp; ++ dprintk("%s Using MDS Session for DS\n", __func__); ++ } ++ goto out; ++ } ++ ++ /* Temporay server for nfs4_set_client */ ++ tmp = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); ++ if (!tmp) ++ goto out; ++ ++ /* ++ * Set a retrans, timeout interval, and authflavor equual to the MDS ++ * values. Use the MDS nfs_client cl_ipaddr field so as to use the ++ * same co_ownerid as the MDS. ++ */ ++ err = nfs4_set_client(tmp, ++ mds_srv->nfs_client->cl_hostname, ++ (struct sockaddr *)&sin, ++ sizeof(struct sockaddr), ++ mds_srv->nfs_client->cl_ipaddr, ++ mds_clnt->cl_auth->au_flavor, ++ IPPROTO_TCP, ++ mds_clnt->cl_xprt->timeout, ++ 1 /* minorversion */); ++ if (err < 0) ++ goto out_free; ++ ++ clp = tmp->nfs_client; ++ ++ /* Ask for only the EXCHGID4_FLAG_USE_PNFS_DS pNFS role */ ++ dprintk("%s EXCHANGE_ID for clp %p\n", __func__, clp); ++ clp->cl_exchange_flags = EXCHGID4_FLAG_USE_PNFS_DS; ++ ++ err = nfs4_recover_expired_lease(clp); ++ if (!err) ++ err = nfs4_check_client_ready(clp); ++ if (err) ++ goto out_put; ++ ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO "ip:port %s is not a pNFS Data Server\n", ++ ds->r_addr); ++ err = -ENODEV; ++ goto out_put; ++ } ++ /* ++ * Mask the (possibly) returned EXCHGID4_FLAG_USE_PNFS_MDS pNFS role ++ * The is_ds_only_session depends on this. ++ */ ++ clp->cl_exchange_flags &= ~EXCHGID4_FLAG_USE_PNFS_MDS; ++ /* ++ * Set DS lease equal to the MDS lease, renewal is scheduled in ++ * create_session ++ */ ++ spin_lock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time; ++ spin_unlock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_last_renewal = jiffies; ++ ++ clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); ++ ds->ds_clp = clp; ++ ++ dprintk("%s: ip=%x, port=%hu, rpcclient %p\n", __func__, ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), ++ clp->cl_rpcclient); ++out_free: ++ kfree(tmp); ++out: ++ dprintk("%s Returns %d\n", __func__, err); ++ return err; ++out_put: ++ nfs_put_client(clp); ++ goto out_free; ++} ++ ++static void ++destroy_ds(struct nfs4_pnfs_ds *ds) ++{ ++ dprintk("--> %s\n", __func__); ++ print_ds(ds); ++ ++ if (ds->ds_clp) ++ nfs_put_client(ds->ds_clp); ++ kfree(ds); ++} ++ ++static void ++nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) ++{ ++ struct nfs4_pnfs_ds *ds; ++ int i; ++ ++ dprintk("%s: device id=%s\n", __func__, ++ deviceid_fmt(&dsaddr->deviceid.de_id)); ++ ++ for (i = 0; i < dsaddr->ds_num; i++) { ++ ds = dsaddr->ds_list[i]; ++ if (ds != NULL) { ++ if (atomic_dec_and_lock(&ds->ds_count, ++ &nfs4_ds_cache_lock)) { ++ list_del_init(&ds->ds_node); ++ spin_unlock(&nfs4_ds_cache_lock); ++ destroy_ds(ds); ++ } ++ } ++ } ++ kfree(dsaddr->stripe_indices); ++ kfree(dsaddr); ++} ++ ++void ++nfs4_fl_free_deviceid_callback(struct kref *kref) ++{ ++ struct nfs4_deviceid *device = ++ container_of(kref, struct nfs4_deviceid, de_kref); ++ struct nfs4_file_layout_dsaddr *dsaddr = ++ container_of(device, struct nfs4_file_layout_dsaddr, deviceid); ++ ++ nfs4_fl_free_deviceid(dsaddr); ++} ++ ++static void ++nfs4_pnfs_ds_add(struct inode *inode, struct nfs4_pnfs_ds **dsp, ++ u32 ip_addr, u32 port, char *r_addr, int len) ++{ ++ struct nfs4_pnfs_ds *tmp_ds, *ds; ++ ++ *dsp = NULL; ++ ++ ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL); ++ if (!ds) ++ return; ++ ++ spin_lock(&nfs4_ds_cache_lock); ++ tmp_ds = _data_server_lookup(ip_addr, port); ++ if (tmp_ds == NULL) { ++ ds->ds_ip_addr = ip_addr; ++ ds->ds_port = port; ++ strncpy(ds->r_addr, r_addr, len); ++ atomic_set(&ds->ds_count, 1); ++ INIT_LIST_HEAD(&ds->ds_node); ++ ds->ds_clp = NULL; ++ list_add(&ds->ds_node, &nfs4_data_server_cache); ++ *dsp = ds; ++ dprintk("%s add new data server ip 0x%x\n", __func__, ++ ds->ds_ip_addr); ++ spin_unlock(&nfs4_ds_cache_lock); ++ } else { ++ atomic_inc(&tmp_ds->ds_count); ++ *dsp = tmp_ds; ++ dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", ++ __func__, tmp_ds->ds_ip_addr, ++ atomic_read(&tmp_ds->ds_count)); ++ spin_unlock(&nfs4_ds_cache_lock); ++ kfree(ds); ++ } ++} ++ ++static struct nfs4_pnfs_ds * ++decode_and_add_ds(uint32_t **pp, struct inode *inode) ++{ ++ struct nfs4_pnfs_ds *ds = NULL; ++ char r_addr[29]; /* max size of ip/port string */ ++ int len; ++ u32 ip_addr, port; ++ int tmp[6]; ++ uint32_t *p = *pp; ++ ++ dprintk("%s enter\n", __func__); ++ /* check and skip r_netid */ ++ len = be32_to_cpup(p++); ++ /* "tcp" */ ++ if (len != 3) { ++ printk("%s: ERROR: non TCP r_netid len %d\n", ++ __func__, len); ++ goto out_err; ++ } ++ /* ++ * Read the bytes into a temporary buffer ++ * XXX: should probably sanity check them ++ */ ++ tmp[0] = be32_to_cpup(p++); ++ ++ len = be32_to_cpup(p++); ++ if (len >= sizeof(r_addr)) { ++ printk("%s: ERROR: Device ip/port too long (%d)\n", ++ __func__, len); ++ goto out_err; ++ } ++ memcpy(r_addr, p, len); ++ p += XDR_QUADLEN(len); ++ *pp = p; ++ r_addr[len] = '\0'; ++ sscanf(r_addr, "%d.%d.%d.%d.%d.%d", &tmp[0], &tmp[1], ++ &tmp[2], &tmp[3], &tmp[4], &tmp[5]); ++ ip_addr = htonl((tmp[0]<<24) | (tmp[1]<<16) | (tmp[2]<<8) | (tmp[3])); ++ port = htons((tmp[4] << 8) | (tmp[5])); ++ ++ nfs4_pnfs_ds_add(inode, &ds, ip_addr, port, r_addr, len); ++ ++ dprintk("%s: addr:port string = %s\n", __func__, r_addr); ++ return ds; ++out_err: ++ dprintk("%s returned NULL\n", __func__); ++ return NULL; ++} ++ ++/* Decode opaque device data and return the result */ ++static struct nfs4_file_layout_dsaddr* ++decode_device(struct inode *ino, struct pnfs_device *pdev) ++{ ++ int i, dummy; ++ u32 cnt, num; ++ u8 *indexp; ++ uint32_t *p = (u32 *)pdev->area, *indicesp; ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ ++ /* Get the stripe count (number of stripe index) */ ++ cnt = be32_to_cpup(p++); ++ dprintk("%s stripe count %d\n", __func__, cnt); ++ if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { ++ printk(KERN_WARNING "%s: stripe count %d greater than " ++ "supported maximum %d\n", __func__, ++ cnt, NFS4_PNFS_MAX_STRIPE_CNT); ++ goto out_err; ++ } ++ ++ /* Check the multipath list count */ ++ indicesp = p; ++ p += XDR_QUADLEN(cnt << 2); ++ num = be32_to_cpup(p++); ++ dprintk("%s ds_num %u\n", __func__, num); ++ if (num > NFS4_PNFS_MAX_MULTI_CNT) { ++ printk(KERN_WARNING "%s: multipath count %d greater than " ++ "supported maximum %d\n", __func__, ++ num, NFS4_PNFS_MAX_MULTI_CNT); ++ goto out_err; ++ } ++ dsaddr = kzalloc(sizeof(*dsaddr) + ++ (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), ++ GFP_KERNEL); ++ if (!dsaddr) ++ goto out_err; ++ ++ dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL); ++ if (!dsaddr->stripe_indices) ++ goto out_err_free; ++ ++ dsaddr->stripe_count = cnt; ++ dsaddr->ds_num = num; ++ ++ memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ ++ /* Go back an read stripe indices */ ++ p = indicesp; ++ indexp = &dsaddr->stripe_indices[0]; ++ for (i = 0; i < dsaddr->stripe_count; i++) { ++ dummy = be32_to_cpup(p++); ++ *indexp = dummy; /* bound by NFS4_PNFS_MAX_MULTI_CNT */ ++ indexp++; ++ } ++ /* Skip already read multipath list count */ ++ p++; ++ ++ for (i = 0; i < dsaddr->ds_num; i++) { ++ int j; ++ ++ dummy = be32_to_cpup(p++); /* multipath count */ ++ if (dummy > 1) { ++ printk(KERN_WARNING ++ "%s: Multipath count %d not supported, " ++ "skipping all greater than 1\n", __func__, ++ dummy); ++ } ++ for (j = 0; j < dummy; j++) { ++ if (j == 0) { ++ dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); ++ if (dsaddr->ds_list[i] == NULL) ++ goto out_err_free; ++ } else { ++ u32 len; ++ /* skip extra multipath */ ++ len = be32_to_cpup(p++); ++ p += XDR_QUADLEN(len); ++ len = be32_to_cpup(p++); ++ p += XDR_QUADLEN(len); ++ continue; ++ } ++ } ++ } ++ nfs4_init_deviceid_node(&dsaddr->deviceid); ++ ++ return dsaddr; ++ ++out_err_free: ++ nfs4_fl_free_deviceid(dsaddr); ++out_err: ++ dprintk("%s ERROR: returning NULL\n", __func__); ++ return NULL; ++} ++ ++/* ++ * Decode the opaque device specified in 'dev' ++ * and add it to the list of available devices. ++ * If the deviceid is already cached, nfs4_add_deviceid will return ++ * a pointer to the cached struct and throw away the new. ++ */ ++static struct nfs4_file_layout_dsaddr* ++decode_and_add_device(struct inode *inode, struct pnfs_device *dev) ++{ ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ struct nfs4_deviceid *d; ++ ++ dsaddr = decode_device(inode, dev); ++ if (!dsaddr) { ++ printk(KERN_WARNING "%s: Could not decode or add device\n", ++ __func__); ++ return NULL; ++ } ++ ++ d = nfs4_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, ++ &dsaddr->deviceid); ++ ++ return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); ++} ++ ++/* ++ * Retrieve the information for dev_id, add it to the list ++ * of available devices, and return it. ++ */ ++struct nfs4_file_layout_dsaddr * ++get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id) ++{ ++ struct pnfs_device *pdev = NULL; ++ u32 max_resp_sz; ++ int max_pages; ++ struct page **pages = NULL; ++ struct nfs4_file_layout_dsaddr *dsaddr = NULL; ++ int rc, i; ++ struct nfs_server *server = NFS_SERVER(inode); ++ ++ /* ++ * Use the session max response size as the basis for setting ++ * GETDEVICEINFO's maxcount ++ */ ++ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; ++ max_pages = max_resp_sz >> PAGE_SHIFT; ++ dprintk("%s inode %p max_resp_sz %u max_pages %d\n", ++ __func__, inode, max_resp_sz, max_pages); ++ ++ pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL); ++ if (pdev == NULL) ++ return NULL; ++ ++ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); ++ if (pages == NULL) { ++ kfree(pdev); ++ return NULL; ++ } ++ for (i = 0; i < max_pages; i++) { ++ pages[i] = alloc_page(GFP_KERNEL); ++ if (!pages[i]) ++ goto out_free; ++ } ++ ++ /* set pdev->area */ ++ pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); ++ if (!pdev->area) ++ goto out_free; ++ ++ memcpy(&pdev->dev_id, dev_id, NFS4_PNFS_DEVICEID4_SIZE); ++ pdev->layout_type = LAYOUT_NFSV4_1_FILES; ++ pdev->pages = pages; ++ pdev->pgbase = 0; ++ pdev->pglen = PAGE_SIZE * max_pages; ++ pdev->mincount = 0; ++ /* TODO: Update types when CB_NOTIFY_DEVICEID is available */ ++ pdev->dev_notify_types = 0; ++ ++ rc = pnfs_callback_ops->nfs_getdeviceinfo(server, pdev); ++ dprintk("%s getdevice info returns %d\n", __func__, rc); ++ if (rc) ++ goto out_free; ++ ++ /* ++ * Found new device, need to decode it and then add it to the ++ * list of known devices for this mountpoint. ++ */ ++ dsaddr = decode_and_add_device(inode, pdev); ++out_free: ++ if (pdev->area != NULL) ++ vunmap(pdev->area); ++ for (i = 0; i < max_pages; i++) ++ __free_page(pages[i]); ++ kfree(pages); ++ kfree(pdev); ++ dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); ++ return dsaddr; ++} ++ ++struct nfs4_file_layout_dsaddr * ++nfs4_pnfs_device_item_find(struct nfs_client *clp, struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ ++ d = nfs4_find_deviceid(clp->cl_devid_cache, id); ++ dprintk("%s device id (%s) nfs4_deviceid %p\n", __func__, ++ deviceid_fmt(id), d); ++ return (d == NULL) ? NULL : ++ container_of(d, struct nfs4_file_layout_dsaddr, deviceid); ++} ++ ++/* ++ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit ++ * Then: ((res + fsi) % dsaddr->stripe_count) ++ */ ++static inline u32 ++_nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ u64 tmp; ++ ++ tmp = offset - flseg->pattern_offset; ++ do_div(tmp, flseg->stripe_unit); ++ tmp += flseg->first_stripe_index; ++ return do_div(tmp, FILE_DSADDR(lseg)->stripe_count); ++} ++ ++u32 ++nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ u32 j; ++ ++ j = _nfs4_fl_calc_j_index(lseg, offset); ++ return FILE_DSADDR(lseg)->stripe_indices[j]; ++} ++ ++struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ u32 i; ++ ++ if (flseg->stripe_type == STRIPE_SPARSE) { ++ if (flseg->num_fh == 1) ++ i = 0; ++ else if (flseg->num_fh == 0) ++ return NULL; ++ else ++ i = nfs4_fl_calc_ds_index(lseg, offset); ++ } else ++ i = _nfs4_fl_calc_j_index(lseg, offset); ++ return &flseg->fh_array[i]; ++} ++ ++struct nfs4_pnfs_ds * ++nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ ++ dsaddr = FILE_DSADDR(lseg); ++ if (dsaddr->ds_list[ds_idx] == NULL) { ++ printk(KERN_ERR "%s: No data server for device id (%s)!!\n", ++ __func__, deviceid_fmt(&flseg->dev_id)); ++ return NULL; ++ } ++ ++ if (!dsaddr->ds_list[ds_idx]->ds_clp) { ++ int err; ++ ++ err = nfs4_pnfs_ds_create(PNFS_NFS_SERVER(lseg->layout), ++ dsaddr->ds_list[ds_idx]); ++ if (err) { ++ printk(KERN_ERR "%s nfs4_pnfs_ds_create error %d\n", ++ __func__, err); ++ return NULL; ++ } ++ } ++ dprintk("%s: dev_id=%s, ds_idx=%u\n", ++ __func__, deviceid_fmt(&flseg->dev_id), ds_idx); ++ ++ return dsaddr->ds_list[ds_idx]; ++} ++ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-23 12:09:03.335501543 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-08-23 12:09:03.335501543 -0400 +@@ -0,0 +1,97 @@ ++/* ++ * pnfs_nfs4filelayout.h ++ * ++ * NFSv4 file layout driver data structures. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef FS_NFS_NFS4FILELAYOUT_H ++#define FS_NFS_NFS4FILELAYOUT_H ++ ++#include ++#include ++#include ++ ++#define NFS4_PNFS_DEV_HASH_BITS 5 ++#define NFS4_PNFS_DEV_HASH_SIZE (1 << NFS4_PNFS_DEV_HASH_BITS) ++#define NFS4_PNFS_DEV_HASH_MASK (NFS4_PNFS_DEV_HASH_SIZE - 1) ++ ++#define NFS4_PNFS_MAX_STRIPE_CNT 4096 ++#define NFS4_PNFS_MAX_MULTI_CNT 64 /* 256 fit into a u8 stripe_index */ ++#define NFS4_PNFS_MAX_MULTI_DS 2 ++ ++#define FILE_DSADDR(lseg) (container_of(lseg->deviceid, \ ++ struct nfs4_file_layout_dsaddr, \ ++ deviceid)) ++ ++enum stripetype4 { ++ STRIPE_SPARSE = 1, ++ STRIPE_DENSE = 2 ++}; ++ ++/* Individual ip address */ ++struct nfs4_pnfs_ds { ++ struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ ++ u32 ds_ip_addr; ++ u32 ds_port; ++ struct nfs_client *ds_clp; ++ atomic_t ds_count; ++ char r_addr[29]; ++}; ++ ++struct nfs4_file_layout_dsaddr { ++ struct nfs4_deviceid deviceid; ++ u32 stripe_count; ++ u8 *stripe_indices; ++ u32 ds_num; ++ struct nfs4_pnfs_ds *ds_list[1]; ++}; ++ ++struct nfs4_pnfs_dev_hlist { ++ rwlock_t dev_lock; ++ struct hlist_head dev_list[NFS4_PNFS_DEV_HASH_SIZE]; ++}; ++ ++struct nfs4_filelayout_segment { ++ u32 stripe_type; ++ u32 commit_through_mds; ++ u32 stripe_unit; ++ u32 first_stripe_index; ++ u64 pattern_offset; ++ struct pnfs_deviceid dev_id; ++ unsigned int num_fh; ++ struct nfs_fh *fh_array; ++}; ++ ++struct nfs4_filelayout { ++ struct pnfs_layout_type fl_layout; ++ u32 stripe_unit; ++}; ++ ++extern struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset); ++ ++static inline struct nfs4_filelayout * ++FILE_LO(struct pnfs_layout_type *lo) ++{ ++ return container_of(lo, struct nfs4_filelayout, fl_layout); ++} ++ ++extern struct pnfs_client_operations *pnfs_callback_ops; ++ ++extern void nfs4_fl_free_deviceid_callback(struct kref *); ++extern void print_ds(struct nfs4_pnfs_ds *ds); ++char *deviceid_fmt(const struct pnfs_deviceid *dev_id); ++u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset); ++struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, ++ u32 ds_idx); ++extern struct nfs4_file_layout_dsaddr * ++nfs4_pnfs_device_item_find(struct nfs_client *, struct pnfs_deviceid *dev_id); ++struct nfs4_file_layout_dsaddr * ++get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id); ++ ++#endif /* FS_NFS_NFS4FILELAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nfs4_fs.h +--- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-23 12:08:29.047512264 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-23 12:09:03.336490079 -0400 +@@ -45,8 +45,28 @@ enum nfs4_client_state { + NFS4CLNT_RECLAIM_NOGRACE, + NFS4CLNT_DELEGRETURN, + NFS4CLNT_SESSION_RESET, +- NFS4CLNT_SESSION_DRAINING, + NFS4CLNT_RECALL_SLOT, ++ NFS4CLNT_LAYOUT_RECALL, ++}; ++ ++enum nfs4_session_state { ++ NFS4_SESSION_INITING, ++ NFS4_SESSION_DRAINING, ++}; ++ ++struct nfs4_minor_version_ops { ++ u32 minor_version; ++ ++ int (*call_sync)(struct nfs_server *server, ++ struct rpc_message *msg, ++ struct nfs4_sequence_args *args, ++ struct nfs4_sequence_res *res, ++ int cache_reply); ++ int (*validate_stateid)(struct nfs_delegation *, ++ const nfs4_stateid *); ++ const struct nfs4_state_recovery_ops *reboot_recovery_ops; ++ const struct nfs4_state_recovery_ops *nograce_recovery_ops; ++ const struct nfs4_state_maintenance_ops *state_renewal_ops; + }; + + /* +@@ -89,7 +109,6 @@ struct nfs_unique_id { + */ + struct nfs4_state_owner { + struct nfs_unique_id so_owner_id; +- struct nfs_client *so_client; + struct nfs_server *so_server; + struct rb_node so_client_node; + +@@ -99,7 +118,6 @@ struct nfs4_state_owner { + atomic_t so_count; + unsigned long so_flags; + struct list_head so_states; +- struct list_head so_delegations; + struct nfs_seqid_counter so_seqid; + struct rpc_sequence so_sequence; + }; +@@ -125,10 +143,20 @@ enum { + * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) + */ + ++struct nfs4_lock_owner { ++ unsigned int lo_type; ++#define NFS4_ANY_LOCK_TYPE (0U) ++#define NFS4_FLOCK_LOCK_TYPE (1U << 0) ++#define NFS4_POSIX_LOCK_TYPE (1U << 1) ++ union { ++ fl_owner_t posix_owner; ++ pid_t flock_owner; ++ } lo_u; ++}; ++ + struct nfs4_lock_state { + struct list_head ls_locks; /* Other lock stateids */ + struct nfs4_state * ls_state; /* Pointer to open state */ +- fl_owner_t ls_owner; /* POSIX lock owner */ + #define NFS_LOCK_INITIALIZED 1 + int ls_flags; + struct nfs_seqid_counter ls_seqid; +@@ -136,6 +164,7 @@ struct nfs4_lock_state { + struct nfs_unique_id ls_id; + nfs4_stateid ls_stateid; + atomic_t ls_count; ++ struct nfs4_lock_owner ls_owner; + }; + + /* bits for nfs4_state->flags */ +@@ -219,22 +248,34 @@ extern int nfs4_open_revalidate(struct i + extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); + extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, + struct nfs4_fs_locations *fs_locations, struct page *page); ++extern void nfs4_release_lockowner(const struct nfs4_lock_state *); + +-extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[]; +-extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[]; + #if defined(CONFIG_NFS_V4_1) +-extern int nfs4_setup_sequence(struct nfs_client *clp, ++static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) ++{ ++ return server->nfs_client->cl_session; ++} ++ ++extern int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task); + extern void nfs4_destroy_session(struct nfs4_session *session); + extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); ++extern int nfs4_proc_exchange_id(struct nfs_client *, struct rpc_cred *); + extern int nfs4_proc_create_session(struct nfs_client *); + extern int nfs4_proc_destroy_session(struct nfs4_session *); + extern int nfs4_init_session(struct nfs_server *server); + extern int nfs4_proc_get_lease_time(struct nfs_client *clp, + struct nfs_fsinfo *fsinfo); + #else /* CONFIG_NFS_v4_1 */ +-static inline int nfs4_setup_sequence(struct nfs_client *clp, ++static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) ++{ ++ return NULL; ++} ++ ++static inline int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task) + { +@@ -247,12 +288,12 @@ static inline int nfs4_init_session(stru + } + #endif /* CONFIG_NFS_V4_1 */ + +-extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[]; ++extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; + + extern const u32 nfs4_fattr_bitmap[2]; + extern const u32 nfs4_statfs_bitmap[2]; + extern const u32 nfs4_pathconf_bitmap[2]; +-extern const u32 nfs4_fsinfo_bitmap[2]; ++extern const u32 nfs4_fsinfo_bitmap[3]; + extern const u32 nfs4_fs_locations_bitmap[2]; + + /* nfs4renewd.c */ +@@ -284,7 +325,7 @@ extern void nfs41_handle_sequence_flag_e + extern void nfs41_handle_recall_slot(struct nfs_client *clp); + extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); + extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); +-extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); ++extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); + + extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); + extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); +@@ -293,6 +334,7 @@ extern void nfs_increment_lock_seqid(int + extern void nfs_release_seqid(struct nfs_seqid *seqid); + extern void nfs_free_seqid(struct nfs_seqid *seqid); + ++/* write.c */ + extern const nfs4_stateid zero_stateid; + + /* nfs4xdr.c */ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-08-23 12:08:29.050481368 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-23 12:09:03.339481253 -0400 +@@ -49,12 +49,15 @@ + #include + #include + #include ++#include ++#include + + #include "nfs4_fs.h" + #include "delegation.h" + #include "internal.h" + #include "iostat.h" + #include "callback.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PROC + +@@ -67,7 +70,7 @@ struct nfs4_opendata; + static int _nfs4_proc_open(struct nfs4_opendata *data); + static int _nfs4_recover_proc_open(struct nfs4_opendata *data); + static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); +-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); ++static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, struct nfs_client *); + static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, +@@ -125,11 +128,16 @@ const u32 nfs4_pathconf_bitmap[2] = { + 0 + }; + +-const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE ++const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE + | FATTR4_WORD0_MAXREAD + | FATTR4_WORD0_MAXWRITE + | FATTR4_WORD0_LEASE_TIME, ++#ifdef CONFIG_NFS_V4_1 ++ FATTR4_WORD1_FS_LAYOUT_TYPES, ++ FATTR4_WORD2_LAYOUT_BLKSIZE ++#else /* CONFIG_NFS_V4_1 */ + 0 ++#endif /* CONFIG_NFS_V4_1 */ + }; + + const u32 nfs4_fs_locations_bitmap[2] = { +@@ -356,7 +364,7 @@ static void nfs41_check_drain_session_co + { + struct rpc_task *task; + +- if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) { ++ if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); + if (task) + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); +@@ -370,12 +378,11 @@ static void nfs41_check_drain_session_co + complete(&ses->complete); + } + +-static void nfs41_sequence_free_slot(const struct nfs_client *clp, +- struct nfs4_sequence_res *res) ++static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) + { + struct nfs4_slot_table *tbl; + +- tbl = &clp->cl_session->fc_slot_table; ++ tbl = &res->sr_session->fc_slot_table; + if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { + /* just wake up the next guy waiting since + * we may have not consumed a slot after all */ +@@ -385,18 +392,17 @@ static void nfs41_sequence_free_slot(con + + spin_lock(&tbl->slot_tbl_lock); + nfs4_free_slot(tbl, res->sr_slotid); +- nfs41_check_drain_session_complete(clp->cl_session); ++ nfs41_check_drain_session_complete(res->sr_session); + spin_unlock(&tbl->slot_tbl_lock); + res->sr_slotid = NFS4_MAX_SLOT_TABLE; + } + +-static void nfs41_sequence_done(struct nfs_client *clp, +- struct nfs4_sequence_res *res, +- int rpc_status) ++static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) + { + unsigned long timestamp; + struct nfs4_slot_table *tbl; + struct nfs4_slot *slot; ++ struct nfs_client *clp; + + /* + * sr_status remains 1 if an RPC level error occurred. The server +@@ -411,13 +417,16 @@ static void nfs41_sequence_done(struct n + if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) + goto out; + ++ tbl = &res->sr_session->fc_slot_table; ++ slot = tbl->slots + res->sr_slotid; ++ + /* Check the SEQUENCE operation status */ +- if (res->sr_status == 0) { +- tbl = &clp->cl_session->fc_slot_table; +- slot = tbl->slots + res->sr_slotid; ++ switch (res->sr_status) { ++ case 0: + /* Update the slot's sequence and clientid lease timer */ + ++slot->seq_nr; + timestamp = res->sr_renewal_time; ++ clp = res->sr_session->clp; + spin_lock(&clp->cl_lock); + if (time_before(clp->cl_last_renewal, timestamp)) + clp->cl_last_renewal = timestamp; +@@ -425,11 +434,39 @@ static void nfs41_sequence_done(struct n + /* Check sequence flags */ + if (atomic_read(&clp->cl_count) > 1) + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); ++ break; ++ case -NFS4ERR_DELAY: ++ /* The server detected a resend of the RPC call and ++ * returned NFS4ERR_DELAY as per Section 2.10.6.2 ++ * of RFC5661. ++ */ ++ dprintk("%s: slot=%d seq=%d: Operation in progress\n", ++ __func__, res->sr_slotid, slot->seq_nr); ++ goto out_retry; ++ default: ++ /* Just update the slot sequence no. */ ++ ++slot->seq_nr; + } + out: + /* The session may be reset by one of the error handlers. */ + dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); +- nfs41_sequence_free_slot(clp, res); ++ nfs41_sequence_free_slot(res); ++ return 1; ++out_retry: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ rpc_restart_call(task); ++ /* FIXME: rpc_restart_call() should be made to return success/fail */ ++ if (RPC_ASSASSINATED(task)) ++ goto out; ++ return 0; ++} ++ ++static int nfs4_sequence_done(struct rpc_task *task, ++ struct nfs4_sequence_res *res) ++{ ++ if (res->sr_session == NULL) ++ return 1; ++ return nfs41_sequence_done(task, res); + } + + /* +@@ -480,12 +517,11 @@ static int nfs41_setup_sequence(struct n + if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) + return 0; + +- memset(res, 0, sizeof(*res)); + res->sr_slotid = NFS4_MAX_SLOT_TABLE; + tbl = &session->fc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); +- if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) && ++ if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && + !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { + /* + * The state manager will wait until the slot table is empty. +@@ -525,6 +561,7 @@ static int nfs41_setup_sequence(struct n + res->sr_session = session; + res->sr_slotid = slotid; + res->sr_renewal_time = jiffies; ++ res->sr_status_flags = 0; + /* + * sr_status is only set in decode_sequence, and so will remain + * set to 1 if an rpc level failure occurs. +@@ -533,33 +570,36 @@ static int nfs41_setup_sequence(struct n + return 0; + } + +-int nfs4_setup_sequence(struct nfs_client *clp, ++int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply, + struct rpc_task *task) + { ++ struct nfs4_session *session = nfs4_get_session(server); + int ret = 0; + ++ if (ds_session) ++ session = ds_session; ++ if (session == NULL) { ++ args->sa_session = NULL; ++ res->sr_session = NULL; ++ goto out; ++ } ++ + dprintk("--> %s clp %p session %p sr_slotid %d\n", +- __func__, clp, clp->cl_session, res->sr_slotid); ++ __func__, session->clp, session, res->sr_slotid); + +- if (!nfs4_has_session(clp)) +- goto out; +- ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply, ++ ret = nfs41_setup_sequence(session, args, res, cache_reply, + task); +- if (ret && ret != -EAGAIN) { +- /* terminate rpc task */ +- task->tk_status = ret; +- task->tk_action = NULL; +- } + out: + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; + } + + struct nfs41_call_sync_data { +- struct nfs_client *clp; ++ const struct nfs_server *seq_server; + struct nfs4_sequence_args *seq_args; + struct nfs4_sequence_res *seq_res; + int cache_reply; +@@ -569,9 +609,9 @@ static void nfs41_call_sync_prepare(stru + { + struct nfs41_call_sync_data *data = calldata; + +- dprintk("--> %s data->clp->cl_session %p\n", __func__, +- data->clp->cl_session); +- if (nfs4_setup_sequence(data->clp, data->seq_args, ++ dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); ++ ++ if (nfs4_setup_sequence(data->seq_server, NULL, data->seq_args, + data->seq_res, data->cache_reply, task)) + return; + rpc_call_start(task); +@@ -587,7 +627,7 @@ static void nfs41_call_sync_done(struct + { + struct nfs41_call_sync_data *data = calldata; + +- nfs41_sequence_done(data->clp, data->seq_res, task->tk_status); ++ nfs41_sequence_done(task, data->seq_res); + } + + struct rpc_call_ops nfs41_call_sync_ops = { +@@ -600,8 +640,7 @@ struct rpc_call_ops nfs41_call_priv_sync + .rpc_call_done = nfs41_call_sync_done, + }; + +-static int nfs4_call_sync_sequence(struct nfs_client *clp, +- struct rpc_clnt *clnt, ++static int nfs4_call_sync_sequence(struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, +@@ -611,13 +650,13 @@ static int nfs4_call_sync_sequence(struc + int ret; + struct rpc_task *task; + struct nfs41_call_sync_data data = { +- .clp = clp, ++ .seq_server = server, + .seq_args = args, + .seq_res = res, + .cache_reply = cache_reply, + }; + struct rpc_task_setup task_setup = { +- .rpc_client = clnt, ++ .rpc_client = server->client, + .rpc_message = msg, + .callback_ops = &nfs41_call_sync_ops, + .callback_data = &data +@@ -642,10 +681,15 @@ int _nfs4_call_sync_session(struct nfs_s + struct nfs4_sequence_res *res, + int cache_reply) + { +- return nfs4_call_sync_sequence(server->nfs_client, server->client, +- msg, args, res, cache_reply, 0); ++ return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0); + } + ++#else ++static int nfs4_sequence_done(struct rpc_task *task, ++ struct nfs4_sequence_res *res) ++{ ++ return 1; ++} + #endif /* CONFIG_NFS_V4_1 */ + + int _nfs4_call_sync(struct nfs_server *server, +@@ -659,18 +703,9 @@ int _nfs4_call_sync(struct nfs_server *s + } + + #define nfs4_call_sync(server, msg, args, res, cache_reply) \ +- (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \ ++ (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \ + &(res)->seq_res, (cache_reply)) + +-static void nfs4_sequence_done(const struct nfs_server *server, +- struct nfs4_sequence_res *res, int rpc_status) +-{ +-#ifdef CONFIG_NFS_V4_1 +- if (nfs4_has_session(server->nfs_client)) +- nfs41_sequence_done(server->nfs_client, res, rpc_status); +-#endif /* CONFIG_NFS_V4_1 */ +-} +- + static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) + { + struct nfs_inode *nfsi = NFS_I(dir); +@@ -745,19 +780,14 @@ static struct nfs4_opendata *nfs4_openda + p->o_arg.server = server; + p->o_arg.bitmask = server->attr_bitmask; + p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; +- if (flags & O_EXCL) { +- if (nfs4_has_persistent_session(server->nfs_client)) { +- /* GUARDED */ +- p->o_arg.u.attrs = &p->attrs; +- memcpy(&p->attrs, attrs, sizeof(p->attrs)); +- } else { /* EXCLUSIVE4_1 */ +- u32 *s = (u32 *) p->o_arg.u.verifier.data; +- s[0] = jiffies; +- s[1] = current->pid; +- } +- } else if (flags & O_CREAT) { ++ if (flags & O_CREAT) { ++ u32 *s; ++ + p->o_arg.u.attrs = &p->attrs; + memcpy(&p->attrs, attrs, sizeof(p->attrs)); ++ s = (u32 *) p->o_arg.u.verifier.data; ++ s[0] = jiffies; ++ s[1] = current->pid; + } + p->c_arg.fh = &p->o_res.fh; + p->c_arg.stateid = &p->o_res.stateid; +@@ -851,8 +881,10 @@ static void update_open_stateflags(struc + static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) + { + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) +- memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data)); +- memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data)); ++ memcpy(state->stateid.u.data, stateid->u.data, ++ sizeof(state->stateid.u.data)); ++ memcpy(state->open_stateid.u.data, stateid->u.data, ++ sizeof(state->open_stateid.u.data)); + switch (fmode) { + case FMODE_READ: + set_bit(NFS_O_RDONLY_STATE, &state->flags); +@@ -880,7 +912,8 @@ static void __update_open_stateid(struct + */ + write_seqlock(&state->seqlock); + if (deleg_stateid != NULL) { +- memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data)); ++ memcpy(state->stateid.u.data, deleg_stateid->u.data, ++ sizeof(state->stateid.u.data)); + set_bit(NFS_DELEGATED_STATE, &state->flags); + } + if (open_stateid != NULL) +@@ -911,7 +944,8 @@ static int update_open_stateid(struct nf + + if (delegation == NULL) + delegation = &deleg_cur->stateid; +- else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0) ++ else if (memcmp(deleg_cur->stateid.u.data, delegation->u.data, ++ NFS4_STATEID_SIZE) != 0) + goto no_delegation_unlock; + + nfs_mark_delegation_referenced(deleg_cur); +@@ -973,7 +1007,8 @@ static struct nfs4_state *nfs4_try_open_ + break; + } + /* Save the delegation */ +- memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); ++ memcpy(stateid.u.data, delegation->stateid.u.data, ++ sizeof(stateid.u.data)); + rcu_read_unlock(); + ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); + if (ret != 0) +@@ -1127,10 +1162,13 @@ static int nfs4_open_recover(struct nfs4 + * Check if we need to update the current stateid. + */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 && +- memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) { ++ memcmp(state->stateid.u.data, state->open_stateid.u.data, ++ sizeof(state->stateid.u.data)) != 0) { + write_seqlock(&state->seqlock); + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) +- memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)); ++ memcpy(state->stateid.u.data, ++ state->open_stateid.u.data, ++ sizeof(state->stateid.u.data)); + write_sequnlock(&state->seqlock); + } + return 0; +@@ -1199,8 +1237,8 @@ static int _nfs4_open_delegation_recall( + if (IS_ERR(opendata)) + return PTR_ERR(opendata); + opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; +- memcpy(opendata->o_arg.u.delegation.data, stateid->data, +- sizeof(opendata->o_arg.u.delegation.data)); ++ memcpy(opendata->o_arg.u.delegation.u.data, stateid->u.data, ++ sizeof(opendata->o_arg.u.delegation.u.data)); + ret = nfs4_open_recover(opendata, state); + nfs4_opendata_put(opendata); + return ret; +@@ -1258,8 +1296,8 @@ static void nfs4_open_confirm_done(struc + if (RPC_ASSASSINATED(task)) + return; + if (data->rpc_status == 0) { +- memcpy(data->o_res.stateid.data, data->c_res.stateid.data, +- sizeof(data->o_res.stateid.data)); ++ memcpy(data->o_res.stateid.u.data, data->c_res.stateid.u.data, ++ sizeof(data->o_res.stateid.u.data)); + nfs_confirm_seqid(&data->owner->so_seqid, 0); + renew_lease(data->o_res.server, data->timestamp); + data->rpc_done = 1; +@@ -1356,13 +1394,13 @@ static void nfs4_open_prepare(struct rpc + } + /* Update sequence id. */ + data->o_arg.id = sp->so_owner_id.id; +- data->o_arg.clientid = sp->so_client->cl_clientid; ++ data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; + if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; + nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); + } + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->o_arg.server->nfs_client, ++ if (nfs4_setup_sequence(data->o_arg.server, NULL, + &data->o_arg.seq_args, + &data->o_res.seq_res, 1, task)) + return; +@@ -1385,8 +1423,8 @@ static void nfs4_open_done(struct rpc_ta + + data->rpc_status = task->tk_status; + +- nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->o_res.seq_res)) ++ return; + + if (RPC_ASSASSINATED(task)) + return; +@@ -1539,9 +1577,8 @@ static int _nfs4_proc_open(struct nfs4_o + return 0; + } + +-static int nfs4_recover_expired_lease(struct nfs_server *server) ++int nfs4_recover_expired_lease(struct nfs_client *clp) + { +- struct nfs_client *clp = server->nfs_client; + unsigned int loop; + int ret; + +@@ -1557,6 +1594,7 @@ static int nfs4_recover_expired_lease(st + } + return ret; + } ++EXPORT_SYMBOL(nfs4_recover_expired_lease); + + /* + * OPEN_EXPIRED: +@@ -1646,7 +1684,7 @@ static int _nfs4_do_open(struct inode *d + dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); + goto out_err; + } +- status = nfs4_recover_expired_lease(server); ++ status = nfs4_recover_expired_lease(server->nfs_client); + if (status != 0) + goto err_put_state_owner; + if (path->dentry->d_inode != NULL) +@@ -1773,7 +1811,7 @@ static int _nfs4_do_setattr(struct inode + if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { + /* Use that stateid */ + } else if (state != NULL) { +- nfs4_copy_stateid(&arg.stateid, state, current->files); ++ nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid); + } else + memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); + +@@ -1838,7 +1876,8 @@ static void nfs4_close_done(struct rpc_t + struct nfs4_state *state = calldata->state; + struct nfs_server *server = NFS_SERVER(calldata->inode); + +- nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status); ++ if (!nfs4_sequence_done(task, &calldata->res.seq_res)) ++ return; + if (RPC_ASSASSINATED(task)) + return; + /* hmm. we are done with the inode, and in the process of freeing +@@ -1858,7 +1897,7 @@ static void nfs4_close_done(struct rpc_t + if (calldata->arg.fmode == 0) + break; + default: +- if (nfs4_async_handle_error(task, server, state) == -EAGAIN) ++ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } + nfs_release_seqid(calldata->arg.seqid); +@@ -1903,7 +1942,7 @@ static void nfs4_close_prepare(struct rp + + nfs_fattr_init(calldata->res.fattr); + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client, ++ if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), NULL, + &calldata->arg.seq_args, &calldata->res.seq_res, + 1, task)) + return; +@@ -2323,6 +2362,9 @@ nfs4_proc_setattr(struct dentry *dentry, + struct nfs4_state *state = NULL; + int status; + ++ if (pnfs_ld_layoutret_on_setattr(inode)) ++ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); ++ + nfs_fattr_init(fattr); + + /* Search for an existing open(O_WRITE) file */ +@@ -2648,8 +2690,9 @@ static int nfs4_proc_unlink_done(struct + { + struct nfs_removeres *res = task->tk_msg.rpc_resp; + +- nfs4_sequence_done(res->server, &res->seq_res, task->tk_status); +- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) ++ if (!nfs4_sequence_done(task, &res->seq_res)) ++ return 0; ++ if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN) + return 0; + update_changeattr(dir, &res->cinfo); + nfs_post_op_update_inode(dir, res->dir_attr); +@@ -3090,18 +3133,31 @@ static int nfs4_proc_pathconf(struct nfs + static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) + { + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; + + dprintk("--> %s\n", __func__); + +- nfs4_sequence_done(server, &data->res.seq_res, task->tk_status); ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; + +- if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, server->nfs_client); ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } + + nfs_invalidate_atime(data->inode); +- if (task->tk_status > 0) ++ if (task->tk_status > 0 && client == server->nfs_client) + renew_lease(server, data->timestamp); + return 0; + } +@@ -3112,20 +3168,56 @@ static void nfs4_proc_read_setup(struct + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + } + ++static void pnfs4_update_write_done(struct nfs_inode *nfsi, struct nfs_write_data *data) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ + static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- +- nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, +- task->tk_status); ++ struct nfs_server *server = NFS_SERVER(inode); ++ struct nfs_client *client = server->nfs_client; + +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; ++ ++#ifdef CONFIG_NFS_V4_1 ++ /* restore original count after retry? */ ++ if (data->pdata.orig_count) { ++ dprintk("%s: restoring original count %u\n", __func__, ++ data->pdata.orig_count); ++ data->args.count = data->pdata.orig_count; ++ } ++ ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } ++ ++ /* ++ * MDS write: renew lease ++ * DS write: update lastbyte written, mark for layout commit ++ */ + if (task->tk_status >= 0) { +- renew_lease(NFS_SERVER(inode), data->timestamp); +- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ if (client == server->nfs_client) { ++ renew_lease(server, data->timestamp); ++ nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ } else ++ pnfs4_update_write_done(NFS_I(inode), data); + } + return 0; + } +@@ -3138,20 +3230,42 @@ static void nfs4_proc_write_setup(struct + data->res.server = server; + data->timestamp = jiffies; + ++#ifdef CONFIG_NFS_V4_1 ++ /* writes to DS use pnfs vector */ ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_WRITE]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; + } + + static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- +- nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, +- task->tk_status); +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { ++ struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; ++ ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS commit\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; ++ ++ if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL, NULL) == -EAGAIN) { + nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + return -EAGAIN; + } +- nfs_refresh_inode(inode, data->res.fattr); ++ if (client == server->nfs_client) ++ nfs_refresh_inode(inode, data->res.fattr); + return 0; + } + +@@ -3161,6 +3275,12 @@ static void nfs4_proc_commit_setup(struc + + data->args.bitmask = server->cache_consistency_bitmask; + data->res.server = server; ++#if defined(CONFIG_NFS_V4_1) ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_COMMIT]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; + } + +@@ -3464,9 +3584,12 @@ static int nfs4_proc_set_acl(struct inod + } + + static int +-_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state) ++nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state, struct nfs_client *clp) + { +- if (!clp || task->tk_status >= 0) ++ if (!clp) ++ clp = server->nfs_client; ++ ++ if (task->tk_status >= 0) + return 0; + switch(task->tk_status) { + case -NFS4ERR_ADMIN_REVOKED: +@@ -3491,8 +3614,9 @@ _nfs4_async_handle_error(struct rpc_task + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: +- dprintk("%s ERROR %d, Reset session\n", __func__, +- task->tk_status); ++ dprintk("%s ERROR %d, Reset session. Exchangeid " ++ "flags 0x%x\n", __func__, task->tk_status, ++ clp->cl_exchange_flags); + nfs4_schedule_state_recovery(clp); + task->tk_status = 0; + return -EAGAIN; +@@ -3512,6 +3636,8 @@ _nfs4_async_handle_error(struct rpc_task + task->tk_status = nfs4_map_errors(task->tk_status); + return 0; + do_state_recovery: ++ if (is_ds_only_client(clp)) ++ return 0; + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); + nfs4_schedule_state_recovery(clp); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) +@@ -3520,12 +3646,6 @@ do_state_recovery: + return -EAGAIN; + } + +-static int +-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) +-{ +- return _nfs4_async_handle_error(task, server, server->nfs_client, state); +-} +- + int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, + unsigned short port, struct rpc_cred *cred, + struct nfs4_setclientid_res *res) +@@ -3641,8 +3761,8 @@ static void nfs4_delegreturn_done(struct + { + struct nfs4_delegreturndata *data = calldata; + +- nfs4_sequence_done(data->res.server, &data->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; + + switch (task->tk_status) { + case -NFS4ERR_STALE_STATEID: +@@ -3651,8 +3771,8 @@ static void nfs4_delegreturn_done(struct + renew_lease(data->res.server, data->timestamp); + break; + default: +- if (nfs4_async_handle_error(task, data->res.server, NULL) == +- -EAGAIN) { ++ if (nfs4_async_handle_error(task, data->res.server, NULL, NULL) ++ == -EAGAIN) { + nfs_restart_rpc(task, data->res.server->nfs_client); + return; + } +@@ -3672,7 +3792,7 @@ static void nfs4_delegreturn_prepare(str + + d_data = (struct nfs4_delegreturndata *)data; + +- if (nfs4_setup_sequence(d_data->res.server->nfs_client, ++ if (nfs4_setup_sequence(d_data->res.server, NULL, + &d_data->args.seq_args, + &d_data->res.seq_res, 1, task)) + return; +@@ -3892,15 +4012,16 @@ static void nfs4_locku_done(struct rpc_t + { + struct nfs4_unlockdata *calldata = data; + +- nfs4_sequence_done(calldata->server, &calldata->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &calldata->res.seq_res)) ++ return; + if (RPC_ASSASSINATED(task)) + return; + switch (task->tk_status) { + case 0: +- memcpy(calldata->lsp->ls_stateid.data, +- calldata->res.stateid.data, +- sizeof(calldata->lsp->ls_stateid.data)); ++ memcpy(calldata->lsp->ls_stateid.u.data, ++ calldata->res.stateid.u.data, ++ sizeof(calldata->lsp->ls_stateid.u. ++ data)); + renew_lease(calldata->server, calldata->timestamp); + break; + case -NFS4ERR_BAD_STATEID: +@@ -3909,7 +4030,7 @@ static void nfs4_locku_done(struct rpc_t + case -NFS4ERR_EXPIRED: + break; + default: +- if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) ++ if (nfs4_async_handle_error(task, calldata->server, NULL, NULL) == -EAGAIN) + nfs_restart_rpc(task, + calldata->server->nfs_client); + } +@@ -3927,7 +4048,7 @@ static void nfs4_locku_prepare(struct rp + return; + } + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence(calldata->server->nfs_client, ++ if (nfs4_setup_sequence(calldata->server, NULL, + &calldata->arg.seq_args, + &calldata->res.seq_res, 1, task)) + return; +@@ -4082,7 +4203,8 @@ static void nfs4_lock_prepare(struct rpc + } else + data->arg.new_lock_owner = 0; + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args, ++ if (nfs4_setup_sequence(data->server, NULL, ++ &data->arg.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +@@ -4101,8 +4223,8 @@ static void nfs4_lock_done(struct rpc_ta + + dprintk("%s: begin!\n", __func__); + +- nfs4_sequence_done(data->server, &data->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; + + data->rpc_status = task->tk_status; + if (RPC_ASSASSINATED(task)) +@@ -4114,8 +4236,8 @@ static void nfs4_lock_done(struct rpc_ta + goto out; + } + if (data->rpc_status == 0) { +- memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, +- sizeof(data->lsp->ls_stateid.data)); ++ memcpy(data->lsp->ls_stateid.u.data, data->res.stateid.u.data, ++ sizeof(data->lsp->ls_stateid.u.data)); + data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; + renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp); + } +@@ -4424,6 +4546,34 @@ out: + return err; + } + ++static void nfs4_release_lockowner_release(void *calldata) ++{ ++ kfree(calldata); ++} ++ ++const struct rpc_call_ops nfs4_release_lockowner_ops = { ++ .rpc_release = nfs4_release_lockowner_release, ++}; ++ ++void nfs4_release_lockowner(const struct nfs4_lock_state *lsp) ++{ ++ struct nfs_server *server = lsp->ls_state->owner->so_server; ++ struct nfs_release_lockowner_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], ++ }; ++ ++ if (server->nfs_client->cl_mvops->minor_version != 0) ++ return; ++ args = kmalloc(sizeof(*args), GFP_NOFS); ++ if (!args) ++ return; ++ args->lock_owner.clientid = server->nfs_client->cl_clientid; ++ args->lock_owner.id = lsp->ls_id.id; ++ msg.rpc_argp = args; ++ rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); ++} ++ + #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" + + int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, +@@ -4526,7 +4676,7 @@ int nfs4_proc_exchange_id(struct nfs_cli + nfs4_verifier verifier; + struct nfs41_exchange_id_args args = { + .client = clp, +- .flags = clp->cl_exchange_flags, ++ .flags = clp->cl_exchange_flags & ~EXCHGID4_FLAG_CONFIRMED_R, + }; + struct nfs41_exchange_id_res res = { + .client = clp, +@@ -4574,6 +4724,7 @@ int nfs4_proc_exchange_id(struct nfs_cli + dprintk("<-- %s status= %d\n", __func__, status); + return status; + } ++EXPORT_SYMBOL(nfs4_proc_exchange_id); + + struct nfs4_get_lease_time_data { + struct nfs4_get_lease_time_args *args; +@@ -4611,7 +4762,8 @@ static void nfs4_get_lease_time_done(str + (struct nfs4_get_lease_time_data *)calldata; + + dprintk("--> %s\n", __func__); +- nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status); ++ if (!nfs41_sequence_done(task, &data->res->lr_seq_res)) ++ return; + switch (task->tk_status) { + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: +@@ -4805,13 +4957,6 @@ struct nfs4_session *nfs4_alloc_session( + if (!session) + return NULL; + +- /* +- * The create session reply races with the server back +- * channel probe. Mark the client NFS_CS_SESSION_INITING +- * so that the client back channel can find the +- * nfs_client struct +- */ +- clp->cl_cons_state = NFS_CS_SESSION_INITING; + init_completion(&session->complete); + + tbl = &session->fc_slot_table; +@@ -4824,6 +4969,8 @@ struct nfs4_session *nfs4_alloc_session( + spin_lock_init(&tbl->slot_tbl_lock); + rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); + ++ session->session_state = 1<clp = clp; + return session; + } +@@ -5040,6 +5187,10 @@ int nfs4_init_session(struct nfs_server + if (!nfs4_has_session(clp)) + return 0; + ++ session = clp->cl_session; ++ if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) ++ return 0; ++ + rsize = server->rsize; + if (rsize == 0) + rsize = NFS_MAX_FILE_IO_SIZE; +@@ -5047,11 +5198,10 @@ int nfs4_init_session(struct nfs_server + if (wsize == 0) + wsize = NFS_MAX_FILE_IO_SIZE; + +- session = clp->cl_session; + session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; + session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; + +- ret = nfs4_recover_expired_lease(server); ++ ret = nfs4_recover_expired_lease(server->nfs_client); + if (!ret) + ret = nfs4_check_client_ready(clp); + return ret; +@@ -5060,69 +5210,70 @@ int nfs4_init_session(struct nfs_server + /* + * Renew the cl_session lease. + */ +-static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +-{ ++struct nfs4_sequence_data { ++ struct nfs_client *clp; + struct nfs4_sequence_args args; + struct nfs4_sequence_res res; +- +- struct rpc_message msg = { +- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], +- .rpc_argp = &args, +- .rpc_resp = &res, +- .rpc_cred = cred, +- }; +- +- args.sa_cache_this = 0; +- +- return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args, +- &res, args.sa_cache_this, 1); +-} ++}; + + static void nfs41_sequence_release(void *data) + { +- struct nfs_client *clp = (struct nfs_client *)data; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); ++ kfree(calldata); ++} ++ ++static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp) ++{ ++ switch(task->tk_status) { ++ case -NFS4ERR_DELAY: ++ case -EKEYEXPIRED: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ return -EAGAIN; ++ default: ++ nfs4_schedule_state_recovery(clp); ++ } ++ return 0; + } + + static void nfs41_sequence_call_done(struct rpc_task *task, void *data) + { +- struct nfs_client *clp = (struct nfs_client *)data; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + +- nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status); ++ if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp)) ++ return; + + if (task->tk_status < 0) { + dprintk("%s ERROR %d\n", __func__, task->tk_status); + if (atomic_read(&clp->cl_count) == 1) + goto out; + +- if (_nfs4_async_handle_error(task, NULL, clp, NULL) +- == -EAGAIN) { +- nfs_restart_rpc(task, clp); ++ if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) { ++ rpc_restart_call_prepare(task); + return; + } + } + dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); + out: +- kfree(task->tk_msg.rpc_argp); +- kfree(task->tk_msg.rpc_resp); +- + dprintk("<-- %s\n", __func__); + } + + static void nfs41_sequence_prepare(struct rpc_task *task, void *data) + { +- struct nfs_client *clp; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + struct nfs4_sequence_args *args; + struct nfs4_sequence_res *res; + +- clp = (struct nfs_client *)data; + args = task->tk_msg.rpc_argp; + res = task->tk_msg.rpc_resp; + +- if (nfs4_setup_sequence(clp, args, res, 0, task)) ++ if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task)) + return; + rpc_call_start(task); + } +@@ -5133,32 +5284,67 @@ static const struct rpc_call_ops nfs41_s + .rpc_release = nfs41_sequence_release, + }; + +-static int nfs41_proc_async_sequence(struct nfs_client *clp, +- struct rpc_cred *cred) ++static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) + { +- struct nfs4_sequence_args *args; +- struct nfs4_sequence_res *res; ++ struct nfs4_sequence_data *calldata; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], + .rpc_cred = cred, + }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = clp->cl_rpcclient, ++ .rpc_message = &msg, ++ .callback_ops = &nfs41_sequence_ops, ++ .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT, ++ }; + + if (!atomic_inc_not_zero(&clp->cl_count)) +- return -EIO; +- args = kzalloc(sizeof(*args), GFP_NOFS); +- res = kzalloc(sizeof(*res), GFP_NOFS); +- if (!args || !res) { +- kfree(args); +- kfree(res); ++ return ERR_PTR(-EIO); ++ calldata = kmalloc(sizeof(*calldata), GFP_NOFS); ++ if (calldata == NULL) { + nfs_put_client(clp); +- return -ENOMEM; ++ return ERR_PTR(-ENOMEM); + } +- res->sr_slotid = NFS4_MAX_SLOT_TABLE; +- msg.rpc_argp = args; +- msg.rpc_resp = res; ++ calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ msg.rpc_argp = &calldata->args; ++ msg.rpc_resp = &calldata->res; ++ calldata->clp = clp; ++ task_setup_data.callback_data = calldata; + +- return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, +- &nfs41_sequence_ops, (void *)clp); ++ return rpc_run_task(&task_setup_data); ++} ++ ++static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred) ++{ ++ struct rpc_task *task; ++ int ret = 0; ++ ++ task = _nfs41_proc_sequence(clp, cred); ++ if (IS_ERR(task)) ++ ret = PTR_ERR(task); ++ else ++ rpc_put_task(task); ++ dprintk("<-- %s status=%d\n", __func__, ret); ++ return ret; ++} ++ ++static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) ++{ ++ struct rpc_task *task; ++ int ret; ++ ++ task = _nfs41_proc_sequence(clp, cred); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ ret = rpc_wait_for_completion_task(task); ++ if (!ret) ++ ret = task->tk_status; ++ rpc_put_task(task); ++out: ++ dprintk("<-- %s status=%d\n", __func__, ret); ++ return ret; + } + + struct nfs4_reclaim_complete_data { +@@ -5172,13 +5358,31 @@ static void nfs4_reclaim_complete_prepar + struct nfs4_reclaim_complete_data *calldata = data; + + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); +- if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, ++ if (nfs41_setup_sequence(calldata->clp->cl_session, ++ &calldata->arg.seq_args, + &calldata->res.seq_res, 0, task)) + return; + + rpc_call_start(task); + } + ++static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) ++{ ++ switch(task->tk_status) { ++ case 0: ++ case -NFS4ERR_COMPLETE_ALREADY: ++ case -NFS4ERR_WRONG_CRED: /* What to do here? */ ++ break; ++ case -NFS4ERR_DELAY: ++ case -EKEYEXPIRED: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ return -EAGAIN; ++ default: ++ nfs4_schedule_state_recovery(clp); ++ } ++ return 0; ++} ++ + static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) + { + struct nfs4_reclaim_complete_data *calldata = data; +@@ -5186,32 +5390,13 @@ static void nfs4_reclaim_complete_done(s + struct nfs4_sequence_res *res = &calldata->res.seq_res; + + dprintk("--> %s\n", __func__); +- nfs41_sequence_done(clp, res, task->tk_status); +- switch (task->tk_status) { +- case 0: +- case -NFS4ERR_COMPLETE_ALREADY: +- break; +- case -NFS4ERR_BADSESSION: +- case -NFS4ERR_DEADSESSION: +- /* +- * Handle the session error, but do not retry the operation, as +- * we have no way of telling whether the clientid had to be +- * reset before we got our reply. If reset, a new wave of +- * reclaim operations will follow, containing their own reclaim +- * complete. We don't want our retry to get on the way of +- * recovery by incorrectly indicating to the server that we're +- * done reclaiming state since the process had to be restarted. +- */ +- _nfs4_async_handle_error(task, NULL, clp, NULL); +- break; +- default: +- if (_nfs4_async_handle_error( +- task, NULL, clp, NULL) == -EAGAIN) { +- rpc_restart_call_prepare(task); +- return; +- } +- } ++ if (!nfs41_sequence_done(task, res)) ++ return; + ++ if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) { ++ rpc_restart_call_prepare(task); ++ return; ++ } + dprintk("<-- %s\n", __func__); + } + +@@ -5268,6 +5453,404 @@ out: + dprintk("<-- %s status=%d\n", __func__, status); + return status; + } ++ ++static void ++nfs4_pnfs_layoutget_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_pnfs_layoutget *lgp = calldata; ++ struct inode *ino = lgp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args, ++ &lgp->res.seq_res, 0, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void nfs4_pnfs_layoutget_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_pnfs_layoutget *lgp = calldata; ++ struct inode *ino = lgp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!nfs4_sequence_done(task, &lgp->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ pnfs_get_layout_done(lgp, task->tk_status); ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ lgp->status = task->tk_status; ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void nfs4_pnfs_layoutget_release(void *calldata) ++{ ++ struct nfs4_pnfs_layoutget *lgp = calldata; ++ ++ dprintk("--> %s\n", __func__); ++ pnfs_layout_release(NFS_I(lgp->args.inode)->layout, NULL); ++ if (lgp->res.layout.buf != NULL) ++ free_page((unsigned long) lgp->res.layout.buf); ++ kfree(calldata); ++ dprintk("<-- %s\n", __func__); ++} ++ ++static const struct rpc_call_ops nfs4_pnfs_layoutget_call_ops = { ++ .rpc_call_prepare = nfs4_pnfs_layoutget_prepare, ++ .rpc_call_done = nfs4_pnfs_layoutget_done, ++ .rpc_release = nfs4_pnfs_layoutget_release, ++}; ++ ++/* FIXME: We need to call nfs4_handle_exception ++ * and deal with retries. ++ * Currently we can't since we release lgp and its contents. ++ */ ++static int _pnfs4_proc_layoutget(struct nfs4_pnfs_layoutget *lgp) ++{ ++ struct nfs_server *server = NFS_SERVER(lgp->args.inode); ++ struct rpc_task *task; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_LAYOUTGET], ++ .rpc_argp = &lgp->args, ++ .rpc_resp = &lgp->res, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = server->client, ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_pnfs_layoutget_call_ops, ++ .callback_data = lgp, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ int status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ ++ lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); ++ if (lgp->res.layout.buf == NULL) { ++ nfs4_pnfs_layoutget_release(lgp); ++ return -ENOMEM; ++ } ++ ++ lgp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = lgp->status; ++ if (status != 0) ++ goto out; ++ status = pnfs_layout_process(lgp); ++out: ++ rpc_put_task(task); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ return status; ++} ++ ++int pnfs4_proc_layoutget(struct nfs4_pnfs_layoutget *lgp) ++{ ++ struct nfs_server *server = NFS_SERVER(lgp->args.inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, _pnfs4_proc_layoutget(lgp), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static void pnfs_layoutcommit_prepare(struct rpc_task *task, void *data) ++{ ++ struct pnfs_layoutcommit_data *ldata = ++ (struct pnfs_layoutcommit_data *)data; ++ struct nfs_server *server = NFS_SERVER(ldata->args.inode); ++ ++ if (nfs4_setup_sequence(server, NULL, &ldata->args.seq_args, ++ &ldata->res.seq_res, 1, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void ++pnfs_layoutcommit_done(struct rpc_task *task, void *calldata) ++{ ++ struct pnfs_layoutcommit_data *data = ++ (struct pnfs_layoutcommit_data *)calldata; ++ struct nfs_server *server = NFS_SERVER(data->args.inode); ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ data->status = task->tk_status; ++} ++ ++static void pnfs_layoutcommit_release(void *lcdata) ++{ ++ struct pnfs_layoutcommit_data *data = ++ (struct pnfs_layoutcommit_data *)lcdata; ++ ++ put_rpccred(data->cred); ++ pnfs_cleanup_layoutcommit(lcdata); ++ pnfs_layoutcommit_free(lcdata); ++ /* Matched by get_layout in pnfs_layoutcommit_inode */ ++ put_layout(data->args.inode); ++} ++ ++static const struct rpc_call_ops pnfs_layoutcommit_ops = { ++ .rpc_call_prepare = pnfs_layoutcommit_prepare, ++ .rpc_call_done = pnfs_layoutcommit_done, ++ .rpc_release = pnfs_layoutcommit_release, ++}; ++ ++/* Execute a layoutcommit to the server */ ++static int ++_pnfs4_proc_layoutcommit(struct pnfs_layoutcommit_data *data, int issync) ++{ ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_LAYOUTCOMMIT], ++ .rpc_argp = &data->args, ++ .rpc_resp = &data->res, ++ .rpc_cred = data->cred, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .task = &data->task, ++ .rpc_client = NFS_CLIENT(data->args.inode), ++ .rpc_message = &msg, ++ .callback_ops = &pnfs_layoutcommit_ops, ++ .callback_data = data, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ struct rpc_task *task; ++ int status = 0; ++ ++ dprintk("NFS: %4d initiating layoutcommit call. %llu@%llu lbw: %llu " ++ "type: %d issync %d\n", ++ data->task.tk_pid, ++ data->args.lseg.length, ++ data->args.lseg.offset, ++ data->args.lastbytewritten, ++ data->args.layout_type, issync); ++ ++ data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = data->status; ++out: ++ dprintk("%s: status %d\n", __func__, status); ++ rpc_put_task(task); ++ return 0; ++} ++ ++int pnfs4_proc_layoutcommit(struct pnfs_layoutcommit_data *data, int issync) ++{ ++ struct nfs4_exception exception = { }; ++ struct nfs_server *server = NFS_SERVER(data->args.inode); ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _pnfs4_proc_layoutcommit(data, issync), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static void ++nfs4_pnfs_layoutreturn_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_pnfs_layoutreturn *lrp = calldata; ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ if (nfs4_setup_sequence(server, NULL, &lrp->args.seq_args, ++ &lrp->res.seq_res, 0, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void nfs4_pnfs_layoutreturn_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_pnfs_layoutreturn *lrp = calldata; ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!nfs4_sequence_done(task, &lrp->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void nfs4_pnfs_layoutreturn_release(void *calldata) ++{ ++ struct nfs4_pnfs_layoutreturn *lrp = calldata; ++ struct pnfs_layout_type *lo = NFS_I(lrp->args.inode)->layout; ++ ++ dprintk("--> %s return_type %d lo %p\n", __func__, ++ lrp->args.return_type, lo); ++ ++ if (lrp->args.return_type == RETURN_FILE) { ++ if (!lrp->res.lrs_present) ++ pnfs_set_layout_stateid(lo, &zero_stateid); ++ pnfs_layout_release(lo, &lrp->args.lseg); ++ } ++ kfree(calldata); ++ dprintk("<-- %s\n", __func__); ++} ++ ++static const struct rpc_call_ops nfs4_pnfs_layoutreturn_call_ops = { ++ .rpc_call_prepare = nfs4_pnfs_layoutreturn_prepare, ++ .rpc_call_done = nfs4_pnfs_layoutreturn_done, ++ .rpc_release = nfs4_pnfs_layoutreturn_release, ++}; ++ ++int _pnfs4_proc_layoutreturn(struct nfs4_pnfs_layoutreturn *lrp, bool issync) ++{ ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ struct rpc_task *task; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_LAYOUTRETURN], ++ .rpc_argp = &lrp->args, ++ .rpc_resp = &lrp->res, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = server->client, ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_pnfs_layoutreturn_call_ops, ++ .callback_data = lrp, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ int status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ lrp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = task->tk_status; ++out: ++ dprintk("<-- %s\n", __func__); ++ rpc_put_task(task); ++ return status; ++} ++ ++int pnfs4_proc_layoutreturn(struct nfs4_pnfs_layoutreturn *lrp, bool issync) ++{ ++ struct nfs_server *server = NFS_SERVER(lrp->args.inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, ++ _pnfs4_proc_layoutreturn(lrp, issync), ++ &exception); ++ } while (exception.retry); ++ ++ return err; ++} ++ ++/* ++ * Retrieve the list of Data Server devices from the MDS. ++ */ ++static int _nfs4_pnfs_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_pnfs_getdevicelist_arg arg = { ++ .fh = fh, ++ .layoutclass = server->pnfs_curr_ld->id, ++ }; ++ struct nfs4_pnfs_getdevicelist_res res = { ++ .devlist = devlist, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_GETDEVICELIST], ++ .rpc_argp = &arg, ++ .rpc_resp = &res, ++ }; ++ int status; ++ ++ dprintk("--> %s\n", __func__); ++ status = nfs4_call_sync(server, &msg, &arg, &res, 0); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ return status; ++} ++ ++int nfs4_pnfs_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_pnfs_getdevicelist(server, fh, devlist), ++ &exception); ++ } while (exception.retry); ++ ++ dprintk("nfs4_pnfs_getdevlist: err=%d, num_devs=%u\n", ++ err, devlist->num_devs); ++ ++ return err; ++} ++ ++int nfs4_pnfs_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) ++{ ++ struct nfs4_pnfs_getdeviceinfo_arg args = { ++ .pdev = pdev, ++ }; ++ struct nfs4_pnfs_getdeviceinfo_res res = { ++ .pdev = pdev, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_GETDEVICEINFO], ++ .rpc_argp = &args, ++ .rpc_resp = &res, ++ }; ++ int status; ++ ++ dprintk("--> %s\n", __func__); ++ status = nfs4_call_sync(server, &msg, &args, &res, 0); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ ++ return status; ++} ++ + #endif /* CONFIG_NFS_V4_1 */ + + struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { +@@ -5325,28 +5908,30 @@ struct nfs4_state_maintenance_ops nfs41_ + }; + #endif + +-/* +- * Per minor version reboot and network partition recovery ops +- */ +- +-struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = { +- &nfs40_reboot_recovery_ops, +-#if defined(CONFIG_NFS_V4_1) +- &nfs41_reboot_recovery_ops, +-#endif ++static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { ++ .minor_version = 0, ++ .call_sync = _nfs4_call_sync, ++ .validate_stateid = nfs4_validate_delegation_stateid, ++ .reboot_recovery_ops = &nfs40_reboot_recovery_ops, ++ .nograce_recovery_ops = &nfs40_nograce_recovery_ops, ++ .state_renewal_ops = &nfs40_state_renewal_ops, + }; + +-struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = { +- &nfs40_nograce_recovery_ops, + #if defined(CONFIG_NFS_V4_1) +- &nfs41_nograce_recovery_ops, +-#endif ++static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { ++ .minor_version = 1, ++ .call_sync = _nfs4_call_sync_session, ++ .validate_stateid = nfs41_validate_delegation_stateid, ++ .reboot_recovery_ops = &nfs41_reboot_recovery_ops, ++ .nograce_recovery_ops = &nfs41_nograce_recovery_ops, ++ .state_renewal_ops = &nfs41_state_renewal_ops, + }; ++#endif + +-struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = { +- &nfs40_state_renewal_ops, ++const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { ++ [0] = &nfs_v4_0_minor_ops, + #if defined(CONFIG_NFS_V4_1) +- &nfs41_state_renewal_ops, ++ [1] = &nfs_v4_1_minor_ops, + #endif + }; + +@@ -5364,6 +5949,7 @@ const struct nfs_rpc_ops nfs_v4_clientop + .dentry_ops = &nfs4_dentry_operations, + .dir_inode_ops = &nfs4_dir_inode_operations, + .file_inode_ops = &nfs4_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs4_proc_get_root, + .getattr = nfs4_proc_getattr, + .setattr = nfs4_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs/nfs4renewd.c +--- linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-08-23 12:09:03.341491726 -0400 +@@ -54,17 +54,17 @@ + void + nfs4_renew_state(struct work_struct *work) + { +- struct nfs4_state_maintenance_ops *ops; ++ const struct nfs4_state_maintenance_ops *ops; + struct nfs_client *clp = + container_of(work, struct nfs_client, cl_renewd.work); + struct rpc_cred *cred; + long lease; + unsigned long last, now; + +- ops = nfs4_state_renewal_ops[clp->cl_minorversion]; ++ ops = clp->cl_mvops->state_renewal_ops; + dprintk("%s: start\n", __func__); + /* Are there any active superblocks? */ +- if (list_empty(&clp->cl_superblocks)) ++ if (list_empty(&clp->cl_superblocks) && !is_ds_only_client(clp)) + goto out; + spin_lock(&clp->cl_lock); + lease = clp->cl_lease_time; +diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/nfs4state.c +--- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-08-23 12:08:29.052491341 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-23 12:09:03.342373443 -0400 +@@ -53,6 +53,9 @@ + #include "callback.h" + #include "delegation.h" + #include "internal.h" ++#include ++#include ++#include "pnfs.h" + + #define OPENOWNER_POOL_SIZE 8 + +@@ -126,6 +129,11 @@ static int nfs41_setup_state_renewal(str + int status; + struct nfs_fsinfo fsinfo; + ++ if (is_ds_only_client(clp)) { ++ nfs4_schedule_state_renewal(clp); ++ return 0; ++ } ++ + status = nfs4_proc_get_lease_time(clp, &fsinfo); + if (status == 0) { + /* Update lease time and schedule renewal */ +@@ -145,7 +153,9 @@ static void nfs4_end_drain_session(struc + struct nfs4_session *ses = clp->cl_session; + int max_slots; + +- if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) { ++ if (ses == NULL) ++ return; ++ if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + spin_lock(&ses->fc_slot_table.slot_tbl_lock); + max_slots = ses->fc_slot_table.max_slots; + while (max_slots--) { +@@ -167,7 +177,7 @@ static int nfs4_begin_drain_session(stru + struct nfs4_slot_table *tbl = &ses->fc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); +- set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state); ++ set_bit(NFS4_SESSION_DRAINING, &ses->session_state); + if (tbl->highest_used_slotid != -1) { + INIT_COMPLETION(ses->complete); + spin_unlock(&tbl->slot_tbl_lock); +@@ -371,7 +381,6 @@ nfs4_alloc_state_owner(void) + return NULL; + spin_lock_init(&sp->so_lock); + INIT_LIST_HEAD(&sp->so_states); +- INIT_LIST_HEAD(&sp->so_delegations); + rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); + sp->so_seqid.sequence = &sp->so_sequence; + spin_lock_init(&sp->so_sequence.lock); +@@ -384,7 +393,7 @@ static void + nfs4_drop_state_owner(struct nfs4_state_owner *sp) + { + if (!RB_EMPTY_NODE(&sp->so_client_node)) { +- struct nfs_client *clp = sp->so_client; ++ struct nfs_client *clp = sp->so_server->nfs_client; + + spin_lock(&clp->cl_lock); + rb_erase(&sp->so_client_node, &clp->cl_state_owners); +@@ -406,7 +415,6 @@ struct nfs4_state_owner *nfs4_get_state_ + new = nfs4_alloc_state_owner(); + if (new == NULL) + return NULL; +- new->so_client = clp; + new->so_server = server; + new->so_cred = cred; + spin_lock(&clp->cl_lock); +@@ -423,7 +431,7 @@ struct nfs4_state_owner *nfs4_get_state_ + + void nfs4_put_state_owner(struct nfs4_state_owner *sp) + { +- struct nfs_client *clp = sp->so_client; ++ struct nfs_client *clp = sp->so_server->nfs_client; + struct rpc_cred *cred = sp->so_cred; + + if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) +@@ -583,8 +591,24 @@ static void __nfs4_close(struct path *pa + if (!call_close) { + nfs4_put_open_state(state); + nfs4_put_state_owner(owner); +- } else ++ } else { ++ u32 roc_iomode; ++ struct nfs_inode *nfsi = NFS_I(state->inode); ++ ++ if (has_layout(nfsi) && ++ (roc_iomode = pnfs_layout_roc_iomode(nfsi)) != 0) { ++ struct nfs4_pnfs_layout_segment range = { ++ .iomode = roc_iomode, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ ++ pnfs_return_layout(state->inode, &range, NULL, ++ RETURN_FILE, wait); ++ } ++ + nfs4_do_close(path, state, gfp_mask, wait); ++ } + } + + void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) +@@ -602,12 +626,21 @@ void nfs4_close_sync(struct path *path, + * that is compatible with current->files + */ + static struct nfs4_lock_state * +-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) ++__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) + { + struct nfs4_lock_state *pos; + list_for_each_entry(pos, &state->lock_states, ls_locks) { +- if (pos->ls_owner != fl_owner) ++ if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type) + continue; ++ switch (pos->ls_owner.lo_type) { ++ case NFS4_POSIX_LOCK_TYPE: ++ if (pos->ls_owner.lo_u.posix_owner != fl_owner) ++ continue; ++ break; ++ case NFS4_FLOCK_LOCK_TYPE: ++ if (pos->ls_owner.lo_u.flock_owner != fl_pid) ++ continue; ++ } + atomic_inc(&pos->ls_count); + return pos; + } +@@ -619,10 +652,10 @@ __nfs4_find_lock_state(struct nfs4_state + * exists, return an uninitialized one. + * + */ +-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) ++static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) + { + struct nfs4_lock_state *lsp; +- struct nfs_client *clp = state->owner->so_client; ++ struct nfs_client *clp = state->owner->so_server->nfs_client; + + lsp = kzalloc(sizeof(*lsp), GFP_NOFS); + if (lsp == NULL) +@@ -633,7 +666,18 @@ static struct nfs4_lock_state *nfs4_allo + lsp->ls_seqid.sequence = &lsp->ls_sequence; + atomic_set(&lsp->ls_count, 1); + lsp->ls_state = state; +- lsp->ls_owner = fl_owner; ++ lsp->ls_owner.lo_type = type; ++ switch (lsp->ls_owner.lo_type) { ++ case NFS4_FLOCK_LOCK_TYPE: ++ lsp->ls_owner.lo_u.flock_owner = fl_pid; ++ break; ++ case NFS4_POSIX_LOCK_TYPE: ++ lsp->ls_owner.lo_u.posix_owner = fl_owner; ++ break; ++ default: ++ kfree(lsp); ++ return NULL; ++ } + spin_lock(&clp->cl_lock); + nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); + spin_unlock(&clp->cl_lock); +@@ -643,7 +687,7 @@ static struct nfs4_lock_state *nfs4_allo + + static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) + { +- struct nfs_client *clp = lsp->ls_state->owner->so_client; ++ struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client; + + spin_lock(&clp->cl_lock); + nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); +@@ -657,13 +701,13 @@ static void nfs4_free_lock_state(struct + * exists, return an uninitialized one. + * + */ +-static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) ++static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type) + { + struct nfs4_lock_state *lsp, *new = NULL; + + for(;;) { + spin_lock(&state->state_lock); +- lsp = __nfs4_find_lock_state(state, owner); ++ lsp = __nfs4_find_lock_state(state, owner, pid, type); + if (lsp != NULL) + break; + if (new != NULL) { +@@ -674,7 +718,7 @@ static struct nfs4_lock_state *nfs4_get_ + break; + } + spin_unlock(&state->state_lock); +- new = nfs4_alloc_lock_state(state, owner); ++ new = nfs4_alloc_lock_state(state, owner, pid, type); + if (new == NULL) + return NULL; + } +@@ -701,6 +745,8 @@ void nfs4_put_lock_state(struct nfs4_loc + if (list_empty(&state->lock_states)) + clear_bit(LK_STATE_IN_USE, &state->flags); + spin_unlock(&state->state_lock); ++ if (lsp->ls_flags & NFS_LOCK_INITIALIZED) ++ nfs4_release_lockowner(lsp); + nfs4_free_lock_state(lsp); + } + +@@ -728,7 +774,12 @@ int nfs4_set_lock_state(struct nfs4_stat + + if (fl->fl_ops != NULL) + return 0; +- lsp = nfs4_get_lock_state(state, fl->fl_owner); ++ if (fl->fl_flags & FL_POSIX) ++ lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE); ++ else if (fl->fl_flags & FL_FLOCK) ++ lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE); ++ else ++ return -EINVAL; + if (lsp == NULL) + return -ENOMEM; + fl->fl_u.nfs4_fl.owner = lsp; +@@ -740,7 +791,7 @@ int nfs4_set_lock_state(struct nfs4_stat + * Byte-range lock aware utility to initialize the stateid of read/write + * requests. + */ +-void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) ++void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid) + { + struct nfs4_lock_state *lsp; + int seq; +@@ -753,7 +804,7 @@ void nfs4_copy_stateid(nfs4_stateid *dst + return; + + spin_lock(&state->state_lock); +- lsp = __nfs4_find_lock_state(state, fl_owner); ++ lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); + if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) + memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); + spin_unlock(&state->state_lock); +@@ -1031,8 +1082,8 @@ restart: + * Open state on this file cannot be recovered + * All we can do is revert to using the zero stateid. + */ +- memset(state->stateid.data, 0, +- sizeof(state->stateid.data)); ++ memset(state->stateid.u.data, 0, ++ sizeof(state->stateid.u.data)); + /* Mark the file as being 'closed' */ + state->state = 0; + break; +@@ -1041,11 +1092,11 @@ restart: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: +- nfs4_state_mark_reclaim_nograce(sp->so_client, state); ++ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_NO_GRACE: +- nfs4_state_mark_reclaim_nograce(sp->so_client, state); ++ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: +@@ -1120,8 +1171,7 @@ static void nfs4_state_end_reclaim_reboo + if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + return; + +- nfs4_reclaim_complete(clp, +- nfs4_reboot_recovery_ops[clp->cl_minorversion]); ++ nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); + + for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); +@@ -1211,8 +1261,8 @@ restart: + static int nfs4_check_lease(struct nfs_client *clp) + { + struct rpc_cred *cred; +- struct nfs4_state_maintenance_ops *ops = +- nfs4_state_renewal_ops[clp->cl_minorversion]; ++ const struct nfs4_state_maintenance_ops *ops = ++ clp->cl_mvops->state_renewal_ops; + int status = -NFS4ERR_EXPIRED; + + /* Is the client already known to have an expired lease? */ +@@ -1235,8 +1285,8 @@ out: + static int nfs4_reclaim_lease(struct nfs_client *clp) + { + struct rpc_cred *cred; +- struct nfs4_state_recovery_ops *ops = +- nfs4_reboot_recovery_ops[clp->cl_minorversion]; ++ const struct nfs4_state_recovery_ops *ops = ++ clp->cl_mvops->reboot_recovery_ops; + int status = -ENOENT; + + cred = ops->get_clid_cred(clp); +@@ -1421,6 +1471,7 @@ static void nfs4_state_manager(struct nf + } + clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); ++ pnfs_destroy_all_layouts(clp); + } + + if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { +@@ -1444,7 +1495,7 @@ static void nfs4_state_manager(struct nf + /* First recover reboot state... */ + if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, +- nfs4_reboot_recovery_ops[clp->cl_minorversion]); ++ clp->cl_mvops->reboot_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + continue; +@@ -1458,7 +1509,7 @@ static void nfs4_state_manager(struct nf + /* Now recover expired state... */ + if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, +- nfs4_nograce_recovery_ops[clp->cl_minorversion]); ++ clp->cl_mvops->nograce_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || + test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) +diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-23 12:08:29.054481400 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-23 12:09:03.346481283 -0400 +@@ -50,8 +50,11 @@ + #include + #include + #include ++#include ++#include + #include "nfs4_fs.h" + #include "internal.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_XDR + +@@ -89,7 +92,7 @@ static int nfs4_stat_to_errno(int); + #define encode_getfh_maxsz (op_encode_hdr_maxsz) + #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ + ((3+NFS4_FHSIZE) >> 2)) +-#define nfs4_fattr_bitmap_maxsz 3 ++#define nfs4_fattr_bitmap_maxsz 4 + #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) + #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) + #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) +@@ -111,7 +114,11 @@ static int nfs4_stat_to_errno(int); + #define encode_restorefh_maxsz (op_encode_hdr_maxsz) + #define decode_restorefh_maxsz (op_decode_hdr_maxsz) + #define encode_fsinfo_maxsz (encode_getattr_maxsz) +-#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) ++/* The 5 accounts for the PNFS attributes, and assumes that at most three ++ * layout types will be returned. ++ */ ++#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ ++ nfs4_fattr_bitmap_maxsz + 8 + 5) + #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) + #define decode_renew_maxsz (op_decode_hdr_maxsz) + #define encode_setclientid_maxsz \ +@@ -202,14 +209,17 @@ static int nfs4_stat_to_errno(int); + #define encode_link_maxsz (op_encode_hdr_maxsz + \ + nfs4_name_maxsz) + #define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) ++#define encode_lockowner_maxsz (7) + #define encode_lock_maxsz (op_encode_hdr_maxsz + \ + 7 + \ +- 1 + encode_stateid_maxsz + 8) ++ 1 + encode_stateid_maxsz + 1 + \ ++ encode_lockowner_maxsz) + #define decode_lock_denied_maxsz \ + (8 + decode_lockowner_maxsz) + #define decode_lock_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) +-#define encode_lockt_maxsz (op_encode_hdr_maxsz + 12) ++#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \ ++ encode_lockowner_maxsz) + #define decode_lockt_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) + #define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \ +@@ -217,6 +227,11 @@ static int nfs4_stat_to_errno(int); + 4) + #define decode_locku_maxsz (op_decode_hdr_maxsz + \ + decode_stateid_maxsz) ++#define encode_release_lockowner_maxsz \ ++ (op_encode_hdr_maxsz + \ ++ encode_lockowner_maxsz) ++#define decode_release_lockowner_maxsz \ ++ (op_decode_hdr_maxsz) + #define encode_access_maxsz (op_encode_hdr_maxsz + 1) + #define decode_access_maxsz (op_decode_hdr_maxsz + 2) + #define encode_symlink_maxsz (op_encode_hdr_maxsz + \ +@@ -302,6 +317,35 @@ static int nfs4_stat_to_errno(int); + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) + #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) + #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) ++#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ ++ encode_verifier_maxsz) ++#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1 + \ ++ decode_verifier_maxsz + \ ++ XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ ++ NFS4_PNFS_DEVICEID4_SIZE)) ++#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ ++ XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE)) ++#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ ++ 4 /*layout type */ + \ ++ 4 /* opaque devaddr4 length */ +\ ++ 4 /* notification bitmap length */ + \ ++ 4 /* notification bitmap */) ++#define encode_layoutget_sz (op_encode_hdr_maxsz + 10 + \ ++ encode_stateid_maxsz) ++#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ ++ decode_stateid_maxsz + \ ++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) ++#define encode_layoutcommit_sz (18 + \ ++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \ ++ op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz) ++#define decode_layoutcommit_maxsz (3 + op_decode_hdr_maxsz) ++#define encode_layoutreturn_sz (8 + op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz + \ ++ 1 /* FIXME: opaque lrf_body always empty at ++ *the moment */) ++#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ ++ 1 + decode_stateid_maxsz) + #else /* CONFIG_NFS_V4_1 */ + #define encode_sequence_maxsz 0 + #define decode_sequence_maxsz 0 +@@ -471,6 +515,12 @@ static int nfs4_stat_to_errno(int); + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_locku_maxsz) ++#define NFS4_enc_release_lockowner_sz \ ++ (compound_encode_hdr_maxsz + \ ++ encode_lockowner_maxsz) ++#define NFS4_dec_release_lockowner_sz \ ++ (compound_decode_hdr_maxsz + \ ++ decode_lockowner_maxsz) + #define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ +@@ -685,6 +735,60 @@ static int nfs4_stat_to_errno(int); + #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_reclaim_complete_maxsz) ++#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_getdevicelist_maxsz) ++#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_getdevicelist_maxsz) ++#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_getdeviceinfo_maxsz) ++#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_getdeviceinfo_maxsz) ++#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_layoutget_sz) ++#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutget_maxsz) ++#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_layoutcommit_sz + \ ++ encode_getattr_maxsz) ++#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutcommit_maxsz + \ ++ decode_getattr_maxsz) ++#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_layoutreturn_sz) ++#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutreturn_maxsz) ++#define NFS4_enc_dswrite_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_write_maxsz) ++#define NFS4_dec_dswrite_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_write_maxsz) ++#define NFS4_enc_dscommit_sz (compound_encode_hdr_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_commit_maxsz) ++#define NFS4_dec_dscommit_sz (compound_decode_hdr_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_commit_maxsz) + + const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_encode_hdr_maxsz + +@@ -915,7 +1019,7 @@ static void encode_close(struct xdr_stre + p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_CLOSE); + *p++ = cpu_to_be32(arg->seqid->sequence->counter); +- xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_close_maxsz; + } +@@ -989,6 +1093,35 @@ static void encode_getattr_two(struct xd + hdr->replen += decode_getattr_maxsz; + } + ++static void ++encode_getattr_three(struct xdr_stream *xdr, ++ uint32_t bm0, uint32_t bm1, uint32_t bm2, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(OP_GETATTR); ++ if (bm2) { ++ p = reserve_space(xdr, 16); ++ *p++ = cpu_to_be32(3); ++ *p++ = cpu_to_be32(bm0); ++ *p++ = cpu_to_be32(bm1); ++ *p = cpu_to_be32(bm2); ++ } else if (bm1) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(2); ++ *p++ = cpu_to_be32(bm0); ++ *p = cpu_to_be32(bm1); ++ } else { ++ p = reserve_space(xdr, 8); ++ *p++ = cpu_to_be32(1); ++ *p = cpu_to_be32(bm0); ++ } ++ hdr->nops++; ++ hdr->replen += decode_getattr_maxsz; ++} ++ + static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { + encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], +@@ -997,8 +1130,11 @@ static void encode_getfattr(struct xdr_s + + static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { +- encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], +- bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); ++ encode_getattr_three(xdr, ++ bitmask[0] & nfs4_fsinfo_bitmap[0], ++ bitmask[1] & nfs4_fsinfo_bitmap[1], ++ bitmask[2] & nfs4_fsinfo_bitmap[2], ++ hdr); + } + + static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) +@@ -1042,6 +1178,17 @@ static inline uint64_t nfs4_lock_length( + return fl->fl_end - fl->fl_start + 1; + } + ++static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 28); ++ p = xdr_encode_hyper(p, lowner->clientid); ++ *p++ = cpu_to_be32(16); ++ p = xdr_encode_opaque_fixed(p, "lock id:", 8); ++ xdr_encode_hyper(p, lowner->id); ++} ++ + /* + * opcode,type,reclaim,offset,length,new_lock_owner = 32 + * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 +@@ -1058,18 +1205,16 @@ static void encode_lock(struct xdr_strea + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + *p = cpu_to_be32(args->new_lock_owner); + if (args->new_lock_owner){ +- p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); ++ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(args->open_seqid->sequence->counter); +- p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->open_stateid->u.data, ++ NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); +- p = xdr_encode_hyper(p, args->lock_owner.clientid); +- *p++ = cpu_to_be32(16); +- p = xdr_encode_opaque_fixed(p, "lock id:", 8); +- xdr_encode_hyper(p, args->lock_owner.id); ++ encode_lockowner(xdr, &args->lock_owner); + } + else { + p = reserve_space(xdr, NFS4_STATEID_SIZE+4); +- p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->lock_stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(args->lock_seqid->sequence->counter); + } + hdr->nops++; +@@ -1080,15 +1225,12 @@ static void encode_lockt(struct xdr_stre + { + __be32 *p; + +- p = reserve_space(xdr, 52); ++ p = reserve_space(xdr, 24); + *p++ = cpu_to_be32(OP_LOCKT); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); +- p = xdr_encode_hyper(p, args->lock_owner.clientid); +- *p++ = cpu_to_be32(16); +- p = xdr_encode_opaque_fixed(p, "lock id:", 8); +- xdr_encode_hyper(p, args->lock_owner.id); ++ encode_lockowner(xdr, &args->lock_owner); + hdr->nops++; + hdr->replen += decode_lockt_maxsz; + } +@@ -1101,13 +1243,25 @@ static void encode_locku(struct xdr_stre + *p++ = cpu_to_be32(OP_LOCKU); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + *p++ = cpu_to_be32(args->seqid->sequence->counter); +- p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->stateid->u.data, ++ NFS4_STATEID_SIZE); + p = xdr_encode_hyper(p, args->fl->fl_start); + xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + hdr->nops++; + hdr->replen += decode_locku_maxsz; + } + ++static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(OP_RELEASE_LOCKOWNER); ++ encode_lockowner(xdr, lowner); ++ hdr->nops++; ++ hdr->replen += decode_release_lockowner_maxsz; ++} ++ + static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) + { + int len = name->len; +@@ -1172,7 +1326,7 @@ static inline void encode_createmode(str + break; + default: + clp = arg->server->nfs_client; +- if (clp->cl_minorversion > 0) { ++ if (clp->cl_mvops->minor_version > 0) { + if (nfs4_has_persistent_session(clp)) { + *p = cpu_to_be32(NFS4_CREATE_GUARDED); + encode_attrs(xdr, arg->u.attrs, arg->server); +@@ -1251,7 +1405,7 @@ static inline void encode_claim_delegate + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); +- xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, stateid->u.data, NFS4_STATEID_SIZE); + encode_string(xdr, name->len, name->name); + } + +@@ -1282,7 +1436,7 @@ static void encode_open_confirm(struct x + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_CONFIRM); +- p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); + hdr->nops++; + hdr->replen += decode_open_confirm_maxsz; +@@ -1294,7 +1448,7 @@ static void encode_open_downgrade(struct + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); +- p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); + encode_share_access(xdr, arg->fmode); + hdr->nops++; +@@ -1324,17 +1478,17 @@ static void encode_putrootfh(struct xdr_ + hdr->replen += decode_putrootfh_maxsz; + } + +-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) ++static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx) + { + nfs4_stateid stateid; + __be32 *p; + + p = reserve_space(xdr, NFS4_STATEID_SIZE); + if (ctx->state != NULL) { +- nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); +- xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); ++ nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); ++ xdr_encode_opaque_fixed(p, stateid.u.data, NFS4_STATEID_SIZE); + } else +- xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, zero_stateid.u.data, NFS4_STATEID_SIZE); + } + + static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) +@@ -1344,7 +1498,7 @@ static void encode_read(struct xdr_strea + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_READ); + +- encode_stateid(xdr, args->context); ++ encode_stateid(xdr, args->context, args->lock_context); + + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->offset); +@@ -1448,7 +1602,7 @@ encode_setacl(struct xdr_stream *xdr, st + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); +- xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, zero_stateid.u.data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 2*4); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(FATTR4_WORD0_ACL); +@@ -1479,7 +1633,7 @@ static void encode_setattr(struct xdr_st + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); +- xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, arg->stateid.u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_setattr_maxsz; + encode_attrs(xdr, arg->iap, server); +@@ -1523,7 +1677,7 @@ static void encode_write(struct xdr_stre + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_WRITE); + +- encode_stateid(xdr, args->context); ++ encode_stateid(xdr, args->context, args->lock_context); + + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, args->offset); +@@ -1542,7 +1696,7 @@ static void encode_delegreturn(struct xd + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + + *p++ = cpu_to_be32(OP_DELEGRETURN); +- xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, stateid->u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_delegreturn_maxsz; + } +@@ -1696,6 +1850,162 @@ static void encode_sequence(struct xdr_s + #endif /* CONFIG_NFS_V4_1 */ + } + ++#ifdef CONFIG_NFS_V4_1 ++static void ++encode_getdevicelist(struct xdr_stream *xdr, ++ const struct nfs4_pnfs_getdevicelist_arg *args, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ nfs4_verifier dummy = { ++ .data = "dummmmmy", ++ }; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_GETDEVICELIST); ++ *p++ = cpu_to_be32(args->layoutclass); ++ *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); ++ xdr_encode_hyper(p, 0ULL); /* cookie */ ++ encode_nfs4_verifier(xdr, &dummy); ++ hdr->nops++; ++} ++ ++static void ++encode_getdeviceinfo(struct xdr_stream *xdr, ++ const struct nfs4_pnfs_getdeviceinfo_arg *args, ++ struct compound_hdr *hdr) ++{ ++ int has_bitmap = (args->pdev->dev_notify_types != 0); ++ int len = 16 + NFS4_PNFS_DEVICEID4_SIZE + (has_bitmap * 4); ++ __be32 *p; ++ ++ p = reserve_space(xdr, len); ++ *p++ = cpu_to_be32(OP_GETDEVICEINFO); ++ p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ *p++ = cpu_to_be32(args->pdev->layout_type); ++ *p++ = cpu_to_be32(args->pdev->pglen + len); /* gdia_maxcount */ ++ *p++ = cpu_to_be32(has_bitmap); /* bitmap length [01] */ ++ if (has_bitmap) ++ *p = cpu_to_be32(args->pdev->dev_notify_types); ++ hdr->nops++; ++} ++ ++static void ++encode_layoutget(struct xdr_stream *xdr, ++ const struct nfs4_pnfs_layoutget_arg *args, ++ struct compound_hdr *hdr) ++{ ++ nfs4_stateid stateid; ++ __be32 *p; ++ ++ p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(OP_LAYOUTGET); ++ *p++ = cpu_to_be32(0); /* Signal layout available */ ++ *p++ = cpu_to_be32(args->type); ++ *p++ = cpu_to_be32(args->lseg.iomode); ++ p = xdr_encode_hyper(p, args->lseg.offset); ++ p = xdr_encode_hyper(p, args->lseg.length); ++ p = xdr_encode_hyper(p, args->minlength); ++ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout); ++ p = xdr_encode_opaque_fixed(p, &stateid.u.data, NFS4_STATEID_SIZE); ++ *p = cpu_to_be32(args->maxcount); ++ ++ dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", ++ __func__, ++ args->type, ++ args->lseg.iomode, ++ (unsigned long)args->lseg.offset, ++ (unsigned long)args->lseg.length, ++ args->maxcount); ++ hdr->nops++; ++ hdr->replen += decode_layoutget_maxsz; ++} ++ ++static int ++encode_layoutcommit(struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *args, ++ struct compound_hdr *hdr) ++{ ++ struct layoutdriver_io_operations *ld_io_ops = ++ NFS_SERVER(args->inode)->pnfs_curr_ld->ld_io_ops; ++ __be32 *p; ++ ++ dprintk("%s: %llu@%llu lbw: %llu type: %d\n", __func__, ++ args->lseg.length, args->lseg.offset, args->lastbytewritten, ++ args->layout_type); ++ ++ p = reserve_space(xdr, 40 + NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); ++ p = xdr_encode_hyper(p, args->lseg.offset); ++ p = xdr_encode_hyper(p, args->lseg.length); ++ *p++ = cpu_to_be32(0); /* reclaim */ ++ p = xdr_encode_opaque_fixed(p, args->stateid.u.data, NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(1); /* newoffset = TRUE */ ++ p = xdr_encode_hyper(p, args->lastbytewritten); ++ *p = cpu_to_be32(args->time_modify_changed != 0); ++ if (args->time_modify_changed) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(0); ++ *p++ = cpu_to_be32(args->time_modify.tv_sec); ++ *p = cpu_to_be32(args->time_modify.tv_nsec); ++ } ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(args->layout_type); ++ ++ if (ld_io_ops->encode_layoutcommit) { ++ ld_io_ops->encode_layoutcommit(NFS_I(args->inode)->layout, ++ xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ xdr_encode_opaque(p, NULL, 0); ++ } ++ ++ hdr->nops++; ++ hdr->replen += decode_layoutcommit_maxsz; ++ return 0; ++} ++ ++static void ++encode_layoutreturn(struct xdr_stream *xdr, ++ const struct nfs4_pnfs_layoutreturn_arg *args, ++ struct compound_hdr *hdr) ++{ ++ nfs4_stateid stateid; ++ __be32 *p; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_LAYOUTRETURN); ++ *p++ = cpu_to_be32(args->reclaim); ++ *p++ = cpu_to_be32(args->layout_type); ++ *p++ = cpu_to_be32(args->lseg.iomode); ++ *p = cpu_to_be32(args->return_type); ++ if (args->return_type == RETURN_FILE) { ++ struct layoutdriver_io_operations *ld_io_ops = ++ NFS_SERVER(args->inode)->pnfs_curr_ld->ld_io_ops; ++ ++ p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE); ++ p = xdr_encode_hyper(p, args->lseg.offset); ++ p = xdr_encode_hyper(p, args->lseg.length); ++ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout); ++ p = xdr_encode_opaque_fixed(p, &stateid.u.data, ++ NFS4_STATEID_SIZE); ++ dprintk("%s: call %pF\n", __func__, ++ ld_io_ops->encode_layoutreturn); ++ if (ld_io_ops->encode_layoutreturn) { ++ ld_io_ops->encode_layoutreturn( ++ NFS_I(args->inode)->layout, xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(0); ++ } ++ } ++ hdr->nops++; ++ hdr->replen += decode_layoutreturn_maxsz; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ + /* + * END OF "GENERIC" ENCODE ROUTINES. + */ +@@ -1704,7 +2014,7 @@ static u32 nfs4_xdr_minorversion(const s + { + #if defined(CONFIG_NFS_V4_1) + if (args->sa_session) +- return args->sa_session->clp->cl_minorversion; ++ return args->sa_session->clp->cl_mvops->minor_version; + #endif /* CONFIG_NFS_V4_1 */ + return 0; + } +@@ -2048,6 +2358,20 @@ static int nfs4_xdr_enc_locku(struct rpc + return 0; + } + ++static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = 0, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_release_lockowner(&xdr, &args->lock_owner, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ + /* + * Encode a READLINK request + */ +@@ -2330,7 +2654,7 @@ static int nfs4_xdr_enc_setclientid_conf + struct compound_hdr hdr = { + .nops = 0, + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); +@@ -2395,7 +2719,7 @@ static int nfs4_xdr_enc_exchange_id(stru + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = args->client->cl_minorversion, ++ .minorversion = args->client->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2413,7 +2737,7 @@ static int nfs4_xdr_enc_create_session(s + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = args->client->cl_minorversion, ++ .minorversion = args->client->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2431,7 +2755,7 @@ static int nfs4_xdr_enc_destroy_session( + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = session->clp->cl_minorversion, ++ .minorversion = session->clp->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2469,7 +2793,7 @@ static int nfs4_xdr_enc_get_lease_time(s + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); +@@ -2499,6 +2823,159 @@ static int nfs4_xdr_enc_reclaim_complete + return 0; + } + ++/* ++ * Encode GETDEVICELIST request ++ */ ++static int ++nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_pnfs_getdevicelist_arg *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_getdevicelist(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode GETDEVICEINFO request ++ */ ++static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_pnfs_getdeviceinfo_arg *args) ++{ ++ struct xdr_stream xdr; ++ struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ int replen; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_getdeviceinfo(&xdr, args, &hdr); ++ ++ /* set up reply kvec. Subtract notification bitmap max size (8) ++ * so that notification bitmap is put in xdr_buf tail */ ++ replen = (RPC_REPHDRSIZE + auth->au_rslack + ++ NFS4_dec_getdeviceinfo_sz - 8) << 2; ++ xdr_inline_pages(&req->rq_rcv_buf, replen, args->pdev->pages, ++ args->pdev->pgbase, args->pdev->pglen); ++ dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", ++ __func__, replen, args->pdev->pages, ++ args->pdev->pgbase, args->pdev->pglen); ++ ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTGET request ++ */ ++static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_pnfs_layoutget_arg *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, NFS_FH(args->inode), &hdr); ++ encode_layoutget(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTCOMMIT request ++ */ ++static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, uint32_t *p, ++ struct pnfs_layoutcommit_arg *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_layoutcommit(&xdr, args, &hdr); ++ encode_getfattr(&xdr, args->bitmask, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTRETURN request ++ */ ++static int nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_pnfs_layoutreturn_arg *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, NFS_FH(args->inode), &hdr); ++ encode_layoutreturn(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server WRITE request ++ */ ++static int nfs4_xdr_enc_dswrite(struct rpc_rqst *req, uint32_t *p, ++ struct nfs_writeargs *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_write(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server COMMIT request ++ */ ++static int nfs4_xdr_enc_dscommit(struct rpc_rqst *req, uint32_t *p, ++ struct nfs_writeargs *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_commit(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} + #endif /* CONFIG_NFS_V4_1 */ + + static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +@@ -2599,14 +3076,17 @@ static int decode_attr_bitmap(struct xdr + goto out_overflow; + bmlen = be32_to_cpup(p); + +- bitmap[0] = bitmap[1] = 0; ++ bitmap[0] = bitmap[1] = bitmap[2] = 0; + p = xdr_inline_decode(xdr, (bmlen << 2)); + if (unlikely(!p)) + goto out_overflow; + if (bmlen > 0) { + bitmap[0] = be32_to_cpup(p++); +- if (bmlen > 1) +- bitmap[1] = be32_to_cpup(p); ++ if (bmlen > 1) { ++ bitmap[1] = be32_to_cpup(p++); ++ if (bmlen > 2) ++ bitmap[2] = be32_to_cpup(p); ++ } + } + return 0; + out_overflow: +@@ -2635,8 +3115,9 @@ static int decode_attr_supported(struct + decode_attr_bitmap(xdr, bitmask); + bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; + } else +- bitmask[0] = bitmask[1] = 0; +- dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); ++ bitmask[0] = bitmask[1] = bitmask[2] = 0; ++ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, ++ bitmask[0], bitmask[1], bitmask[2]); + return 0; + } + +@@ -3565,7 +4046,7 @@ static int decode_opaque_fixed(struct xd + + static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) + { +- return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); ++ return decode_opaque_fixed(xdr, stateid->u.data, NFS4_STATEID_SIZE); + } + + static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +@@ -3621,7 +4102,7 @@ out_overflow: + static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3647,7 +4128,7 @@ xdr_error: + static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3679,7 +4160,7 @@ xdr_error: + static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3705,7 +4186,7 @@ static int decode_getfattr(struct xdr_st + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}, ++ bitmap[3] = {0}, + type; + int status; + umode_t fmode = 0; +@@ -3824,24 +4305,101 @@ xdr_error: + return status; + } + +- +-static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) ++#if defined(CONFIG_NFS_V4_1) ++/* ++ * Decode potentially multiple layout types. Currently we only support ++ * one layout driver per file system. ++ */ ++static int decode_pnfs_list(struct xdr_stream *xdr, uint32_t *layoutclass) + { +- __be32 *savep; +- uint32_t attrlen, bitmap[2]; +- int status; ++ uint32_t *p; ++ int num; + +- if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +- goto xdr_error; +- if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) +- goto xdr_error; +- if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) +- goto xdr_error; ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ num = be32_to_cpup(p); + +- fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ ++ /* pNFS is not supported by the underlying file system */ ++ if (num == 0) { ++ *layoutclass = 0; ++ return 0; ++ } + +- if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) +- goto xdr_error; ++ /* TODO: We will eventually support multiple layout drivers ? */ ++ if (num > 1) ++ printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers " ++ "per filesystem not supported\n", __func__); ++ ++ /* Decode and set first layout type */ ++ p = xdr_inline_decode(xdr, num * 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ *layoutclass = be32_to_cpup(p); ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++/* ++ * The type of file system exported ++ */ ++static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, ++ uint32_t *layoutclass) ++{ ++ int status = 0; ++ ++ dprintk("%s: bitmap is %x\n", __func__, bitmap[1]); ++ if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) ++ return -EIO; ++ if (likely(bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES)) { ++ status = decode_pnfs_list(xdr, layoutclass); ++ bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; ++ } ++ return status; ++} ++ ++/* ++ * The prefered block size for layout directed io ++ */ ++static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, ++ uint32_t *res) ++{ ++ __be32 *p; ++ ++ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); ++ *res = 0; ++ if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) { ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++ } ++ *res = be32_to_cpup(p); ++ bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; ++ } ++ return 0; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ ++static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) ++{ ++ __be32 *savep; ++ uint32_t attrlen, bitmap[3]; ++ int status; ++ ++ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) ++ goto xdr_error; ++ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) ++ goto xdr_error; ++ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) ++ goto xdr_error; ++ ++ fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ ++ ++ if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) ++ goto xdr_error; + if ((status = decode_attr_maxfilesize(xdr, bitmap, &fsinfo->maxfilesize)) != 0) + goto xdr_error; + if ((status = decode_attr_maxread(xdr, bitmap, &fsinfo->rtmax)) != 0) +@@ -3850,6 +4408,14 @@ static int decode_fsinfo(struct xdr_stre + if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) + goto xdr_error; + fsinfo->wtpref = fsinfo->wtmax; ++#if defined(CONFIG_NFS_V4_1) ++ status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); ++ if (status) ++ goto xdr_error; ++ status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); ++ if (status) ++ goto xdr_error; ++#endif /* CONFIG_NFS_V4_1 */ + + status = verify_attr_len(xdr, savep, attrlen); + xdr_error: +@@ -3973,6 +4539,11 @@ static int decode_locku(struct xdr_strea + return status; + } + ++static int decode_release_lockowner(struct xdr_stream *xdr) ++{ ++ return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER); ++} ++ + static int decode_lookup(struct xdr_stream *xdr) + { + return decode_op_hdr(xdr, OP_LOOKUP); +@@ -4333,7 +4904,7 @@ static int decode_getacl(struct xdr_stre + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}; ++ bitmap[3] = {0}; + struct kvec *iov = req->rq_rcv_buf.head; + int status; + +@@ -4682,6 +5253,226 @@ out_overflow: + #endif /* CONFIG_NFS_V4_1 */ + } + ++#if defined(CONFIG_NFS_V4_1) ++/* ++ * TODO: Need to handle case when EOF != true; ++ */ ++static int decode_getdevicelist(struct xdr_stream *xdr, ++ struct pnfs_devicelist *res) ++{ ++ __be32 *p; ++ int status, i; ++ struct nfs_writeverf verftemp; ++ ++ status = decode_op_hdr(xdr, OP_GETDEVICELIST); ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 8 + 8 + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ ++ /* TODO: Skip cookie for now */ ++ p += 2; ++ ++ /* Read verifier */ ++ p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); ++ ++ res->num_devs = be32_to_cpup(p); ++ ++ dprintk("%s: num_dev %d\n", __func__, res->num_devs); ++ ++ if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) ++ return -NFS4ERR_REP_TOO_BIG; ++ ++ p = xdr_inline_decode(xdr, ++ res->num_devs * NFS4_PNFS_DEVICEID4_SIZE + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ for (i = 0; i < res->num_devs; i++) ++ p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ res->eof = be32_to_cpup(p); ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_getdeviceinfo(struct xdr_stream *xdr, ++ struct pnfs_device *pdev) ++{ ++ __be32 *p; ++ uint32_t len, type; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_GETDEVICEINFO); ++ if (status) { ++ if (status == -ETOOSMALL) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ pdev->mincount = be32_to_cpup(p); ++ dprintk("%s: Min count too small. mincnt = %u\n", ++ __func__, pdev->mincount); ++ } ++ return status; ++ } ++ ++ p = xdr_inline_decode(xdr, 8); ++ if (unlikely(!p)) ++ goto out_overflow; ++ type = be32_to_cpup(p++); ++ if (type != pdev->layout_type) { ++ dprintk("%s: layout mismatch req: %u pdev: %u\n", ++ __func__, pdev->layout_type, type); ++ return -EINVAL; ++ } ++ /* ++ * Get the length of the opaque device_addr4. xdr_read_pages places ++ * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages) ++ * and places the remaining xdr data in xdr_buf->tail ++ */ ++ pdev->mincount = be32_to_cpup(p); ++ xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ ++ ++ /* At most one bitmap word */ ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ len = be32_to_cpup(p); ++ if (len) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ pdev->dev_notify_types = be32_to_cpup(p); ++ } else ++ pdev->dev_notify_types = 0; ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, ++ struct nfs4_pnfs_layoutget_res *res) ++{ ++ __be32 *p; ++ int status; ++ u32 layout_count, dummy; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTGET); ++ if (status) ++ return status; ++ p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->return_on_close = be32_to_cpup(p++); ++ p = xdr_decode_opaque_fixed(p, res->stateid.u.data, NFS4_STATEID_SIZE); ++ layout_count = be32_to_cpup(p); ++ if (!layout_count) { ++ dprintk("%s: server responded with empty layout array\n", ++ __func__); ++ return -EINVAL; ++ } ++ ++ p = xdr_inline_decode(xdr, 24); ++ if (unlikely(!p)) ++ goto out_overflow; ++ p = xdr_decode_hyper(p, &res->lseg.offset); ++ p = xdr_decode_hyper(p, &res->lseg.length); ++ res->lseg.iomode = be32_to_cpup(p++); ++ res->type = be32_to_cpup(p++); ++ ++ status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p); ++ if (unlikely(status)) ++ return status; ++ ++ dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", ++ __func__, ++ (unsigned long)res->lseg.offset, ++ (unsigned long)res->lseg.length, ++ res->lseg.iomode, ++ res->type, ++ res->layout.len); ++ ++ /* presuambly, pnfs4_proc_layoutget allocated a single page */ ++ if (res->layout.len > PAGE_SIZE) ++ return -ENOMEM; ++ memcpy(res->layout.buf, p, res->layout.len); ++ ++ /* FIXME: the whole layout array should be passed up to the pnfs ++ * client */ ++ if (layout_count > 1) { ++ dprintk("%s: server responded with %d layouts, dropping tail\n", ++ __func__, layout_count); ++ ++ while (--layout_count) { ++ p = xdr_inline_decode(xdr, 24); ++ if (unlikely(!p)) ++ goto out_overflow; ++ status = decode_opaque_inline(xdr, &dummy, (char **)&p); ++ if (unlikely(status)) ++ return status; ++ } ++ } ++ ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutreturn(struct xdr_stream *xdr, ++ struct nfs4_pnfs_layoutreturn_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTRETURN); ++ if (status) ++ return status; ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->lrs_present = be32_to_cpup(p); ++ if (res->lrs_present) ++ status = decode_stateid(xdr, &res->stateid); ++ return status; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutcommit(struct xdr_stream *xdr, ++ struct rpc_rqst *req, ++ struct pnfs_layoutcommit_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->sizechanged = be32_to_cpup(p); ++ ++ if (res->sizechanged) { ++ p = xdr_inline_decode(xdr, 8); ++ if (unlikely(!p)) ++ goto out_overflow; ++ xdr_decode_hyper(p, &res->newsize); ++ } ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ + /* + * END OF "GENERIC" DECODE ROUTINES. + */ +@@ -5259,6 +6050,19 @@ out: + return status; + } + ++static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (!status) ++ status = decode_release_lockowner(&xdr); ++ return status; ++} ++ + /* + * Decode READLINK response + */ +@@ -5696,6 +6500,186 @@ static int nfs4_xdr_dec_reclaim_complete + status = decode_reclaim_complete(&xdr, (void *)NULL); + return status; + } ++ ++/* ++ * Decode GETDEVICELIST response ++ */ ++static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_pnfs_getdevicelist_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ dprintk("encoding getdevicelist!\n"); ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status != 0) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status != 0) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status != 0) ++ goto out; ++ status = decode_getdevicelist(&xdr, res->devlist); ++out: ++ return status; ++} ++ ++/* ++ * Decode GETDEVINFO response ++ */ ++static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_pnfs_getdeviceinfo_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status != 0) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status != 0) ++ goto out; ++ status = decode_getdeviceinfo(&xdr, res->pdev); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTGET response ++ */ ++static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_pnfs_layoutget_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutget(&xdr, rqstp, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTRETURN response ++ */ ++static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_pnfs_layoutreturn_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutreturn(&xdr, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTCOMMIT response ++ */ ++static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, uint32_t *p, ++ struct pnfs_layoutcommit_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutcommit(&xdr, rqstp, res); ++ if (status) ++ goto out; ++ decode_getfattr(&xdr, res->fattr, res->server, ++ !RPC_IS_ASYNC(rqstp->rq_task)); ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server WRITE response ++ */ ++static int nfs4_xdr_dec_dswrite(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs_writeres *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_write(&xdr, res); ++ if (!status) ++ return res->count; ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server COMMIT response ++ */ ++static int nfs4_xdr_dec_dscommit(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs_writeres *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_commit(&xdr, res); ++out: ++ return status; ++} + #endif /* CONFIG_NFS_V4_1 */ + + __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) +@@ -5866,6 +6850,7 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(GETACL, enc_getacl, dec_getacl), + PROC(SETACL, enc_setacl, dec_setacl), + PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), ++ PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), + #if defined(CONFIG_NFS_V4_1) + PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), + PROC(CREATE_SESSION, enc_create_session, dec_create_session), +@@ -5873,6 +6858,13 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(SEQUENCE, enc_sequence, dec_sequence), + PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), ++ PROC(PNFS_GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), ++ PROC(PNFS_GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), ++ PROC(PNFS_LAYOUTGET, enc_layoutget, dec_layoutget), ++ PROC(PNFS_LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), ++ PROC(PNFS_LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), ++ PROC(PNFS_WRITE, enc_dswrite, dec_dswrite), ++ PROC(PNFS_COMMIT, enc_dscommit, dec_dscommit), + #endif /* CONFIG_NFS_V4_1 */ + }; + +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild +--- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-23 12:09:03.348511665 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-08-23 12:09:03.348511665 -0400 +@@ -0,0 +1,11 @@ ++# ++# Makefile for the pNFS Objects Layout Driver kernel module ++# ++objlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o objio_osd.o ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o ++ ++# ++# Panasas pNFS Layout Driver kernel module ++# ++panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o ++obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-23 12:09:03.349501459 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-23 12:09:03.349501459 -0400 +@@ -0,0 +1,1087 @@ ++/* ++ * objio_osd.c ++ * ++ * pNFS Objects layout implementation over open-osd initiator library ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++#define _LLU(x) ((unsigned long long)x) ++ ++enum { BIO_MAX_PAGES_KMALLOC = ++ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), ++}; ++ ++/* A per mountpoint struct currently for device cache */ ++struct objio_mount_type { ++ struct list_head dev_list; ++ spinlock_t dev_list_lock; ++}; ++ ++struct _dev_ent { ++ struct list_head list; ++ struct pnfs_deviceid d_id; ++ struct osd_dev *od; ++}; ++ ++static void _dev_list_remove_all(struct objio_mount_type *omt) ++{ ++ spin_lock(&omt->dev_list_lock); ++ ++ while (!list_empty(&omt->dev_list)) { ++ struct _dev_ent *de = list_entry(omt->dev_list.next, ++ struct _dev_ent, list); ++ ++ list_del_init(&de->list); ++ osduld_put_device(de->od); ++ kfree(de); ++ } ++ ++ spin_unlock(&omt->dev_list_lock); ++} ++ ++static struct osd_dev *___dev_list_find(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id) ++{ ++ struct list_head *le; ++ ++ list_for_each(le, &omt->dev_list) { ++ struct _dev_ent *de = list_entry(le, struct _dev_ent, list); ++ ++ if (0 == memcmp(&de->d_id, d_id, sizeof(*d_id))) ++ return de->od; ++ } ++ ++ return NULL; ++} ++ ++static struct osd_dev *_dev_list_find(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id) ++{ ++ struct osd_dev *od; ++ ++ spin_lock(&omt->dev_list_lock); ++ od = ___dev_list_find(omt, d_id); ++ spin_unlock(&omt->dev_list_lock); ++ return od; ++} ++ ++static int _dev_list_add(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id, struct osd_dev *od) ++{ ++ struct _dev_ent *de = kzalloc(sizeof(*de), GFP_KERNEL); ++ ++ if (!de) ++ return -ENOMEM; ++ ++ spin_lock(&omt->dev_list_lock); ++ ++ if (___dev_list_find(omt, d_id)) { ++ kfree(de); ++ goto out; ++ } ++ ++ de->d_id = *d_id; ++ de->od = od; ++ list_add(&de->list, &omt->dev_list); ++ ++out: ++ spin_unlock(&omt->dev_list_lock); ++ return 0; ++} ++ ++struct objio_segment { ++ struct pnfs_osd_layout *layout; ++ ++ unsigned mirrors_p1; ++ unsigned stripe_unit; ++ unsigned group_width; /* Data stripe_units without integrity comps */ ++ u64 group_depth; ++ unsigned group_count; ++ ++ unsigned num_comps; ++ /* variable length */ ++ struct osd_dev *ods[1]; ++}; ++ ++struct objio_state; ++typedef ssize_t (*objio_done_fn)(struct objio_state *ios); ++ ++struct objio_state { ++ /* Generic layer */ ++ struct objlayout_io_state ol_state; ++ ++ struct objio_segment *objio_seg; ++ ++ struct kref kref; ++ objio_done_fn done; ++ void *private; ++ ++ unsigned long length; ++ unsigned numdevs; /* Actually used devs in this IO */ ++ /* A per-device variable array of size numdevs */ ++ struct _objio_per_comp { ++ struct bio *bio; ++ struct osd_request *or; ++ unsigned long length; ++ u64 offset; ++ unsigned dev; ++ } per_dev[]; ++}; ++ ++/* Send and wait for a get_device_info of devices in the layout, ++ then look them up with the osd_initiator library */ ++static struct osd_dev *_device_lookup(struct pnfs_layout_type *pnfslay, ++ struct objio_segment *objio_seg, unsigned comp) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ struct pnfs_osd_deviceaddr *deviceaddr; ++ struct pnfs_deviceid *d_id; ++ struct osd_dev *od; ++ struct osd_dev_info odi; ++ struct objio_mount_type *omt = PNFS_NFS_SERVER(pnfslay)->pnfs_ld_data; ++ int err; ++ ++ d_id = &layout->olo_comps[comp].oc_object_id.oid_device_id; ++ ++ od = _dev_list_find(omt, d_id); ++ if (od) ++ return od; ++ ++ err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr); ++ if (unlikely(err)) { ++ dprintk("%s: objlayout_get_deviceinfo=>%d\n", __func__, err); ++ return ERR_PTR(err); ++ } ++ ++ odi.systemid_len = deviceaddr->oda_systemid.len; ++ if (odi.systemid_len > sizeof(odi.systemid)) { ++ err = -EINVAL; ++ goto out; ++ } else if (odi.systemid_len) ++ memcpy(odi.systemid, deviceaddr->oda_systemid.data, ++ odi.systemid_len); ++ odi.osdname_len = deviceaddr->oda_osdname.len; ++ odi.osdname = (u8 *)deviceaddr->oda_osdname.data; ++ ++ if (!odi.osdname_len && !odi.systemid_len) { ++ dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", ++ __func__); ++ err = -ENODEV; ++ goto out; ++ } ++ ++ od = osduld_info_lookup(&odi); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ dprintk("%s: osduld_info_lookup => %d\n", __func__, err); ++ goto out; ++ } ++ ++ _dev_list_add(omt, d_id, od); ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ objlayout_put_deviceinfo(deviceaddr); ++ return err ? ERR_PTR(err) : od; ++} ++ ++static int objio_devices_lookup(struct pnfs_layout_type *pnfslay, ++ struct objio_segment *objio_seg) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ unsigned i, num_comps = layout->olo_num_comps; ++ int err; ++ ++ /* lookup all devices */ ++ for (i = 0; i < num_comps; i++) { ++ struct osd_dev *od; ++ ++ od = _device_lookup(pnfslay, objio_seg, i); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ goto out; ++ } ++ objio_seg->ods[i] = od; ++ } ++ objio_seg->num_comps = num_comps; ++ err = 0; ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ return err; ++} ++ ++static int _verify_data_map(struct pnfs_osd_layout *layout) ++{ ++ struct pnfs_osd_data_map *data_map = &layout->olo_map; ++ u64 stripe_length; ++ u32 group_width; ++ ++/* FIXME: Only raid0 for now. if not go through MDS */ ++ if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { ++ printk(KERN_ERR "Only RAID_0 for now\n"); ++ return -ENOTSUPP; ++ } ++ if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { ++ printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", ++ data_map->odm_num_comps, data_map->odm_mirror_cnt); ++ return -EINVAL; ++ } ++ ++ if (data_map->odm_group_width) ++ group_width = data_map->odm_group_width; ++ else ++ group_width = data_map->odm_num_comps / ++ (data_map->odm_mirror_cnt + 1); ++ ++ stripe_length = (u64)data_map->odm_stripe_unit * group_width; ++ if (stripe_length >= (1ULL << 32)) { ++ printk(KERN_ERR "Total Stripe length(0x%llx)" ++ " >= 32bit is not supported\n", _LLU(stripe_length)); ++ return -ENOTSUPP; ++ } ++ ++ if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { ++ printk(KERN_ERR "Stripe Unit(0x%llx)" ++ " must be Multples of PAGE_SIZE(0x%lx)\n", ++ _LLU(data_map->odm_stripe_unit), PAGE_SIZE); ++ return -ENOTSUPP; ++ } ++ ++ return 0; ++} ++ ++int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_type *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ struct objio_segment *objio_seg; ++ int err; ++ ++ err = _verify_data_map(layout); ++ if (unlikely(err)) ++ return err; ++ ++ objio_seg = kzalloc(sizeof(*objio_seg) + ++ (layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]), ++ GFP_KERNEL); ++ if (!objio_seg) ++ return -ENOMEM; ++ ++ objio_seg->layout = layout; ++ err = objio_devices_lookup(pnfslay, objio_seg); ++ if (err) ++ goto free_seg; ++ ++ objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1; ++ objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit; ++ if (layout->olo_map.odm_group_width) { ++ objio_seg->group_width = layout->olo_map.odm_group_width; ++ objio_seg->group_depth = layout->olo_map.odm_group_depth; ++ objio_seg->group_count = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1 / ++ objio_seg->group_width; ++ } else { ++ objio_seg->group_width = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1; ++ objio_seg->group_depth = -1; ++ objio_seg->group_count = 1; ++ } ++ ++ *outp = objio_seg; ++ return 0; ++ ++free_seg: ++ dprintk("%s: Error: return %d\n", __func__, err); ++ kfree(objio_seg); ++ *outp = NULL; ++ return err; ++} ++ ++void objio_free_lseg(void *p) ++{ ++ struct objio_segment *objio_seg = p; ++ ++ kfree(objio_seg); ++} ++ ++int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct objio_segment *objio_seg = seg; ++ struct objio_state *ios; ++ const unsigned first_size = sizeof(*ios) + ++ objio_seg->num_comps * sizeof(ios->per_dev[0]); ++ const unsigned sec_size = objio_seg->num_comps * ++ sizeof(ios->ol_state.ioerrs[0]); ++ ++ dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps); ++ ios = kzalloc(first_size + sec_size, GFP_KERNEL); ++ if (unlikely(!ios)) ++ return -ENOMEM; ++ ++ ios->objio_seg = objio_seg; ++ ios->ol_state.ioerrs = ((void *)ios) + first_size; ++ ios->ol_state.num_comps = objio_seg->num_comps; ++ ++ *outp = &ios->ol_state; ++ return 0; ++} ++ ++void objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ ++ kfree(ios); ++} ++ ++enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) ++{ ++ switch (oep) { ++ case OSD_ERR_PRI_NO_ERROR: ++ return (enum pnfs_osd_errno)0; ++ ++ case OSD_ERR_PRI_CLEAR_PAGES: ++ BUG_ON(1); ++ return 0; ++ ++ case OSD_ERR_PRI_RESOURCE: ++ return PNFS_OSD_ERR_RESOURCE; ++ case OSD_ERR_PRI_BAD_CRED: ++ return PNFS_OSD_ERR_BAD_CRED; ++ case OSD_ERR_PRI_NO_ACCESS: ++ return PNFS_OSD_ERR_NO_ACCESS; ++ case OSD_ERR_PRI_UNREACHABLE: ++ return PNFS_OSD_ERR_UNREACHABLE; ++ case OSD_ERR_PRI_NOT_FOUND: ++ return PNFS_OSD_ERR_NOT_FOUND; ++ case OSD_ERR_PRI_NO_SPACE: ++ return PNFS_OSD_ERR_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case OSD_ERR_PRI_EIO: ++ return PNFS_OSD_ERR_EIO; ++ } ++} ++ ++static void _clear_bio(struct bio *bio) ++{ ++ struct bio_vec *bv; ++ unsigned i; ++ ++ __bio_for_each_segment(bv, bio, i, 0) { ++ unsigned this_count = bv->bv_len; ++ ++ if (likely(PAGE_SIZE == this_count)) ++ clear_highpage(bv->bv_page); ++ else ++ zero_user(bv->bv_page, bv->bv_offset, this_count); ++ } ++} ++ ++static int _io_check(struct objio_state *ios, bool is_write) ++{ ++ enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; ++ int lin_ret = 0; ++ int i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_sense_info osi; ++ struct osd_request *or = ios->per_dev[i].or; ++ int ret; ++ ++ if (!or) ++ continue; ++ ++ ret = osd_req_decode_sense(or, &osi); ++ if (likely(!ret)) ++ continue; ++ ++ if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { ++ /* start read offset passed endof file */ ++ BUG_ON(is_write); ++ _clear_bio(ios->per_dev[i].bio); ++ dprintk("%s: start read offset passed end of file " ++ "offset=0x%llx, length=0x%lx\n", __func__, ++ _LLU(ios->per_dev[i].offset), ++ ios->per_dev[i].length); ++ ++ continue; /* we recovered */ ++ } ++ objlayout_io_set_result(&ios->ol_state, ios->per_dev[i].dev, ++ osd_pri_2_pnfs_err(osi.osd_err_pri), ++ ios->per_dev[i].offset, ++ ios->per_dev[i].length, ++ is_write); ++ ++ if (osi.osd_err_pri >= oep) { ++ oep = osi.osd_err_pri; ++ lin_ret = ret; ++ } ++ } ++ ++ return lin_ret; ++} ++ ++/* ++ * Common IO state helpers. ++ */ ++static void _io_free(struct objio_state *ios) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[i]; ++ ++ if (per_dev->or) { ++ osd_end_request(per_dev->or); ++ per_dev->or = NULL; ++ } ++ ++ if (per_dev->bio) { ++ bio_put(per_dev->bio); ++ per_dev->bio = NULL; ++ } ++ } ++} ++ ++struct osd_dev * _io_od(struct objio_state *ios, unsigned dev) ++{ ++ unsigned min_dev = ios->objio_seg->layout->olo_comps_index; ++ unsigned max_dev = min_dev + ios->ol_state.num_comps; ++ ++ BUG_ON(dev < min_dev || max_dev <= dev); ++ return ios->objio_seg->ods[dev - min_dev]; ++} ++ ++struct _striping_info { ++ u64 obj_offset; ++ u64 group_length; ++ u64 total_group_length; ++ u64 Major; ++ unsigned dev; ++ unsigned unit_off; ++}; ++ ++static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, ++ struct _striping_info *si) ++{ ++ u32 stripe_unit = ios->objio_seg->stripe_unit; ++ u32 group_width = ios->objio_seg->group_width; ++ u64 group_depth = ios->objio_seg->group_depth; ++ u32 U = stripe_unit * group_width; ++ ++ u64 T = U * group_depth; ++ u64 S = T * ios->objio_seg->group_count; ++ u64 M = div64_u64(file_offset, S); ++ ++ /* ++ G = (L - (M * S)) / T ++ H = (L - (M * S)) % T ++ */ ++ u64 LmodU = file_offset - M * S; ++ u32 G = div64_u64(LmodU, T); ++ u64 H = LmodU - G * T; ++ ++ u32 N = div_u64(H, U); ++ ++ div_u64_rem(file_offset, stripe_unit, &si->unit_off); ++ si->obj_offset = si->unit_off + (N * stripe_unit) + ++ (M * group_depth * stripe_unit); ++ ++ /* "H - (N * U)" is just "H % U" so it's bound to u32 */ ++ si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; ++ si->dev *= ios->objio_seg->mirrors_p1; ++ ++ si->group_length = T - H; ++ si->total_group_length = T; ++ si->Major = M; ++} ++ ++static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, ++ unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len) ++{ ++ unsigned pg = *cur_pg; ++ struct request_queue *q = ++ osd_request_queue(_io_od(ios, per_dev->dev)); ++ ++ per_dev->length += cur_len; ++ ++ if (per_dev->bio == NULL) { ++ unsigned stripes = ios->ol_state.num_comps / ++ ios->objio_seg->mirrors_p1; ++ unsigned pages_in_stripe = stripes * ++ (ios->objio_seg->stripe_unit / PAGE_SIZE); ++ unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / ++ stripes; ++ ++ per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); ++ if (unlikely(!per_dev->bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", bio_size); ++ return -ENOMEM; ++ } ++ } ++ ++ while (cur_len > 0) { ++ unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); ++ unsigned added_len; ++ ++ BUG_ON(ios->ol_state.nr_pages <= pg); ++ cur_len -= pglen; ++ ++ added_len = bio_add_pc_page(q, per_dev->bio, ++ ios->ol_state.pages[pg], pglen, pgbase); ++ if (unlikely(pglen != added_len)) ++ return -ENOMEM; ++ pgbase = 0; ++ ++pg; ++ } ++ BUG_ON(cur_len); ++ ++ *cur_pg = pg; ++ return 0; ++} ++ ++static int _prepare_one_group(struct objio_state *ios, u64 length, ++ struct _striping_info *si, unsigned first_comp, ++ unsigned *last_pg) ++{ ++ unsigned stripe_unit = ios->objio_seg->stripe_unit; ++ unsigned mirrors_p1 = ios->objio_seg->mirrors_p1; ++ unsigned devs_in_group = ios->objio_seg->group_width * mirrors_p1; ++ unsigned dev = si->dev; ++ unsigned first_dev = dev - (dev % devs_in_group); ++ unsigned comp = first_comp + (dev - first_dev); ++ unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; ++ unsigned cur_pg = *last_pg; ++ int ret = 0; ++ ++ while (length) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[comp]; ++ unsigned cur_len, page_off = 0; ++ ++ if (!per_dev->length) { ++ per_dev->dev = dev; ++ if (dev < si->dev) { ++ per_dev->offset = si->obj_offset + stripe_unit - ++ si->unit_off; ++ cur_len = stripe_unit; ++ } else if (dev == si->dev) { ++ per_dev->offset = si->obj_offset; ++ cur_len = stripe_unit - si->unit_off; ++ page_off = si->unit_off & ~PAGE_MASK; ++ BUG_ON(page_off && ++ (page_off != ios->ol_state.pgbase)); ++ } else { /* dev > si->dev */ ++ per_dev->offset = si->obj_offset - si->unit_off; ++ cur_len = stripe_unit; ++ } ++ ++ if (max_comp < comp) ++ max_comp = comp; ++ ++ dev += mirrors_p1; ++ dev = (dev % devs_in_group) + first_dev; ++ } else { ++ cur_len = stripe_unit; ++ } ++ if (cur_len >= length) ++ cur_len = length; ++ ++ ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, ++ cur_len); ++ if (unlikely(ret)) ++ goto out; ++ ++ comp += mirrors_p1; ++ comp = (comp % devs_in_group) + first_comp; ++ ++ length -= cur_len; ++ ios->length += cur_len; ++ } ++out: ++ ios->numdevs = max_comp + mirrors_p1; ++ *last_pg = cur_pg; ++ return ret; ++} ++ ++static int _io_rw_pagelist(struct objio_state *ios) ++{ ++ u64 length = ios->ol_state.count; ++ struct _striping_info si; ++ unsigned devs_in_group = ios->objio_seg->group_width * ++ ios->objio_seg->mirrors_p1; ++ unsigned first_comp = 0; ++ unsigned num_comps = ios->objio_seg->layout->olo_map.odm_num_comps; ++ unsigned last_pg = 0; ++ int ret = 0; ++ ++ _calc_stripe_info(ios, ios->ol_state.offset, &si); ++ while (length) { ++ if (length < si.group_length) ++ si.group_length = length; ++ ++ ret = _prepare_one_group(ios, si.group_length, &si, first_comp, ++ &last_pg); ++ if (unlikely(ret)) ++ goto out; ++ ++ length -= si.group_length; ++ ++ si.group_length = si.total_group_length; ++ si.unit_off = 0; ++ ++si.Major; ++ si.obj_offset = si.Major * ios->objio_seg->stripe_unit * ++ ios->objio_seg->group_depth; ++ ++ si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group; ++ si.dev %= num_comps; ++ ++ first_comp += devs_in_group; ++ first_comp %= num_comps; ++ } ++ ++out: ++ if (!ios->length) ++ return ret; ++ ++ return 0; ++} ++ ++static ssize_t _sync_done(struct objio_state *ios) ++{ ++ struct completion *waiting = ios->private; ++ ++ complete(waiting); ++ return 0; ++} ++ ++static void _last_io(struct kref *kref) ++{ ++ struct objio_state *ios = container_of(kref, struct objio_state, kref); ++ ++ ios->done(ios); ++} ++ ++static void _done_io(struct osd_request *or, void *p) ++{ ++ struct objio_state *ios = p; ++ ++ kref_put(&ios->kref, _last_io); ++} ++ ++static ssize_t _io_exec(struct objio_state *ios) ++{ ++ DECLARE_COMPLETION_ONSTACK(wait); ++ ssize_t status = 0; /* sync status */ ++ unsigned i; ++ objio_done_fn saved_done_fn = ios->done; ++ bool sync = ios->ol_state.sync; ++ ++ if (sync) { ++ ios->done = _sync_done; ++ ios->private = &wait; ++ } ++ ++ kref_init(&ios->kref); ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_request *or = ios->per_dev[i].or; ++ ++ if (!or) ++ continue; ++ ++ kref_get(&ios->kref); ++ osd_execute_request_async(or, _done_io, ios); ++ } ++ ++ kref_put(&ios->kref, _last_io); ++ ++ if (sync) { ++ wait_for_completion(&wait); ++ status = saved_done_fn(ios); ++ } ++ ++ return status; ++} ++ ++/* ++ * read ++ */ ++static ssize_t _read_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, false); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) ++ status = ios->length; ++ else ++ status = ret; ++ ++ objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct osd_request *or = NULL; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = per_dev->dev; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ int ret; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ ++err: ++ return ret; ++} ++ ++static ssize_t _read_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _read_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _read_done; ++ return _io_exec(ios); /* In sync mode exec returns the io status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _read_exec(ios); ++} ++ ++/* ++ * write ++ */ ++static ssize_t _write_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, true); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) { ++ /* FIXME: should be based on the OSD's persistence model ++ * See OSD2r05 Section 4.13 Data persistence model */ ++ ios->ol_state.committed = NFS_UNSTABLE; //NFS_FILE_SYNC; ++ status = ios->length; ++ } else { ++ status = ret; ++ } ++ ++ objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = ios->per_dev[cur_comp].dev; ++ unsigned last_comp = cur_comp + ios->objio_seg->mirrors_p1; ++ int ret; ++ ++ for (; cur_comp < last_comp; ++cur_comp, ++dev) { ++ struct osd_request *or = NULL; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ struct bio *bio; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ if (per_dev != master_dev) { ++ bio = bio_kmalloc(GFP_KERNEL, ++ master_dev->bio->bi_max_vecs); ++ if (unlikely(!bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", ++ master_dev->bio->bi_max_vecs); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ __bio_clone(bio, master_dev->bio); ++ bio->bi_bdev = NULL; ++ bio->bi_next = NULL; ++ per_dev->bio = bio; ++ per_dev->dev = dev; ++ per_dev->length = master_dev->length; ++ per_dev->offset = master_dev->offset; ++ } else { ++ bio = master_dev->bio; ++ /* FIXME: bio_set_dir() */ ++ bio->bi_rw |= (1 << BIO_RW); ++ } ++ ++ osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ } ++ ++err: ++ return ret; ++} ++ ++static ssize_t _write_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _write_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _write_done; ++ return _io_exec(ios); /* In sync mode exec returns the io->status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ /* TODO: ios->stable = stable; */ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _write_exec(ios); ++} ++ ++/* ++ * Policy Operations ++ */ ++ ++/* ++ * Return the stripe size for the specified file ++ */ ++ssize_t ++objlayout_get_stripesize(struct pnfs_layout_type *pnfslay) ++{ ++ ssize_t sz, maxsz = -1; ++ struct pnfs_layout_segment *lseg; ++ ++ list_for_each_entry(lseg, &pnfslay->segs, fi_list) { ++ int n; ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(lseg); ++ struct pnfs_osd_layout *lo = ++ (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; ++ struct pnfs_osd_data_map *map = &lo->olo_map; ++ ++ n = map->odm_group_width; ++ if (n == 0) ++ n = map->odm_num_comps / (map->odm_mirror_cnt + 1); ++ ++ switch (map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_5: ++ n -= 1; ++ break; ++ ++ case PNFS_OSD_RAID_PQ: ++ n -= 2; ++ break; ++ ++ default: ++ BUG_ON(1); ++ } ++ sz = map->odm_stripe_unit * n; ++ if (sz > maxsz) ++ maxsz = sz; ++ } ++ dprintk("%s: Return %Zx\n", __func__, maxsz); ++ return maxsz; ++} ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++objlayout_get_blocksize(void) ++{ ++ ssize_t sz = BIO_MAX_PAGES_KMALLOC * PAGE_SIZE; ++ ++ return sz; ++} ++ ++static struct layoutdriver_policy_operations objlayout_policy_operations = { ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ .get_stripesize = objlayout_get_stripesize, ++ .get_blocksize = objlayout_get_blocksize, ++}; ++ ++static struct pnfs_layoutdriver_type objlayout_type = { ++ .id = LAYOUT_OSD2_OBJECTS, ++ .name = "LAYOUT_OSD2_OBJECTS", ++ .ld_io_ops = &objlayout_io_operations, ++ .ld_policy_ops = &objlayout_policy_operations, ++}; ++ ++void *objio_init_mt(void) ++{ ++ struct objio_mount_type *omt = kzalloc(sizeof(*omt), GFP_KERNEL); ++ ++ if (!omt) ++ return ERR_PTR(-ENOMEM); ++ ++ INIT_LIST_HEAD(&omt->dev_list); ++ spin_lock_init(&omt->dev_list_lock); ++ return omt; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++ _dev_list_remove_all(mountid); ++ kfree(mountid); ++} ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++objlayout_init(void) ++{ ++ pnfs_client_ops = pnfs_register_layoutdriver(&objlayout_type); ++ printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", ++ __func__); ++ return 0; ++} ++ ++static void __exit ++objlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&objlayout_type); ++ printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(objlayout_init); ++module_exit(objlayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-23 12:09:03.350491564 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-08-23 12:09:03.350491564 -0400 +@@ -0,0 +1,790 @@ ++/* ++ * objlayout.c ++ * ++ * pNFS layout driver for Panasas OSDs ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pnfs_client_operations *pnfs_client_ops; ++ ++/* ++ * Create a objlayout layout structure for the given inode and return it. ++ */ ++static struct pnfs_layout_type * ++objlayout_alloc_layout(struct inode *inode) ++{ ++ struct objlayout *objlay; ++ ++ objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL); ++ if (objlay) { ++ spin_lock_init(&objlay->lock); ++ INIT_LIST_HEAD(&objlay->err_list); ++ } ++ dprintk("%s: Return %p\n", __func__, objlay); ++ return &objlay->pnfs_layout; ++} ++ ++/* ++ * Free an objlayout layout structure ++ */ ++static void ++objlayout_free_layout(struct pnfs_layout_type *lo) ++{ ++ struct objlayout *objlay = OBJLAYOUT(lo); ++ ++ dprintk("%s: objlay %p\n", __func__, objlay); ++ ++ WARN_ON(!list_empty(&objlay->err_list)); ++ kfree(objlay); ++} ++ ++/* ++ * Unmarshall layout and store it in pnfslay. ++ */ ++static struct pnfs_layout_segment * ++objlayout_alloc_lseg(struct pnfs_layout_type *pnfslay, ++ struct nfs4_pnfs_layoutget_res *lgr) ++{ ++ int status; ++ void *layout = lgr->layout.buf; ++ struct pnfs_layout_segment *lseg; ++ struct objlayout_segment *objlseg; ++ struct pnfs_osd_layout *pnfs_osd_layout; ++ ++ dprintk("%s: Begin pnfslay %p layout %p\n", __func__, pnfslay, layout); ++ ++ BUG_ON(!layout); ++ ++ status = -ENOMEM; ++ lseg = kzalloc(sizeof(*lseg) + sizeof(*objlseg) + ++ pnfs_osd_layout_incore_sz(layout), GFP_KERNEL); ++ if (!lseg) ++ goto err; ++ ++ objlseg = LSEG_LD_DATA(lseg); ++ pnfs_osd_layout = (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; ++ pnfs_osd_xdr_decode_layout(pnfs_osd_layout, layout); ++ ++ status = objio_alloc_lseg(&objlseg->internal, pnfslay, lseg, ++ pnfs_osd_layout); ++ if (status) ++ goto err; ++ ++ dprintk("%s: Return %p\n", __func__, lseg); ++ return lseg; ++ ++ err: ++ kfree(lseg); ++ return ERR_PTR(status); ++} ++ ++/* ++ * Free a layout segement ++ */ ++static void ++objlayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ struct objlayout_segment *objlseg; ++ ++ dprintk("%s: freeing layout segment %p\n", __func__, lseg); ++ ++ if (unlikely(!lseg)) ++ return; ++ ++ objlseg = LSEG_LD_DATA(lseg); ++ objio_free_lseg(objlseg->internal); ++ kfree(lseg); ++} ++ ++/* ++ * I/O Operations ++ */ ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ ++static struct objlayout_io_state * ++objlayout_alloc_io_state(struct pnfs_layout_type *pnfs_layout_type, ++ struct page **pages, ++ unsigned pgbase, ++ unsigned nr_pages, ++ loff_t offset, ++ size_t count, ++ struct pnfs_layout_segment *lseg, ++ void *rpcdata) ++{ ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(lseg); ++ struct objlayout_io_state *state; ++ u64 lseg_end_offset; ++ size_t size_nr_pages; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ if (objio_alloc_io_state(objlseg->internal, &state)) ++ return NULL; ++ ++ BUG_ON(offset < lseg->range.offset); ++ lseg_end_offset = end_offset(lseg->range.offset, lseg->range.length); ++ BUG_ON(offset >= lseg_end_offset); ++ if (offset + count > lseg_end_offset) { ++ count = lseg->range.length - (offset - lseg->range.offset); ++ dprintk("%s: truncated count %Zd\n", __func__, count); ++ } ++ ++ if (pgbase > PAGE_SIZE) { ++ unsigned n = pgbase >> PAGE_SHIFT; ++ ++ pgbase &= ~PAGE_MASK; ++ pages += n; ++ nr_pages -= n; ++ } ++ ++ size_nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ BUG_ON(nr_pages < size_nr_pages); ++ if (nr_pages > size_nr_pages) ++ nr_pages = size_nr_pages; ++ ++ INIT_LIST_HEAD(&state->err_list); ++ state->lseg = lseg; ++ state->rpcdata = rpcdata; ++ state->pages = pages; ++ state->pgbase = pgbase; ++ state->nr_pages = nr_pages; ++ state->offset = offset; ++ state->count = count; ++ state->sync = 0; ++ ++ return state; ++} ++ ++static void ++objlayout_free_io_state(struct objlayout_io_state *state) ++{ ++ dprintk("%s: freeing io_state\n", __func__); ++ if (unlikely(!state)) ++ return; ++ ++ objio_free_io_state(state); ++} ++ ++/* ++ * I/O done common code ++ */ ++static void ++objlayout_iodone(struct objlayout_io_state *state) ++{ ++ dprintk("%s: state %p status\n", __func__, state); ++ ++ if (likely(state->status >= 0)) { ++ objlayout_free_io_state(state); ++ } else { ++ struct objlayout *objlay = OBJLAYOUT(state->lseg->layout); ++ ++ spin_lock(&objlay->lock); ++ objlay->delta_space_valid = OBJ_DSU_INVALID; ++ list_add(&objlay->err_list, &state->err_list); ++ spin_unlock(&objlay->lock); ++ } ++} ++ ++/* ++ * objlayout_io_set_result - Set an osd_error code on a specific osd comp. ++ * ++ * The @index component IO failed (error returned from target). Register ++ * the error for later reporting at layout-return. ++ */ ++void ++objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, ++ int osd_error, u64 offset, u64 length, bool is_write) ++{ ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; ++ ++ BUG_ON(index >= state->num_comps); ++ if (osd_error) { ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(state->lseg); ++ struct pnfs_osd_layout *layout = ++ (typeof(layout))objlseg->pnfs_osd_layout; ++ ++ ioerr->oer_component = layout->olo_comps[index].oc_object_id; ++ ioerr->oer_comp_offset = offset; ++ ioerr->oer_comp_length = length; ++ ioerr->oer_iswrite = is_write; ++ ioerr->oer_errno = osd_error; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " ++ "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", ++ __func__, index, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ } else { ++ /* User need not call if no error is reported */ ++ ioerr->oer_errno = 0; ++ } ++} ++ ++static void _rpc_commit_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_client_ops->nfs_commit_complete(wdata); ++} ++ ++/* ++ * Commit data remotely on OSDs ++ */ ++enum pnfs_try_status ++objlayout_commit(struct nfs_write_data *wdata, int how) ++{ ++ int status = PNFS_ATTEMPTED; ++ ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_commit_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ dprintk("%s: Return %d\n", __func__, status); ++ return status; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_read_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ ++ pnfs_client_ops->nfs_readlist_complete(rdata); ++} ++ ++void ++objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) ++{ ++ int eof = state->eof; ++ struct nfs_read_data *rdata; ++ ++ state->status = status; ++ dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof); ++ rdata = state->rpcdata; ++ rdata->task.tk_status = status; ++ if (status >= 0) { ++ rdata->res.count = status; ++ rdata->res.eof = eof; ++ } ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_client_ops->nfs_readlist_complete(rdata); ++ else { ++ INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); ++ schedule_work(&rdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async reads. ++ */ ++enum pnfs_try_status ++objlayout_read_pagelist(struct nfs_read_data *rdata, unsigned nr_pages) ++{ ++ loff_t offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct objlayout_io_state *state; ++ ssize_t status = 0; ++ loff_t eof; ++ ++ dprintk("%s: Begin inode %p offset %llu count %d\n", ++ __func__, rdata->inode, offset, (int)count); ++ ++ eof = i_size_read(rdata->inode); ++ if (unlikely(offset + count > eof)) { ++ if (offset >= eof) { ++ status = 0; ++ rdata->res.count = 0; ++ rdata->res.eof = 1; ++ goto out; ++ } ++ count = eof - offset; ++ } ++ ++ state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, ++ rdata->args.pages, rdata->args.pgbase, ++ nr_pages, offset, count, ++ rdata->pdata.lseg, rdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->eof = state->offset + state->count >= eof; ++ ++ status = objio_read_pagelist(state); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ rdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_write_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_client_ops->nfs_writelist_complete(wdata); ++} ++ ++void ++objlayout_write_done(struct objlayout_io_state *state, ssize_t status, ++ bool sync) ++{ ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s: Begin\n", __func__); ++ wdata = state->rpcdata; ++ state->status = status; ++ wdata->task.tk_status = status; ++ if (status >= 0) { ++ wdata->res.count = status; ++ wdata->verf.committed = state->committed; ++ dprintk("%s: Return status %d committed %d\n", ++ __func__, wdata->task.tk_status, ++ wdata->verf.committed); ++ } else ++ dprintk("%s: Return status %d\n", ++ __func__, wdata->task.tk_status); ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_client_ops->nfs_writelist_complete(wdata); ++ else { ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async writes. ++ */ ++enum pnfs_try_status ++objlayout_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int how) ++{ ++ struct objlayout_io_state *state; ++ ssize_t status; ++ ++ dprintk("%s: Begin inode %p offset %llu count %u\n", ++ __func__, wdata->inode, wdata->args.offset, wdata->args.count); ++ ++ state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, ++ wdata->args.pages, ++ wdata->args.pgbase, ++ nr_pages, ++ wdata->args.offset, ++ wdata->args.count, ++ wdata->pdata.lseg, wdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->sync = how & FLUSH_SYNC; ++ ++ status = objio_write_pagelist(state, how & FLUSH_STABLE); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ wdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++void ++objlayout_encode_layoutcommit(struct pnfs_layout_type *pnfslay, ++ struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct pnfs_osd_layoutupdate lou; ++ __be32 *start; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ spin_lock(&objlay->lock); ++ lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); ++ lou.dsu_delta = objlay->delta_space_used; ++ objlay->delta_space_used = 0; ++ objlay->delta_space_valid = OBJ_DSU_INIT; ++ lou.olu_ioerr_flag = !list_empty(&objlay->err_list); ++ spin_unlock(&objlay->lock); ++ ++ start = xdr_reserve_space(xdr, 4); ++ ++ BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ ++ dprintk("%s: Return delta_space_used %lld err %d\n", __func__, ++ lou.dsu_delta, lou.olu_ioerr_flag); ++} ++ ++static int ++err_prio(u32 oer_errno) ++{ ++ switch (oer_errno) { ++ case 0: ++ return 0; ++ ++ case PNFS_OSD_ERR_RESOURCE: ++ return OSD_ERR_PRI_RESOURCE; ++ case PNFS_OSD_ERR_BAD_CRED: ++ return OSD_ERR_PRI_BAD_CRED; ++ case PNFS_OSD_ERR_NO_ACCESS: ++ return OSD_ERR_PRI_NO_ACCESS; ++ case PNFS_OSD_ERR_UNREACHABLE: ++ return OSD_ERR_PRI_UNREACHABLE; ++ case PNFS_OSD_ERR_NOT_FOUND: ++ return OSD_ERR_PRI_NOT_FOUND; ++ case PNFS_OSD_ERR_NO_SPACE: ++ return OSD_ERR_PRI_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case PNFS_OSD_ERR_EIO: ++ return OSD_ERR_PRI_EIO; ++ } ++} ++ ++static void ++merge_ioerr(struct pnfs_osd_ioerr *dest_err, ++ const struct pnfs_osd_ioerr *src_err) ++{ ++ u64 dest_end, src_end; ++ ++ if (!dest_err->oer_errno) { ++ *dest_err = *src_err; ++ /* accumulated device must be blank */ ++ memset(&dest_err->oer_component.oid_device_id, 0, ++ sizeof(dest_err->oer_component.oid_device_id)); ++ ++ return; ++ } ++ ++ if (dest_err->oer_component.oid_partition_id != ++ src_err->oer_component.oid_partition_id) ++ dest_err->oer_component.oid_partition_id = 0; ++ ++ if (dest_err->oer_component.oid_object_id != ++ src_err->oer_component.oid_object_id) ++ dest_err->oer_component.oid_object_id = 0; ++ ++ if (dest_err->oer_comp_offset > src_err->oer_comp_offset) ++ dest_err->oer_comp_offset = src_err->oer_comp_offset; ++ ++ dest_end = end_offset(dest_err->oer_comp_offset, ++ dest_err->oer_comp_length); ++ src_end = end_offset(src_err->oer_comp_offset, ++ src_err->oer_comp_length); ++ if (dest_end < src_end) ++ dest_end = src_end; ++ ++ dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; ++ ++ if ((src_err->oer_iswrite == dest_err->oer_iswrite) && ++ (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { ++ dest_err->oer_errno = src_err->oer_errno; ++ } else if (src_err->oer_iswrite) { ++ dest_err->oer_iswrite = true; ++ dest_err->oer_errno = src_err->oer_errno; ++ } ++} ++ ++static void ++encode_accumulated_error(struct objlayout *objlay, struct xdr_stream *xdr) ++{ ++ struct objlayout_io_state *state, *tmp; ++ struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ ++ for (i = 0; i < state->num_comps; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ merge_ioerr(&accumulated_err, ioerr); ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++ ++ BUG_ON(pnfs_osd_xdr_encode_ioerr(xdr, &accumulated_err)); ++} ++ ++void ++objlayout_encode_layoutreturn(struct pnfs_layout_type *pnfslay, ++ struct xdr_stream *xdr, ++ const struct nfs4_pnfs_layoutreturn_arg *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct objlayout_io_state *state, *tmp; ++ __be32 *start, *uninitialized_var(last_xdr); ++ ++ dprintk("%s: Begin\n", __func__); ++ start = xdr_reserve_space(xdr, 4); ++ BUG_ON(!start); ++ ++ spin_lock(&objlay->lock); ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ int res = 0; ++ ++ for (i = 0; i < state->num_comps && !res; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ last_xdr = xdr->p; ++ res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]); ++ } ++ if (unlikely(res)) { ++ /* no space for even one error descriptor */ ++ BUG_ON(last_xdr == start + 1); ++ ++ /* we've encountered a situation with lots and lots of ++ * errors and no space to encode them all. Use the last ++ * available slot to report the union of all the ++ * remaining errors. ++ */ ++ xdr_rewind_stream(xdr, last_xdr - ++ pnfs_osd_ioerr_xdr_sz() / 4); ++ encode_accumulated_error(objlay, xdr); ++ goto loop_done; ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++loop_done: ++ spin_unlock(&objlay->lock); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ dprintk("%s: Return\n", __func__); ++} ++ ++struct objlayout_deviceinfo { ++ struct page *page; ++ struct pnfs_osd_deviceaddr da; /* This must be last */ ++}; ++ ++/* Initialize and call nfs_getdeviceinfo, then decode and return a ++ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() ++ * should be called. ++ */ ++int objlayout_get_deviceinfo(struct pnfs_layout_type *pnfslay, ++ struct pnfs_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi; ++ struct pnfs_device pd; ++ struct super_block *sb; ++ struct page *page; ++ size_t sz; ++ u32 *p; ++ int err; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ pd.area = page_address(page); ++ ++ memcpy(&pd.dev_id, d_id, sizeof(*d_id)); ++ pd.layout_type = LAYOUT_OSD2_OBJECTS; ++ pd.dev_notify_types = 0; ++ pd.pages = &page; ++ pd.pgbase = 0; ++ pd.pglen = PAGE_SIZE; ++ pd.mincount = 0; ++ ++ sb = PNFS_INODE(pnfslay)->i_sb; ++ err = pnfs_client_ops->nfs_getdeviceinfo(PNFS_NFS_SERVER(pnfslay), &pd); ++ dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); ++ if (err) ++ goto err_out; ++ ++ p = pd.area; ++ sz = pnfs_osd_xdr_deviceaddr_incore_sz(p); ++ odi = kzalloc(sz + (sizeof(*odi) - sizeof(odi->da)), GFP_KERNEL); ++ if (!odi) { ++ err = -ENOMEM; ++ goto err_out; ++ } ++ pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); ++ odi->page = page; ++ *deviceaddr = &odi->da; ++ return 0; ++ ++err_out: ++ __free_page(page); ++ return err; ++} ++ ++void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi = container_of(deviceaddr, ++ struct objlayout_deviceinfo, ++ da); ++ ++ __free_page(odi->page); ++ kfree(odi); ++} ++ ++/* ++ * Initialize a mountpoint by retrieving the list of ++ * available devices for it. ++ * Return the pnfs_mount_type structure so the ++ * pNFS_client can refer to the mount point later on. ++ */ ++static int ++objlayout_initialize_mountpoint(struct nfs_server *server, ++ const struct nfs_fh *mntfh) ++{ ++ void *data; ++ ++ data = objio_init_mt(); ++ if (IS_ERR(data)) { ++ printk(KERN_INFO "%s: objlayout lib not ready err=%ld\n", ++ __func__, PTR_ERR(data)); ++ return PTR_ERR(data); ++ } ++ server->pnfs_ld_data = data; ++ ++ dprintk("%s: Return data=%p\n", __func__, data); ++ return 0; ++} ++ ++/* ++ * Uninitialize a mountpoint ++ */ ++static int ++objlayout_uninitialize_mountpoint(struct nfs_server *server) ++{ ++ dprintk("%s: Begin %p\n", __func__, server->pnfs_ld_data); ++ objio_fini_mt(server->pnfs_ld_data); ++ return 0; ++} ++ ++struct layoutdriver_io_operations objlayout_io_operations = { ++ .commit = objlayout_commit, ++ .read_pagelist = objlayout_read_pagelist, ++ .write_pagelist = objlayout_write_pagelist, ++ .alloc_layout = objlayout_alloc_layout, ++ .free_layout = objlayout_free_layout, ++ .alloc_lseg = objlayout_alloc_lseg, ++ .free_lseg = objlayout_free_lseg, ++ .encode_layoutcommit = objlayout_encode_layoutcommit, ++ .encode_layoutreturn = objlayout_encode_layoutreturn, ++ .initialize_mountpoint = objlayout_initialize_mountpoint, ++ .uninitialize_mountpoint = objlayout_uninitialize_mountpoint, ++}; +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-23 12:09:03.351434439 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-08-23 12:09:03.351434439 -0400 +@@ -0,0 +1,171 @@ ++/* ++ * objlayout.h ++ * ++ * Data types and function declerations for interfacing with the ++ * pNFS standard object layout driver. ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#ifndef _OBJLAYOUT_H ++#define _OBJLAYOUT_H ++ ++#include ++#include ++#include ++ ++/* ++ * in-core layout segment ++ */ ++struct objlayout_segment { ++ void *internal; /* for provider internal use */ ++ u8 pnfs_osd_layout[]; ++}; ++ ++/* ++ * per-inode layout ++ */ ++struct objlayout { ++ struct pnfs_layout_type pnfs_layout; ++ ++ /* for layout_commit */ ++ enum osd_delta_space_valid_enum { ++ OBJ_DSU_INIT = 0, ++ OBJ_DSU_VALID, ++ OBJ_DSU_INVALID, ++ } delta_space_valid; ++ s64 delta_space_used; /* consumed by write ops */ ++ ++ /* for layout_return */ ++ spinlock_t lock; ++ struct list_head err_list; ++}; ++ ++static inline struct objlayout * ++OBJLAYOUT(struct pnfs_layout_type *lo) ++{ ++ return container_of(lo, struct objlayout, pnfs_layout); ++} ++ ++/* ++ * per-I/O operation state ++ * embedded in objects provider io_state data structure ++ */ ++struct objlayout_io_state { ++ struct pnfs_layout_segment *lseg; ++ ++ struct page **pages; ++ unsigned pgbase; ++ unsigned nr_pages; ++ unsigned long count; ++ loff_t offset; ++ bool sync; ++ ++ void *rpcdata; ++ int status; /* res */ ++ int eof; /* res */ ++ int committed; /* res */ ++ ++ /* Error reporting (layout_return) */ ++ struct list_head err_list; ++ unsigned num_comps; ++ /* Pointer to array of error descriptors of size num_comps. ++ * It should contain as many entries as devices in the osd_layout ++ * that participate in the I/O. It is up to the io_engine to allocate ++ * needed space and set num_comps. ++ */ ++ struct pnfs_osd_ioerr *ioerrs; ++}; ++ ++/* ++ * Raid engine I/O API ++ */ ++extern void *objio_init_mt(void); ++extern void objio_fini_mt(void *mt); ++ ++extern int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_type *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout); ++extern void objio_free_lseg(void *p); ++ ++extern int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp); ++extern void objio_free_io_state(struct objlayout_io_state *state); ++ ++extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); ++extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable); ++ ++/* ++ * callback API ++ */ ++extern void objlayout_io_set_result(struct objlayout_io_state *state, ++ unsigned index, int osd_error, ++ u64 offset, u64 length, bool is_write); ++ ++static inline void ++objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) ++{ ++ struct objlayout *objlay = OBJLAYOUT(state->lseg->layout); ++ ++ /* If one of the I/Os errored out and the delta_space_used was ++ * invalid we render the complete report as invalid. Protocol mandate ++ * the DSU be accurate or not reported. ++ */ ++ spin_lock(&objlay->lock); ++ if (objlay->delta_space_valid != OBJ_DSU_INVALID) { ++ objlay->delta_space_valid = OBJ_DSU_VALID; ++ objlay->delta_space_used += space_used; ++ } ++ spin_unlock(&objlay->lock); ++} ++ ++extern void objlayout_read_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++extern void objlayout_write_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++ ++extern int objlayout_get_deviceinfo(struct pnfs_layout_type *pnfslay, ++ struct pnfs_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr); ++extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); ++ ++/* ++ * exported generic objects function vectors ++ */ ++extern struct layoutdriver_io_operations objlayout_io_operations; ++extern struct pnfs_client_operations *pnfs_client_ops; ++ ++#endif /* _OBJLAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-23 12:09:03.352501716 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-23 12:09:03.352501716 -0400 +@@ -0,0 +1,734 @@ ++/* ++ * panfs_shim.c ++ * ++ * Shim layer for interfacing with the Panasas DirectFlow module I/O stack ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include "objlayout.h" ++#include "panfs_shim.h" ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct panfs_export_operations *panfs_export_ops; ++ ++void * ++objio_init_mt(void) ++{ ++ return panfs_export_ops == NULL ? ERR_PTR(-EAGAIN) : NULL; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++} ++ ++static int ++panfs_shim_conv_raid01(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) { ++ hdr->type = PAN_AGG_RAID1; ++ hdr->hdr.raid1.num_comps = lo_map->odm_mirror_cnt + 1; ++ } else if (layout->olo_num_comps > 1) { ++ hdr->type = PAN_AGG_RAID0; ++ hdr->hdr.raid0.num_comps = layout->olo_num_comps; ++ hdr->hdr.raid0.stripe_unit = lo_map->odm_stripe_unit; ++ } else ++ hdr->type = PAN_AGG_SIMPLE; ++ return 0; ++} ++ ++static int ++panfs_shim_conv_raid5(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) ++ goto err; ++ ++ if (lo_map->odm_group_width || lo_map->odm_group_depth) { ++ if (!lo_map->odm_group_width || !lo_map->odm_group_depth) ++ goto err; ++ ++ hdr->type = PAN_AGG_GRP_RAID5_LEFT; ++ hdr->hdr.grp_raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.grp_raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.grp_raid5_left.stripe_unit = lo_map->odm_stripe_unit; ++ hdr->hdr.grp_raid5_left.rg_width = lo_map->odm_group_width; ++ hdr->hdr.grp_raid5_left.rg_depth = lo_map->odm_group_depth; ++ /* this is a guess, panasas server is not supposed to ++ hand out layotu otherwise */ ++ hdr->hdr.grp_raid5_left.group_layout_policy = ++ PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN; ++ } else { ++ hdr->type = PAN_AGG_RAID5_LEFT; ++ hdr->hdr.raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.raid5_left.stripe_unit2 = ++ hdr->hdr.raid5_left.stripe_unit1 = ++ hdr->hdr.raid5_left.stripe_unit0 = lo_map->odm_stripe_unit; ++ } ++ ++ return 0; ++err: ++ return -EINVAL; ++} ++ ++/* ++ * Convert a pnfs_osd data map into Panasas aggregation layout header ++ */ ++static int ++panfs_shim_conv_pnfs_osd_data_map( ++ struct pnfs_osd_layout *layout, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ int status = -EINVAL; ++ struct pnfs_osd_data_map *lo_map = &layout->olo_map; ++ ++ if (!layout->olo_num_comps) { ++ dprintk("%s: !!layout.n_comps(%u)\n", __func__, ++ layout->olo_num_comps); ++ goto err; ++ } ++ ++ switch (lo_map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_0 " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid01(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_5: ++ if (!lo_map->odm_group_width) { ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_5 !group_width " ++ "layout.n_comps(%u)!=map.n_comps(%u) " ++ "|| comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ } else if ((layout->olo_num_comps != lo_map->odm_num_comps && ++ layout->olo_num_comps > lo_map->odm_group_width) || ++ (layout->olo_comps_index % lo_map->odm_group_width)){ ++ dprintk("%s: !!PNFS_OSD_RAID_5 group_width(%u) " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ lo_map->odm_group_width, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid5(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_PQ: ++ default: ++ dprintk("%s: !!PNFS_OSD_RAID_(%d)\n", __func__, ++ lo_map->odm_raid_algorithm); ++ goto err; ++ } ++ ++ return 0; ++ ++err: ++ return status; ++} ++ ++/* ++ * Convert pnfs_osd layout into Panasas map and caps type ++ */ ++int ++objio_alloc_lseg(void **outp, ++ struct pnfs_layout_type *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ int i, total_comps; ++ int status; ++ struct pnfs_osd_object_cred *lo_comp; ++ pan_size_t alloc_sz, local_sz; ++ pan_sm_map_cap_t *mcs = NULL; ++ u8 *buf; ++ pan_agg_comp_obj_t *pan_comp; ++ pan_sm_sec_t *pan_sec; ++ ++ status = -EINVAL; ++ if (layout->olo_num_comps < layout->olo_map.odm_group_width) { ++ total_comps = layout->olo_comps_index + layout->olo_num_comps; ++ } else { ++ /* allocate full map, otherwise SAM gets confused */ ++ total_comps = layout->olo_map.odm_num_comps; ++ } ++ alloc_sz = total_comps * ++ (sizeof(pan_agg_comp_obj_t) + sizeof(pan_sm_sec_t)); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p = layout->olo_comps[i].oc_cap.cred; ++ if (panfs_export_ops->sm_sec_t_get_size_otw( ++ (pan_sm_sec_otw_t *)&p, &local_sz, NULL, NULL)) ++ goto err; ++ alloc_sz += local_sz; ++ } ++ ++ status = -ENOMEM; ++ mcs = kzalloc(sizeof(*mcs) + alloc_sz, GFP_KERNEL); ++ if (!mcs) ++ goto err; ++ buf = (u8 *)&mcs[1]; ++ ++ mcs->offset = lseg->range.offset; ++ mcs->length = lseg->range.length; ++#if 0 ++ /* FIXME: for now */ ++ mcs->expiration_time.ts_sec = 0; ++ mcs->expiration_time.ts_nsec = 0; ++#endif ++ mcs->full_map.map_hdr.avail_state = PAN_AGG_OBJ_STATE_NORMAL; ++ status = panfs_shim_conv_pnfs_osd_data_map(layout, ++ &mcs->full_map.layout_hdr); ++ if (status) ++ goto err; ++ ++ mcs->full_map.components.size = total_comps; ++ mcs->full_map.components.data = (pan_agg_comp_obj_t *)buf; ++ buf += total_comps * sizeof(pan_agg_comp_obj_t); ++ ++ mcs->secs.size = total_comps; ++ mcs->secs.data = (pan_sm_sec_t *)buf; ++ buf += total_comps * sizeof(pan_sm_sec_t); ++ ++ lo_comp = layout->olo_comps; ++ pan_comp = mcs->full_map.components.data + layout->olo_comps_index; ++ pan_sec = mcs->secs.data + layout->olo_comps_index; ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p; ++ pan_stor_obj_id_t *obj_id = &mcs->full_map.map_hdr.obj_id; ++ struct pnfs_osd_objid *oc_obj_id = &lo_comp->oc_object_id; ++ u64 dev_id = __be64_to_cpup( ++ (__be64 *)oc_obj_id->oid_device_id.data + 1); ++ ++ dprintk("%s: i=%d deviceid=%Lx:%Lx partition=%Lx object=%Lx\n", ++ __func__, i, ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data), ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data + 1), ++ oc_obj_id->oid_partition_id, oc_obj_id->oid_object_id); ++ ++ if (i == 0) { ++ /* make up mgr_id to calm sam down */ ++ pan_mgr_id_construct_artificial(PAN_MGR_SM, 0, ++ &obj_id->dev_id); ++ obj_id->grp_id = oc_obj_id->oid_partition_id; ++ obj_id->obj_id = oc_obj_id->oid_object_id; ++ } ++ ++ if (obj_id->grp_id != lo_comp->oc_object_id.oid_partition_id) { ++ dprintk("%s: i=%d grp_id=0x%Lx oid_partition_id=0x%Lx\n", ++ __func__, i, (u64)obj_id->grp_id, ++ lo_comp->oc_object_id.oid_partition_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ if (obj_id->obj_id != lo_comp->oc_object_id.oid_object_id) { ++ dprintk("%s: i=%d obj_id=0x%Lx oid_object_id=0x%Lx\n", ++ __func__, i, obj_id->obj_id, ++ lo_comp->oc_object_id.oid_object_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ pan_comp->dev_id = dev_id; ++ if (!pan_stor_is_device_id_an_obsd_id(pan_comp->dev_id)) { ++ dprintk("%s: i=%d dev_id=0x%Lx not an obsd_id\n", ++ __func__, i, obj_id->dev_id); ++ status = -EINVAL; ++ goto err; ++ } ++ if (lo_comp->oc_osd_version == PNFS_OSD_MISSING) { ++ dprintk("%s: degraded maps not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ pan_comp->avail_state = PAN_AGG_COMP_STATE_NORMAL; ++ if (lo_comp->oc_cap_key_sec != PNFS_OSD_CAP_KEY_SEC_NONE) { ++ dprintk("%s: cap key security not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ ++ p = lo_comp->oc_cap.cred; ++ panfs_export_ops->sm_sec_t_unmarshall( ++ (pan_sm_sec_otw_t *)&p, ++ pan_sec, ++ buf, ++ alloc_sz, ++ NULL, ++ &local_sz); ++ buf += local_sz; ++ alloc_sz -= local_sz; ++ ++ lo_comp++; ++ pan_comp++; ++ pan_sec++; ++ } ++ ++ *outp = mcs; ++ dprintk("%s:Return mcs=%p\n", __func__, mcs); ++ return 0; ++ ++err: ++ objio_free_lseg(mcs); ++ dprintk("%s:Error %d\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Free a Panasas map and caps type ++ */ ++void ++objio_free_lseg(void *p) ++{ ++ kfree(p); ++} ++ ++/* ++ * I/O routines ++ */ ++int ++objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct panfs_shim_io_state *p; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ p = kzalloc(sizeof(*p), GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ *outp = &p->ol_state; ++ return 0; ++} ++ ++/* ++ * Free an I/O state ++ */ ++void ++objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ int i; ++ ++ dprintk("%s: freeing io_state\n", __func__); ++ for (i = 0; i < state->ol_state.nr_pages; i++) ++ kunmap(state->ol_state.pages[i]); ++ ++ if (state->ucreds) ++ panfs_export_ops->ucreds_put(state->ucreds); ++ kfree(state->sg_list); ++ kfree(state); ++} ++ ++static int ++panfs_shim_pages_to_sg( ++ struct panfs_shim_io_state *state, ++ struct page **pages, ++ unsigned int pgbase, ++ unsigned nr_pages, ++ size_t count) ++{ ++ unsigned i, n; ++ pan_sg_entry_t *sg; ++ ++ dprintk("%s pgbase %u nr_pages %u count %d " ++ "pg0 %p flags 0x%x index %llu\n", ++ __func__, pgbase, nr_pages, (int)count, pages[0], ++ (unsigned)pages[0]->flags, (unsigned long long)pages[0]->index); ++ ++ sg = kmalloc(nr_pages * sizeof(*sg), GFP_KERNEL); ++ if (sg == NULL) ++ return -ENOMEM; ++ ++ dprintk("%s sg_list %p pages %p pgbase %u nr_pages %u\n", ++ __func__, sg, pages, pgbase, nr_pages); ++ ++ for (i = 0; i < nr_pages; i++) { ++ sg[i].buffer = (char *)kmap(pages[i]) + pgbase; ++ n = PAGE_SIZE - pgbase; ++ pgbase = 0; ++ if (n > count) ++ n = count; ++ sg[i].chunk_size = n; ++ count -= n; ++ if (likely(count)) { ++ sg[i].next = &sg[i+1]; ++ } else { ++ /* we're done */ ++ sg[i].next = NULL; ++ break; ++ } ++ } ++ BUG_ON(count); ++ ++ state->sg_list = sg; ++ return 0; ++} ++ ++/* ++ * Callback function for async reads ++ */ ++static void ++panfs_shim_read_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.read.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++ status = res_p->length; ++ WARN_ON(status < 0); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_read rc %d: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_read_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ struct objlayout_segment *lseg = LSEG_LD_DATA(ol_state->lseg); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)lseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.read.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.read.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_read(PAN_SAM_ACCESS_BYPASS_TIMESTAMP, ++ &state->u.read.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_read_done, ++ state, NULL, ++ &state->u.read.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_read_done(state, NULL, &state->u.read.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Callback function for async writes ++ */ ++static void ++panfs_shim_write_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.write.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++/* state->ol_state.committed = NFS_FILE_SYNC;*/ ++ state->ol_state.committed = NFS_UNSTABLE; ++ status = res_p->length; ++ WARN_ON(status < 0); ++ ++ objlayout_add_delta_space_used(&state->ol_state, ++ res_p->delta_capacity_used); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_write rc %u: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_write_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable /* unused, PanOSD writes are stable */) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ struct objlayout_segment *lseg = LSEG_LD_DATA(ol_state->lseg); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)lseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.write.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.write.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_write(PAN_SAM_ACCESS_NONE, ++ &state->u.write.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_write_done, ++ state, ++ NULL, ++ &state->u.write.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_write_done(state, NULL, &state->u.write.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++int ++panfs_shim_register(struct panfs_export_operations *ops) ++{ ++ if (panfs_export_ops) { ++ printk(KERN_INFO ++ "%s: panfs already registered (panfs ops %p)\n", ++ __func__, panfs_export_ops); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: registering panfs ops %p\n", ++ __func__, ops); ++ ++ panfs_export_ops = ops; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_register); ++ ++int ++panfs_shim_unregister(void) ++{ ++ if (!panfs_export_ops) { ++ printk(KERN_INFO "%s: panfs is not registered\n", __func__); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: unregistering panfs ops %p\n", ++ __func__, panfs_export_ops); ++ ++ panfs_export_ops = NULL; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_unregister); ++ ++/* ++ * Policy Operations ++ */ ++ ++/* ++ * Return the stripe size for the specified file ++ */ ++ssize_t ++panlayout_get_stripesize(struct pnfs_layout_type *pnfslay) ++{ ++ ssize_t sz, maxsz = -1; ++ struct pnfs_layout_segment *lseg; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ list_for_each_entry(lseg, &pnfslay->segs, fi_list) { ++ int n; ++ struct objlayout_segment *panlseg = LSEG_LD_DATA(lseg); ++ struct pnfs_osd_layout *lo = ++ (struct pnfs_osd_layout *)panlseg->pnfs_osd_layout; ++ struct pnfs_osd_data_map *map = &lo->olo_map; ++ ++ n = map->odm_group_width; ++ if (n == 0) ++ n = map->odm_num_comps / (map->odm_mirror_cnt + 1); ++ ++ switch (map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_5: ++ n -= 1; ++ n *= 8; /* FIXME: until we have 2-D coalescing */ ++ break; ++ ++ case PNFS_OSD_RAID_PQ: ++ n -= 2; ++ break; ++ ++ default: ++ BUG_ON(1); ++ } ++ sz = map->odm_stripe_unit * n; ++ if (sz > maxsz) ++ maxsz = sz; ++ } ++ dprintk("%s: Return %Zd\n", __func__, maxsz); ++ return maxsz; ++} ++ ++#define PANLAYOUT_DEF_STRIPE_UNIT (64*1024) ++#define PANLAYOUT_DEF_STRIPE_WIDTH 9 ++#define PANLAYOUT_MAX_STRIPE_WIDTH 11 ++#define PANLAYOUT_MAX_GATHER_STRIPES 8 ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++panlayout_get_blocksize(void) ++{ ++ ssize_t sz = (PANLAYOUT_MAX_STRIPE_WIDTH-1) * ++ PANLAYOUT_DEF_STRIPE_UNIT * ++ PANLAYOUT_MAX_GATHER_STRIPES; ++ dprintk("%s: Return %Zd\n", __func__, sz); ++ return sz; ++} ++ ++static struct layoutdriver_policy_operations panlayout_policy_operations = { ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ .get_stripesize = panlayout_get_stripesize, ++ .get_blocksize = panlayout_get_blocksize, ++}; ++ ++#define PNFS_LAYOUT_PANOSD (NFS4_PNFS_PRIVATE_LAYOUT | LAYOUT_OSD2_OBJECTS) ++ ++static struct pnfs_layoutdriver_type panlayout_type = { ++ .id = PNFS_LAYOUT_PANOSD, ++ .name = "PNFS_LAYOUT_PANOSD", ++ .ld_io_ops = &objlayout_io_operations, ++ .ld_policy_ops = &panlayout_policy_operations, ++}; ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for Panasas OSDs"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++panlayout_init(void) ++{ ++ pnfs_client_ops = pnfs_register_layoutdriver(&panlayout_type); ++ printk(KERN_INFO "%s: Registered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++ return 0; ++} ++ ++static void __exit ++panlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&panlayout_type); ++ printk(KERN_INFO "%s: Unregistered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(panlayout_init); ++module_exit(panlayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-23 12:09:03.353501685 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-23 12:09:03.353501685 -0400 +@@ -0,0 +1,482 @@ ++/* ++ * panfs_shim.h ++ * ++ * Data types and external function declerations for interfacing with ++ * panfs (Panasas DirectFlow) I/O stack ++ * ++ * Copyright (C) 2007 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#ifndef _PANLAYOUT_PANFS_SHIM_H ++#define _PANLAYOUT_PANFS_SHIM_H ++ ++typedef s8 pan_int8_t; ++typedef u8 pan_uint8_t; ++typedef s16 pan_int16_t; ++typedef u16 pan_uint16_t; ++typedef s32 pan_int32_t; ++typedef u32 pan_uint32_t; ++typedef s64 pan_int64_t; ++typedef u64 pan_uint64_t; ++ ++/* ++ * from pan_base_types.h ++ */ ++typedef pan_uint64_t pan_rpc_none_t; ++typedef pan_uint32_t pan_rpc_arrdim_t; ++typedef pan_uint32_t pan_status_t; ++typedef pan_uint8_t pan_otw_t; ++typedef pan_uint8_t pan_pad_t; ++ ++typedef pan_uint32_t pan_timespec_sec_t; ++typedef pan_uint32_t pan_timespec_nsec_t; ++ ++typedef struct pan_timespec_s pan_timespec_t; ++struct pan_timespec_s { ++ pan_timespec_sec_t ts_sec; ++ pan_timespec_nsec_t ts_nsec; ++}; ++ ++/* ++ * from pan_std_types.h ++ */ ++typedef pan_uint32_t pan_size_t; ++typedef int pan_bool_t; ++ ++/* ++ * from pan_common_error.h ++ */ ++#define PAN_SUCCESS ((pan_status_t)0) ++#define PAN_ERR_IN_PROGRESS ((pan_status_t)55) ++ ++/* ++ * from pan_sg.h ++ */ ++typedef struct pan_sg_entry_s pan_sg_entry_t; ++struct pan_sg_entry_s { ++ void *buffer; /* pointer to memory */ ++ pan_uint32_t chunk_size; /* size of each chunk (bytes) */ ++ pan_sg_entry_t *next; ++}; ++ ++/* ++ * from pan_storage.h ++ */ ++typedef pan_uint64_t pan_stor_dev_id_t; ++typedef pan_uint32_t pan_stor_obj_grp_id_t; ++typedef pan_uint64_t pan_stor_obj_uniq_t; ++typedef pan_uint32_t pan_stor_action_t; ++typedef pan_uint8_t pan_stor_cap_key_t[20]; ++ ++typedef pan_uint8_t pan_stor_key_type_t; ++typedef pan_uint64_t pan_stor_len_t; ++typedef pan_int64_t pan_stor_delta_len_t; ++typedef pan_uint64_t pan_stor_offset_t; ++typedef pan_uint16_t pan_stor_op_t; ++ ++typedef pan_uint16_t pan_stor_sec_level_t; ++ ++struct pan_stor_obj_id_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_obj_uniq_t obj_id; ++ pan_stor_obj_grp_id_t grp_id; ++}; ++ ++typedef struct pan_stor_obj_id_s pan_stor_obj_id_t; ++ ++#define PAN_STOR_OP_NONE ((pan_stor_op_t) 0U) ++#define PAN_STOR_OP_READ ((pan_stor_op_t) 8U) ++#define PAN_STOR_OP_WRITE ((pan_stor_op_t) 9U) ++#define PAN_STOR_OP_APPEND ((pan_stor_op_t) 10U) ++#define PAN_STOR_OP_GETATTR ((pan_stor_op_t) 11U) ++#define PAN_STOR_OP_SETATTR ((pan_stor_op_t) 12U) ++#define PAN_STOR_OP_FLUSH ((pan_stor_op_t) 13U) ++#define PAN_STOR_OP_CLEAR ((pan_stor_op_t) 14U) ++ ++/* ++ * from pan_aggregation_map.h ++ */ ++typedef pan_uint8_t pan_agg_type_t; ++typedef pan_uint64_t pan_agg_map_version_t; ++typedef pan_uint8_t pan_agg_obj_state_t; ++typedef pan_uint8_t pan_agg_comp_state_t; ++typedef pan_uint8_t pan_agg_comp_flag_t; ++ ++#define PAN_AGG_OBJ_STATE_INVALID ((pan_agg_obj_state_t) 0x00) ++#define PAN_AGG_OBJ_STATE_NORMAL ((pan_agg_obj_state_t) 0x01) ++#define PAN_AGG_OBJ_STATE_DEGRADED ((pan_agg_obj_state_t) 0x02) ++#define PAN_AGG_OBJ_STATE_RECONSTRUCT ((pan_agg_obj_state_t) 0x03) ++#define PAN_AGG_OBJ_STATE_COPYBACK ((pan_agg_obj_state_t) 0x04) ++#define PAN_AGG_OBJ_STATE_UNAVAILABLE ((pan_agg_obj_state_t) 0x05) ++#define PAN_AGG_OBJ_STATE_CREATING ((pan_agg_obj_state_t) 0x06) ++#define PAN_AGG_OBJ_STATE_DELETED ((pan_agg_obj_state_t) 0x07) ++#define PAN_AGG_COMP_STATE_INVALID ((pan_agg_comp_state_t) 0x00) ++#define PAN_AGG_COMP_STATE_NORMAL ((pan_agg_comp_state_t) 0x01) ++#define PAN_AGG_COMP_STATE_UNAVAILABLE ((pan_agg_comp_state_t) 0x02) ++#define PAN_AGG_COMP_STATE_COPYBACK ((pan_agg_comp_state_t) 0x03) ++#define PAN_AGG_COMP_F_NONE ((pan_agg_comp_flag_t) 0x00) ++#define PAN_AGG_COMP_F_ATTR_STORING ((pan_agg_comp_flag_t) 0x01) ++#define PAN_AGG_COMP_F_OBJ_CORRUPT_OBS ((pan_agg_comp_flag_t) 0x02) ++#define PAN_AGG_COMP_F_TEMP ((pan_agg_comp_flag_t) 0x04) ++ ++struct pan_aggregation_map_s { ++ pan_agg_map_version_t version; ++ pan_agg_obj_state_t avail_state; ++ pan_stor_obj_id_t obj_id; ++}; ++ ++typedef struct pan_aggregation_map_s pan_aggregation_map_t; ++ ++struct pan_agg_comp_obj_s { ++ pan_stor_dev_id_t dev_id; ++ pan_agg_comp_state_t avail_state; ++ pan_agg_comp_flag_t comp_flags; ++}; ++ ++typedef struct pan_agg_comp_obj_s pan_agg_comp_obj_t; ++ ++struct pan_agg_simple_header_s { ++ pan_uint8_t unused; ++}; ++ ++typedef struct pan_agg_simple_header_s pan_agg_simple_header_t; ++ ++struct pan_agg_raid1_header_s { ++ pan_uint16_t num_comps; ++}; ++ ++typedef struct pan_agg_raid1_header_s pan_agg_raid1_header_t; ++ ++struct pan_agg_raid0_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++}; ++ ++typedef struct pan_agg_raid0_header_s pan_agg_raid0_header_t; ++ ++struct pan_agg_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit0; ++ pan_uint32_t stripe_unit1; ++ pan_uint32_t stripe_unit2; ++}; ++ ++typedef struct pan_agg_raid5_left_header_s pan_agg_raid5_left_header_t; ++ ++typedef struct pan_agg_grp_raid5_left_header_s pan_agg_grp_raid5_left_header_t; ++ ++struct pan_agg_grp_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++ pan_uint16_t rg_width; ++ pan_uint16_t rg_depth; ++ pan_uint8_t group_layout_policy; ++}; ++ ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_INVALID ((pan_uint8_t) 0x00) ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN ((pan_uint8_t) 0x01) ++ ++#define PAN_AGG_NULL_MAP ((pan_agg_type_t) 0x00) ++#define PAN_AGG_SIMPLE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_RAID1 ((pan_agg_type_t) 0x02) ++#define PAN_AGG_RAID0 ((pan_agg_type_t) 0x03) ++#define PAN_AGG_RAID5_LEFT ((pan_agg_type_t) 0x04) ++#define PAN_AGG_GRP_RAID5_LEFT ((pan_agg_type_t) 0x06) ++#define PAN_AGG_MINTYPE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_MAXTYPE ((pan_agg_type_t) 0x06) ++ ++struct pan_agg_layout_hdr_s { ++ pan_agg_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_uint64_t null; ++ pan_agg_simple_header_t simple; ++ pan_agg_raid1_header_t raid1; ++ pan_agg_raid0_header_t raid0; ++ pan_agg_raid5_left_header_t raid5_left; ++ pan_agg_grp_raid5_left_header_t grp_raid5_left; ++ } hdr; ++}; ++ ++typedef struct pan_agg_layout_hdr_s pan_agg_layout_hdr_t; ++ ++struct pan_agg_comp_obj_a_s { ++ pan_rpc_arrdim_t size; ++ pan_agg_comp_obj_t *data; ++}; ++typedef struct pan_agg_comp_obj_a_s pan_agg_comp_obj_a; ++ ++struct pan_agg_full_map_s { ++ pan_aggregation_map_t map_hdr; ++ pan_agg_layout_hdr_t layout_hdr; ++ pan_agg_comp_obj_a components; ++}; ++ ++typedef struct pan_agg_full_map_s pan_agg_full_map_t; ++ ++/* ++ * from pan_obsd_rpc_types.h ++ */ ++typedef pan_uint8_t pan_obsd_security_key_a[16]; ++ ++typedef pan_uint8_t pan_obsd_capability_key_a[20]; ++ ++typedef pan_uint8_t pan_obsd_key_holder_id_t; ++ ++#define PAN_OBSD_KEY_HOLDER_BASIS_KEY ((pan_obsd_key_holder_id_t) 0x01) ++#define PAN_OBSD_KEY_HOLDER_CAP_KEY ((pan_obsd_key_holder_id_t) 0x02) ++ ++struct pan_obsd_key_holder_s { ++ pan_obsd_key_holder_id_t select; ++ pan_pad_t pad[3]; ++ union { ++ pan_obsd_security_key_a basis_key; ++ pan_obsd_capability_key_a cap_key; ++ } key; ++}; ++ ++typedef struct pan_obsd_key_holder_s pan_obsd_key_holder_t; ++ ++/* ++ * from pan_sm_sec.h ++ */ ++typedef pan_uint8_t pan_sm_sec_type_t; ++typedef pan_uint8_t pan_sm_sec_otw_allo_mode_t; ++ ++struct pan_obsd_capability_generic_otw_t_s { ++ pan_rpc_arrdim_t size; ++ pan_uint8_t *data; ++}; ++typedef struct pan_obsd_capability_generic_otw_t_s ++ pan_obsd_capability_generic_otw_t; ++ ++struct pan_sm_sec_obsd_s { ++ pan_obsd_key_holder_t key; ++ pan_obsd_capability_generic_otw_t cap_otw; ++ pan_sm_sec_otw_allo_mode_t allo_mode; ++}; ++ ++typedef struct pan_sm_sec_obsd_s pan_sm_sec_obsd_t; ++ ++struct pan_sm_sec_s { ++ pan_sm_sec_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_rpc_none_t none; ++ pan_sm_sec_obsd_t obsd; ++ } variant; ++}; ++ ++typedef struct pan_sm_sec_s pan_sm_sec_t; ++ ++struct pan_sm_sec_a_s { ++ pan_rpc_arrdim_t size; ++ pan_sm_sec_t *data; ++}; ++typedef struct pan_sm_sec_a_s pan_sm_sec_a; ++typedef pan_otw_t *pan_sm_sec_otw_t; ++ ++/* ++ * from pan_sm_types.h ++ */ ++typedef pan_uint64_t pan_sm_cap_handle_t; ++ ++struct pan_sm_map_cap_s { ++ pan_agg_full_map_t full_map; ++ pan_stor_offset_t offset; ++ pan_stor_len_t length; ++ pan_sm_sec_a secs; ++ pan_sm_cap_handle_t handle; ++ pan_timespec_t expiration_time; ++ pan_stor_action_t action_mask; ++ pan_uint32_t flags; ++}; ++ ++typedef struct pan_sm_map_cap_s pan_sm_map_cap_t; ++ ++/* ++ * from pan_sm_ops.h ++ */ ++typedef pan_rpc_none_t pan_sm_cache_ptr_t; ++ ++/* ++ * from pan_sam_api.h ++ */ ++typedef pan_uint32_t pan_sam_access_flags_t; ++ ++typedef struct pan_sam_dev_error_s pan_sam_dev_error_t; ++struct pan_sam_dev_error_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_op_t stor_op; ++ pan_status_t error; ++}; ++ ++typedef struct pan_sam_ext_status_s pan_sam_ext_status_t; ++struct pan_sam_ext_status_s { ++ pan_uint32_t available; ++ pan_uint32_t size; ++ pan_sam_dev_error_t *errors; ++}; ++ ++enum pan_sam_rpc_sec_sel_e { ++ PAN_SAM_RPC_SEC_DEFAULT, ++ PAN_SAM_RPC_SEC_ATLEAST, ++ PAN_SAM_RPC_SEC_EXACTLY ++}; ++typedef enum pan_sam_rpc_sec_sel_e pan_sam_rpc_sec_sel_t; ++ ++typedef struct pan_sam_obj_sec_s pan_sam_obj_sec_t; ++struct pan_sam_obj_sec_s { ++ pan_stor_sec_level_t min_security; ++ pan_sm_map_cap_t *map_ccaps; ++}; ++ ++typedef struct pan_sam_rpc_sec_s pan_sam_rpc_sec_t; ++struct pan_sam_rpc_sec_s { ++ pan_sam_rpc_sec_sel_t selector; ++}; ++ ++typedef struct pan_sam_read_args_s pan_sam_read_args_t; ++struct pan_sam_read_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ void *return_attr; ++ void *checksum; ++ pan_stor_offset_t offset; ++ pan_uint16_t sm_options; ++ void *callout; ++ void *callout_arg; ++}; ++ ++typedef struct pan_sam_read_res_s pan_sam_read_res_t; ++struct pan_sam_read_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ void *attr; ++ void *checksum; ++}; ++ ++typedef void (*pan_sam_read_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t status); ++ ++#define PAN_SAM_ACCESS_NONE 0x0000 ++#define PAN_SAM_ACCESS_BYPASS_TIMESTAMP 0x0020 ++ ++typedef struct pan_sam_write_args_s pan_sam_write_args_t; ++struct pan_sam_write_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ pan_stor_offset_t offset; ++ void *attr; ++ void *return_attr; ++}; ++ ++typedef struct pan_sam_write_res_s pan_sam_write_res_t; ++struct pan_sam_write_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ pan_stor_delta_len_t delta_capacity_used; ++ pan_bool_t parity_dirty; ++ void *attr; ++}; ++ ++typedef void (*pan_sam_write_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t status); ++ ++/* ++ * from pan_mgr_types.h ++ */ ++#define PAN_MGR_ID_TYPE_SHIFT 56 ++#define PAN_MGR_ID_TYPE_MASK ((pan_mgr_id_t)18374686479671623680ULL) ++#define PAN_MGR_ID_UNIQ_MASK ((pan_mgr_id_t)72057594037927935ULL) ++ ++typedef pan_uint16_t pan_mgr_type_t; ++typedef pan_uint64_t pan_mgr_id_t; ++ ++#define PAN_MGR_SM ((pan_mgr_type_t) 2U) ++#define PAN_MGR_OBSD ((pan_mgr_type_t) 6U) ++ ++/* ++ * from pan_mgr_types_c.h ++ */ ++#define pan_mgr_id_construct_artificial(_mgr_type_, _mgr_uniq_, _mgr_id_p_) { \ ++ pan_mgr_id_t _id1, _id2; \ ++\ ++ _id1 = (_mgr_type_); \ ++ _id1 <<= PAN_MGR_ID_TYPE_SHIFT; \ ++ _id1 &= PAN_MGR_ID_TYPE_MASK; \ ++ _id2 = (_mgr_uniq_); \ ++ _id2 &= PAN_MGR_ID_UNIQ_MASK; \ ++ _id1 |= _id2; \ ++ *(_mgr_id_p_) = _id1; \ ++} ++ ++/* ++ * from pan_storage_c.h ++ */ ++#define pan_stor_is_device_id_an_obsd_id(_device_id_) \ ++ ((((_device_id_) & PAN_MGR_ID_TYPE_MASK) >> PAN_MGR_ID_TYPE_SHIFT) \ ++ == PAN_MGR_OBSD) ++ ++/* ++ * pnfs_shim internal definitions ++ */ ++ ++struct panfs_shim_io_state { ++ struct objlayout_io_state ol_state; ++ ++ pan_sg_entry_t *sg_list; ++ pan_sam_obj_sec_t obj_sec; ++ void *ucreds; ++ union { ++ struct { ++ pan_sam_read_args_t args; ++ pan_sam_read_res_t res; ++ } read; ++ struct { ++ pan_sam_write_args_t args; ++ pan_sam_write_res_t res; ++ } write; ++ } u; ++}; ++ ++#endif /* _PANLAYOUT_PANFS_SHIM_H */ +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-23 12:09:03.354501721 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-23 12:09:03.354501721 -0400 +@@ -0,0 +1,435 @@ ++/* ++ * pnfs_osd_xdr.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* ++ * The following implementation is based on these Internet Drafts: ++ * ++ * draft-ietf-nfsv4-minorversion-21 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_objid(u32 *p, struct pnfs_osd_objid *objid) ++{ ++ COPYMEM(objid->oid_device_id.data, sizeof(objid->oid_device_id.data)); ++ READ64(objid->oid_partition_id); ++ READ64(objid->oid_object_id); ++ return p; ++} ++ ++static inline u32 * ++pnfs_osd_xdr_decode_opaque_cred(u32 *p, ++ struct pnfs_osd_opaque_cred *opaque_cred) ++{ ++ READ32(opaque_cred->cred_len); ++ COPYMEM(opaque_cred->cred, opaque_cred->cred_len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_object_cred(u32 *p, struct pnfs_osd_object_cred *comp, ++ u8 **credp) ++{ ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_objid(p, &comp->oc_object_id); ++ READ32(comp->oc_osd_version); ++ READ32(comp->oc_cap_key_sec); ++ ++ cred = *credp; ++ comp->oc_cap_key.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap_key); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap_key.cred_len)); ++ comp->oc_cap.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap.cred_len)); ++ *credp = cred; ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_data_map(u32 *p, struct pnfs_osd_data_map *data_map) ++{ ++ READ32(data_map->odm_num_comps); ++ READ64(data_map->odm_stripe_unit); ++ READ32(data_map->odm_group_width); ++ READ32(data_map->odm_group_depth); ++ READ32(data_map->odm_mirror_cnt); ++ READ32(data_map->odm_raid_algorithm); ++ dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " ++ "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", ++ __func__, ++ data_map->odm_num_comps, ++ (unsigned long long)data_map->odm_stripe_unit, ++ data_map->odm_group_width, ++ data_map->odm_group_depth, ++ data_map->odm_mirror_cnt, ++ data_map->odm_raid_algorithm); ++ return p; ++} ++ ++struct pnfs_osd_layout * ++pnfs_osd_xdr_decode_layout(struct pnfs_osd_layout *layout, u32 *p) ++{ ++ int i; ++ u32 *start = p; ++ struct pnfs_osd_object_cred *comp; ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_data_map(p, &layout->olo_map); ++ READ32(layout->olo_comps_index); ++ READ32(layout->olo_num_comps); ++ layout->olo_comps = (struct pnfs_osd_object_cred *)(layout + 1); ++ comp = layout->olo_comps; ++ cred = (u8 *)(comp + layout->olo_num_comps); ++ dprintk("%s: comps_index=%u num_comps=%u\n", ++ __func__, layout->olo_comps_index, layout->olo_num_comps); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ p = pnfs_osd_xdr_decode_object_cred(p, comp, &cred); ++ dprintk("%s: comp[%d]=dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "key_len=%u cap_len=%u\n", ++ __func__, i, ++ _DEVID_LO(&comp->oc_object_id.oid_device_id), ++ _DEVID_HI(&comp->oc_object_id.oid_device_id), ++ comp->oc_object_id.oid_partition_id, ++ comp->oc_object_id.oid_object_id, ++ comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); ++ comp++; ++ } ++ dprintk("%s: xdr_size=%Zd end=%p in_core_size=%Zd\n", __func__, ++ (char *)p - (char *)start, cred, (char *)cred - (char *)layout); ++ return layout; ++} ++ ++/* ++ * Get Device Information Decoding ++ * ++ * Note: since Device Information is currently done synchronously, most ++ * of the actual fields are left inside the rpc buffer and are only ++ * pointed to by the pnfs_osd_deviceaddr members. So the read buffer ++ * should not be freed while the returned information is in use. ++ */ ++ ++u32 *__xdr_read_calc_nfs4_string( ++ u32 *p, struct nfs4_string *str, u8 **freespace) ++{ ++ u32 len; ++ char *data; ++ bool need_copy; ++ ++ READ32(len); ++ data = (char *)p; ++ ++ if (data[len]) { /* Not null terminated we'll need extra space */ ++ data = *freespace; ++ *freespace += len + 1; ++ need_copy = true; ++ } else { ++ need_copy = false; ++ } ++ ++ if (str) { ++ str->len = len; ++ str->data = data; ++ if (need_copy) { ++ memcpy(data, p, len); ++ data[len] = 0; ++ } ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++u32 *__xdr_read_calc_u8_opaque( ++ u32 *p, struct nfs4_string *str) ++{ ++ u32 len; ++ ++ READ32(len); ++ ++ if (str) { ++ str->len = len; ++ str->data = (char *)p; ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetid { ++ * u32 oti_type; ++ * struct nfs4_string oti_scsi_device_id; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetid( ++ u32 *p, struct pnfs_osd_targetid* targetid, u8 **freespace) ++{ ++ u32 oti_type; ++ ++ READ32(oti_type); ++ if (targetid) ++ targetid->oti_type = oti_type; ++ ++ switch (oti_type) { ++ case OBJ_TARGET_SCSI_NAME: ++ case OBJ_TARGET_SCSI_DEVICE_ID: ++ p = __xdr_read_calc_u8_opaque(p, ++ targetid ? &targetid->oti_scsi_device_id : NULL); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_net_addr { ++ * struct nfs4_string r_netid; ++ * struct nfs4_string r_addr; ++ * }; ++ */ ++u32 *__xdr_read_calc_net_addr( ++ u32 *p, struct pnfs_osd_net_addr* netaddr, u8 **freespace) ++{ ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_netid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_addr : NULL, ++ freespace); ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetaddr { ++ * u32 ota_available; ++ * struct pnfs_osd_net_addr ota_netaddr; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetaddr( ++ u32 *p, struct pnfs_osd_targetaddr *targetaddr, u8 **freespace) ++{ ++ u32 ota_available; ++ ++ READ32(ota_available); ++ if (targetaddr) ++ targetaddr->ota_available = ota_available; ++ ++ if (ota_available) { ++ p = __xdr_read_calc_net_addr(p, ++ targetaddr ? &targetaddr->ota_netaddr : NULL, ++ freespace); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++u32 *__xdr_read_calc_deviceaddr( ++ u32 *p, struct pnfs_osd_deviceaddr *deviceaddr, u8 **freespace) ++{ ++ p = __xdr_read_calc_targetid(p, ++ deviceaddr ? &deviceaddr->oda_targetid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_targetaddr(p, ++ deviceaddr ? &deviceaddr->oda_targetaddr : NULL, ++ freespace); ++ ++ if (deviceaddr) ++ COPYMEM(deviceaddr->oda_lun, sizeof(deviceaddr->oda_lun)); ++ else ++ p += XDR_QUADLEN(sizeof(deviceaddr->oda_lun)); ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_systemid : NULL); ++ ++ if (deviceaddr) { ++ p = pnfs_osd_xdr_decode_object_cred(p, ++ &deviceaddr->oda_root_obj_cred, freespace); ++ } else { ++ *freespace += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_osdname : NULL); ++ ++ return p; ++} ++ ++size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p) ++{ ++ u8 *null_freespace = NULL; ++ size_t sz; ++ ++ __xdr_read_calc_deviceaddr(p, NULL, &null_freespace); ++ sz = sizeof(struct pnfs_osd_deviceaddr) + (size_t)null_freespace; ++ ++ return sz; ++} ++ ++void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p) ++{ ++ u8 *freespace = (u8 *)(deviceaddr + 1); ++ ++ __xdr_read_calc_deviceaddr(p, deviceaddr, &freespace); ++} ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou) ++{ ++ __be32 *p = xdr_reserve_space(xdr, 16); ++ ++ if (!p) ++ return -E2BIG; ++ ++ *p++ = cpu_to_be32(lou->dsu_valid); ++ if (lou->dsu_valid) ++ p = xdr_encode_hyper(p, lou->dsu_delta); ++ *p++ = cpu_to_be32(lou->olu_ioerr_flag); ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ */ ++static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p; ++ ++ p = xdr_reserve_space(xdr, 32); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, ++ sizeof(object_id->oid_device_id.data)); ++ p = xdr_encode_hyper(p, object_id->oid_partition_id); ++ p = xdr_encode_hyper(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, ++ struct pnfs_osd_ioerr *ioerr) ++{ ++ __be32 *p; ++ int ret; ++ ++ ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component); ++ if (ret) ++ return ret; ++ ++ p = xdr_reserve_space(xdr, 24); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_hyper(p, ioerr->oer_comp_offset); ++ p = xdr_encode_hyper(p, ioerr->oer_comp_length); ++ *p++ = cpu_to_be32(ioerr->oer_iswrite); ++ *p = cpu_to_be32(ioerr->oer_errno); ++ ++ return 0; ++} +diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/pagelist.c +--- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-08-23 12:08:29.056411363 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-23 12:09:03.355511659 -0400 +@@ -20,6 +20,7 @@ + #include + + #include "internal.h" ++#include "pnfs.h" + + static struct kmem_cache *nfs_page_cachep; + +@@ -56,7 +57,8 @@ nfs_page_free(struct nfs_page *p) + struct nfs_page * + nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, + struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + +@@ -79,7 +81,11 @@ nfs_create_request(struct nfs_open_conte + req->wb_pgbase = offset; + req->wb_bytes = count; + req->wb_context = get_nfs_open_context(ctx); ++ req->wb_lock_context = nfs_get_lock_context(ctx); + kref_init(&req->wb_kref); ++ req->wb_lseg = lseg; ++ if (lseg) ++ get_lseg(lseg); + return req; + } + +@@ -141,18 +147,26 @@ void nfs_clear_request(struct nfs_page * + { + struct page *page = req->wb_page; + struct nfs_open_context *ctx = req->wb_context; ++ struct nfs_lock_context *l_ctx = req->wb_lock_context; + + if (page != NULL) { + page_cache_release(page); + req->wb_page = NULL; + } ++ if (l_ctx != NULL) { ++ nfs_put_lock_context(l_ctx); ++ req->wb_lock_context = NULL; ++ } + if (ctx != NULL) { + put_nfs_open_context(ctx); + req->wb_context = NULL; + } ++ if (req->wb_lseg != NULL) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } + } + +- + /** + * nfs_release_request - Release the count on an NFS read/write request + * @req: request to release +@@ -231,11 +245,12 @@ void nfs_pageio_init(struct nfs_pageio_d + * Return 'true' if this is the case, else return 'false'. + */ + static int nfs_can_coalesce_requests(struct nfs_page *prev, +- struct nfs_page *req) ++ struct nfs_page *req, ++ struct nfs_pageio_descriptor *pgio) + { + if (req->wb_context->cred != prev->wb_context->cred) + return 0; +- if (req->wb_context->lockowner != prev->wb_context->lockowner) ++ if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) + return 0; + if (req->wb_context->state != prev->wb_context->state) + return 0; +@@ -245,6 +260,12 @@ static int nfs_can_coalesce_requests(str + return 0; + if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) + return 0; ++ if (req->wb_lseg != prev->wb_lseg) ++ return 0; ++#ifdef CONFIG_NFS_V4_1 ++ if (pgio->pg_test && !pgio->pg_test(pgio, prev, req)) ++ return 0; ++#endif /* CONFIG_NFS_V4_1 */ + return 1; + } + +@@ -277,7 +298,7 @@ static int nfs_pageio_do_add_request(str + if (newlen > desc->pg_bsize) + return 0; + prev = nfs_list_entry(desc->pg_list.prev); +- if (!nfs_can_coalesce_requests(prev, req)) ++ if (!nfs_can_coalesce_requests(prev, req, desc)) + return 0; + } else + desc->pg_base = req->wb_pgbase; +@@ -366,6 +387,7 @@ void nfs_pageio_cond_complete(struct nfs + * @idx_start: lower bound of page->index to scan + * @npages: idx_start + npages sets the upper bound to scan. + * @tag: tag to scan for ++ * @use_pnfs: will be set TRUE if commit needs to be handled by layout driver + * + * Moves elements from one of the inode request lists. + * If the number of requests is set to 0, the entire address_space +@@ -375,7 +397,7 @@ void nfs_pageio_cond_complete(struct nfs + */ + int nfs_scan_list(struct nfs_inode *nfsi, + struct list_head *dst, pgoff_t idx_start, +- unsigned int npages, int tag) ++ unsigned int npages, int tag, int *use_pnfs) + { + struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; + struct nfs_page *req; +@@ -406,6 +428,8 @@ int nfs_scan_list(struct nfs_inode *nfsi + radix_tree_tag_clear(&nfsi->nfs_page_tree, + req->wb_index, tag); + nfs_list_add_request(req, dst); ++ if (req->wb_lseg) ++ *use_pnfs = 1; + res++; + if (res == INT_MAX) + goto out; +diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs.c +--- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-08-23 12:09:03.356501413 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-08-23 12:09:03.357481204 -0400 +@@ -0,0 +1,2027 @@ ++/* ++ * linux/fs/nfs/pnfs.c ++ * ++ * pNFS functions to call and manage layout drivers. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "internal.h" ++#include "nfs4_fs.h" ++#include "pnfs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS ++ ++#define MIN_POOL_LC (4) ++ ++static int pnfs_initialized; ++ ++static void pnfs_free_layout(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layout_segment *range); ++static inline void get_layout(struct pnfs_layout_type *lo); ++ ++/* Locking: ++ * ++ * pnfs_spinlock: ++ * protects pnfs_modules_tbl. ++ */ ++static spinlock_t pnfs_spinlock = __SPIN_LOCK_UNLOCKED(pnfs_spinlock); ++ ++/* ++ * pnfs_modules_tbl holds all pnfs modules ++ */ ++static struct list_head pnfs_modules_tbl; ++static struct kmem_cache *pnfs_cachep; ++static mempool_t *pnfs_layoutcommit_mempool; ++ ++static inline struct pnfs_layoutcommit_data *pnfs_layoutcommit_alloc(void) ++{ ++ struct pnfs_layoutcommit_data *p = ++ mempool_alloc(pnfs_layoutcommit_mempool, GFP_NOFS); ++ if (p) ++ memset(p, 0, sizeof(*p)); ++ ++ return p; ++} ++ ++void pnfs_layoutcommit_free(struct pnfs_layoutcommit_data *p) ++{ ++ mempool_free(p, pnfs_layoutcommit_mempool); ++} ++ ++/* ++ * struct pnfs_module - One per pNFS device module. ++ */ ++struct pnfs_module { ++ struct pnfs_layoutdriver_type *pnfs_ld_type; ++ struct list_head pnfs_tblid; ++}; ++ ++int ++pnfs_initialize(void) ++{ ++ INIT_LIST_HEAD(&pnfs_modules_tbl); ++ ++ pnfs_cachep = kmem_cache_create("pnfs_layoutcommit_data", ++ sizeof(struct pnfs_layoutcommit_data), ++ 0, SLAB_HWCACHE_ALIGN, NULL); ++ if (pnfs_cachep == NULL) ++ return -ENOMEM; ++ ++ pnfs_layoutcommit_mempool = mempool_create(MIN_POOL_LC, ++ mempool_alloc_slab, ++ mempool_free_slab, ++ pnfs_cachep); ++ if (pnfs_layoutcommit_mempool == NULL) { ++ kmem_cache_destroy(pnfs_cachep); ++ return -ENOMEM; ++ } ++ ++ pnfs_initialized = 1; ++ return 0; ++} ++ ++void pnfs_uninitialize(void) ++{ ++ mempool_destroy(pnfs_layoutcommit_mempool); ++ kmem_cache_destroy(pnfs_cachep); ++} ++ ++/* search pnfs_modules_tbl for right pnfs module */ ++static int ++find_pnfs(u32 id, struct pnfs_module **module) { ++ struct pnfs_module *local = NULL; ++ ++ dprintk("PNFS: %s: Searching for %u\n", __func__, id); ++ list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) { ++ if (local->pnfs_ld_type->id == id) { ++ *module = local; ++ return(1); ++ } ++ } ++ return 0; ++} ++ ++/* Set lo_cred to indicate we require a layoutcommit ++ * If we don't even have a layout, we don't need to commit it. ++ */ ++void ++pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx) ++{ ++ dprintk("%s: has_layout=%d ctx=%p\n", __func__, has_layout(nfsi), ctx); ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (has_layout(nfsi) && ++ !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->pnfs_layout_state)) { ++ nfsi->layout->lo_cred = get_rpccred(ctx->state->owner->so_cred); ++ __set_bit(NFS_INO_LAYOUTCOMMIT, ++ &nfsi->layout->pnfs_layout_state); ++ nfsi->change_attr++; ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ dprintk("%s: Set layoutcommit\n", __func__); ++ return; ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* Update last_write_offset for layoutcommit. ++ * TODO: We should only use commited extents, but the current nfs ++ * implementation does not calculate the written range in nfs_commit_done. ++ * We therefore update this field in writeback_done. ++ */ ++void ++pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent) ++{ ++ loff_t end_pos; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (offset < nfsi->layout->pnfs_write_begin_pos) ++ nfsi->layout->pnfs_write_begin_pos = offset; ++ end_pos = offset + extent - 1; /* I'm being inclusive */ ++ if (end_pos > nfsi->layout->pnfs_write_end_pos) ++ nfsi->layout->pnfs_write_end_pos = end_pos; ++ dprintk("%s: Wrote %lu@%lu bpos %lu, epos: %lu\n", ++ __func__, ++ (unsigned long) extent, ++ (unsigned long) offset , ++ (unsigned long) nfsi->layout->pnfs_write_begin_pos, ++ (unsigned long) nfsi->layout->pnfs_write_end_pos); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* Unitialize a mountpoint in a layout driver */ ++void ++unmount_pnfs_layoutdriver(struct nfs_server *nfss) ++{ ++ if (PNFS_EXISTS_LDIO_OP(nfss, uninitialize_mountpoint)) ++ nfss->pnfs_curr_ld->ld_io_ops->uninitialize_mountpoint(nfss); ++} ++ ++/* ++ * Set the server pnfs module to the first registered pnfs_type. ++ * Only one pNFS layout driver is supported. ++ */ ++void ++set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, ++ u32 id) ++{ ++ struct pnfs_module *mod = NULL; ++ ++ if (server->pnfs_curr_ld) ++ return; ++ ++ if (!find_pnfs(id, &mod)) { ++ request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); ++ find_pnfs(id, &mod); ++ } ++ ++ if (!mod) { ++ dprintk("%s: No pNFS module found for %u. ", __func__, id); ++ goto out_err; ++ } ++ ++ server->pnfs_curr_ld = mod->pnfs_ld_type; ++ if (mod->pnfs_ld_type->ld_io_ops->initialize_mountpoint( ++ server, mntfh)) { ++ printk(KERN_ERR "%s: Error initializing mount point " ++ "for layout driver %u. ", __func__, id); ++ goto out_err; ++ } ++ ++ dprintk("%s: pNFS module for %u set\n", __func__, id); ++ return; ++ ++out_err: ++ dprintk("Using NFSv4 I/O\n"); ++ server->pnfs_curr_ld = NULL; ++} ++ ++/* Allow I/O module to set its functions structure */ ++struct pnfs_client_operations* ++pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) ++{ ++ struct pnfs_module *pnfs_mod; ++ struct layoutdriver_io_operations *io_ops = ld_type->ld_io_ops; ++ ++ if (!pnfs_initialized) { ++ printk(KERN_ERR "%s Registration failure. " ++ "pNFS not initialized.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops || !io_ops->alloc_layout || !io_ops->free_layout) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "alloc_layout and free_layout.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops->alloc_lseg || !io_ops->free_lseg) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "alloc_lseg and free_lseg.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops->read_pagelist || !io_ops->write_pagelist || ++ !io_ops->commit) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "read_pagelist, write_pagelist, and commit.\n", ++ __func__); ++ return NULL; ++ } ++ ++ pnfs_mod = kmalloc(sizeof(struct pnfs_module), GFP_KERNEL); ++ if (pnfs_mod != NULL) { ++ dprintk("%s Registering id:%u name:%s\n", ++ __func__, ++ ld_type->id, ++ ld_type->name); ++ pnfs_mod->pnfs_ld_type = ld_type; ++ INIT_LIST_HEAD(&pnfs_mod->pnfs_tblid); ++ ++ spin_lock(&pnfs_spinlock); ++ list_add(&pnfs_mod->pnfs_tblid, &pnfs_modules_tbl); ++ spin_unlock(&pnfs_spinlock); ++ } ++ ++ return &pnfs_ops; ++} ++ ++/* Allow I/O module to set its functions structure */ ++void ++pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) ++{ ++ struct pnfs_module *pnfs_mod; ++ ++ if (find_pnfs(ld_type->id, &pnfs_mod)) { ++ dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); ++ spin_lock(&pnfs_spinlock); ++ list_del(&pnfs_mod->pnfs_tblid); ++ spin_unlock(&pnfs_spinlock); ++ kfree(pnfs_mod); ++ } ++} ++ ++/* ++ * pNFS client layout cache ++ */ ++#if defined(CONFIG_SMP) ++#define BUG_ON_UNLOCKED_INO(ino) \ ++ BUG_ON(!spin_is_locked(&ino->i_lock)) ++#define BUG_ON_UNLOCKED_LO(lo) \ ++ BUG_ON_UNLOCKED_INO(PNFS_INODE(lo)) ++#else /* CONFIG_SMP */ ++#define BUG_ON_UNLOCKED_INO(lo) do {} while (0) ++#define BUG_ON_UNLOCKED_LO(lo) do {} while (0) ++#endif /* CONFIG_SMP */ ++ ++static inline void ++get_layout(struct pnfs_layout_type *lo) ++{ ++ BUG_ON_UNLOCKED_LO(lo); ++ lo->refcount++; ++} ++ ++static inline void ++put_layout_locked(struct pnfs_layout_type *lo) ++{ ++ BUG_ON_UNLOCKED_LO(lo); ++ BUG_ON(lo->refcount <= 0); ++ ++ lo->refcount--; ++ if (!lo->refcount) { ++ struct layoutdriver_io_operations *io_ops = PNFS_LD_IO_OPS(lo); ++ struct nfs_inode *nfsi = PNFS_NFS_INODE(lo); ++ ++ dprintk("%s: freeing layout cache %p\n", __func__, lo); ++ WARN_ON(!list_empty(&lo->lo_layouts)); ++ io_ops->free_layout(lo); ++ nfsi->layout = NULL; ++ } ++} ++ ++void ++put_layout(struct inode *inode) ++{ ++ spin_lock(&inode->i_lock); ++ put_layout_locked(NFS_I(inode)->layout); ++ spin_unlock(&inode->i_lock); ++ ++} ++ ++void ++pnfs_layout_release(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ struct nfs_inode *nfsi = PNFS_NFS_INODE(lo); ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (range) ++ pnfs_free_layout(lo, range); ++ /* ++ * Matched in _pnfs_update_layout for layoutget ++ * and by get_layout in _pnfs_return_layout for layoutreturn ++ */ ++ put_layout_locked(lo); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ wake_up_all(&nfsi->lo_waitq); ++} ++ ++void ++pnfs_destroy_layout(struct nfs_inode *nfsi) ++{ ++ struct pnfs_layout_type *lo; ++ struct nfs4_pnfs_layout_segment range = { ++ .iomode = IOMODE_ANY, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ lo = nfsi->layout; ++ if (lo) { ++ pnfs_free_layout(lo, &range); ++ WARN_ON(!list_empty(&nfsi->layout->segs)); ++ WARN_ON(!list_empty(&nfsi->layout->lo_layouts)); ++ ++ if (nfsi->layout->refcount != 1) ++ printk(KERN_WARNING "%s: layout refcount not=1 %d\n", ++ __func__, nfsi->layout->refcount); ++ WARN_ON(nfsi->layout->refcount != 1); ++ ++ /* Matched by refcount set to 1 in alloc_init_layout */ ++ put_layout_locked(lo); ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* ++ * Called by the state manger to remove all layouts established under an ++ * expired lease. ++ */ ++void ++pnfs_destroy_all_layouts(struct nfs_client *clp) ++{ ++ struct pnfs_layout_type *lo; ++ ++ while (!list_empty(&clp->cl_layouts)) { ++ lo = list_entry(clp->cl_layouts.next, struct pnfs_layout_type, ++ lo_layouts); ++ dprintk("%s freeing layout for inode %lu\n", __func__, ++ lo->lo_inode->i_ino); ++ pnfs_destroy_layout(NFS_I(lo->lo_inode)); ++ } ++} ++ ++static inline void ++init_lseg(struct pnfs_layout_type *lo, struct pnfs_layout_segment *lseg) ++{ ++ INIT_LIST_HEAD(&lseg->fi_list); ++ kref_init(&lseg->kref); ++ lseg->valid = true; ++ lseg->layout = lo; ++} ++ ++static void ++destroy_lseg(struct kref *kref) ++{ ++ struct pnfs_layout_segment *lseg = ++ container_of(kref, struct pnfs_layout_segment, kref); ++ ++ dprintk("--> %s\n", __func__); ++ /* Matched by get_layout in pnfs_insert_layout */ ++ put_layout_locked(lseg->layout); ++ PNFS_LD_IO_OPS(lseg->layout)->free_lseg(lseg); ++} ++ ++static void ++put_lseg_locked(struct pnfs_layout_segment *lseg) ++{ ++ bool do_wake_up; ++ struct nfs_inode *nfsi; ++ ++ if (!lseg) ++ return; ++ ++ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, ++ atomic_read(&lseg->kref.refcount), lseg->valid); ++ do_wake_up = !lseg->valid; ++ nfsi = PNFS_NFS_INODE(lseg->layout); ++ kref_put(&lseg->kref, destroy_lseg); ++ if (do_wake_up) ++ wake_up(&nfsi->lo_waitq); ++} ++ ++void ++put_lseg(struct pnfs_layout_segment *lseg) ++{ ++ bool do_wake_up; ++ struct nfs_inode *nfsi; ++ ++ if (!lseg) ++ return; ++ ++ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, ++ atomic_read(&lseg->kref.refcount), lseg->valid); ++ do_wake_up = !lseg->valid; ++ nfsi = PNFS_NFS_INODE(lseg->layout); ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ kref_put(&lseg->kref, destroy_lseg); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ if (do_wake_up) ++ wake_up(&nfsi->lo_waitq); ++} ++EXPORT_SYMBOL(put_lseg); ++ ++void get_lseg(struct pnfs_layout_segment *lseg) ++{ ++ kref_get(&lseg->kref); ++} ++EXPORT_SYMBOL(get_lseg); ++ ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end: NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1: NFS4_MAX_UINT64; ++} ++ ++/* ++ * is l2 fully contained in l1? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_contained(struct nfs4_pnfs_layout_segment *l1, ++ struct nfs4_pnfs_layout_segment *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (start1 <= start2) && (end1 >= end2); ++} ++ ++/* ++ * is l1 and l2 intersecting? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_intersecting(struct nfs4_pnfs_layout_segment *l1, ++ struct nfs4_pnfs_layout_segment *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (end1 == NFS4_MAX_UINT64 || end1 > start2) && ++ (end2 == NFS4_MAX_UINT64 || end2 > start1); ++} ++ ++void ++pnfs_set_layout_stateid(struct pnfs_layout_type *lo, ++ const nfs4_stateid *stateid) ++{ ++ write_seqlock(&lo->seqlock); ++ memcpy(lo->stateid.u.data, stateid->u.data, sizeof(lo->stateid.u.data)); ++ write_sequnlock(&lo->seqlock); ++} ++ ++void ++pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_type *lo) ++{ ++ int seq; ++ ++ dprintk("--> %s\n", __func__); ++ ++ do { ++ seq = read_seqbegin(&lo->seqlock); ++ memcpy(dst->u.data, lo->stateid.u.data, ++ sizeof(lo->stateid.u.data)); ++ } while (read_seqretry(&lo->seqlock, seq)); ++ ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void ++pnfs_layout_from_open_stateid(struct pnfs_layout_type *lo, ++ struct nfs4_state *state) ++{ ++ int seq; ++ ++ dprintk("--> %s\n", __func__); ++ ++ write_seqlock(&lo->seqlock); ++ if (!memcmp(lo->stateid.u.data, &zero_stateid, NFS4_STATEID_SIZE)) ++ do { ++ seq = read_seqbegin(&state->seqlock); ++ memcpy(lo->stateid.u.data, state->stateid.u.data, ++ sizeof(state->stateid.u.data)); ++ } while (read_seqretry(&state->seqlock, seq)); ++ write_sequnlock(&lo->seqlock); ++ dprintk("<-- %s\n", __func__); ++} ++ ++/* ++* Get layout from server. ++* for now, assume that whole file layouts are requested. ++* arg->offset: 0 ++* arg->length: all ones ++*/ ++static int ++send_layoutget(struct inode *ino, ++ struct nfs_open_context *ctx, ++ struct nfs4_pnfs_layout_segment *range, ++ struct pnfs_layout_segment **lsegpp, ++ struct pnfs_layout_type *lo) ++{ ++ int status; ++ struct nfs_server *server = NFS_SERVER(ino); ++ struct nfs4_pnfs_layoutget *lgp; ++ ++ dprintk("--> %s\n", __func__); ++ ++ lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); ++ if (lgp == NULL) { ++ pnfs_layout_release(lo, NULL); ++ return -ENOMEM; ++ } ++ lgp->args.minlength = NFS4_MAX_UINT64; ++ lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; ++ lgp->args.lseg.iomode = range->iomode; ++ lgp->args.lseg.offset = 0; ++ lgp->args.lseg.length = NFS4_MAX_UINT64; ++ lgp->args.type = server->pnfs_curr_ld->id; ++ lgp->args.inode = ino; ++ lgp->lsegpp = lsegpp; ++ ++ if (!memcmp(lo->stateid.u.data, &zero_stateid, NFS4_STATEID_SIZE)) { ++ struct nfs_open_context *oldctx = ctx; ++ ++ if (!oldctx) { ++ ctx = nfs_find_open_context(ino, NULL, ++ (range->iomode == IOMODE_READ) ? ++ FMODE_READ: FMODE_WRITE); ++ BUG_ON(!ctx); ++ } ++ /* Set the layout stateid from the open stateid */ ++ pnfs_layout_from_open_stateid(NFS_I(ino)->layout, ctx->state); ++ if (!oldctx) ++ put_nfs_open_context(ctx); ++ } ++ ++ /* Retrieve layout information from server */ ++ status = pnfs4_proc_layoutget(lgp); ++ ++ dprintk("<-- %s status %d\n", __func__, status); ++ return status; ++} ++ ++/* ++ * iomode matching rules: ++ * range lseg match ++ * ----- ----- ----- ++ * ANY READ true ++ * ANY RW true ++ * RW READ false ++ * RW RW true ++ * READ READ true ++ * READ RW false ++ */ ++static inline int ++should_free_lseg(struct pnfs_layout_segment *lseg, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ return (range->iomode == IOMODE_ANY || ++ lseg->range.iomode == range->iomode) && ++ lo_seg_intersecting(&lseg->range, range); ++} ++ ++static struct pnfs_layout_segment * ++has_layout_to_return(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ struct pnfs_layout_segment *out = NULL, *lseg; ++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", ++ __func__, lo, range->offset, range->length, range->iomode); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry (lseg, &lo->segs, fi_list) ++ if (should_free_lseg(lseg, range)) { ++ out = lseg; ++ break; ++ } ++ ++ dprintk("%s:Return lseg=%p\n", __func__, out); ++ return out; ++} ++ ++static inline bool ++_pnfs_can_return_lseg(struct pnfs_layout_segment *lseg) ++{ ++ return atomic_read(&lseg->kref.refcount) == 1; ++} ++ ++ ++static void ++pnfs_free_layout(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ struct pnfs_layout_segment *lseg, *next; ++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", ++ __func__, lo, range->offset, range->length, range->iomode); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry_safe (lseg, next, &lo->segs, fi_list) { ++ if (!should_free_lseg(lseg, range) || ++ !_pnfs_can_return_lseg(lseg)) ++ continue; ++ dprintk("%s: freeing lseg %p iomode %d " ++ "offset %llu length %llu\n", __func__, ++ lseg, lseg->range.iomode, lseg->range.offset, ++ lseg->range.length); ++ list_del(&lseg->fi_list); ++ put_lseg_locked(lseg); ++ } ++ if (list_empty(&lo->segs)) { ++ struct nfs_client *clp; ++ ++ clp = PNFS_NFS_SERVER(lo)->nfs_client; ++ spin_lock(&clp->cl_lock); ++ list_del_init(&lo->lo_layouts); ++ spin_unlock(&clp->cl_lock); ++ pnfs_set_layout_stateid(lo, &zero_stateid); ++ } ++ ++ dprintk("%s:Return\n", __func__); ++} ++ ++static bool ++pnfs_return_layout_barrier(struct nfs_inode *nfsi, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ struct pnfs_layout_segment *lseg; ++ bool ret = false; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ list_for_each_entry(lseg, &nfsi->layout->segs, fi_list) { ++ if (!should_free_lseg(lseg, range)) ++ continue; ++ lseg->valid = false; ++ if (!_pnfs_can_return_lseg(lseg)) { ++ dprintk("%s: wait on lseg %p refcount %d\n", ++ __func__, lseg, ++ atomic_read(&lseg->kref.refcount)); ++ ret = true; ++ } ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ dprintk("%s:Return %d\n", __func__, ret); ++ return ret; ++} ++ ++static int ++return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range, ++ enum pnfs_layoutreturn_type type, struct pnfs_layout_type *lo, ++ bool wait) ++{ ++ struct nfs4_pnfs_layoutreturn *lrp; ++ struct nfs_server *server = NFS_SERVER(ino); ++ int status = -ENOMEM; ++ ++ dprintk("--> %s\n", __func__); ++ ++ BUG_ON(type != RETURN_FILE); ++ ++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); ++ if (lrp == NULL) { ++ if (lo && (type == RETURN_FILE)) ++ pnfs_layout_release(lo, NULL); ++ goto out; ++ } ++ lrp->args.reclaim = 0; ++ lrp->args.layout_type = server->pnfs_curr_ld->id; ++ lrp->args.return_type = type; ++ lrp->args.lseg = *range; ++ lrp->args.inode = ino; ++ ++ status = pnfs4_proc_layoutreturn(lrp, wait); ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; ++} ++ ++int ++_pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ struct pnfs_layout_type *lo = NULL; ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct nfs4_pnfs_layout_segment arg; ++ int status = 0; ++ ++ dprintk("--> %s type %d\n", __func__, type); ++ ++ ++ arg.iomode = range ? range->iomode : IOMODE_ANY; ++ arg.offset = 0; ++ arg.length = NFS4_MAX_UINT64; ++ ++ if (type == RETURN_FILE) { ++ spin_lock(&ino->i_lock); ++ lo = nfsi->layout; ++ if (lo && !has_layout_to_return(lo, &arg)) { ++ lo = NULL; ++ } ++ if (!lo) { ++ spin_unlock(&ino->i_lock); ++ dprintk("%s: no layout segments to return\n", __func__); ++ goto out; ++ } ++ ++ /* Reference for layoutreturn matched in pnfs_layout_release */ ++ get_layout(lo); ++ ++ spin_unlock(&ino->i_lock); ++ ++ if (pnfs_return_layout_barrier(nfsi, &arg)) { ++ if (stateid) { /* callback */ ++ status = -EAGAIN; ++ goto out_put; ++ } ++ dprintk("%s: waiting\n", __func__); ++ wait_event(nfsi->lo_waitq, ++ !pnfs_return_layout_barrier(nfsi, &arg)); ++ } ++ ++ if (layoutcommit_needed(nfsi)) { ++ if (stateid && !wait) { /* callback */ ++ dprintk("%s: layoutcommit pending\n", __func__); ++ status = -EAGAIN; ++ goto out_put; ++ } ++ status = pnfs_layoutcommit_inode(ino, wait); ++ if (status) { ++ /* Return layout even if layoutcommit fails */ ++ dprintk("%s: layoutcommit failed, status=%d. " ++ "Returning layout anyway\n", ++ __func__, status); ++ } ++ } ++ ++ if (!stateid) ++ status = return_layout(ino, &arg, type, lo, wait); ++ else ++ pnfs_layout_release(lo, &arg); ++ } ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; ++out_put: ++ put_layout(ino); ++ goto out; ++} ++ ++/* ++ * cmp two layout segments for sorting into layout cache ++ */ ++static inline s64 ++cmp_layout(struct nfs4_pnfs_layout_segment *l1, ++ struct nfs4_pnfs_layout_segment *l2) ++{ ++ s64 d; ++ ++ /* higher offset > lower offset */ ++ d = l1->offset - l2->offset; ++ if (d) ++ return d; ++ ++ /* longer length > shorter length */ ++ d = l1->length - l2->length; ++ if (d) ++ return d; ++ ++ /* read > read/write */ ++ return (int)(l1->iomode == IOMODE_READ) - ++ (int)(l2->iomode == IOMODE_READ); ++} ++ ++static void ++pnfs_insert_layout(struct pnfs_layout_type *lo, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct pnfs_layout_segment *lp; ++ int found = 0; ++ ++ dprintk("%s:Begin\n", __func__); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ if (list_empty(&lo->segs)) { ++ struct nfs_client *clp = PNFS_NFS_SERVER(lo)->nfs_client; ++ ++ spin_lock(&clp->cl_lock); ++ BUG_ON(!list_empty(&lo->lo_layouts)); ++ list_add_tail(&lo->lo_layouts, &clp->cl_layouts); ++ spin_unlock(&clp->cl_lock); ++ } ++ list_for_each_entry (lp, &lo->segs, fi_list) { ++ if (cmp_layout(&lp->range, &lseg->range) > 0) ++ continue; ++ list_add_tail(&lseg->fi_list, &lp->fi_list); ++ dprintk("%s: inserted lseg %p " ++ "iomode %d offset %llu length %llu before " ++ "lp %p iomode %d offset %llu length %llu\n", ++ __func__, lseg, lseg->range.iomode, ++ lseg->range.offset, lseg->range.length, ++ lp, lp->range.iomode, lp->range.offset, ++ lp->range.length); ++ found = 1; ++ break; ++ } ++ if (!found) { ++ list_add_tail(&lseg->fi_list, &lo->segs); ++ dprintk("%s: inserted lseg %p " ++ "iomode %d offset %llu length %llu at tail\n", ++ __func__, lseg, lseg->range.iomode, ++ lseg->range.offset, lseg->range.length); ++ } ++ get_layout(lo); ++ ++ dprintk("%s:Return\n", __func__); ++} ++ ++/* ++ * Each layoutdriver embeds pnfs_layout_type as the first field in it's ++ * per-layout type layout cache structure and returns it ZEROed ++ * from layoutdriver_io_ops->alloc_layout ++ */ ++static struct pnfs_layout_type * ++alloc_init_layout(struct inode *ino) ++{ ++ struct pnfs_layout_type *lo; ++ struct layoutdriver_io_operations *io_ops; ++ ++ io_ops = NFS_SERVER(ino)->pnfs_curr_ld->ld_io_ops; ++ lo = io_ops->alloc_layout(ino); ++ if (!lo) { ++ printk(KERN_ERR ++ "%s: out of memory: io_ops->alloc_layout failed\n", ++ __func__); ++ return NULL; ++ } ++ lo->refcount = 1; ++ INIT_LIST_HEAD(&lo->lo_layouts); ++ INIT_LIST_HEAD(&lo->segs); ++ seqlock_init(&lo->seqlock); ++ lo->lo_inode = ino; ++ return lo; ++} ++ ++/* ++ * Retrieve and possibly allocate the inode layout ++ * ++ * ino->i_lock must be taken by the caller. ++ */ ++static struct pnfs_layout_type * ++pnfs_alloc_layout(struct inode *ino) ++{ ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_type *new = NULL; ++ ++ dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); ++ ++ BUG_ON_UNLOCKED_INO(ino); ++ if (likely(nfsi->layout)) ++ return nfsi->layout; ++ ++ spin_unlock(&ino->i_lock); ++ new = alloc_init_layout(ino); ++ spin_lock(&ino->i_lock); ++ ++ if (likely(nfsi->layout == NULL)) { /* Won the race? */ ++ nfsi->layout = new; ++ } else if (new) { ++ /* Reference the layout accross i_lock release and grab */ ++ get_layout(nfsi->layout); ++ spin_unlock(&ino->i_lock); ++ NFS_SERVER(ino)->pnfs_curr_ld->ld_io_ops->free_layout(new); ++ spin_lock(&ino->i_lock); ++ put_layout_locked(nfsi->layout); ++ } ++ return nfsi->layout; ++} ++ ++/* ++ * iomode matching rules: ++ * range lseg match ++ * ----- ----- ----- ++ * ANY READ true ++ * ANY RW true ++ * RW READ false ++ * RW RW true ++ * READ READ true ++ * READ RW true ++ */ ++static inline int ++has_matching_lseg(struct pnfs_layout_segment *lseg, ++ struct nfs4_pnfs_layout_segment *range) ++{ ++ struct nfs4_pnfs_layout_segment range1; ++ ++ if ((range->iomode == IOMODE_RW && lseg->range.iomode != IOMODE_RW) || ++ !lo_seg_intersecting(&lseg->range, range)) ++ return 0; ++ ++ /* range1 covers only the first byte in the range */ ++ range1 = *range; ++ range1.length = 1; ++ return lo_seg_contained(&lseg->range, &range1); ++} ++ ++/* ++ * lookup range in layout ++ */ ++static struct pnfs_layout_segment * ++pnfs_has_layout(struct pnfs_layout_type *lo, ++ struct nfs4_pnfs_layout_segment *range, ++ bool take_ref, ++ bool only_valid) ++{ ++ struct pnfs_layout_segment *lseg, *ret = NULL; ++ ++ dprintk("%s:Begin\n", __func__); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry (lseg, &lo->segs, fi_list) { ++ if (has_matching_lseg(lseg, range) && ++ (lseg->valid || !only_valid)) { ++ ret = lseg; ++ if (take_ref) ++ get_lseg(ret); ++ break; ++ } ++ if (cmp_layout(range, &lseg->range) > 0) ++ break; ++ } ++ ++ dprintk("%s:Return lseg %p take_ref %d ref %d valid %d\n", ++ __func__, ret, take_ref, ++ ret ? atomic_read(&ret->kref.refcount) : 0, ++ ret ? ret->valid : 0); ++ return ret; ++} ++ ++/* Update the file's layout for the given range and iomode. ++ * Layout is retreived from the server if needed. ++ * If lsegpp is given, the appropriate layout segment is referenced and ++ * returned to the caller. ++ */ ++void ++_pnfs_update_layout(struct inode *ino, ++ struct nfs_open_context *ctx, ++ loff_t pos, ++ u64 count, ++ enum pnfs_iomode iomode, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ struct nfs4_pnfs_layout_segment arg = { ++ .iomode = iomode, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_type *lo; ++ struct pnfs_layout_segment *lseg = NULL; ++ bool take_ref = (lsegpp != NULL); ++ ++ if (take_ref) ++ *lsegpp = NULL; ++ spin_lock(&ino->i_lock); ++ lo = pnfs_alloc_layout(ino); ++ if (lo == NULL) { ++ dprintk("%s ERROR: can't get pnfs_layout_type\n", __func__); ++ goto out_unlock; ++ } ++ ++ /* Check to see if the layout for the given range already exists */ ++ lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref); ++ if (lseg && !lseg->valid) { ++ if (take_ref) ++ put_lseg_locked(lseg); ++ /* someone is cleaning the layout */ ++ lseg = NULL; ++ goto out_unlock; ++ } ++ ++ if (lseg) { ++ dprintk("%s: Using cached lseg %p for %llu@%llu iomode %d)\n", ++ __func__, ++ lseg, ++ arg.length, ++ arg.offset, ++ arg.iomode); ++ ++ goto out_unlock; ++ } ++ ++ /* if get layout already failed once goto out */ ++ if (test_bit(lo_fail_bit(iomode), &nfsi->layout->pnfs_layout_state)) { ++ if (unlikely(nfsi->pnfs_layout_suspend && ++ get_seconds() >= nfsi->pnfs_layout_suspend)) { ++ dprintk("%s: layout_get resumed\n", __func__); ++ clear_bit(lo_fail_bit(iomode), ++ &nfsi->layout->pnfs_layout_state); ++ nfsi->pnfs_layout_suspend = 0; ++ } else ++ goto out_unlock; ++ } ++ ++ /* Reference the layout for layoutget matched in pnfs_layout_release */ ++ get_layout(lo); ++ spin_unlock(&ino->i_lock); ++ ++ send_layoutget(ino, ctx, &arg, lsegpp, lo); ++out: ++ dprintk("%s end, state 0x%lx lseg %p\n", __func__, ++ nfsi->layout->pnfs_layout_state, lseg); ++ return; ++out_unlock: ++ if (lsegpp) ++ *lsegpp = lseg; ++ spin_unlock(&ino->i_lock); ++ goto out; ++} ++ ++void ++pnfs_get_layout_done(struct nfs4_pnfs_layoutget *lgp, int rpc_status) ++{ ++ struct pnfs_layout_segment *lseg = NULL; ++ struct nfs_inode *nfsi = NFS_I(lgp->args.inode); ++ time_t suspend = 0; ++ ++ dprintk("-->%s\n", __func__); ++ ++ lgp->status = rpc_status; ++ if (likely(!rpc_status)) { ++ if (unlikely(lgp->res.layout.len < 0)) { ++ printk(KERN_ERR ++ "%s: ERROR Returned layout size is ZERO\n", __func__); ++ lgp->status = -EIO; ++ } ++ goto out; ++ } ++ ++ dprintk("%s: ERROR retrieving layout %d\n", __func__, rpc_status); ++ switch (rpc_status) { ++ case -NFS4ERR_BADLAYOUT: ++ lgp->status = -ENOENT; ++ /* FALLTHROUGH */ ++ case -EACCES: /* NFS4ERR_ACCESS */ ++ /* transient error, don't mark with NFS_INO_LAYOUT_FAILED */ ++ goto out; ++ ++ case -NFS4ERR_LAYOUTTRYLATER: ++ case -NFS4ERR_RECALLCONFLICT: ++ case -NFS4ERR_OLD_STATEID: ++ case -EAGAIN: /* NFS4ERR_LOCKED */ ++ lgp->status = -NFS4ERR_DELAY; /* for nfs4_handle_exception */ ++ /* FALLTHROUGH */ ++ case -NFS4ERR_GRACE: ++ case -NFS4ERR_DELAY: ++ goto out; ++ ++ case -NFS4ERR_ADMIN_REVOKED: ++ case -NFS4ERR_DELEG_REVOKED: ++ /* The layout is expected to be returned at this point. ++ * This should clear the layout stateid as well */ ++ suspend = get_seconds() + 1; ++ break; ++ ++ case -NFS4ERR_LAYOUTUNAVAILABLE: ++ lgp->status = -ENOTSUPP; ++ break; ++ ++ case -NFS4ERR_REP_TOO_BIG: ++ case -NFS4ERR_REP_TOO_BIG_TO_CACHE: ++ lgp->status = -E2BIG; ++ break; ++ ++ /* Leave the following errors untranslated */ ++ case -NFS4ERR_DEADSESSION: ++ case -NFS4ERR_DQUOT: ++ case -EINVAL: /* NFS4ERR_INVAL */ ++ case -EIO: /* NFS4ERR_IO */ ++ case -NFS4ERR_FHEXPIRED: ++ case -NFS4ERR_MOVED: ++ case -NFS4ERR_NOSPC: ++ case -ESERVERFAULT: /* NFS4ERR_SERVERFAULT */ ++ case -ESTALE: /* NFS4ERR_STALE */ ++ case -ETOOSMALL: /* NFS4ERR_TOOSMALL */ ++ break; ++ ++ /* The following errors are our fault and should never happen */ ++ case -NFS4ERR_BADIOMODE: ++ case -NFS4ERR_BADXDR: ++ case -NFS4ERR_REQ_TOO_BIG: ++ case -NFS4ERR_UNKNOWN_LAYOUTTYPE: ++ case -NFS4ERR_WRONG_TYPE: ++ lgp->status = -EINVAL; ++ /* FALLTHROUGH */ ++ case -NFS4ERR_BAD_STATEID: ++ case -NFS4ERR_NOFILEHANDLE: ++ case -ENOTSUPP: /* NFS4ERR_NOTSUPP */ ++ case -NFS4ERR_OPENMODE: ++ case -NFS4ERR_OP_NOT_IN_SESSION: ++ case -NFS4ERR_TOO_MANY_OPS: ++ dprintk("%s: error %d: should never happen\n", __func__, ++ rpc_status); ++ break; ++ ++ /* The following errors are the server's fault */ ++ default: ++ dprintk("%s: illegal error %d\n", __func__, rpc_status); ++ lgp->status = -EIO; ++ break; ++ } ++ ++ /* remember that get layout failed and suspend trying */ ++ nfsi->pnfs_layout_suspend = suspend; ++ set_bit(lo_fail_bit(lgp->args.lseg.iomode), ++ &nfsi->layout->pnfs_layout_state); ++ dprintk("%s: layout_get suspended until %ld\n", ++ __func__, suspend); ++out: ++ dprintk("%s end (err:%d) state 0x%lx lseg %p\n", ++ __func__, lgp->status, nfsi->layout->pnfs_layout_state, lseg); ++ return; ++} ++ ++int ++pnfs_layout_process(struct nfs4_pnfs_layoutget *lgp) ++{ ++ struct pnfs_layout_type *lo = NFS_I(lgp->args.inode)->layout; ++ struct nfs4_pnfs_layoutget_res *res = &lgp->res; ++ struct pnfs_layout_segment *lseg; ++ struct inode *ino = PNFS_INODE(lo); ++ int status = 0; ++ ++ /* Inject layout blob into I/O device driver */ ++ lseg = PNFS_LD_IO_OPS(lo)->alloc_lseg(lo, res); ++ if (!lseg || IS_ERR(lseg)) { ++ if (!lseg) ++ status = -ENOMEM; ++ else ++ status = PTR_ERR(lseg); ++ dprintk("%s: Could not allocate layout: error %d\n", ++ __func__, status); ++ goto out; ++ } ++ ++ spin_lock(&ino->i_lock); ++ init_lseg(lo, lseg); ++ lseg->range = res->lseg; ++ if (lgp->lsegpp) { ++ get_lseg(lseg); ++ *lgp->lsegpp = lseg; ++ } ++ pnfs_insert_layout(lo, lseg); ++ ++ if (res->return_on_close) { ++ lo->roc_iomode |= res->lseg.iomode; ++ if (!lo->roc_iomode) ++ lo->roc_iomode = IOMODE_ANY; ++ } ++ ++ /* Done processing layoutget. Set the layout stateid */ ++ pnfs_set_layout_stateid(lo, &res->stateid); ++ spin_unlock(&ino->i_lock); ++out: ++ return status; ++} ++ ++void ++readahead_range(struct inode *inode, struct list_head *pages, loff_t *offset, ++ size_t *count) ++{ ++ struct page *first, *last; ++ loff_t foff, i_size = i_size_read(inode); ++ pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; ++ size_t range; ++ ++ ++ first = list_entry((pages)->prev, struct page, lru); ++ last = list_entry((pages)->next, struct page, lru); ++ ++ foff = (loff_t)first->index << PAGE_CACHE_SHIFT; ++ ++ range = (last->index - first->index) * PAGE_CACHE_SIZE; ++ if (last->index == end_index) ++ range += ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; ++ else ++ range += PAGE_CACHE_SIZE; ++ dprintk("%s foff %lu, range %Zu\n", __func__, (unsigned long)foff, ++ range); ++ *offset = foff; ++ *count = range; ++} ++ ++void ++pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio) ++{ ++ struct pnfs_layout_type *laytype; ++ struct pnfs_layoutdriver_type *ld; ++ ++ pgio->pg_test = NULL; ++ ++ laytype = NFS_I(inode)->layout; ++ ld = NFS_SERVER(inode)->pnfs_curr_ld; ++ if (!pnfs_enabled_sb(NFS_SERVER(inode)) || !laytype) ++ return; ++ ++ if (ld->ld_policy_ops) ++ pgio->pg_test = ld->ld_policy_ops->pg_test; ++} ++ ++static u32 ++pnfs_getboundary(struct inode *inode) ++{ ++ u32 stripe_size = 0; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct layoutdriver_policy_operations *policy_ops; ++ ++ if (!nfss->pnfs_curr_ld) ++ goto out; ++ ++ policy_ops = nfss->pnfs_curr_ld->ld_policy_ops; ++ if (!policy_ops || !policy_ops->get_stripesize) ++ goto out; ++ ++ /* The default is to not gather across stripes */ ++ if (pnfs_ld_gather_across_stripes(nfss->pnfs_curr_ld)) ++ goto out; ++ ++ spin_lock(&inode->i_lock); ++ if (NFS_I(inode)->layout) ++ stripe_size = policy_ops->get_stripesize(NFS_I(inode)->layout); ++ spin_unlock(&inode->i_lock); ++out: ++ return stripe_size; ++} ++ ++/* ++ * rsize is already set by caller to MDS rsize. ++ */ ++void ++pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, ++ struct inode *inode, ++ struct nfs_open_context *ctx, ++ struct list_head *pages, ++ size_t *rsize) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ size_t count = 0; ++ loff_t loff; ++ ++ pgio->pg_iswrite = 0; ++ pgio->pg_boundary = 0; ++ pgio->pg_test = NULL; ++ pgio->pg_lseg = NULL; ++ ++ if (!pnfs_enabled_sb(nfss)) ++ return; ++ ++ /* Calculate the total read-ahead count */ ++ readahead_range(inode, pages, &loff, &count); ++ ++ if (count > 0) { ++ _pnfs_update_layout(inode, ctx, loff, count, IOMODE_READ, ++ &pgio->pg_lseg); ++ if (!pgio->pg_lseg) ++ return; ++ ++ *rsize = NFS_SERVER(inode)->ds_rsize; ++ pgio->pg_boundary = pnfs_getboundary(inode); ++ if (pgio->pg_boundary) ++ pnfs_set_pg_test(inode, pgio); ++ } ++} ++ ++void ++pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, ++ size_t *wsize) ++{ ++ struct nfs_server *server = NFS_SERVER(inode); ++ ++ pgio->pg_iswrite = 1; ++ if (!pnfs_enabled_sb(server)) { ++ pgio->pg_boundary = 0; ++ pgio->pg_test = NULL; ++ return; ++ } ++ pgio->pg_boundary = pnfs_getboundary(inode); ++ pnfs_set_pg_test(inode, pgio); ++ *wsize = server->ds_wsize; ++} ++ ++/* Return I/O buffer size for a layout driver ++ * This value will determine what size reads and writes ++ * will be gathered into and sent to the data servers. ++ * blocksize must be a multiple of the page cache size. ++ */ ++unsigned int ++pnfs_getiosize(struct nfs_server *server) ++{ ++ if (!PNFS_EXISTS_LDPOLICY_OP(server, get_blocksize)) ++ return 0; ++ return server->pnfs_curr_ld->ld_policy_ops->get_blocksize(); ++} ++ ++void ++pnfs_set_ds_iosize(struct nfs_server *server) ++{ ++ unsigned dssize = pnfs_getiosize(server); ++ ++ /* Set buffer size for data servers */ ++ if (dssize > 0) { ++ server->ds_rsize = server->ds_wsize = ++ nfs_block_size(dssize, NULL); ++ } else { ++ server->ds_wsize = server->wsize; ++ server->ds_rsize = server->rsize; ++ } ++} ++ ++static int ++pnfs_call_done(struct pnfs_call_data *pdata, struct rpc_task *task, void *data) ++{ ++ put_lseg(pdata->lseg); ++ pdata->lseg = NULL; ++ pdata->call_ops->rpc_call_done(task, data); ++ if (pdata->pnfs_error == -EAGAIN || task->tk_status == -EAGAIN) ++ return -EAGAIN; ++ if (pdata->pnfsflags & PNFS_NO_RPC) { ++ pdata->call_ops->rpc_release(data); ++ } else { ++ /* ++ * just restore original rpc call ops ++ * rpc_release will be called later by the rpc scheduling layer. ++ */ ++ task->tk_ops = pdata->call_ops; ++ } ++ return 0; ++} ++ ++/* Post-write completion function ++ * Invoked by all layout drivers when write_pagelist is done. ++ * ++ * NOTE: callers set data->pnfsflags PNFS_NO_RPC ++ * so that the NFS cleanup routines perform only the page cache ++ * cleanup. ++ */ ++static void ++pnfs_write_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ struct nfs4_pnfs_layout_segment range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = wdata->args.offset; ++ range.length = wdata->args.count; ++ _pnfs_return_layout(wdata->inode, &range, NULL, RETURN_FILE, true); ++ pnfs_initiate_write(wdata, NFS_CLIENT(wdata->inode), ++ wdata->pdata.call_ops, wdata->pdata.how); ++} ++ ++static void ++pnfs_writeback_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ /* update last write offset and need layout commit ++ * for non-files layout types (files layout calls ++ * pnfs4_write_done for this) ++ */ ++ if ((pdata->pnfsflags & PNFS_NO_RPC) && ++ data->task.tk_status >= 0 && data->res.count > 0) { ++ struct nfs_inode *nfsi = NFS_I(data->inode); ++ ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++ } ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_write_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++ ++static void _pnfs_clear_lseg_from_pages(struct list_head *head) ++{ ++ struct nfs_page *req; ++ ++ list_for_each_entry(req, head, wb_list) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } ++} ++ ++/* ++ * Call the appropriate parallel I/O subsystem write function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ * ++ * TODO: Is wdata->how and wdata->args.stable always the same value? ++ * TODO: It seems in NFS, the server may not do a stable write even ++ * though it was requested (and vice-versa?). To check, it looks ++ * in data->res.verf->committed. Do we need this ability ++ * for non-file layout drivers? ++ */ ++enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *wdata, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ struct inode *inode = wdata->inode; ++ enum pnfs_try_status trypnfs; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = wdata->req->wb_lseg; ++ ++ wdata->pdata.call_ops = call_ops; ++ wdata->pdata.pnfs_error = 0; ++ wdata->pdata.how = how; ++ ++ dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, ++ inode->i_ino, wdata->args.count, wdata->args.offset, how); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ wdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ wdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->write_pagelist(wdata, ++ nfs_page_array_len(wdata->args.pgbase, wdata->args.count), ++ how); ++ ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ wdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ wdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&wdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* Post-read completion function. Invoked by all layout drivers when ++ * read_pagelist is done ++ */ ++static void ++pnfs_read_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ struct nfs4_pnfs_layout_segment range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = rdata->args.offset; ++ range.length = rdata->args.count; ++ _pnfs_return_layout(rdata->inode, &range, NULL, RETURN_FILE, true); ++ pnfs_initiate_read(rdata, NFS_CLIENT(rdata->inode), ++ rdata->pdata.call_ops); ++} ++ ++static void ++pnfs_read_done(struct nfs_read_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_read_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++ ++/* ++ * Call the appropriate parallel I/O subsystem read function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ */ ++enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *rdata, ++ const struct rpc_call_ops *call_ops) ++{ ++ struct inode *inode = rdata->inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = rdata->req->wb_lseg; ++ enum pnfs_try_status trypnfs; ++ ++ rdata->pdata.call_ops = call_ops; ++ rdata->pdata.pnfs_error = 0; ++ ++ dprintk("%s: Reading ino:%lu %u@%llu\n", ++ __func__, inode->i_ino, rdata->args.count, rdata->args.offset); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ rdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ rdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->read_pagelist(rdata, ++ nfs_page_array_len(rdata->args.pgbase, rdata->args.count)); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ rdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ rdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&rdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_READ); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* ++ * This gives the layout driver an opportunity to read in page "around" ++ * the data to be written. It returns 0 on success, otherwise an error code ++ * which will either be passed up to user, or ignored if ++ * some previous part of write succeeded. ++ * Note the range [pos, pos+len-1] is entirely within the page. ++ */ ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata) ++{ ++ struct pnfs_fsdata *data; ++ int status = 0; ++ ++ dprintk("--> %s: pos=%llu len=%u\n", ++ __func__, (unsigned long long)pos, len); ++ data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL); ++ if (!data) { ++ status = -ENOMEM; ++ goto out; ++ } ++ data->lseg = lseg; /* refcount passed into data to be managed there */ ++ status = NFS_SERVER(inode)->pnfs_curr_ld->ld_io_ops->write_begin( ++ lseg, page, pos, len, data); ++ if (status) { ++ kfree(data); ++ data = NULL; ++ } ++out: ++ *fsdata = data; ++ dprintk("<-- %s: status=%d\n", __func__, status); ++ return status; ++} ++ ++/* Return 0 on succes, negative on failure */ ++/* CAREFUL - what happens if copied < len??? */ ++int _pnfs_write_end(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int status; ++ ++ status = nfss->pnfs_curr_ld->ld_io_ops->write_end(inode, page, ++ pos, len, copied, lseg); ++ return status; ++} ++ ++/* pNFS Commit callback function for all layout drivers */ ++static void ++pnfs_commit_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ struct nfs4_pnfs_layout_segment range = { ++ .iomode = IOMODE_RW, ++ .offset = data->args.offset, ++ .length = data->args.count, ++ }; ++ dprintk("%s: retrying\n", __func__); ++ _pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE, ++ true); ++ pnfs_initiate_commit(data, NFS_CLIENT(data->inode), ++ pdata->call_ops, pdata->how, 1); ++ } ++} ++ ++enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int sync) ++{ ++ struct inode *inode = data->inode; ++ struct nfs_server *nfss = NFS_SERVER(data->inode); ++ enum pnfs_try_status trypnfs; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ /* We need to account for possibility that ++ * each nfs_page can point to a different lseg (or be NULL). ++ * For the immediate case of whole-file-only layouts, we at ++ * least know there can be only a single lseg. ++ * We still have to account for the possibility of some being NULL. ++ * This will be done by passing the buck to the layout driver. ++ */ ++ data->pdata.call_ops = call_ops; ++ data->pdata.pnfs_error = 0; ++ data->pdata.how = sync; ++ data->pdata.lseg = NULL; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->commit(data, sync); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ data->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ _pnfs_clear_lseg_from_pages(&data->pages); ++ } else ++ nfs_inc_stats(inode, NFSIOS_PNFS_COMMIT); ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++void pnfs_cleanup_layoutcommit(struct pnfs_layoutcommit_data *data) ++{ ++ struct nfs_server *nfss = NFS_SERVER(data->args.inode); ++ ++ /* TODO: Maybe we should avoid this by allowing the layout driver ++ * to directly xdr its layout on the wire. ++ */ ++ if (nfss->pnfs_curr_ld->ld_io_ops->cleanup_layoutcommit) ++ nfss->pnfs_curr_ld->ld_io_ops->cleanup_layoutcommit( ++ NFS_I(data->args.inode)->layout, ++ &data->args, data->status); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int ++pnfs_layoutcommit_setup(struct inode *inode, ++ struct pnfs_layoutcommit_data *data, ++ loff_t write_begin_pos, loff_t write_end_pos) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int result = 0; ++ ++ dprintk("--> %s\n", __func__); ++ ++ data->args.inode = inode; ++ data->args.fh = NFS_FH(inode); ++ data->args.layout_type = nfss->pnfs_curr_ld->id; ++ data->res.fattr = &data->fattr; ++ nfs_fattr_init(&data->fattr); ++ ++ /* TODO: Need to determine the correct values */ ++ data->args.time_modify_changed = 0; ++ ++ /* Set values from inode so it can be reset ++ */ ++ data->args.lseg.iomode = IOMODE_RW; ++ data->args.lseg.offset = write_begin_pos; ++ data->args.lseg.length = write_end_pos - write_begin_pos + 1; ++ data->args.lastbytewritten = min(write_end_pos, ++ i_size_read(inode) - 1); ++ data->args.bitmask = nfss->attr_bitmask; ++ data->res.server = nfss; ++ ++ /* Call layout driver to set the arguments */ ++ if (nfss->pnfs_curr_ld->ld_io_ops->setup_layoutcommit) ++ result = nfss->pnfs_curr_ld->ld_io_ops->setup_layoutcommit( ++ NFS_I(inode)->layout, &data->args); ++ ++ dprintk("<-- %s Status %d\n", __func__, result); ++ return result; ++} ++ ++/* Issue a async layoutcommit for an inode. ++ */ ++int ++pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ struct pnfs_layoutcommit_data *data; ++ struct nfs_inode *nfsi = NFS_I(inode); ++ loff_t write_begin_pos; ++ loff_t write_end_pos; ++ ++ int status = 0; ++ ++ dprintk("%s Begin (sync:%d)\n", __func__, sync); ++ ++ BUG_ON(!has_layout(nfsi)); ++ ++ data = pnfs_layoutcommit_alloc(); ++ if (!data) ++ return -ENOMEM; ++ ++ spin_lock(&inode->i_lock); ++ if (!layoutcommit_needed(nfsi)) { ++ spin_unlock(&inode->i_lock); ++ goto out_free; ++ } ++ ++ /* Clear layoutcommit properties in the inode so ++ * new lc info can be generated ++ */ ++ write_begin_pos = nfsi->layout->pnfs_write_begin_pos; ++ write_end_pos = nfsi->layout->pnfs_write_end_pos; ++ data->cred = nfsi->layout->lo_cred; ++ nfsi->layout->pnfs_write_begin_pos = 0; ++ nfsi->layout->pnfs_write_end_pos = 0; ++ nfsi->layout->lo_cred = NULL; ++ __clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->pnfs_layout_state); ++ pnfs_get_layout_stateid(&data->args.stateid, nfsi->layout); ++ ++ /* Reference for layoutcommit matched in pnfs_layoutcommit_release */ ++ get_layout(NFS_I(inode)->layout); ++ ++ spin_unlock(&inode->i_lock); ++ ++ /* Set up layout commit args */ ++ status = pnfs_layoutcommit_setup(inode, data, write_begin_pos, ++ write_end_pos); ++ if (status) { ++ /* The layout driver failed to setup the layoutcommit */ ++ put_rpccred(data->cred); ++ put_layout(inode); ++ goto out_free; ++ } ++ status = pnfs4_proc_layoutcommit(data, sync); ++out: ++ dprintk("%s end (err:%d)\n", __func__, status); ++ return status; ++out_free: ++ pnfs_layoutcommit_free(data); ++ goto out; ++} ++ ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata) ++{ ++ if (fsdata) { ++ /* lseg refcounting handled directly in nfs_Write_end */ ++ kfree(fsdata); ++ } ++} ++ ++/* Callback operations for layout drivers. ++ */ ++struct pnfs_client_operations pnfs_ops = { ++ .nfs_getdevicelist = nfs4_pnfs_getdevicelist, ++ .nfs_getdeviceinfo = nfs4_pnfs_getdeviceinfo, ++ .nfs_readlist_complete = pnfs_read_done, ++ .nfs_writelist_complete = pnfs_writeback_done, ++ .nfs_commit_complete = pnfs_commit_done, ++}; ++ ++EXPORT_SYMBOL(pnfs_unregister_layoutdriver); ++EXPORT_SYMBOL(pnfs_register_layoutdriver); ++ ++ ++/* Device ID cache. Supports one layout type per struct nfs_client */ ++int ++nfs4_alloc_init_deviceid_cache(struct nfs_client *clp, ++ void (*free_callback)(struct kref *)) ++{ ++ struct nfs4_deviceid_cache *c; ++ ++ c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL); ++ if (!c) ++ return -ENOMEM; ++ spin_lock(&clp->cl_lock); ++ if (clp->cl_devid_cache != NULL) { ++ kref_get(&clp->cl_devid_cache->dc_kref); ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [kref [%d]]\n", __func__, ++ atomic_read(&clp->cl_devid_cache->dc_kref.refcount)); ++ kfree(c); ++ } else { ++ int i; ++ ++ spin_lock_init(&c->dc_lock); ++ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE ; i++) ++ INIT_HLIST_HEAD(&c->dc_deviceids[i]); ++ kref_init(&c->dc_kref); ++ c->dc_free_callback = free_callback; ++ clp->cl_devid_cache = c; ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [new]\n", __func__); ++ } ++ return 0; ++} ++EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache); ++ ++void ++nfs4_init_deviceid_node(struct nfs4_deviceid *d) ++{ ++ INIT_HLIST_NODE(&d->de_node); ++ kref_init(&d->de_kref); ++} ++EXPORT_SYMBOL(nfs4_init_deviceid_node); ++ ++/* Called from layoutdriver_io_operations->alloc_lseg */ ++void ++nfs4_set_layout_deviceid(struct pnfs_layout_segment *l, struct nfs4_deviceid *d) ++{ ++ dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); ++ l->deviceid = d; ++ kref_get(&d->de_kref); ++} ++EXPORT_SYMBOL(nfs4_set_layout_deviceid); ++ ++/* Called from layoutdriver_io_operations->free_lseg */ ++void ++nfs4_unset_layout_deviceid(struct pnfs_layout_segment *l, ++ struct nfs4_deviceid *d, ++ void (*free_callback)(struct kref *)) ++{ ++ dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); ++ l->deviceid = NULL; ++ kref_put(&d->de_kref, free_callback); ++} ++EXPORT_SYMBOL(nfs4_unset_layout_deviceid); ++ ++struct nfs4_deviceid * ++nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ long hash = nfs4_deviceid_hash(id); ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ rcu_read_lock(); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) { ++ rcu_read_unlock(); ++ return d; ++ } ++ } ++ rcu_read_unlock(); ++ return NULL; ++} ++EXPORT_SYMBOL(nfs4_find_deviceid); ++ ++/* ++ * Add or kref_get a deviceid. ++ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new ++ */ ++struct nfs4_deviceid * ++nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ long hash = nfs4_deviceid_hash(&new->de_id); ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ spin_lock(&c->dc_lock); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) { ++ spin_unlock(&c->dc_lock); ++ dprintk("%s [discard]\n", __func__); ++ c->dc_free_callback(&new->de_kref); ++ return d; ++ } ++ } ++ hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]); ++ spin_unlock(&c->dc_lock); ++ dprintk("%s [new]\n", __func__); ++ return new; ++} ++EXPORT_SYMBOL(nfs4_add_deviceid); ++ ++static int ++nfs4_remove_deviceid(struct nfs4_deviceid_cache *c, long hash, ++ struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ spin_lock(&c->dc_lock); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (id && memcmp(id, &d->de_id, NFS4_PNFS_DEVICEID4_SIZE)) ++ continue; ++ hlist_del_rcu(&d->de_node); ++ spin_unlock(&c->dc_lock); ++ synchronize_rcu(); ++ dprintk("%s [%d]\n", __func__, ++ atomic_read(&d->de_kref.refcount)); ++ kref_put(&d->de_kref, c->dc_free_callback); ++ return 1; ++ } ++ spin_unlock(&c->dc_lock); ++ return 0; ++} ++ ++void ++nfs4_delete_device(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) ++{ ++ long hash = nfs4_deviceid_hash(id); ++ ++ nfs4_remove_deviceid(c, hash, id); ++} ++EXPORT_SYMBOL(nfs4_delete_device); ++ ++static void ++nfs4_free_deviceid_cache(struct kref *kref) ++{ ++ struct nfs4_deviceid_cache *cache = ++ container_of(kref, struct nfs4_deviceid_cache, dc_kref); ++ long i; ++ ++ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) ++ while (nfs4_remove_deviceid(cache, i, NULL)) ++ ; ++ kfree(cache); ++} ++ ++void ++nfs4_put_deviceid_cache(struct nfs_client *clp) ++{ ++ struct nfs4_deviceid_cache *tmp = clp->cl_devid_cache; ++ int refcount; ++ ++ dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); ++ spin_lock(&clp->cl_lock); ++ refcount = atomic_read(&clp->cl_devid_cache->dc_kref.refcount); ++ if (refcount == 1) ++ clp->cl_devid_cache = NULL; ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [%d]\n", __func__, refcount); ++ kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache); ++} ++EXPORT_SYMBOL(nfs4_put_deviceid_cache); +diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs.h +--- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-08-23 12:09:03.358501440 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-08-23 12:09:03.358501440 -0400 +@@ -0,0 +1,355 @@ ++/* ++ * fs/nfs/pnfs.h ++ * ++ * pNFS client data structures. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef FS_NFS_PNFS_H ++#define FS_NFS_PNFS_H ++ ++#include ++ ++#ifdef CONFIG_NFS_V4_1 ++ ++#include ++#include ++#include ++#include "iostat.h" ++ ++/* nfs4proc.c */ ++extern int nfs4_pnfs_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist); ++extern int nfs4_pnfs_getdeviceinfo(struct nfs_server *server, ++ struct pnfs_device *dev); ++extern int pnfs4_proc_layoutget(struct nfs4_pnfs_layoutget *lgp); ++extern int pnfs4_proc_layoutcommit(struct pnfs_layoutcommit_data *data, ++ int issync); ++extern int pnfs4_proc_layoutreturn(struct nfs4_pnfs_layoutreturn *lrp, bool wait); ++ ++/* pnfs.c */ ++extern const nfs4_stateid zero_stateid; ++ ++void _pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp); ++ ++int _pnfs_return_layout(struct inode *, struct nfs4_pnfs_layout_segment *, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type, bool wait); ++void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id); ++void unmount_pnfs_layoutdriver(struct nfs_server *); ++enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, ++ const struct rpc_call_ops *); ++int pnfs_initialize(void); ++void pnfs_uninitialize(void); ++void pnfs_layoutcommit_free(struct pnfs_layoutcommit_data *data); ++void pnfs_cleanup_layoutcommit(struct pnfs_layoutcommit_data *data); ++int pnfs_layoutcommit_inode(struct inode *inode, int sync); ++void pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent); ++void pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx); ++unsigned int pnfs_getiosize(struct nfs_server *server); ++void pnfs_set_ds_iosize(struct nfs_server *server); ++enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, ++ struct nfs_open_context *, struct list_head *, ++ size_t *); ++void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, ++ size_t *); ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata); ++void pnfs_get_layout_done(struct nfs4_pnfs_layoutget *, int rpc_status); ++int pnfs_layout_process(struct nfs4_pnfs_layoutget *lgp); ++void pnfs_layout_release(struct pnfs_layout_type *, struct nfs4_pnfs_layout_segment *range); ++void pnfs_set_layout_stateid(struct pnfs_layout_type *lo, ++ const nfs4_stateid *stateid); ++void pnfs_destroy_layout(struct nfs_inode *); ++void pnfs_destroy_all_layouts(struct nfs_client *); ++void put_layout(struct inode *inode); ++void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_type *lo); ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata); ++int _pnfs_write_end(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg); ++ ++#define PNFS_EXISTS_LDIO_OP(srv, opname) ((srv)->pnfs_curr_ld && \ ++ (srv)->pnfs_curr_ld->ld_io_ops && \ ++ (srv)->pnfs_curr_ld->ld_io_ops->opname) ++#define PNFS_EXISTS_LDPOLICY_OP(srv, opname) ((srv)->pnfs_curr_ld && \ ++ (srv)->pnfs_curr_ld->ld_policy_ops && \ ++ (srv)->pnfs_curr_ld->ld_policy_ops->opname) ++ ++#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" ++ ++static inline int lo_fail_bit(u32 iomode) ++{ ++ return iomode == IOMODE_RW ? ++ NFS_INO_RW_LAYOUT_FAILED : NFS_INO_RO_LAYOUT_FAILED; ++} ++ ++/* Return true if a layout driver is being used for this mountpoint */ ++static inline int pnfs_enabled_sb(struct nfs_server *nfss) ++{ ++ return nfss->pnfs_curr_ld != NULL; ++} ++ ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) ++{ ++ return !fsdata || ((struct pnfs_layout_segment *)fsdata == lseg) || ++ !fsdata->bypass_eof; ++} ++ ++/* Should the pNFS client commit and return the layout upon a setattr */ ++static inline bool ++pnfs_ld_layoutret_on_setattr(struct inode *inode) ++{ ++ if (!pnfs_enabled_sb(NFS_SERVER(inode))) ++ return false; ++ return NFS_SERVER(inode)->pnfs_curr_ld->ld_policy_ops->flags & ++ PNFS_LAYOUTRET_ON_SETATTR; ++} ++ ++/* Should the pNFS client commit and return the layout on close ++ */ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout->roc_iomode; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int status = 0; ++ ++ *fsdata = lseg; ++ if (lseg && PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ status = _pnfs_write_begin(inode, page, pos, len, lseg, ++ (struct pnfs_fsdata **) fsdata); ++ return status; ++} ++ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_end)) ++ return _pnfs_write_end(inode, page, pos, len, copied, lseg); ++ else ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++ if (fsdata) { ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_end_cleanup)) ++ nfss->pnfs_curr_ld->ld_io_ops->write_end_cleanup(filp, fsdata); ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ pnfs_free_fsdata(fsdata); ++ } ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct nfs4_pnfs_layout_segment *lseg, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct nfs_server *nfss = NFS_SERVER(ino); ++ ++ if (pnfs_enabled_sb(nfss) && ++ (type != RETURN_FILE || has_layout(nfsi))) ++ return _pnfs_return_layout(ino, lseg, stateid, type, wait); ++ ++ return 0; ++} ++ ++static inline void pnfs_update_layout(struct inode *ino, ++ struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ struct nfs_server *nfss = NFS_SERVER(ino); ++ ++ if (pnfs_enabled_sb(nfss)) ++ _pnfs_update_layout(ino, ctx, pos, count, access_type, lsegpp); ++ else { ++ if (lsegpp) ++ *lsegpp = NULL; ++ } ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline int pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ if (pnfs_enabled_sb(nfss)) ++ return pnfs_ld_use_rpc_code(nfss->pnfs_curr_ld); ++ ++ return 1; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ if (fsdata) { ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ return ((struct pnfs_fsdata *) fsdata)->lseg; ++ } ++ return fsdata; ++} ++#else /* CONFIG_NFS_V4_1 */ ++ ++static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) ++{ ++} ++ ++static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) ++{ ++} ++ ++static inline void get_lseg(struct pnfs_layout_segment *lseg) ++{ ++} ++ ++static inline void put_lseg(struct pnfs_layout_segment *lseg) ++{ ++} ++ ++static inline void ++pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ if (lsegpp) ++ *lsegpp = NULL; ++} ++ ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) ++{ ++ return 1; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ *fsdata = NULL; ++ return 0; ++} ++ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return 0; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return 0; ++} ++ ++static inline int pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ return 1; ++} ++ ++static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ return 0; ++} ++ ++static inline bool ++pnfs_ld_layoutret_on_setattr(struct inode *inode) ++{ ++ return false; ++} ++ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return 0; ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct nfs4_pnfs_layout_segment *lseg, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ return 0; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ return NULL; ++} ++ ++#endif /* CONFIG_NFS_V4_1 */ ++ ++#endif /* FS_NFS_PNFS_H */ +diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc.c +--- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-08-23 12:08:29.057511533 -0400 ++++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-23 12:09:03.359501471 -0400 +@@ -443,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, stru + fattr = nfs_alloc_fattr(); + status = -ENOMEM; + if (fh == NULL || fattr == NULL) +- goto out; ++ goto out_free; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); +@@ -455,7 +455,7 @@ nfs_proc_symlink(struct inode *dir, stru + */ + if (status == 0) + status = nfs_instantiate(dentry, fh, fattr); +- ++out_free: + nfs_free_fattr(fattr); + nfs_free_fhandle(fh); + out: +@@ -694,6 +694,7 @@ const struct nfs_rpc_ops nfs_v2_clientop + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs_dir_inode_operations, + .file_inode_ops = &nfs_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs_proc_get_root, + .getattr = nfs_proc_getattr, + .setattr = nfs_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read.c +--- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-08-23 12:08:29.057511533 -0400 ++++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-23 12:09:03.359501471 -0400 +@@ -18,8 +18,12 @@ + #include + #include + #include ++#include ++#include + + #include ++#include ++#include "pnfs.h" + + #include "nfs4_fs.h" + #include "internal.h" +@@ -117,11 +121,14 @@ int nfs_readpage_async(struct nfs_open_c + LIST_HEAD(one_request); + struct nfs_page *new; + unsigned int len; ++ struct pnfs_layout_segment *lseg; + + len = nfs_page_length(page); + if (len == 0) + return nfs_return_empty_page(page); +- new = nfs_create_request(ctx, inode, page, 0, len); ++ pnfs_update_layout(inode, ctx, 0, NFS4_MAX_UINT64, IOMODE_READ, &lseg); ++ new = nfs_create_request(ctx, inode, page, 0, len, lseg); ++ put_lseg(lseg); + if (IS_ERR(new)) { + unlock_page(page); + return PTR_ERR(new); +@@ -155,24 +162,20 @@ static void nfs_readpage_release(struct + nfs_release_request(req); + } + +-/* +- * Set up the NFS read request struct +- */ +-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset) ++int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, +@@ -180,9 +183,46 @@ static int nfs_read_rpcsetup(struct nfs_ + .flags = RPC_TASK_ASYNC | swap_flags, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->read_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_read); ++ ++int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) ++ return pnfs_get_read_status(data); ++ ++ return nfs_initiate_read(data, clnt, call_ops); ++} ++ ++/* ++ * Set up the NFS read request struct ++ */ ++static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + data->req = req; + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -190,27 +230,14 @@ static int nfs_read_rpcsetup(struct nfs_ + data->args.pages = data->pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); ++ data->args.lock_context = req->wb_lock_context; + + data->res.fattr = &data->fattr; + data->res.count = count; + data->res.eof = 0; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- rpc_put_task(task); +- return 0; ++ return pnfs_initiate_read(data, NFS_CLIENT(inode), call_ops); + } + + static void +@@ -354,7 +381,14 @@ static void nfs_readpage_retry(struct rp + { + struct nfs_readargs *argp = &data->args; + struct nfs_readres *resp = &data->res; ++ struct nfs_client *clp = NFS_SERVER(data->inode)->nfs_client; + ++#ifdef CONFIG_NFS_V4_1 ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + if (resp->eof || resp->count == argp->count) + return; + +@@ -368,7 +402,10 @@ static void nfs_readpage_retry(struct rp + argp->offset += resp->count; + argp->pgbase += resp->count; + argp->count -= resp->count; +- nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ ++ nfs_restart_rpc(task, clp); + } + + /* +@@ -409,13 +446,19 @@ static void nfs_readpage_release_partial + void nfs_read_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_read_data *data = calldata; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client, ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, + &data->args.seq_args, &data->res.seq_res, + 0, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_read_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_read_partial_ops = { +@@ -568,7 +611,8 @@ readpage_async_filler(void *data, struct + if (len == 0) + return nfs_return_empty_page(page); + +- new = nfs_create_request(desc->ctx, inode, page, 0, len); ++ new = nfs_create_request(desc->ctx, inode, page, 0, len, ++ desc->pgio->pg_lseg); + if (IS_ERR(new)) + goto out_error; + +@@ -624,6 +668,9 @@ int nfs_readpages(struct file *filp, str + if (ret == 0) + goto read_complete; /* all pages were read */ + ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_pageio_init_read(&pgio, inode, desc.ctx, pages, &rsize); ++#endif /* CONFIG_NFS_V4_1 */ + if (rsize < PAGE_CACHE_SIZE) + nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); + else +@@ -632,6 +679,7 @@ int nfs_readpages(struct file *filp, str + ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); + + nfs_pageio_complete(&pgio); ++ put_lseg(pgio.pg_lseg); + npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nfs_add_stats(inode, NFSIOS_READPAGES, npages); + read_complete: +diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/super.c +--- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-23 12:08:29.059491391 -0400 ++++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-23 12:09:03.361501458 -0400 +@@ -64,6 +64,7 @@ + #include "iostat.h" + #include "internal.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_VFS + +@@ -669,6 +670,28 @@ static int nfs_show_options(struct seq_f + + return 0; + } ++#ifdef CONFIG_NFS_V4_1 ++void show_sessions(struct seq_file *m, struct nfs_server *server) ++{ ++ if (nfs4_has_session(server->nfs_client)) ++ seq_printf(m, ",sessions"); ++} ++#else ++void show_sessions(struct seq_file *m, struct nfs_server *server) {} ++#endif ++ ++#ifdef CONFIG_NFS_V4_1 ++void show_pnfs(struct seq_file *m, struct nfs_server *server) ++{ ++ seq_printf(m, ",pnfs="); ++ if (server->pnfs_curr_ld) ++ seq_printf(m, "%s", server->pnfs_curr_ld->name); ++ else ++ seq_printf(m, "not configured"); ++} ++#else /* CONFIG_NFS_V4_1 */ ++void show_pnfs(struct seq_file *m, struct nfs_server *server) {} ++#endif /* CONFIG_NFS_V4_1 */ + + /* + * Present statistical information for this VFS mountpoint +@@ -707,6 +730,8 @@ static int nfs_show_stats(struct seq_fil + seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); + seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); + seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); ++ show_sessions(m, nfss); ++ show_pnfs(m, nfss); + } + #endif + +diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unlink.c +--- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-08-23 12:08:29.060501485 -0400 ++++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-23 12:09:03.362419975 -0400 +@@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task + struct nfs_unlinkdata *data = calldata; + struct nfs_server *server = NFS_SERVER(data->dir); + +- if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, ++ if (nfs4_setup_sequence(server, NULL, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/write.c +--- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-08-23 12:08:27.630563929 -0400 ++++ linux-2.6.34.noarch/fs/nfs/write.c 2010-08-23 12:09:03.364491337 -0400 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + #include + +@@ -28,6 +29,7 @@ + #include "iostat.h" + #include "nfs4_fs.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PAGECACHE + +@@ -59,6 +61,7 @@ struct nfs_write_data *nfs_commitdata_al + } + return p; + } ++EXPORT_SYMBOL(nfs_commitdata_alloc); + + void nfs_commit_free(struct nfs_write_data *p) + { +@@ -66,6 +69,7 @@ void nfs_commit_free(struct nfs_write_da + kfree(p->pagevec); + mempool_free(p, nfs_commit_mempool); + } ++EXPORT_SYMBOL(nfs_commit_free); + + struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) + { +@@ -418,6 +422,17 @@ static void nfs_inode_remove_request(str + nfs_clear_request(req); + nfs_release_request(req); + } ++static void ++nfs_mark_request_nopnfs(struct nfs_page *req) ++{ ++ struct pnfs_layout_segment *lseg = req->wb_lseg; ++ ++ if (req->wb_lseg == NULL) ++ return; ++ req->wb_lseg = NULL; ++ put_lseg(lseg); ++ dprintk(" retry through MDS\n"); ++} + + static void + nfs_mark_request_dirty(struct nfs_page *req) +@@ -523,7 +538,7 @@ nfs_need_commit(struct nfs_inode *nfsi) + * The requests are *not* checked to ensure that they form a contiguous set. + */ + static int +-nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) ++nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int *use_pnfs) + { + struct nfs_inode *nfsi = NFS_I(inode); + int ret; +@@ -531,7 +546,8 @@ nfs_scan_commit(struct inode *inode, str + if (!nfs_need_commit(nfsi)) + return 0; + +- ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); ++ ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT, ++ use_pnfs); + if (ret > 0) + nfsi->ncommit -= ret; + if (nfs_need_commit(NFS_I(inode))) +@@ -560,7 +576,8 @@ static inline int nfs_scan_commit(struct + static struct nfs_page *nfs_try_to_update_request(struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int bytes) ++ unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + unsigned int rqend; +@@ -585,8 +602,8 @@ static struct nfs_page *nfs_try_to_updat + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. + */ +- if (offset > rqend +- || end < req->wb_offset) ++ if (offset > rqend || end < req->wb_offset || ++ req->wb_lseg != lseg) + goto out_flushme; + + if (nfs_set_page_tag_locked(req)) +@@ -634,16 +651,17 @@ out_err: + * already called nfs_flush_incompatible() if necessary. + */ + static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, +- struct page *page, unsigned int offset, unsigned int bytes) ++ struct page *page, unsigned int offset, unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct inode *inode = page->mapping->host; + struct nfs_page *req; + int error; + +- req = nfs_try_to_update_request(inode, page, offset, bytes); ++ req = nfs_try_to_update_request(inode, page, offset, bytes, lseg); + if (req != NULL) + goto out; +- req = nfs_create_request(ctx, inode, page, offset, bytes); ++ req = nfs_create_request(ctx, inode, page, offset, bytes, lseg); + if (IS_ERR(req)) + goto out; + error = nfs_inode_add_request(inode, req); +@@ -656,23 +674,27 @@ out: + } + + static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, ++ void *fsdata) + { + struct nfs_page *req; + +- req = nfs_setup_write_request(ctx, page, offset, count); ++ req = nfs_setup_write_request(ctx, page, offset, count, lseg); + if (IS_ERR(req)) + return PTR_ERR(req); + nfs_mark_request_dirty(req); + /* Update file length */ +- nfs_grow_file(page, offset, count); ++ if (pnfs_grow_ok(lseg, fsdata)) ++ nfs_grow_file(page, offset, count); + nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + return 0; + } + +-int nfs_flush_incompatible(struct file *file, struct page *page) ++int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct nfs_page *req; +@@ -689,7 +711,10 @@ int nfs_flush_incompatible(struct file * + req = nfs_page_find_request(page); + if (req == NULL) + return 0; +- do_flush = req->wb_page != page || req->wb_context != ctx; ++ do_flush = req->wb_page != page || req->wb_context != ctx || ++ req->wb_lock_context->lockowner != current->files || ++ req->wb_lock_context->pid != current->tgid || ++ req->wb_lseg != lseg; + nfs_release_request(req); + if (!do_flush) + return 0; +@@ -716,7 +741,8 @@ static int nfs_write_pageuptodate(struct + * things with a page scheduled for an RPC call (e.g. invalidate it). + */ + int nfs_updatepage(struct file *file, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct inode *inode = page->mapping->host; +@@ -741,7 +767,7 @@ int nfs_updatepage(struct file *file, st + offset = 0; + } + +- status = nfs_writepage_setup(ctx, page, offset, count); ++ status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata); + if (status < 0) + nfs_set_pageerror(page); + +@@ -771,25 +797,21 @@ static int flush_task_priority(int how) + return RPC_PRIORITY_NORMAL; + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_write_rpcsetup(struct nfs_page *req, +- struct nfs_write_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset, +- int how) ++int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .task = &data->task, + .rpc_message = &msg, + .callback_ops = call_ops, +@@ -800,12 +822,62 @@ static int nfs_write_rpcsetup(struct nfs + }; + int ret = 0; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->write_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ if (how & FLUSH_SYNC) { ++ ret = rpc_wait_for_completion_task(task); ++ if (ret == 0) ++ ret = task->tk_status; ++ } ++ rpc_put_task(task); ++out: ++ return ret; ++} ++EXPORT_SYMBOL(nfs_initiate_write); ++ ++int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_write(data, clnt, call_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_write_rpcsetup(struct nfs_page *req, ++ struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset, ++ int how) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + data->req = req; + data->inode = inode = req->wb_context->path.dentry->d_inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -813,6 +885,7 @@ static int nfs_write_rpcsetup(struct nfs + data->args.pages = data->pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); ++ data->args.lock_context = req->wb_lock_context; + data->args.stable = NFS_UNSTABLE; + if (how & FLUSH_STABLE) { + data->args.stable = NFS_DATA_SYNC; +@@ -825,30 +898,7 @@ static int nfs_write_rpcsetup(struct nfs + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- dprintk("NFS: %5u initiated write call " +- "(req %s/%lld, %u bytes @ offset %llu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) { +- ret = PTR_ERR(task); +- goto out; +- } +- if (how & FLUSH_SYNC) { +- ret = rpc_wait_for_completion_task(task); +- if (ret == 0) +- ret = task->tk_status; +- } +- rpc_put_task(task); +-out: +- return ret; ++ return pnfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); + } + + /* If a nfs_flush_* function fails, it should remove reqs from @head and +@@ -859,6 +909,7 @@ static void nfs_redirty_request(struct n + { + struct page *page = req->wb_page; + ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); +@@ -971,6 +1022,10 @@ static void nfs_pageio_init_write(struct + { + size_t wsize = NFS_SERVER(inode)->wsize; + ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_pageio_init_write(pgio, inode, &wsize); ++#endif /* CONFIG_NFS_V4_1 */ ++ + if (wsize < PAGE_CACHE_SIZE) + nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); + else +@@ -1036,13 +1091,27 @@ out: + void nfs_write_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_write_data *data = calldata; +- struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(clp, &data->args.seq_args, ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } else if (data->args.count > NFS_SERVER(data->inode)->wsize) { ++ /* retrying via MDS? */ ++ data->pdata.orig_count = data->args.count; ++ data->args.count = NFS_SERVER(data->inode)->wsize; ++ dprintk("%s: trimmed count %u to wsize %u\n", __func__, ++ data->pdata.orig_count, data->args.count); ++ } else ++ data->pdata.orig_count = 0; ++ ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, ++ &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_write_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_write_partial_ops = { +@@ -1126,10 +1195,11 @@ int nfs_writeback_done(struct rpc_task * + struct nfs_writeargs *argp = &data->args; + struct nfs_writeres *resp = &data->res; + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *clp = server->nfs_client; + int status; + +- dprintk("NFS: %5u nfs_writeback_done (status %d)\n", +- task->tk_pid, task->tk_status); ++ dprintk("NFS: %5u nfs_writeback_done (status %d count %u)\n", ++ task->tk_pid, task->tk_status, resp->count); + + /* + * ->write_done will attempt to use post-op attributes to detect +@@ -1142,6 +1212,13 @@ int nfs_writeback_done(struct rpc_task * + if (status != 0) + return status; + nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); ++#ifdef CONFIG_NFS_V4_1 ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) + if (resp->verf->committed < argp->stable && task->tk_status >= 0) { +@@ -1158,7 +1235,7 @@ int nfs_writeback_done(struct rpc_task * + if (time_before(complain, jiffies)) { + dprintk("NFS: faulty NFS server %s:" + " (committed = %d) != (stable = %d)\n", +- server->nfs_client->cl_hostname, ++ clp->cl_hostname, + resp->verf->committed, argp->stable); + complain = jiffies + 300 * HZ; + } +@@ -1168,6 +1245,9 @@ int nfs_writeback_done(struct rpc_task * + if (task->tk_status >= 0 && resp->count < argp->count) { + static unsigned long complain; + ++ dprintk("NFS: short write:" ++ " (resp->count %u) < (argp->count = %u)\n", ++ resp->count, argp->count); + nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE); + + /* Has the server at least made some progress? */ +@@ -1184,7 +1264,10 @@ int nfs_writeback_done(struct rpc_task * + */ + argp->stable = NFS_FILE_SYNC; + } +- nfs_restart_rpc(task, server->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ ++ nfs_restart_rpc(task, clp); + return -EAGAIN; + } + if (time_before(complain, jiffies)) { +@@ -1228,40 +1311,73 @@ static void nfs_commitdata_release(void + nfs_commit_free(wdata); + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_commit_rpcsetup(struct list_head *head, +- struct nfs_write_data *data, +- int how) ++int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct nfs_page *first = nfs_list_entry(head->next); +- struct inode *inode = first->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = first->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, +- .callback_ops = &nfs_commit_ops, ++ .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + .priority = priority, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->commit_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_commit); ++ ++ ++int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs) ++{ ++ if (pnfs && ++ (pnfs_try_to_commit(data, &nfs_commit_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_commit(data, clnt, &nfs_commit_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_commit_rpcsetup(struct list_head *head, ++ struct nfs_write_data *data, ++ int how, int pnfs) ++{ ++ struct nfs_page *first = nfs_list_entry(head->next); ++ struct inode *inode = first->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + list_splice_init(head, &data->pages); + + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = first->wb_context->cred; + + data->args.fh = NFS_FH(data->inode); + /* Note: we always request a commit of the entire inode */ +@@ -1272,45 +1388,47 @@ static int nfs_commit_rpcsetup(struct li + data->res.fattr = &data->fattr; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); ++ kref_init(&data->refcount); ++ data->parent = NULL; ++ data->args.context = first->wb_context; /* used by commit done */ + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->commit_setup(data, &msg); ++ return pnfs_initiate_commit(data, NFS_CLIENT(inode), &nfs_commit_ops, ++ how, pnfs); ++} + +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++/* Handle memory error during commit */ ++void nfs_mark_list_commit(struct list_head *head) ++{ ++ struct nfs_page *req; + +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- rpc_put_task(task); +- return 0; ++ while (!list_empty(head)) { ++ req = nfs_list_entry(head->next); ++ nfs_list_remove_request(req); ++ nfs_mark_request_commit(req); ++ dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); ++ dec_bdi_stat(req->wb_page->mapping->backing_dev_info, ++ BDI_RECLAIMABLE); ++ nfs_clear_page_tag_locked(req); ++ } + } ++EXPORT_SYMBOL(nfs_mark_list_commit); + + /* + * Commit dirty pages + */ + static int +-nfs_commit_list(struct inode *inode, struct list_head *head, int how) ++nfs_commit_list(struct inode *inode, struct list_head *head, int how, int pnfs) + { + struct nfs_write_data *data; +- struct nfs_page *req; + + data = nfs_commitdata_alloc(); +- + if (!data) + goto out_bad; + + /* Set up the argument struct */ +- return nfs_commit_rpcsetup(head, data, how); ++ return nfs_commit_rpcsetup(head, data, how, pnfs); + out_bad: +- while (!list_empty(head)) { +- req = nfs_list_entry(head->next); +- nfs_list_remove_request(req); +- nfs_mark_request_commit(req); +- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); +- dec_bdi_stat(req->wb_page->mapping->backing_dev_info, +- BDI_RECLAIMABLE); +- nfs_clear_page_tag_locked(req); +- } ++ nfs_mark_list_commit(head); + nfs_commit_clear_lock(NFS_I(inode)); + return -ENOMEM; + } +@@ -1330,6 +1448,19 @@ static void nfs_commit_done(struct rpc_t + return; + } + ++static inline void nfs_commit_cleanup(struct kref *kref) ++{ ++ struct nfs_write_data *data; ++ ++ data = container_of(kref, struct nfs_write_data, refcount); ++ /* Clear lock only when all cloned commits are finished */ ++ if (data->parent) ++ kref_put(&data->parent->refcount, nfs_commit_cleanup); ++ else ++ nfs_commit_clear_lock(NFS_I(data->inode)); ++ nfs_commitdata_release(data); ++} ++ + static void nfs_commit_release(void *calldata) + { + struct nfs_write_data *data = calldata; +@@ -1347,6 +1478,11 @@ static void nfs_commit_release(void *cal + req->wb_bytes, + (long long)req_offset(req)); + if (status < 0) { ++ if (req->wb_lseg) { ++ nfs_mark_request_nopnfs(req); ++ nfs_mark_request_dirty(req); ++ goto next; ++ } + nfs_context_set_write_error(req->wb_context, status); + nfs_inode_remove_request(req); + dprintk(", error = %d\n", status); +@@ -1363,12 +1499,12 @@ static void nfs_commit_release(void *cal + } + /* We have a mismatch. Write the page again */ + dprintk(" mismatch\n"); ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + next: + nfs_clear_page_tag_locked(req); + } +- nfs_commit_clear_lock(NFS_I(data->inode)); +- nfs_commitdata_release(calldata); ++ kref_put(&data->refcount, nfs_commit_cleanup); + } + + static const struct rpc_call_ops nfs_commit_ops = { +@@ -1384,21 +1520,22 @@ int nfs_commit_inode(struct inode *inode + LIST_HEAD(head); + int may_wait = how & FLUSH_SYNC; + int res = 0; ++ int use_pnfs = 0; + + if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) + goto out_mark_dirty; + spin_lock(&inode->i_lock); +- res = nfs_scan_commit(inode, &head, 0, 0); ++ res = nfs_scan_commit(inode, &head, 0, 0, &use_pnfs); + spin_unlock(&inode->i_lock); + if (res) { +- int error = nfs_commit_list(inode, &head, how); ++ int error = nfs_commit_list(inode, &head, how, use_pnfs); + if (error < 0) + return error; +- if (may_wait) ++ if (may_wait) { + wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); +- else ++ } else + goto out_mark_dirty; + } else + nfs_commit_clear_lock(NFS_I(inode)); +@@ -1451,7 +1588,18 @@ static int nfs_commit_unstable_pages(str + + int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) + { +- return nfs_commit_unstable_pages(inode, wbc); ++ int ret; ++ ret = nfs_commit_unstable_pages(inode, wbc); ++ if (ret >= 0 && layoutcommit_needed(NFS_I(inode))) { ++ int err, sync = wbc->sync_mode; ++ ++ if (wbc->nonblocking || wbc->for_background) ++ sync = 0; ++ err = pnfs_layoutcommit_inode(inode, sync); ++ if (err < 0) ++ ret = err; ++ } ++ return ret; + } + + /* +@@ -1459,6 +1607,7 @@ int nfs_write_inode(struct inode *inode, + */ + int nfs_wb_all(struct inode *inode) + { ++ int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, +@@ -1466,7 +1615,8 @@ int nfs_wb_all(struct inode *inode) + .range_end = LLONG_MAX, + }; + +- return sync_inode(inode, &wbc); ++ ret = sync_inode(inode, &wbc); ++ return ret; + } + + int nfs_wb_page_cancel(struct inode *inode, struct page *page) +diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/include/linux/exportfs.h +--- linux-2.6.34.noarch/include/linux/exportfs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-08-23 12:09:03.365501459 -0400 +@@ -2,6 +2,7 @@ + #define LINUX_EXPORTFS_H 1 + + #include ++#include + + struct dentry; + struct inode; +@@ -175,4 +176,62 @@ extern struct dentry *generic_fh_to_pare + struct fid *fid, int fh_len, int fh_type, + struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)); + ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct pnfs_filelayout_device; ++struct pnfs_filelayout_layout; ++ ++extern int filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev); ++extern enum nfsstat4 filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct list_head; ++ ++extern int blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes); ++ ++extern enum nfsstat4 blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *layouts); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_PNFSD) ++#include ++ ++struct pnfsd_cb_operations; ++ ++struct pnfsd_cb_ctl { ++ spinlock_t lock; ++ struct module *module; ++ const struct pnfsd_cb_operations *cb_op; ++}; ++ ++/* in expfs.c so that file systems can depend on it */ ++extern struct pnfsd_cb_ctl pnfsd_cb_ctl; ++ ++static inline int ++pnfsd_get_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ int ret = -ENOENT; ++ ++ spin_lock(&pnfsd_cb_ctl.lock); ++ if (!pnfsd_cb_ctl.cb_op) ++ goto out; ++ if (!try_module_get(pnfsd_cb_ctl.module)) ++ goto out; ++ ctl->cb_op = pnfsd_cb_ctl.cb_op; ++ ctl->module = pnfsd_cb_ctl.module; ++ ret = 0; ++out: ++ spin_unlock(&pnfsd_cb_ctl.lock); ++ return ret; ++} ++ ++static inline void ++pnfsd_put_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ module_put(ctl->module); ++} ++#endif /* CONFIG_PNFSD */ + #endif /* LINUX_EXPORTFS_H */ +diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/include/linux/exp_xdr.h +--- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-08-23 12:09:03.367491365 -0400 ++++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-08-23 12:09:03.367491365 -0400 +@@ -0,0 +1,141 @@ ++#ifndef _LINUX_EXP_XDR_H ++#define _LINUX_EXP_XDR_H ++ ++#include ++#include ++#include ++ ++struct exp_xdr_stream { ++ __be32 *p; ++ __be32 *end; ++}; ++ ++/** ++ * exp_xdr_qwords - Calculate the number of quad-words holding nbytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline size_t ++exp_xdr_qwords(__u32 nbytes) ++{ ++ return DIV_ROUND_UP(nbytes, 4); ++} ++ ++/** ++ * exp_xdr_qbytes - Calculate the number of bytes holding qwords ++ * @qwords: number of quad-words to encode ++ */ ++static inline size_t ++exp_xdr_qbytes(size_t qwords) ++{ ++ return qwords << 2; ++} ++ ++/** ++ * exp_xdr_reserve_space - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nbytes: number of bytes to reserve ++ * ++ * Checks that we have enough buffer space to encode 'nbytes' more ++ * bytes of data. If so, update the xdr stream. ++ */ ++static inline __be32 * ++exp_xdr_reserve_space(struct exp_xdr_stream *xdr, size_t nbytes) ++{ ++ __be32 *p = xdr->p; ++ __be32 *q; ++ ++ /* align nbytes on the next 32-bit boundary */ ++ q = p + exp_xdr_qwords(nbytes); ++ if (unlikely(q > xdr->end || q < p)) ++ return NULL; ++ xdr->p = q; ++ return p; ++} ++ ++/** ++ * exp_xdr_reserve_qwords - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nwords: number of quad words (u32's) to reserve ++ */ ++static inline __be32 * ++exp_xdr_reserve_qwords(struct exp_xdr_stream *xdr, size_t qwords) ++{ ++ return exp_xdr_reserve_space(xdr, exp_xdr_qbytes(qwords)); ++} ++ ++/** ++ * exp_xdr_encode_u32 - Encode an unsigned 32-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u32(__be32 *p, __u32 val) ++{ ++ *p = cpu_to_be32(val); ++ return p + 1; ++} ++ ++/** ++ * exp_xdr_encode_u64 - Encode an unsigned 64-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u64(__be32 *p, __u64 val) ++{ ++ put_unaligned_be64(val, p); ++ return p + 2; ++} ++ ++/** ++ * exp_xdr_encode_bytes - Encode an array of bytes onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the array of bytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_bytes(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ if (likely(nbytes != 0)) { ++ unsigned int qwords = exp_xdr_qwords(nbytes); ++ unsigned int padding = exp_xdr_qbytes(qwords) - nbytes; ++ ++ memcpy(p, ptr, nbytes); ++ if (padding != 0) ++ memset((char *)p + nbytes, 0, padding); ++ p += qwords; ++ } ++ return p; ++} ++ ++/** ++ * exp_xdr_encode_opaque - Encode an opaque type onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the opaque array ++ * @nbytes: number of bytes to encode ++ * ++ * Encodes the 32-bit opaque size in bytes followed by the opaque value. ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ p = exp_xdr_encode_u32(p, nbytes); ++ return exp_xdr_encode_bytes(p, ptr, nbytes); ++} ++ ++/** ++ * exp_xdr_encode_opaque_qlen - Encode the opaque length onto a xdr stream ++ * @lenp: pointer to the opaque length destination ++ * @endp: pointer to the end of the opaque array ++ * ++ * Encodes the 32-bit opaque size in bytes given the start and end pointers ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque_len(__be32 *lenp, const void *endp) ++{ ++ size_t nbytes = (char *)endp - (char *)(lenp + 1); ++ ++ exp_xdr_encode_u32(lenp, nbytes); ++ return lenp + 1 + exp_xdr_qwords(nbytes); ++} ++#endif /* _LINUX_EXP_XDR_H */ +diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include/linux/fs.h +--- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-08-23 12:08:29.021511898 -0400 ++++ linux-2.6.34.noarch/include/linux/fs.h 2010-08-23 12:09:03.369481147 -0400 +@@ -387,6 +387,7 @@ struct inodes_stat_t { + #include + + struct export_operations; ++struct pnfs_export_operations; + struct hd_geometry; + struct iovec; + struct nameidata; +@@ -1329,6 +1330,7 @@ struct super_block { + const struct dquot_operations *dq_op; + const struct quotactl_ops *s_qcop; + const struct export_operations *s_export_op; ++ const struct pnfs_export_operations *s_pnfs_op; + unsigned long s_flags; + unsigned long s_magic; + struct dentry *s_root; +diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/include/linux/nfs4.h +--- linux-2.6.34.noarch/include/linux/nfs4.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-08-23 12:09:03.371491472 -0400 +@@ -17,7 +17,10 @@ + + #define NFS4_BITMAP_SIZE 2 + #define NFS4_VERIFIER_SIZE 8 +-#define NFS4_STATEID_SIZE 16 ++#define NFS4_CLIENTID_SIZE 8 ++#define NFS4_STATEID_SEQID_SIZE 4 ++#define NFS4_STATEID_OTHER_SIZE 12 ++#define NFS4_STATEID_SIZE (NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE) + #define NFS4_FHSIZE 128 + #define NFS4_MAXPATHLEN PATH_MAX + #define NFS4_MAXNAMLEN NAME_MAX +@@ -119,6 +122,13 @@ + #define EXCHGID4_FLAG_MASK_A 0x40070003 + #define EXCHGID4_FLAG_MASK_R 0x80070003 + ++static inline bool ++is_ds_only_session(u32 exchange_flags) ++{ ++ u32 mask = EXCHGID4_FLAG_USE_PNFS_DS | EXCHGID4_FLAG_USE_PNFS_MDS; ++ return (exchange_flags & mask) == EXCHGID4_FLAG_USE_PNFS_DS; ++} ++ + #define SEQ4_STATUS_CB_PATH_DOWN 0x00000001 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING 0x00000002 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED 0x00000004 +@@ -166,8 +176,25 @@ struct nfs4_acl { + struct nfs4_ace aces[0]; + }; + ++struct nfs4_fsid { ++ u64 major; ++ u64 minor; ++}; ++ + typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier; +-typedef struct { char data[NFS4_STATEID_SIZE]; } nfs4_stateid; ++typedef struct { char data[NFS4_CLIENTID_SIZE]; } nfs4_clientid; ++ ++struct nfs41_stateid { ++ __be32 seqid; ++ char other[NFS4_STATEID_OTHER_SIZE]; ++} __attribute__ ((packed)); ++ ++typedef struct { ++ union { ++ char data[NFS4_STATEID_SIZE]; ++ struct nfs41_stateid stateid; ++ } u; ++} nfs4_stateid; + + enum nfs_opnum4 { + OP_ACCESS = 3, +@@ -471,6 +498,8 @@ enum lock_type4 { + #define FATTR4_WORD1_TIME_MODIFY (1UL << 21) + #define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22) + #define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23) ++#define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30) ++#define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) + + #define NFSPROC4_NULL 0 + #define NFSPROC4_COMPOUND 1 +@@ -523,6 +552,7 @@ enum { + NFSPROC4_CLNT_GETACL, + NFSPROC4_CLNT_SETACL, + NFSPROC4_CLNT_FS_LOCATIONS, ++ NFSPROC4_CLNT_RELEASE_LOCKOWNER, + + /* nfs41 */ + NFSPROC4_CLNT_EXCHANGE_ID, +@@ -531,6 +561,13 @@ enum { + NFSPROC4_CLNT_SEQUENCE, + NFSPROC4_CLNT_GET_LEASE_TIME, + NFSPROC4_CLNT_RECLAIM_COMPLETE, ++ NFSPROC4_CLNT_PNFS_LAYOUTGET, ++ NFSPROC4_CLNT_PNFS_LAYOUTCOMMIT, ++ NFSPROC4_CLNT_PNFS_LAYOUTRETURN, ++ NFSPROC4_CLNT_PNFS_GETDEVICELIST, ++ NFSPROC4_CLNT_PNFS_GETDEVICEINFO, ++ NFSPROC4_CLNT_PNFS_WRITE, ++ NFSPROC4_CLNT_PNFS_COMMIT, + }; + + /* nfs41 types */ +@@ -549,6 +586,43 @@ enum state_protect_how4 { + SP4_SSV = 2 + }; + ++enum pnfs_layouttype { ++ LAYOUT_NFSV4_1_FILES = 1, ++ LAYOUT_OSD2_OBJECTS = 2, ++ LAYOUT_BLOCK_VOLUME = 3, ++}; ++ ++/* used for both layout return and recall */ ++enum pnfs_layoutreturn_type { ++ RETURN_FILE = 1, ++ RETURN_FSID = 2, ++ RETURN_ALL = 3 ++}; ++ ++enum pnfs_iomode { ++ IOMODE_READ = 1, ++ IOMODE_RW = 2, ++ IOMODE_ANY = 3, ++}; ++ ++enum pnfs_notify_deviceid_type4 { ++ NOTIFY_DEVICEID4_CHANGE = 1 << 1, ++ NOTIFY_DEVICEID4_DELETE = 1 << 2, ++}; ++ ++#define NFL4_UFLG_MASK 0x0000003F ++#define NFL4_UFLG_DENSE 0x00000001 ++#define NFL4_UFLG_COMMIT_THRU_MDS 0x00000002 ++#define NFL4_UFLG_STRIPE_UNIT_SIZE_MASK 0xFFFFFFC0 ++ ++/* Encoded in the loh_body field of type layouthint4 */ ++enum filelayout_hint_care4 { ++ NFLH4_CARE_DENSE = NFL4_UFLG_DENSE, ++ NFLH4_CARE_COMMIT_THRU_MDS = NFL4_UFLG_COMMIT_THRU_MDS, ++ NFLH4_CARE_STRIPE_UNIT_SIZE = 0x00000040, ++ NFLH4_CARE_STRIPE_COUNT = 0x00000080 ++}; ++ + #endif + #endif + +diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfs4_pnfs.h +--- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-23 12:09:03.372501550 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-08-23 12:09:03.372501550 -0400 +@@ -0,0 +1,330 @@ ++/* ++ * include/linux/nfs4_pnfs.h ++ * ++ * Common data structures needed by the pnfs client and pnfs layout driver. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef LINUX_NFS4_PNFS_H ++#define LINUX_NFS4_PNFS_H ++ ++#include ++#include ++ ++enum pnfs_try_status { ++ PNFS_ATTEMPTED = 0, ++ PNFS_NOT_ATTEMPTED = 1, ++}; ++ ++#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 ++ ++/* Per-layout driver specific registration structure */ ++struct pnfs_layoutdriver_type { ++ const u32 id; ++ const char *name; ++ struct layoutdriver_io_operations *ld_io_ops; ++ struct layoutdriver_policy_operations *ld_policy_ops; ++}; ++ ++struct pnfs_fsdata { ++ int bypass_eof; ++ struct pnfs_layout_segment *lseg; ++ void *private; ++}; ++ ++#if defined(CONFIG_NFS_V4_1) ++ ++static inline struct nfs_inode * ++PNFS_NFS_INODE(struct pnfs_layout_type *lo) ++{ ++ return NFS_I(lo->lo_inode); ++} ++ ++static inline struct inode * ++PNFS_INODE(struct pnfs_layout_type *lo) ++{ ++ return lo->lo_inode; ++} ++ ++static inline struct nfs_server * ++PNFS_NFS_SERVER(struct pnfs_layout_type *lo) ++{ ++ return NFS_SERVER(PNFS_INODE(lo)); ++} ++ ++static inline struct pnfs_layoutdriver_type * ++PNFS_LD(struct pnfs_layout_type *lo) ++{ ++ return NFS_SERVER(PNFS_INODE(lo))->pnfs_curr_ld; ++} ++ ++static inline struct layoutdriver_io_operations * ++PNFS_LD_IO_OPS(struct pnfs_layout_type *lo) ++{ ++ return PNFS_LD(lo)->ld_io_ops; ++} ++ ++static inline struct layoutdriver_policy_operations * ++PNFS_LD_POLICY_OPS(struct pnfs_layout_type *lo) ++{ ++ return PNFS_LD(lo)->ld_policy_ops; ++} ++ ++static inline bool ++has_layout(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout != NULL; ++} ++ ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) ++{ ++ return has_layout(nfsi) && ++ test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->pnfs_layout_state); ++} ++ ++extern void put_lseg(struct pnfs_layout_segment *lseg); ++extern void get_lseg(struct pnfs_layout_segment *lseg); ++ ++#else /* CONFIG_NFS_V4_1 */ ++ ++static inline bool ++has_layout(struct nfs_inode *nfsi) ++{ ++ return false; ++} ++ ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_NFS_V4_1 */ ++ ++struct pnfs_layout_segment { ++ struct list_head fi_list; ++ struct nfs4_pnfs_layout_segment range; ++ struct kref kref; ++ bool valid; ++ struct pnfs_layout_type *layout; ++ struct nfs4_deviceid *deviceid; ++ u8 ld_data[]; /* layout driver private data */ ++}; ++ ++static inline void * ++LSEG_LD_DATA(struct pnfs_layout_segment *lseg) ++{ ++ return lseg->ld_data; ++} ++ ++/* Layout driver I/O operations. ++ * Either the pagecache or non-pagecache read/write operations must be implemented ++ */ ++struct layoutdriver_io_operations { ++ /* Functions that use the pagecache. ++ * If use_pagecache == 1, then these functions must be implemented. ++ */ ++ /* read and write pagelist should return just 0 (to indicate that ++ * the layout code has taken control) or 1 (to indicate that the ++ * layout code wishes to fall back to normal nfs.) If 0 is returned, ++ * information can be passed back through nfs_data->res and ++ * nfs_data->task.tk_status, and the appropriate pnfs done function ++ * MUST be called. ++ */ ++ enum pnfs_try_status ++ (*read_pagelist) (struct nfs_read_data *nfs_data, unsigned nr_pages); ++ enum pnfs_try_status ++ (*write_pagelist) (struct nfs_write_data *nfs_data, unsigned nr_pages, int how); ++ int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page, ++ loff_t pos, unsigned count, ++ struct pnfs_fsdata *fsdata); ++ int (*write_end)(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, ++ struct pnfs_layout_segment *lseg); ++ void (*write_end_cleanup)(struct file *filp, ++ struct pnfs_fsdata *fsdata); ++ ++ /* Consistency ops */ ++ /* 2 problems: ++ * 1) the page list contains nfs_pages, NOT pages ++ * 2) currently the NFS code doesn't create a page array (as it does with read/write) ++ */ ++ enum pnfs_try_status ++ (*commit) (struct nfs_write_data *nfs_data, int how); ++ ++ /* Layout information. For each inode, alloc_layout is executed once to retrieve an ++ * inode specific layout structure. Each subsequent layoutget operation results in ++ * a set_layout call to set the opaque layout in the layout driver.*/ ++ struct pnfs_layout_type * (*alloc_layout) (struct inode *inode); ++ void (*free_layout) (struct pnfs_layout_type *); ++ struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_type *layoutid, struct nfs4_pnfs_layoutget_res *lgr); ++ void (*free_lseg) (struct pnfs_layout_segment *lseg); ++ ++ int (*setup_layoutcommit) (struct pnfs_layout_type *layoutid, ++ struct pnfs_layoutcommit_arg *args); ++ ++ void (*encode_layoutcommit) (struct pnfs_layout_type *layoutid, ++ struct xdr_stream *xdr, ++ const struct pnfs_layoutcommit_arg *args); ++ void (*cleanup_layoutcommit) (struct pnfs_layout_type *layoutid, ++ struct pnfs_layoutcommit_arg *args, ++ int status); ++ void (*encode_layoutreturn) (struct pnfs_layout_type *layoutid, ++ struct xdr_stream *xdr, ++ const struct nfs4_pnfs_layoutreturn_arg *args); ++ ++ /* Registration information for a new mounted file system ++ */ ++ int (*initialize_mountpoint) (struct nfs_server *, ++ const struct nfs_fh * mntfh); ++ int (*uninitialize_mountpoint) (struct nfs_server *server); ++}; ++ ++enum layoutdriver_policy_flags { ++ /* Should the full nfs rpc cleanup code be used after io */ ++ PNFS_USE_RPC_CODE = 1 << 0, ++ ++ /* Should the NFS req. gather algorithm cross stripe boundaries? */ ++ PNFS_GATHER_ACROSS_STRIPES = 1 << 1, ++ ++ /* Should the pNFS client commit and return the layout upon a setattr */ ++ PNFS_LAYOUTRET_ON_SETATTR = 1 << 3, ++}; ++ ++struct layoutdriver_policy_operations { ++ unsigned flags; ++ ++ /* The stripe size of the file system */ ++ ssize_t (*get_stripesize) (struct pnfs_layout_type *layoutid); ++ ++ /* test for nfs page cache coalescing */ ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++ ++ /* Retreive the block size of the file system. ++ * If gather_across_stripes == 1, then the file system will gather ++ * requests into the block size. ++ * TODO: Where will the layout driver get this info? It is hard ++ * coded in PVFS2. ++ */ ++ ssize_t (*get_blocksize) (void); ++}; ++ ++/* Should the full nfs rpc cleanup code be used after io */ ++static inline int ++pnfs_ld_use_rpc_code(struct pnfs_layoutdriver_type *ld) ++{ ++ return ld->ld_policy_ops->flags & PNFS_USE_RPC_CODE; ++} ++ ++/* Should the NFS req. gather algorithm cross stripe boundaries? */ ++static inline int ++pnfs_ld_gather_across_stripes(struct pnfs_layoutdriver_type *ld) ++{ ++ return ld->ld_policy_ops->flags & PNFS_GATHER_ACROSS_STRIPES; ++} ++ ++struct pnfs_device { ++ struct pnfs_deviceid dev_id; ++ unsigned int layout_type; ++ unsigned int mincount; ++ struct page **pages; ++ void *area; ++ unsigned int pgbase; ++ unsigned int pglen; ++ unsigned int dev_notify_types; ++}; ++ ++struct pnfs_devicelist { ++ unsigned int eof; ++ unsigned int num_devs; ++ struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; ++}; ++ ++/* ++ * Device ID RCU cache. A device ID is unique per client ID and layout type. ++ */ ++#define NFS4_DEVICE_ID_HASH_BITS 5 ++#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) ++#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) ++ ++static inline u32 ++nfs4_deviceid_hash(struct pnfs_deviceid *id) ++{ ++ unsigned char *cptr = (unsigned char *)id->data; ++ unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE; ++ u32 x = 0; ++ ++ while (nbytes--) { ++ x *= 37; ++ x += *cptr++; ++ } ++ return x & NFS4_DEVICE_ID_HASH_MASK; ++} ++ ++struct nfs4_deviceid_cache { ++ spinlock_t dc_lock; ++ struct kref dc_kref; ++ void (*dc_free_callback)(struct kref *); ++ struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE]; ++}; ++ ++/* Device ID cache node */ ++struct nfs4_deviceid { ++ struct hlist_node de_node; ++ struct pnfs_deviceid de_id; ++ struct kref de_kref; ++}; ++ ++extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *, ++ void (*free_callback)(struct kref *)); ++extern void nfs4_put_deviceid_cache(struct nfs_client *); ++extern void nfs4_init_deviceid_node(struct nfs4_deviceid *); ++extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *, ++ struct pnfs_deviceid *); ++extern struct nfs4_deviceid *nfs4_add_deviceid(struct nfs4_deviceid_cache *, ++ struct nfs4_deviceid *); ++extern void nfs4_set_layout_deviceid(struct pnfs_layout_segment *, ++ struct nfs4_deviceid *); ++extern void nfs4_unset_layout_deviceid(struct pnfs_layout_segment *, ++ struct nfs4_deviceid *, ++ void (*free_callback)(struct kref *)); ++extern void nfs4_delete_device(struct nfs4_deviceid_cache *, ++ struct pnfs_deviceid *); ++ ++/* pNFS client callback functions. ++ * These operations allow the layout driver to access pNFS client ++ * specific information or call pNFS client->server operations. ++ * E.g., getdeviceinfo, I/O callbacks, etc ++ */ ++struct pnfs_client_operations { ++ int (*nfs_getdevicelist) (struct nfs_server *, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist); ++ int (*nfs_getdeviceinfo) (struct nfs_server *, ++ struct pnfs_device *dev); ++ ++ /* Post read callback. */ ++ void (*nfs_readlist_complete) (struct nfs_read_data *nfs_data); ++ ++ /* Post write callback. */ ++ void (*nfs_writelist_complete) (struct nfs_write_data *nfs_data); ++ ++ /* Post commit callback. */ ++ void (*nfs_commit_complete) (struct nfs_write_data *nfs_data); ++ void (*nfs_return_layout) (struct inode *); ++}; ++ ++extern struct pnfs_client_operations pnfs_ops; ++ ++extern struct pnfs_client_operations *pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); ++extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); ++ ++#define NFS4_PNFS_MAX_LAYOUTS 4 ++#define NFS4_PNFS_PRIVATE_LAYOUT 0x80000000 ++ ++#endif /* LINUX_NFS4_PNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarch/include/linux/nfsd4_block.h +--- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-08-23 12:09:03.373491892 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-08-23 12:09:03.374491393 -0400 +@@ -0,0 +1,101 @@ ++#ifndef NFSD4_BLOCK ++#define NFSD4_BLOCK ++ ++#include ++#include ++#include ++#include ++ ++#define PNFS_BLOCK_SUCCESS 1 ++#define PNFS_BLOCK_FAILURE 0 ++ ++#define PNFS_BLOCK_CTL_START 1 ++#define PNFS_BLOCK_CTL_STOP 2 ++#define PNFS_BLOCK_CTL_VERS 3 /* Allows daemon to request current ++ * version from kernel via an upcall. ++ */ ++ ++#define PNFS_UPCALL_MSG_STOP 0 ++#define PNFS_UPCALL_MSG_GETSIG 1 ++#define PNFS_UPCALL_MSG_GETSLICE 2 ++#define PNFS_UPCALL_MSG_DMCHK 3 // See if dev_t is a DM volume ++#define PNFS_UPCALL_MSG_DMGET 4 ++#define PNFS_UPCALL_MSG_VERS 5 ++ ++#define PNFS_UPCALL_VERS 8 ++ ++typedef struct stripe_dev { ++ int major, ++ minor, ++ offset; ++} stripe_dev_t; ++ ++typedef struct bl_comm_res { ++ int res_status; ++ union { ++ struct { ++ long long start, ++ length; ++ } slice; ++ struct { ++ int num_stripes, ++ stripe_size; ++ stripe_dev_t devs[]; ++ } stripe; ++ struct { ++ long long sector; ++ int offset, ++ len; ++ char sig[]; ++ } sig; ++ int vers, ++ dm_vol; ++ } u; ++} bl_comm_res_t; ++ ++typedef struct bl_comm_msg { ++ int msg_type, ++ msg_status; ++ union { ++ dev_t msg_dev; ++ int msg_vers; ++ } u; ++ bl_comm_res_t *msg_res; ++} bl_comm_msg_t; ++ ++#ifdef __KERNEL__ ++ ++typedef struct bl_comm { ++ /* ---- protects access to this structure ---- */ ++ struct mutex lock; ++ /* ---- protects access to rpc pipe ---- */ ++ struct mutex pipe_lock; ++ struct dentry *pipe_dentry; ++ wait_queue_head_t pipe_wq; ++ bl_comm_msg_t msg; ++} bl_comm_t; ++ ++int pnfs_block_enabled(struct inode *, int); ++int bl_layout_type(struct super_block *sb); ++int bl_getdeviceiter(struct super_block *, u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int bl_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++enum nfsstat4 bl_layoutget(struct inode *, struct exp_xdr_stream *, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int bl_layoutcommit(struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++int bl_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len); ++int bl_init_proc(void); ++int bl_upcall(bl_comm_t *, bl_comm_msg_t *, bl_comm_res_t **); ++ ++extern bl_comm_t *bl_comm_global; // Ugly... ++#endif /* __KERNEL__ */ ++ ++#endif /* NFSD4_BLOCK */ ++ +diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h +--- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-23 12:09:03.375501481 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-08-23 12:09:03.375501481 -0400 +@@ -0,0 +1,345 @@ ++/* ++ * include/linux/nfsd4_spnfs.h ++ * ++ * spNFS - simple pNFS implementation with userspace daemon ++ * ++ */ ++ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#ifndef NFS_SPNFS_H ++#define NFS_SPNFS_H ++ ++ ++#ifdef __KERNEL__ ++#include "exportfs.h" ++#include "sunrpc/svc.h" ++#include "nfsd/nfsfh.h" ++#else ++#include ++#endif /* __KERNEL__ */ ++ ++#define SPNFS_STATUS_INVALIDMSG 0x01 ++#define SPNFS_STATUS_AGAIN 0x02 ++#define SPNFS_STATUS_FAIL 0x04 ++#define SPNFS_STATUS_SUCCESS 0x08 ++ ++#define SPNFS_TYPE_LAYOUTGET 0x01 ++#define SPNFS_TYPE_LAYOUTCOMMIT 0x02 ++#define SPNFS_TYPE_LAYOUTRETURN 0x03 ++#define SPNFS_TYPE_GETDEVICEITER 0x04 ++#define SPNFS_TYPE_GETDEVICEINFO 0x05 ++#define SPNFS_TYPE_SETATTR 0x06 ++#define SPNFS_TYPE_OPEN 0x07 ++#define SPNFS_TYPE_CLOSE 0x08 ++#define SPNFS_TYPE_CREATE 0x09 ++#define SPNFS_TYPE_REMOVE 0x0a ++#define SPNFS_TYPE_COMMIT 0x0b ++#define SPNFS_TYPE_READ 0x0c ++#define SPNFS_TYPE_WRITE 0x0d ++ ++#define SPNFS_MAX_DEVICES 1 ++#define SPNFS_MAX_DATA_SERVERS 16 ++#define SPNFS_MAX_IO 512 ++ ++/* layout */ ++struct spnfs_msg_layoutget_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_filelayout_list { ++ u_int32_t fh_len; ++ unsigned char fh_val[128]; /* DMXXX fix this const */ ++}; ++ ++struct spnfs_msg_layoutget_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t stripe_size; ++ u_int32_t stripe_type; ++ u_int32_t stripe_count; ++ struct spnfs_filelayout_list flist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++/* layoutcommit */ ++struct spnfs_msg_layoutcommit_args { ++ unsigned long inode; ++ unsigned long generation; ++ u_int64_t file_size; ++}; ++ ++struct spnfs_msg_layoutcommit_res { ++ int status; ++}; ++ ++/* layoutreturn */ ++/* No op for the daemon */ ++/* ++struct spnfs_msg_layoutreturn_args { ++}; ++ ++struct spnfs_msg_layoutreturn_res { ++}; ++*/ ++ ++/* getdeviceiter */ ++struct spnfs_msg_getdeviceiter_args { ++ unsigned long inode; ++ u_int64_t cookie; ++ u_int64_t verf; ++}; ++ ++struct spnfs_msg_getdeviceiter_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t cookie; ++ u_int64_t verf; ++ u_int32_t eof; ++}; ++ ++/* getdeviceinfo */ ++struct spnfs_data_server { ++ u_int32_t dsid; ++ char netid[5]; ++ char addr[29]; ++}; ++ ++struct spnfs_device { ++ u_int64_t devid; ++ int dscount; ++ struct spnfs_data_server dslist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++struct spnfs_msg_getdeviceinfo_args { ++ u_int64_t devid; ++}; ++ ++struct spnfs_msg_getdeviceinfo_res { ++ int status; ++ struct spnfs_device devinfo; ++}; ++ ++/* setattr */ ++struct spnfs_msg_setattr_args { ++ unsigned long inode; ++ unsigned long generation; ++ int file_size; ++}; ++ ++struct spnfs_msg_setattr_res { ++ int status; ++}; ++ ++/* open */ ++struct spnfs_msg_open_args { ++ unsigned long inode; ++ unsigned long generation; ++ int create; ++ int createmode; ++ int truncate; ++}; ++ ++struct spnfs_msg_open_res { ++ int status; ++}; ++ ++/* close */ ++/* No op for daemon */ ++struct spnfs_msg_close_args { ++ int x; ++}; ++ ++struct spnfs_msg_close_res { ++ int y; ++}; ++ ++/* create */ ++/* ++struct spnfs_msg_create_args { ++ int x; ++}; ++ ++struct spnfs_msg_create_res { ++ int y; ++}; ++*/ ++ ++/* remove */ ++struct spnfs_msg_remove_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_msg_remove_res { ++ int status; ++}; ++ ++/* commit */ ++/* ++struct spnfs_msg_commit_args { ++ int x; ++}; ++ ++struct spnfs_msg_commit_res { ++ int y; ++}; ++*/ ++ ++/* read */ ++struct spnfs_msg_read_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++}; ++ ++struct spnfs_msg_read_res { ++ int status; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++/* write */ ++struct spnfs_msg_write_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++struct spnfs_msg_write_res { ++ int status; ++}; ++ ++/* bundle args and responses */ ++union spnfs_msg_args { ++ struct spnfs_msg_layoutget_args layoutget_args; ++ struct spnfs_msg_layoutcommit_args layoutcommit_args; ++/* ++ struct spnfs_msg_layoutreturn_args layoutreturn_args; ++*/ ++ struct spnfs_msg_getdeviceiter_args getdeviceiter_args; ++ struct spnfs_msg_getdeviceinfo_args getdeviceinfo_args; ++ struct spnfs_msg_setattr_args setattr_args; ++ struct spnfs_msg_open_args open_args; ++ struct spnfs_msg_close_args close_args; ++/* ++ struct spnfs_msg_create_args create_args; ++*/ ++ struct spnfs_msg_remove_args remove_args; ++/* ++ struct spnfs_msg_commit_args commit_args; ++*/ ++ struct spnfs_msg_read_args read_args; ++ struct spnfs_msg_write_args write_args; ++}; ++ ++union spnfs_msg_res { ++ struct spnfs_msg_layoutget_res layoutget_res; ++ struct spnfs_msg_layoutcommit_res layoutcommit_res; ++/* ++ struct spnfs_msg_layoutreturn_res layoutreturn_res; ++*/ ++ struct spnfs_msg_getdeviceiter_res getdeviceiter_res; ++ struct spnfs_msg_getdeviceinfo_res getdeviceinfo_res; ++ struct spnfs_msg_setattr_res setattr_res; ++ struct spnfs_msg_open_res open_res; ++ struct spnfs_msg_close_res close_res; ++/* ++ struct spnfs_msg_create_res create_res; ++*/ ++ struct spnfs_msg_remove_res remove_res; ++/* ++ struct spnfs_msg_commit_res commit_res; ++*/ ++ struct spnfs_msg_read_res read_res; ++ struct spnfs_msg_write_res write_res; ++}; ++ ++/* a spnfs message, args and response */ ++struct spnfs_msg { ++ unsigned char im_type; ++ unsigned char im_status; ++ union spnfs_msg_args im_args; ++ union spnfs_msg_res im_res; ++}; ++ ++/* spnfs configuration info */ ++struct spnfs_config { ++ unsigned char dense_striping; ++ int stripe_size; ++ int num_ds; ++ char ds_dir[SPNFS_MAX_DATA_SERVERS][80]; /* XXX */ ++}; ++ ++#if defined(__KERNEL__) && defined(CONFIG_SPNFS) ++ ++#include ++ ++/* pipe mgmt structure. messages flow through here */ ++struct spnfs { ++ struct dentry *spnfs_dentry; /* dentry for pipe */ ++ wait_queue_head_t spnfs_wq; ++ struct spnfs_msg spnfs_im; /* spnfs message */ ++ struct mutex spnfs_lock; /* Serializes upcalls */ ++ struct mutex spnfs_plock; ++}; ++ ++struct nfsd4_open; ++ ++int spnfs_layout_type(struct super_block *); ++enum nfsstat4 spnfs_layoutget(struct inode *, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int spnfs_layoutcommit(void); ++int spnfs_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int spnfs_getdeviceiter(struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int spnfs_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++int spnfs_setattr(void); ++int spnfs_open(struct inode *, struct nfsd4_open *); ++int spnfs_get_state(struct inode *, struct knfsd_fh *, struct pnfs_get_state *); ++int spnfs_remove(unsigned long, unsigned long); ++__be32 spnfs_read(struct inode *, loff_t, unsigned long *, ++ int, struct svc_rqst *); ++__be32 spnfs_write(struct inode *, loff_t, size_t, int, struct svc_rqst *); ++int spnfs_getfh(int, struct nfs_fh *); ++int spnfs_test_layoutrecall(char *, u64, u64); ++int spnfs_layoutrecall(struct inode *, int, u64, u64); ++ ++int nfsd_spnfs_new(void); ++void nfsd_spnfs_delete(void); ++int spnfs_upcall(struct spnfs *, struct spnfs_msg *, union spnfs_msg_res *); ++int spnfs_enabled(void); ++int spnfs_init_proc(void); ++ ++extern struct spnfs_config *spnfs_config; ++ ++#endif /* __KERNEL__ && CONFIG_SPNFS */ ++ ++#endif /* NFS_SPNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch/include/linux/nfsd/const.h +--- linux-2.6.34.noarch/include/linux/nfsd/const.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-08-23 12:09:03.376401789 -0400 +@@ -29,6 +29,7 @@ + #ifdef __KERNEL__ + + #include ++#include + + /* + * Largest number of bytes we need to allocate for an NFS +diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch/include/linux/nfsd/debug.h +--- linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-08-23 12:09:03.376401789 -0400 +@@ -32,6 +32,8 @@ + #define NFSDDBG_REPCACHE 0x0080 + #define NFSDDBG_XDR 0x0100 + #define NFSDDBG_LOCKD 0x0200 ++#define NFSDDBG_PNFS 0x0400 ++#define NFSDDBG_FILELAYOUT 0x0800 + #define NFSDDBG_ALL 0x7FFF + #define NFSDDBG_NOCHANGE 0xFFFF + +diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarch/include/linux/nfsd/export.h +--- linux-2.6.34.noarch/include/linux/nfsd/export.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-08-23 12:09:03.377481954 -0400 +@@ -100,6 +100,7 @@ struct svc_export { + uid_t ex_anon_uid; + gid_t ex_anon_gid; + int ex_fsid; ++ int ex_pnfs; + unsigned char * ex_uuid; /* 16 byte fsid */ + struct nfsd4_fs_locations ex_fslocs; + int ex_nflavors; +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-23 12:09:03.377481954 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-23 12:09:03.378501747 -0400 +@@ -0,0 +1,132 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef NFSD_NFS4LAYOUTXDR_H ++#define NFSD_NFS4LAYOUTXDR_H ++ ++#include ++#include ++ ++/* the nfsd4_pnfs_devlist dev_addr for the file layout type */ ++struct pnfs_filelayout_devaddr { ++ struct xdr_netobj r_netid; ++ struct xdr_netobj r_addr; ++}; ++ ++/* list of multipath servers */ ++struct pnfs_filelayout_multipath { ++ u32 fl_multipath_length; ++ struct pnfs_filelayout_devaddr *fl_multipath_list; ++}; ++ ++struct pnfs_filelayout_device { ++ u32 fl_stripeindices_length; ++ u32 *fl_stripeindices_list; ++ u32 fl_device_length; ++ struct pnfs_filelayout_multipath *fl_device_list; ++}; ++ ++struct pnfs_filelayout_layout { ++ u32 lg_layout_type; /* response */ ++ u32 lg_stripe_type; /* response */ ++ u32 lg_commit_through_mds; /* response */ ++ u64 lg_stripe_unit; /* response */ ++ u64 lg_pattern_offset; /* response */ ++ u32 lg_first_stripe_index; /* response */ ++ struct nfsd4_pnfs_deviceid device_id; /* response */ ++ u32 lg_fh_length; /* response */ ++ struct knfsd_fh *lg_fh_list; /* response */ ++}; ++ ++enum stripetype4 { ++ STRIPE_SPARSE = 1, ++ STRIPE_DENSE = 2 ++}; ++ ++enum pnfs_block_extent_state4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, ++ PNFS_BLOCK_NONE_DATA = 3 ++}; ++ ++enum pnfs_block_volume_type4 { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, ++ PNFS_BLOCK_VOLUME_SLICE = 1, ++ PNFS_BLOCK_VOLUME_CONCAT = 2, ++ PNFS_BLOCK_VOLUME_STRIPE = 3, ++}; ++typedef enum pnfs_block_volume_type4 pnfs_block_volume_type4; ++ ++enum bl_cache_state { ++ BLOCK_LAYOUT_NEW = 0, ++ BLOCK_LAYOUT_CACHE = 1, ++ BLOCK_LAYOUT_UPDATE = 2, ++}; ++ ++typedef struct pnfs_blocklayout_layout { ++ struct list_head bll_list; ++ struct nfsd4_pnfs_deviceid bll_vol_id; ++ u64 bll_foff; // file offset ++ u64 bll_len; ++ u64 bll_soff; // storage offset ++ int bll_recalled; ++ enum pnfs_block_extent_state4 bll_es; ++ enum bl_cache_state bll_cache_state; ++} pnfs_blocklayout_layout_t; ++ ++typedef struct pnfs_blocklayout_devinfo { ++ struct list_head bld_list; ++ pnfs_block_volume_type4 bld_type; ++ struct nfsd4_pnfs_deviceid bld_devid; ++ int bld_index_loc; ++ union { ++ struct { ++ u64 bld_offset; ++ u32 bld_sig_len, ++ *bld_sig; ++ } simple; ++ struct { ++ u64 bld_start, ++ bld_len; ++ u32 bld_index; /* Index of Simple Volume */ ++ } slice; ++ struct { ++ u32 bld_stripes; ++ u64 bld_chunk_size; ++ u32 *bld_stripe_indexs; ++ } stripe; ++ } u; ++} pnfs_blocklayout_devinfo_t; ++ ++#endif /* NFSD_NFS4LAYOUTXDR_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-23 12:09:03.378501747 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-23 12:09:03.378501747 -0400 +@@ -0,0 +1,54 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++#include ++ ++/* ++ * Length of comma separated pnfs data server IPv4 addresses. Enough room for ++ * 32 addresses. ++ */ ++#define NFSD_DLM_DS_LIST_MAX 512 ++/* ++ * Length of colon separated pnfs dlm device of the form ++ * disk_name:comma separated data server IPv4 address ++ */ ++#define NFSD_PNFS_DLM_DEVICE_MAX (NFSD_DLM_DS_LIST_MAX + DISK_NAME_LEN + 1) ++ ++#ifdef CONFIG_PNFSD ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++extern const struct pnfs_export_operations pnfs_dlm_export_ops; ++ ++int nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len); ++ ++void nfsd4_pnfs_dlm_shutdown(void); ++ ++ssize_t nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen); ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-23 12:09:03.379487099 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-23 12:09:03.379487099 -0400 +@@ -0,0 +1,271 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef _LINUX_NFSD_NFSD4_PNFS_H ++#define _LINUX_NFSD_NFSD4_PNFS_H ++ ++#include ++#include ++#include ++ ++struct nfsd4_pnfs_deviceid { ++ u64 sbid; /* per-superblock unique ID */ ++ u64 devid; /* filesystem-wide unique device ID */ ++}; ++ ++struct nfsd4_pnfs_dev_iter_res { ++ u64 gd_cookie; /* request/repsonse */ ++ u64 gd_verf; /* request/repsonse */ ++ u64 gd_devid; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++/* Arguments for set_device_notify */ ++struct pnfs_devnotify_arg { ++ struct nfsd4_pnfs_deviceid dn_devid; /* request */ ++ u32 dn_layout_type; /* request */ ++ u32 dn_notify_types; /* request/response */ ++}; ++ ++struct nfsd4_layout_seg { ++ u64 clientid; ++ u32 layout_type; ++ u32 iomode; ++ u64 offset; ++ u64 length; ++}; ++ ++/* Used by layout_get to encode layout (loc_body var in spec) ++ * Args: ++ * minlength - min number of accessible bytes given by layout ++ * fsid - Major part of struct pnfs_deviceid. File system uses this ++ * to build the deviceid returned in the layout. ++ * fh - fs can modify the file handle for use on data servers ++ * seg - layout info requested and layout info returned ++ * xdr - xdr info ++ * return_on_close - true if layout to be returned on file close ++ */ ++ ++struct nfsd4_pnfs_layoutget_arg { ++ u64 lg_minlength; ++ u64 lg_sbid; ++ const struct knfsd_fh *lg_fh; ++}; ++ ++struct nfsd4_pnfs_layoutget_res { ++ struct nfsd4_layout_seg lg_seg; /* request/resopnse */ ++ u32 lg_return_on_close; ++}; ++ ++struct nfsd4_pnfs_layoutcommit_arg { ++ struct nfsd4_layout_seg lc_seg; /* request */ ++ u32 lc_reclaim; /* request */ ++ u32 lc_newoffset; /* request */ ++ u64 lc_last_wr; /* request */ ++ struct nfstime4 lc_mtime; /* request */ ++ u32 lc_up_len; /* layout length */ ++ void *lc_up_layout; /* decoded by callback */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit_res { ++ u32 lc_size_chg; /* boolean for response */ ++ u64 lc_newsize; /* response */ ++}; ++ ++#define PNFS_LAST_LAYOUT_NO_RECALLS ((void *)-1) /* used with lr_cookie below */ ++ ++struct nfsd4_pnfs_layoutreturn_arg { ++ u32 lr_return_type; /* request */ ++ struct nfsd4_layout_seg lr_seg; /* request */ ++ u32 lr_reclaim; /* request */ ++ u32 lrf_body_len; /* request */ ++ void *lrf_body; /* request */ ++ void *lr_cookie; /* fs private */ ++}; ++ ++/* pNFS Metadata to Data server state communication */ ++struct pnfs_get_state { ++ u32 dsid; /* request */ ++ u64 ino; /* request */ ++ nfs4_stateid stid; /* request;response */ ++ nfs4_clientid clid; /* response */ ++ u32 access; /* response */ ++ u32 stid_gen; /* response */ ++ u32 verifier[2]; /* response */ ++}; ++ ++/* ++ * pNFS export operations vector. ++ * ++ * The filesystem must implement the following methods: ++ * layout_type ++ * get_device_info ++ * layout_get ++ * ++ * All other methods are optional and can be set to NULL if not implemented. ++ */ ++struct pnfs_export_operations { ++ /* Returns the supported pnfs_layouttype4. */ ++ int (*layout_type) (struct super_block *); ++ ++ /* Encode device info onto the xdr stream. */ ++ int (*get_device_info) (struct super_block *, ++ struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++ ++ /* Retrieve all available devices via an iterator. ++ * arg->cookie == 0 indicates the beginning of the list, ++ * otherwise arg->verf is used to verify that the list hasn't changed ++ * while retrieved. ++ * ++ * On output, the filesystem sets the devid based on the current cookie ++ * and sets res->cookie and res->verf corresponding to the next entry. ++ * When the last entry in the list is retrieved, res->eof is set to 1. ++ */ ++ int (*get_device_iter) (struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++ ++ int (*set_device_notify) (struct super_block *, ++ struct pnfs_devnotify_arg *); ++ ++ /* Retrieve and encode a layout for inode onto the xdr stream. ++ * arg->minlength is the minimum number of accessible bytes required ++ * by the client. ++ * The maximum number of bytes to encode the layout is given by ++ * the xdr stream end pointer. ++ * arg->fsid contains the major part of struct pnfs_deviceid. ++ * The file system uses this to build the deviceid returned ++ * in the layout. ++ * res->seg - layout segment requested and layout info returned. ++ * res->fh can be modified the file handle for use on data servers ++ * res->return_on_close - true if layout to be returned on file close ++ * ++ * return one of the following nfs errors: ++ * NFS_OK Success ++ * NFS4ERR_ACCESS Permission error ++ * NFS4ERR_BADIOMODE Server does not support requested iomode ++ * NFS4ERR_BADLAYOUT No layout matching loga_minlength rules ++ * NFS4ERR_INVAL Parameter other than layout is invalid ++ * NFS4ERR_IO I/O error ++ * NFS4ERR_LAYOUTTRYLATER Layout may be retrieved later ++ * NFS4ERR_LAYOUTUNAVAILABLE Layout unavailable for this file ++ * NFS4ERR_LOCKED Lock conflict ++ * NFS4ERR_NOSPC Out-of-space error occured ++ * NFS4ERR_RECALLCONFLICT Layout currently unavialable due to ++ * a conflicting CB_LAYOUTRECALL ++ * NFS4ERR_SERVERFAULT Server went bezerk ++ * NFS4ERR_TOOSMALL loga_maxcount too small to fit layout ++ * NFS4ERR_WRONG_TYPE Wrong file type (not a regular file) ++ */ ++ enum nfsstat4 (*layout_get) (struct inode *, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++ ++ /* Commit changes to layout */ ++ int (*layout_commit) (struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++ ++ /* Returns the layout */ ++ int (*layout_return) (struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++ ++ /* Can layout segments be merged for this layout type? */ ++ int (*can_merge_layouts) (u32 layout_type); ++ ++ /* pNFS Files layout specific operations */ ++ ++ /* Get the write verifier for DS (called on MDS only) */ ++ void (*get_verifier) (struct super_block *, u32 *p); ++ /* Call fs on DS only */ ++ int (*get_state) (struct inode *, struct knfsd_fh *, ++ struct pnfs_get_state *); ++}; ++ ++struct nfsd4_pnfs_cb_layout { ++ u32 cbl_recall_type; /* request */ ++ struct nfsd4_layout_seg cbl_seg; /* request */ ++ u32 cbl_layoutchanged; /* request */ ++ nfs4_stateid cbl_sid; /* request */ ++ struct nfs4_fsid cbl_fsid; ++ void *cbl_cookie; /* fs private */ ++}; ++ ++/* layoutrecall request (from exported filesystem) */ ++struct nfs4_layoutrecall { ++ struct kref clr_ref; ++ struct nfsd4_pnfs_cb_layout cb; /* request */ ++ struct list_head clr_perclnt; /* on cl_layoutrecalls */ ++ struct nfs4_client *clr_client; ++ struct nfs4_file *clr_file; ++ struct timespec clr_time; /* last activity */ ++ struct super_block *clr_sb; /* We might not have a file */ ++ struct nfs4_layoutrecall *parent; /* The initiating recall */ ++ ++ void *clr_args; /* nfsd internal */ ++}; ++ ++struct nfsd4_pnfs_cb_dev_item { ++ u32 cbd_notify_type; /* request */ ++ u32 cbd_layout_type; /* request */ ++ struct nfsd4_pnfs_deviceid cbd_devid; /* request */ ++ u32 cbd_immediate; /* request */ ++}; ++ ++struct nfsd4_pnfs_cb_dev_list { ++ u32 cbd_len; /* request */ ++ struct nfsd4_pnfs_cb_dev_item *cbd_list; /* request */ ++}; ++ ++/* ++ * callbacks provided by the nfsd ++ */ ++struct pnfsd_cb_operations { ++ /* Generic callbacks */ ++ int (*cb_layout_recall) (struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++ int (*cb_device_notify) (struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++ ++ /* pNFS Files layout specific callbacks */ ++ ++ /* Callback from fs on MDS only */ ++ int (*cb_get_state) (struct super_block *, struct pnfs_get_state *); ++ /* Callback from fs on DS only */ ++ int (*cb_change_state) (struct pnfs_get_state *); ++}; ++ ++#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noarch/include/linux/nfsd/syscall.h +--- linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-08-23 12:09:03.380502500 -0400 +@@ -29,6 +29,7 @@ + /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ + #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ + #define NFSCTL_GETFS 8 /* get an fh by path with max FH len */ ++#define NFSCTL_FD2FH 9 /* get a fh from a fd */ + + /* SVC */ + struct nfsctl_svc { +@@ -71,6 +72,11 @@ struct nfsctl_fsparm { + int gd_maxlen; + }; + ++/* FD2FH */ ++struct nfsctl_fd2fh { ++ int fd; ++}; ++ + /* + * This is the argument union. + */ +@@ -82,6 +88,7 @@ struct nfsctl_arg { + struct nfsctl_export u_export; + struct nfsctl_fdparm u_getfd; + struct nfsctl_fsparm u_getfs; ++ struct nfsctl_fd2fh u_fd2fh; + /* + * The following dummy member is needed to preserve binary compatibility + * on platforms where alignof(void*)>alignof(int). It's needed because +@@ -95,6 +102,7 @@ struct nfsctl_arg { + #define ca_export u.u_export + #define ca_getfd u.u_getfd + #define ca_getfs u.u_getfs ++#define ca_fd2fh u.u_fd2fh + }; + + union nfsctl_res { +diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/include/linux/nfs_fs.h +--- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-23 12:08:29.061494081 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-23 12:09:03.381511751 -0400 +@@ -72,13 +72,20 @@ struct nfs_access_entry { + int mask; + }; + ++struct nfs_lock_context { ++ atomic_t count; ++ struct list_head list; ++ struct nfs_open_context *open_context; ++ fl_owner_t lockowner; ++ pid_t pid; ++}; ++ + struct nfs4_state; + struct nfs_open_context { +- atomic_t count; ++ struct nfs_lock_context lock_context; + struct path path; + struct rpc_cred *cred; + struct nfs4_state *state; +- fl_owner_t lockowner; + fmode_t mode; + + unsigned long flags; +@@ -97,6 +104,26 @@ struct nfs_delegation; + + struct posix_acl; + ++struct pnfs_layout_type { ++ int refcount; ++ struct list_head lo_layouts; /* other client layouts */ ++ struct list_head segs; /* layout segments list */ ++ int roc_iomode; /* iomode to return on close, 0=none */ ++ seqlock_t seqlock; /* Protects the stateid */ ++ nfs4_stateid stateid; ++ unsigned long pnfs_layout_state; ++ #define NFS_INO_RO_LAYOUT_FAILED 0 /* get ro layout failed stop trying */ ++ #define NFS_INO_RW_LAYOUT_FAILED 1 /* get rw layout failed stop trying */ ++ #define NFS_INO_LAYOUTCOMMIT 3 /* LAYOUTCOMMIT needed */ ++ struct rpc_cred *lo_cred; /* layoutcommit credential */ ++ /* DH: These vars keep track of the maximum write range ++ * so the values can be used for layoutcommit. ++ */ ++ loff_t pnfs_write_begin_pos; ++ loff_t pnfs_write_end_pos; ++ struct inode *lo_inode; ++}; ++ + /* + * nfs fs inode data in memory + */ +@@ -181,6 +208,13 @@ struct nfs_inode { + struct nfs_delegation *delegation; + fmode_t delegation_state; + struct rw_semaphore rwsem; ++ ++ /* pNFS layout information */ ++#if defined(CONFIG_NFS_V4_1) ++ wait_queue_head_t lo_waitq; ++ struct pnfs_layout_type *layout; ++ time_t pnfs_layout_suspend; ++#endif /* CONFIG_NFS_V4_1 */ + #endif /* CONFIG_NFS_V4*/ + #ifdef CONFIG_NFS_FSCACHE + struct fscache_cookie *fscache; +@@ -353,6 +387,8 @@ extern void nfs_setattr_update_inode(str + extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); + extern void put_nfs_open_context(struct nfs_open_context *ctx); + extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode); ++extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx); ++extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); + extern u64 nfs_compat_user_ino64(u64 fileid); + extern void nfs_fattr_init(struct nfs_fattr *fattr); + +@@ -481,8 +517,12 @@ extern void nfs_unblock_sillyrename(stru + extern int nfs_congestion_kb; + extern int nfs_writepage(struct page *page, struct writeback_control *wbc); + extern int nfs_writepages(struct address_space *, struct writeback_control *); +-extern int nfs_flush_incompatible(struct file *file, struct page *page); +-extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); ++struct pnfs_layout_segment; ++extern int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg); ++extern int nfs_updatepage(struct file *, struct page *, ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata); + extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); + + /* +@@ -604,6 +644,8 @@ extern void * nfs_root_data(void); + #define NFSDBG_CLIENT 0x0200 + #define NFSDBG_MOUNT 0x0400 + #define NFSDBG_FSCACHE 0x0800 ++#define NFSDBG_PNFS 0x1000 ++#define NFSDBG_PNFS_LD 0x2000 + #define NFSDBG_ALL 0xFFFF + + #ifdef __KERNEL__ +diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/include/linux/nfs_fs_sb.h +--- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-23 12:08:29.062501618 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-23 12:09:03.383491395 -0400 +@@ -15,6 +15,7 @@ struct nlm_host; + struct nfs4_sequence_args; + struct nfs4_sequence_res; + struct nfs_server; ++struct nfs4_minor_version_ops; + + /* + * The nfs_client identifies our client state to the server. +@@ -70,11 +71,7 @@ struct nfs_client { + */ + char cl_ipaddr[48]; + unsigned char cl_id_uniquifier; +- int (* cl_call_sync)(struct nfs_server *server, +- struct rpc_message *msg, +- struct nfs4_sequence_args *args, +- struct nfs4_sequence_res *res, +- int cache_reply); ++ const struct nfs4_minor_version_ops *cl_mvops; + #endif /* CONFIG_NFS_V4 */ + + #ifdef CONFIG_NFS_V4_1 +@@ -85,6 +82,8 @@ struct nfs_client { + /* The flags used for obtaining the clientid during EXCHANGE_ID */ + u32 cl_exchange_flags; + struct nfs4_session *cl_session; /* sharred session */ ++ struct list_head cl_layouts; ++ struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */ + #endif /* CONFIG_NFS_V4_1 */ + + #ifdef CONFIG_NFS_FSCACHE +@@ -92,6 +91,16 @@ struct nfs_client { + #endif + }; + ++static inline bool ++is_ds_only_client(struct nfs_client *clp) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ return is_ds_only_session(clp->cl_exchange_flags); ++#else ++ return false; ++#endif ++} ++ + /* + * NFS client parameters stored in the superblock. + */ +@@ -136,7 +145,7 @@ struct nfs_server { + #endif + + #ifdef CONFIG_NFS_V4 +- u32 attr_bitmask[2];/* V4 bitmask representing the set ++ u32 attr_bitmask[3];/* V4 bitmask representing the set + of attributes supported on this + filesystem */ + u32 cache_consistency_bitmask[2]; +@@ -148,6 +157,15 @@ struct nfs_server { + that are supported on this + filesystem */ + #endif ++ ++#ifdef CONFIG_NFS_V4_1 ++ u32 pnfs_blksize; /* layout_blksize attr */ ++ struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ ++ void *pnfs_ld_data; /* Per-mount data */ ++ unsigned int ds_rsize; /* Data server read size */ ++ unsigned int ds_wsize; /* Data server write size */ ++#endif /* CONFIG_NFS_V4_1 */ ++ + void (*destroy)(struct nfs_server *); + + atomic_t active; /* Keep trace of any activity to this server */ +diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch/include/linux/nfs_iostat.h +--- linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-08-23 12:09:03.384501540 -0400 +@@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { + NFSIOS_SHORTREAD, + NFSIOS_SHORTWRITE, + NFSIOS_DELAY, ++ NFSIOS_PNFS_READ, ++ NFSIOS_PNFS_WRITE, ++ NFSIOS_PNFS_COMMIT, + __NFSIOS_COUNTSMAX, + }; + +diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/include/linux/nfs_page.h +--- linux-2.6.34.noarch/include/linux/nfs_page.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-08-23 12:09:03.385491518 -0400 +@@ -39,6 +39,7 @@ struct nfs_page { + struct list_head wb_list; /* Defines state of page: */ + struct page *wb_page; /* page to read in/write out */ + struct nfs_open_context *wb_context; /* File state context info */ ++ struct nfs_lock_context *wb_lock_context; /* lock context info */ + atomic_t wb_complete; /* i/os we're waiting for */ + pgoff_t wb_index; /* Offset >> PAGE_CACHE_SHIFT */ + unsigned int wb_offset, /* Offset & ~PAGE_CACHE_MASK */ +@@ -47,6 +48,7 @@ struct nfs_page { + struct kref wb_kref; /* reference count */ + unsigned long wb_flags; + struct nfs_writeverf wb_verf; /* Commit cookie */ ++ struct pnfs_layout_segment *wb_lseg; /* Pnfs layout info */ + }; + + struct nfs_pageio_descriptor { +@@ -60,6 +62,12 @@ struct nfs_pageio_descriptor { + int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int); + int pg_ioflags; + int pg_error; ++ struct pnfs_layout_segment *pg_lseg; ++#ifdef CONFIG_NFS_V4_1 ++ int pg_iswrite; ++ int pg_boundary; ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++#endif /* CONFIG_NFS_V4_1 */ + }; + + #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) +@@ -68,13 +76,15 @@ extern struct nfs_page *nfs_create_reque + struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int count); ++ unsigned int count, ++ struct pnfs_layout_segment *lseg); + extern void nfs_clear_request(struct nfs_page *req); + extern void nfs_release_request(struct nfs_page *req); + + + extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst, +- pgoff_t idx_start, unsigned int npages, int tag); ++ pgoff_t idx_start, unsigned int npages, int tag, ++ int *use_pnfs); + extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, + struct inode *inode, + int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), +diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/include/linux/nfs_xdr.h +--- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-08-23 12:08:29.062501618 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-23 12:09:03.387491422 -0400 +@@ -3,6 +3,8 @@ + + #include + #include ++#include ++#include + + /* + * To change the maximum rsize and wsize supported by the NFS client, adjust +@@ -10,7 +12,7 @@ + * support a megabyte or more. The default is left at 4096 bytes, which is + * reasonable for NFS over UDP. + */ +-#define NFS_MAX_FILE_IO_SIZE (1048576U) ++#define NFS_MAX_FILE_IO_SIZE (4U * 1048576U) + #define NFS_DEF_FILE_IO_SIZE (4096U) + #define NFS_MIN_FILE_IO_SIZE (1024U) + +@@ -113,6 +115,10 @@ struct nfs_fsinfo { + __u32 dtpref; /* pref. readdir transfer size */ + __u64 maxfilesize; + __u32 lease_time; /* in seconds */ ++#if defined(CONFIG_NFS_V4_1) ++ __u32 layouttype; /* supported pnfs layout driver */ ++ __u32 blksize; /* preferred pnfs io block size */ ++#endif + }; + + struct nfs_fsstat { +@@ -196,8 +202,10 @@ struct nfs_openargs { + __u64 clientid; + __u64 id; + union { +- struct iattr * attrs; /* UNCHECKED, GUARDED */ +- nfs4_verifier verifier; /* EXCLUSIVE */ ++ struct { ++ struct iattr * attrs; /* UNCHECKED, GUARDED */ ++ nfs4_verifier verifier; /* EXCLUSIVE */ ++ }; + nfs4_stateid delegation; /* CLAIM_DELEGATE_CUR */ + fmode_t delegation_type; /* CLAIM_PREVIOUS */ + } u; +@@ -313,6 +321,10 @@ struct nfs_lockt_res { + struct nfs4_sequence_res seq_res; + }; + ++struct nfs_release_lockowner_args { ++ struct nfs_lowner lock_owner; ++}; ++ + struct nfs4_delegreturnargs { + const struct nfs_fh *fhandle; + const nfs4_stateid *stateid; +@@ -332,6 +344,7 @@ struct nfs4_delegreturnres { + struct nfs_readargs { + struct nfs_fh * fh; + struct nfs_open_context *context; ++ struct nfs_lock_context *lock_context; + __u64 offset; + __u32 count; + unsigned int pgbase; +@@ -352,6 +365,7 @@ struct nfs_readres { + struct nfs_writeargs { + struct nfs_fh * fh; + struct nfs_open_context *context; ++ struct nfs_lock_context *lock_context; + __u64 offset; + __u32 count; + enum nfs3_stable_how stable; +@@ -846,7 +860,7 @@ struct nfs4_server_caps_arg { + }; + + struct nfs4_server_caps_res { +- u32 attr_bitmask[2]; ++ u32 attr_bitmask[3]; + u32 acl_bitmask; + u32 has_links; + u32 has_symlinks; +@@ -961,6 +975,27 @@ struct nfs_page; + + #define NFS_PAGEVEC_SIZE (8U) + ++#if defined(CONFIG_NFS_V4_1) ++/* pnfsflag values */ ++#define PNFS_NO_RPC 0x0001 /* non rpc result callback switch */ ++ ++/* pnfs-specific data needed for read, write, and commit calls */ ++struct pnfs_call_data { ++ struct pnfs_layout_segment *lseg; ++ const struct rpc_call_ops *call_ops; ++ u32 orig_count; /* for retry via MDS */ ++ int pnfs_error; ++ u8 pnfsflags; ++ u8 how; /* for FLUSH_STABLE */ ++}; ++ ++/* files layout-type specific data for read, write, and commit */ ++struct pnfs_fl_call_data { ++ struct nfs_client *ds_nfs_client; ++ __u64 orig_offset; ++}; ++#endif /* CONFIG_NFS_V4_1 */ ++ + struct nfs_read_data { + int flags; + struct rpc_task task; +@@ -976,10 +1011,16 @@ struct nfs_read_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + + struct nfs_write_data { ++ struct kref refcount; /* For pnfs commit splitting */ ++ struct nfs_write_data *parent; /* For pnfs commit splitting */ + int flags; + struct rpc_task task; + struct inode *inode; +@@ -995,6 +1036,10 @@ struct nfs_write_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + +@@ -1008,6 +1053,7 @@ struct nfs_rpc_ops { + const struct dentry_operations *dentry_ops; + const struct inode_operations *dir_inode_ops; + const struct inode_operations *file_inode_ops; ++ const struct file_operations *file_ops; + + int (*getroot) (struct nfs_server *, struct nfs_fh *, + struct nfs_fsinfo *); +@@ -1072,6 +1118,7 @@ struct nfs_rpc_ops { + extern const struct nfs_rpc_ops nfs_v2_clientops; + extern const struct nfs_rpc_ops nfs_v3_clientops; + extern const struct nfs_rpc_ops nfs_v4_clientops; ++extern const struct nfs_rpc_ops pnfs_v4_clientops; + extern struct rpc_version nfs_version2; + extern struct rpc_version nfs_version3; + extern struct rpc_version nfs_version4; +diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.noarch/include/linux/panfs_shim_api.h +--- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-08-23 12:09:03.388491527 -0400 ++++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-08-23 12:09:03.388491527 -0400 +@@ -0,0 +1,57 @@ ++#ifndef _PANFS_SHIM_API_H ++#define _PANFS_SHIM_API_H ++ ++/* ++ * imported panfs functions ++ */ ++struct panfs_export_operations { ++ int (*convert_rc)(pan_status_t rc); ++ ++ int (*sm_sec_t_get_size_otw)( ++ pan_sm_sec_otw_t *var, ++ pan_size_t *core_sizep, ++ pan_size_t *wire_size, ++ void *buf_end); ++ ++ int (*sm_sec_t_unmarshall)( ++ pan_sm_sec_otw_t *in, ++ pan_sm_sec_t *out, ++ void *buf, ++ pan_size_t size, ++ pan_size_t *otw_consumed, ++ pan_size_t *in_core_consumed); ++ ++ int (*ucreds_get)(void **ucreds_pp); ++ ++ void (*ucreds_put)(void *ucreds); ++ ++ int (*sam_read)( ++ pan_sam_access_flags_t flags, ++ pan_sam_read_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_read_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p); ++ ++ int (*sam_write)( ++ pan_sam_access_flags_t flags, ++ pan_sam_write_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_write_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p); ++}; ++ ++extern int ++panfs_shim_register(struct panfs_export_operations *ops); ++ ++extern int ++panfs_shim_unregister(void); ++ ++#endif /* _PANFS_SHIM_API_H */ +diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h +--- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-23 12:09:03.390501461 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-08-23 12:09:03.390501461 -0400 +@@ -0,0 +1,440 @@ ++/* ++ * pnfs_osd_xdr.h ++ * ++ * pNFS-osd on-the-wire data structures ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#ifndef __PNFS_OSD_XDR_H__ ++#define __PNFS_OSD_XDR_H__ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#define PNFS_OSD_OSDNAME_MAXSIZE 256 ++ ++/* ++ * START OF "GENERIC" DECODE ROUTINES. ++ * These may look a little ugly since they are imported from a "generic" ++ * set of XDR encode/decode routines which are intended to be shared by ++ * all of our NFSv4 implementations (OpenBSD, MacOS X...). ++ * ++ * If the pain of reading these is too great, it should be a straightforward ++ * task to translate them into Linux-specific versions which are more ++ * consistent with the style used in NFSv2/v3... ++ */ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (u64)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++ ++/* ++ * draft-ietf-nfsv4-minorversion-22 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* Layout Structure */ ++ ++enum pnfs_osd_raid_algorithm4 { ++ PNFS_OSD_RAID_0 = 1, ++ PNFS_OSD_RAID_4 = 2, ++ PNFS_OSD_RAID_5 = 3, ++ PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ ++}; ++ ++/* struct pnfs_osd_data_map4 { ++ * uint32_t odm_num_comps; ++ * length4 odm_stripe_unit; ++ * uint32_t odm_group_width; ++ * uint32_t odm_group_depth; ++ * uint32_t odm_mirror_cnt; ++ * pnfs_osd_raid_algorithm4 odm_raid_algorithm; ++ * }; ++ */ ++struct pnfs_osd_data_map { ++ u32 odm_num_comps; ++ u64 odm_stripe_unit; ++ u32 odm_group_width; ++ u32 odm_group_depth; ++ u32 odm_mirror_cnt; ++ u32 odm_raid_algorithm; ++}; ++ ++static inline int ++pnfs_osd_data_map_xdr_sz(void) ++{ ++ return 1 + 2 + 1 + 1 + 1 + 1; ++} ++ ++static inline size_t ++pnfs_osd_data_map_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_data_map); ++} ++ ++/* struct pnfs_osd_objid4 { ++ * deviceid4 oid_device_id; ++ * uint64_t oid_partition_id; ++ * uint64_t oid_object_id; ++ * }; ++ */ ++struct pnfs_osd_objid { ++ struct pnfs_deviceid oid_device_id; ++ u64 oid_partition_id; ++ u64 oid_object_id; ++}; ++ ++/* For printout. I use "dev(%llx:%llx)", _DEVID_LO(), _DEVID_HI BE style */ ++#define _DEVID_LO(oid_device_id) \ ++ (unsigned long long)be64_to_cpup((__be64 *)oid_device_id.data) ++ ++#define _DEVID_HI(oid_device_id) \ ++ (unsigned long long)be64_to_cpup(((__be64 *)oid_device_id.data) + 1) ++ ++static inline int ++pnfs_osd_objid_xdr_sz(void) ++{ ++ return (NFS4_PNFS_DEVICEID4_SIZE / 4) + 2 + 2; ++} ++ ++static inline size_t ++pnfs_osd_objid_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_objid); ++} ++ ++enum pnfs_osd_version { ++ PNFS_OSD_MISSING = 0, ++ PNFS_OSD_VERSION_1 = 1, ++ PNFS_OSD_VERSION_2 = 2 ++}; ++ ++struct pnfs_osd_opaque_cred { ++ u32 cred_len; ++ u8 *cred; ++}; ++ ++static inline int ++pnfs_osd_opaque_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ READ32(n); ++ p += XDR_QUADLEN(n); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_opaque_cred_incore_sz(u32 *p) ++{ ++ u32 n; ++ ++ READ32(n); ++ return XDR_QUADLEN(n) * 4; ++} ++ ++enum pnfs_osd_cap_key_sec { ++ PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ PNFS_OSD_CAP_KEY_SEC_SSV = 1, ++}; ++ ++/* struct pnfs_osd_object_cred4 { ++ * pnfs_osd_objid4 oc_object_id; ++ * pnfs_osd_version4 oc_osd_version; ++ * pnfs_osd_cap_key_sec4 oc_cap_key_sec; ++ * opaque oc_capability_key<>; ++ * opaque oc_capability<>; ++ * }; ++ */ ++struct pnfs_osd_object_cred { ++ struct pnfs_osd_objid oc_object_id; ++ u32 oc_osd_version; ++ u32 oc_cap_key_sec; ++ struct pnfs_osd_opaque_cred oc_cap_key; ++ struct pnfs_osd_opaque_cred oc_cap; ++}; ++ ++static inline int ++pnfs_osd_object_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_object_cred_incore_sz(u32 *p) ++{ ++ size_t sz = sizeof(struct pnfs_osd_object_cred); ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ return sz; ++} ++ ++/* struct pnfs_osd_layout4 { ++ * pnfs_osd_data_map4 olo_map; ++ * uint32_t olo_comps_index; ++ * pnfs_osd_object_cred4 olo_components<>; ++ * }; ++ */ ++struct pnfs_osd_layout { ++ struct pnfs_osd_data_map olo_map; ++ u32 olo_comps_index; ++ u32 olo_num_comps; ++ struct pnfs_osd_object_cred *olo_comps; ++}; ++ ++static inline int ++pnfs_osd_layout_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ while ((int)(n--) > 0) ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_layout_incore_sz(u32 *p) ++{ ++ u32 n; ++ size_t sz; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ sz = sizeof(struct pnfs_osd_layout); ++ while ((int)(n--) > 0) { ++ sz += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ return sz; ++} ++ ++/* Device Address */ ++ ++enum pnfs_osd_targetid_type { ++ OBJ_TARGET_ANON = 1, ++ OBJ_TARGET_SCSI_NAME = 2, ++ OBJ_TARGET_SCSI_DEVICE_ID = 3, ++}; ++ ++/* union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) { ++ * case OBJ_TARGET_SCSI_NAME: ++ * string oti_scsi_name<>; ++ * ++ * case OBJ_TARGET_SCSI_DEVICE_ID: ++ * opaque oti_scsi_device_id<>; ++ * ++ * default: ++ * void; ++ * }; ++ * ++ * union pnfs_osd_targetaddr4 switch (bool ota_available) { ++ * case TRUE: ++ * netaddr4 ota_netaddr; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_deviceaddr4 { ++ * pnfs_osd_targetid4 oda_targetid; ++ * pnfs_osd_targetaddr4 oda_targetaddr; ++ * uint64_t oda_lun; ++ * opaque oda_systemid<>; ++ * pnfs_osd_object_cred4 oda_root_obj_cred; ++ * opaque oda_osdname<>; ++ * }; ++ */ ++struct pnfs_osd_targetid { ++ u32 oti_type; ++ struct nfs4_string oti_scsi_device_id; ++}; ++ ++enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 }; ++ ++/* struct netaddr4 { ++ * // see struct rpcb in RFC1833 ++ * string r_netid<>; // network id ++ * string r_addr<>; // universal address ++ * }; ++ */ ++struct pnfs_osd_net_addr { ++ struct nfs4_string r_netid; ++ struct nfs4_string r_addr; ++}; ++ ++struct pnfs_osd_targetaddr { ++ u32 ota_available; ++ struct pnfs_osd_net_addr ota_netaddr; ++}; ++ ++enum { ++ NETWORK_ID_MAX = 16 / 4, ++ UNIVERSAL_ADDRESS_MAX = 64 / 4, ++ PNFS_OSD_TARGETADDR_MAX = 3 + NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX, ++}; ++ ++struct pnfs_osd_deviceaddr { ++ struct pnfs_osd_targetid oda_targetid; ++ struct pnfs_osd_targetaddr oda_targetaddr; ++ u8 oda_lun[8]; ++ struct nfs4_string oda_systemid; ++ struct pnfs_osd_object_cred oda_root_obj_cred; ++ struct nfs4_string oda_osdname; ++}; ++ ++enum { ++ ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4, ++ PNFS_OSD_DEVICEADDR_MAX = ++ PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX + ++ 2 /*oda_lun*/ + ++ 1 + OSD_SYSTEMID_LEN + ++ 1 + ODA_OSDNAME_MAX, ++}; ++ ++/* LAYOUTCOMMIT: layoutupdate */ ++ ++/* union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) { ++ * case TRUE: ++ * int64_t dsu_delta; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_layoutupdate4 { ++ * pnfs_osd_deltaspaceused4 olu_delta_space_used; ++ * bool olu_ioerr_flag; ++ * }; ++ */ ++struct pnfs_osd_layoutupdate { ++ u32 dsu_valid; ++ s64 dsu_delta; ++ u32 olu_ioerr_flag; ++}; ++ ++/* LAYOUTRETURN: I/O Rrror Report */ ++ ++enum pnfs_osd_errno { ++ PNFS_OSD_ERR_EIO = 1, ++ PNFS_OSD_ERR_NOT_FOUND = 2, ++ PNFS_OSD_ERR_NO_SPACE = 3, ++ PNFS_OSD_ERR_BAD_CRED = 4, ++ PNFS_OSD_ERR_NO_ACCESS = 5, ++ PNFS_OSD_ERR_UNREACHABLE = 6, ++ PNFS_OSD_ERR_RESOURCE = 7 ++}; ++ ++/* struct pnfs_osd_ioerr4 { ++ * pnfs_osd_objid4 oer_component; ++ * length4 oer_comp_offset; ++ * length4 oer_comp_length; ++ * bool oer_iswrite; ++ * pnfs_osd_errno4 oer_errno; ++ * }; ++ */ ++struct pnfs_osd_ioerr { ++ struct pnfs_osd_objid oer_component; ++ u64 oer_comp_offset; ++ u64 oer_comp_length; ++ u32 oer_iswrite; ++ u32 oer_errno; ++}; ++ ++static inline unsigned ++pnfs_osd_ioerr_xdr_sz(void) ++{ ++ return pnfs_osd_objid_xdr_sz() + 2 + 2 + 1 + 1; ++} ++ ++/* OSD XDR API */ ++ ++/* Layout helpers */ ++extern struct pnfs_osd_layout *pnfs_osd_xdr_decode_layout( ++ struct pnfs_osd_layout *layout, u32 *p); ++ ++extern int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *layout); ++ ++/* Device Info helpers */ ++ ++/* First pass calculate total size for space needed */ ++extern size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p); ++ ++/* Note: some strings pointed to inside @deviceaddr might point ++ * to space inside @p. @p should stay valid while @deviceaddr ++ * is in use. ++ * It is assumed that @deviceaddr points to bigger memory of size ++ * calculated in first pass by pnfs_osd_xdr_deviceaddr_incore_sz() ++ */ ++extern void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p); ++ ++/* For Servers */ ++extern int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr); ++ ++/* layoutupdate (layout_commit) xdr helpers */ ++extern int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou); ++extern __be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p); ++ ++/* osd_ioerror encoding/decoding (layout_return) */ ++extern int ++pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, struct pnfs_osd_ioerr *ioerr); ++extern __be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p); ++ ++#endif /* __PNFS_OSD_XDR_H__ */ +diff -up linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_xdr.h +--- linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig 2010-08-23 12:09:03.391491550 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_xdr.h 2010-08-23 12:09:03.391491550 -0400 +@@ -0,0 +1,134 @@ ++/* ++ * include/linux/pnfs_xdr.h ++ * ++ * Common xdr data structures needed by pnfs client. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef LINUX_PNFS_XDR_H ++#define LINUX_PNFS_XDR_H ++ ++#define PNFS_LAYOUT_MAXSIZE 4096 ++#define NFS4_PNFS_DEVICEID4_SIZE 16 ++ ++struct pnfs_deviceid { ++ char data[NFS4_PNFS_DEVICEID4_SIZE]; ++}; ++ ++struct nfs4_pnfs_layout { ++ __u32 len; ++ void *buf; ++}; ++ ++struct nfs4_pnfs_layout_segment { ++ u32 iomode; ++ u64 offset; ++ u64 length; ++}; ++ ++struct nfs4_pnfs_layoutget_arg { ++ __u32 type; ++ struct nfs4_pnfs_layout_segment lseg; ++ __u64 minlength; ++ __u32 maxcount; ++ struct inode *inode; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_pnfs_layoutget_res { ++ __u32 return_on_close; ++ struct nfs4_pnfs_layout_segment lseg; ++ __u32 type; ++ nfs4_stateid stateid; ++ struct nfs4_pnfs_layout layout; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_pnfs_layoutget { ++ struct nfs4_pnfs_layoutget_arg args; ++ struct nfs4_pnfs_layoutget_res res; ++ struct pnfs_layout_segment **lsegpp; ++ int status; ++}; ++ ++struct pnfs_layoutcommit_arg { ++ nfs4_stateid stateid; ++ __u64 lastbytewritten; ++ __u32 time_modify_changed; ++ struct timespec time_modify; ++ const u32 *bitmask; ++ struct nfs_fh *fh; ++ struct inode *inode; ++ ++ /* Values set by layout driver */ ++ struct nfs4_pnfs_layout_segment lseg; ++ __u32 layout_type; ++ void *layoutdriver_data; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct pnfs_layoutcommit_res { ++ __u32 sizechanged; ++ __u64 newsize; ++ struct nfs_fattr *fattr; ++ const struct nfs_server *server; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct pnfs_layoutcommit_data { ++ struct rpc_task task; ++ struct rpc_cred *cred; ++ struct nfs_fattr fattr; ++ struct pnfs_layoutcommit_arg args; ++ struct pnfs_layoutcommit_res res; ++ int status; ++}; ++ ++struct nfs4_pnfs_layoutreturn_arg { ++ __u32 reclaim; ++ __u32 layout_type; ++ __u32 return_type; ++ struct nfs4_pnfs_layout_segment lseg; ++ struct inode *inode; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_pnfs_layoutreturn_res { ++ struct nfs4_sequence_res seq_res; ++ u32 lrs_present; ++ nfs4_stateid stateid; ++}; ++ ++struct nfs4_pnfs_layoutreturn { ++ struct nfs4_pnfs_layoutreturn_arg args; ++ struct nfs4_pnfs_layoutreturn_res res; ++ struct rpc_cred *cred; ++ int rpc_status; ++}; ++ ++struct nfs4_pnfs_getdevicelist_arg { ++ const struct nfs_fh *fh; ++ u32 layoutclass; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_pnfs_getdevicelist_res { ++ struct pnfs_devicelist *devlist; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_pnfs_getdeviceinfo_arg { ++ struct pnfs_device *pdev; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_pnfs_getdeviceinfo_res { ++ struct pnfs_device *pdev; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++#endif /* LINUX_PNFS_XDR_H */ +diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/include/linux/posix_acl.h +--- linux-2.6.34.noarch/include/linux/posix_acl.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-08-23 12:09:03.393501437 -0400 +@@ -8,6 +8,7 @@ + #ifndef __LINUX_POSIX_ACL_H + #define __LINUX_POSIX_ACL_H + ++#include + #include + + #define ACL_UNDEFINED_ID (-1) +diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h +--- linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-08-23 12:09:03.393501437 -0400 +@@ -14,6 +14,8 @@ + /* size of an XDR encoding unit in bytes, i.e. 32bit */ + #define XDR_UNIT (4) + ++#include ++ + /* spec defines authentication flavor as an unsigned 32 bit integer */ + typedef u32 rpc_authflavor_t; + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h +--- linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-23 12:09:03.394512138 -0400 +@@ -3,6 +3,7 @@ + + #ifdef __KERNEL__ + ++#include + #include + + struct rpc_pipe_msg { +@@ -11,6 +12,10 @@ struct rpc_pipe_msg { + size_t len; + size_t copied; + int errno; ++#define PIPEFS_AUTOFREE_RPCMSG 0x01 /* frees rpc_pipe_msg */ ++#define PIPEFS_AUTOFREE_RPCMSG_DATA 0x02 /* frees rpc_pipe_msg->data */ ++#define PIPEFS_AUTOFREE_UPCALL_MSG PIPEFS_AUTOFREE_RPCMSG_DATA ++ u8 flags; + }; + + struct rpc_pipe_ops { +diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h +--- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-23 12:09:03.394512138 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-23 12:09:03.395501822 -0400 +@@ -0,0 +1,111 @@ ++/* ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#ifndef _SIMPLE_RPC_PIPEFS_H_ ++#define _SIMPLE_RPC_PIPEFS_H_ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#define payload_of(headerp) ((void *)(headerp + 1)) ++ ++/* ++ * struct pipefs_hdr -- the generic message format for simple_rpc_pipefs. ++ * Messages may simply be the header itself, although having an optional ++ * data payload follow the header allows much more flexibility. ++ * ++ * Messages are created using pipefs_alloc_init_msg() and ++ * pipefs_alloc_init_msg_padded(), both of which accept a pointer to an ++ * (optional) data payload. ++ * ++ * Given a struct pipefs_hdr *msg that has a struct foo payload, the data ++ * can be accessed using: struct foo *foop = payload_of(msg) ++ */ ++struct pipefs_hdr { ++ u32 msgid; ++ u8 type; ++ u8 flags; ++ u16 totallen; /* length of entire message, including hdr itself */ ++ u32 status; ++}; ++ ++/* ++ * struct pipefs_list -- a type of list used for tracking callers who've made an ++ * upcall and are blocked waiting for a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_assign_upcall_reply(). ++ */ ++struct pipefs_list { ++ struct list_head list; ++ spinlock_t list_lock; ++}; ++ ++ ++/* See net/sunrpc/simple_rpc_pipefs.c for more info on using these functions. */ ++extern struct dentry *pipefs_mkpipe(const char *name, ++ const struct rpc_pipe_ops *ops, ++ int wait_for_open); ++extern void pipefs_closepipe(struct dentry *pipe); ++extern void pipefs_init_list(struct pipefs_list *list); ++extern struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen); ++extern struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, ++ u8 flags, void *data, ++ u16 datalen, u16 padlen); ++extern struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list ++ *uplist, u8 upflags, ++ u32 timeout); ++extern int pipefs_queue_upcall_noreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, u8 upflags); ++extern int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist); ++extern struct pipefs_hdr *pipefs_readmsg(struct file *filp, ++ const char __user *src, size_t len); ++extern ssize_t pipefs_generic_upcall(struct file *filp, ++ struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen); ++extern void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg); ++ ++#endif /* _SIMPLE_RPC_PIPEFS_H_ */ +diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h +--- linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-23 12:09:03.395501822 -0400 +@@ -166,4 +166,41 @@ static inline char *__svc_print_addr(con + + return buf; + } ++ ++/* ++ * Print a network address in a universal format (see rfc1833 and nfsv4.1) ++ */ ++static inline int __svc_print_netaddr(struct sockaddr *addr, ++ struct xdr_netobj *na) ++{ ++ u16 port; ++ ssize_t len; ++ ++ switch (addr->sa_family) { ++ case AF_INET: { ++ struct sockaddr_in *sin = (struct sockaddr_in *)addr; ++ port = ntohs(sin->sin_port); ++ ++ len = snprintf(na->data, na->len, "%pI4.%u.%u", ++ &sin->sin_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ case AF_INET6: { ++ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; ++ port = ntohs(sin6->sin6_port); ++ ++ len = snprintf(na->data, na->len, "%pI6.%u.%u", ++ &sin6->sin6_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ default: ++ snprintf(na->data, na->len, "unknown address type: %d", ++ addr->sa_family); ++ len = -EINVAL; ++ break; ++ } ++ return len; ++} + #endif /* SUNRPC_SVC_XPRT_H */ +diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xdr.h +--- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-23 12:08:29.066475323 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-23 12:09:03.396464612 -0400 +@@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp) + return p + 2; + } + ++static inline __be32 * ++xdr_decode_opaque_fixed(__be32 *p, void *ptr, unsigned int len) ++{ ++ memcpy(ptr, p, len); ++ return p + XDR_QUADLEN(len); ++} ++ + /* + * Adjust kvec to reflect end of xdr'ed data (RPC client XDR) + */ +@@ -197,6 +204,7 @@ struct xdr_stream { + + extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); + extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); ++extern __be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q); + extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, + unsigned int base, unsigned int len); + extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); +diff -up linux-2.6.34.noarch/localversion-pnfs.orig linux-2.6.34.noarch/localversion-pnfs +--- linux-2.6.34.noarch/localversion-pnfs.orig 2010-08-23 12:09:03.396464612 -0400 ++++ linux-2.6.34.noarch/localversion-pnfs 2010-08-23 12:09:03.396464612 -0400 +@@ -0,0 +1 @@ ++-pnfs +diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/sunrpc/Makefile +--- linux-2.6.34.noarch/net/sunrpc/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-08-23 12:09:03.397501662 -0400 +@@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt + svc.o svcsock.o svcauth.o svcauth_unix.o \ + addr.o rpcb_clnt.o timer.o xdr.o \ + sunrpc_syms.o cache.o rpc_pipe.o \ +- svc_xprt.o ++ svc_xprt.o simple_rpc_pipefs.o + sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o + sunrpc-$(CONFIG_PROC_FS) += stats.o + sunrpc-$(CONFIG_SYSCTL) += sysctl.o +diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c +--- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-23 12:09:03.398522348 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-23 12:09:03.398522348 -0400 +@@ -0,0 +1,424 @@ ++/* ++ * net/sunrpc/simple_rpc_pipefs.c ++ * ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++ ++/* ++ * Make an rpc_pipefs pipe named @name at the root of the mounted rpc_pipefs ++ * filesystem. ++ * ++ * If @wait_for_open is non-zero and an upcall is later queued but the userland ++ * end of the pipe has not yet been opened, the upcall will remain queued until ++ * the pipe is opened; otherwise, the upcall queueing will return with -EPIPE. ++ */ ++struct dentry *pipefs_mkpipe(const char *name, const struct rpc_pipe_ops *ops, ++ int wait_for_open) ++{ ++ struct dentry *dir, *pipe; ++ struct vfsmount *mnt; ++ ++ mnt = rpc_get_mount(); ++ if (IS_ERR(mnt)) { ++ pipe = ERR_CAST(mnt); ++ goto out; ++ } ++ dir = mnt->mnt_root; ++ if (!dir) { ++ pipe = ERR_PTR(-ENOENT); ++ goto out; ++ } ++ pipe = rpc_mkpipe(dir, name, NULL, ops, ++ wait_for_open ? RPC_PIPE_WAIT_FOR_OPEN : 0); ++out: ++ return pipe; ++} ++EXPORT_SYMBOL(pipefs_mkpipe); ++ ++/* ++ * Shutdown a pipe made by pipefs_mkpipe(). ++ * XXX: do we need to retain an extra reference on the mount? ++ */ ++void pipefs_closepipe(struct dentry *pipe) ++{ ++ rpc_unlink(pipe); ++ rpc_put_mount(); ++} ++EXPORT_SYMBOL(pipefs_closepipe); ++ ++/* ++ * Initialize a struct pipefs_list -- which are a way to keep track of callers ++ * who're blocked having made an upcall and are awaiting a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_find_upcall_msgid() for how ++ * to use them. ++ */ ++inline void pipefs_init_list(struct pipefs_list *list) ++{ ++ INIT_LIST_HEAD(&list->list); ++ spin_lock_init(&list->list_lock); ++} ++EXPORT_SYMBOL(pipefs_init_list); ++ ++/* ++ * Alloc/init a generic pipefs message header and copy into its message body ++ * an arbitrary data payload. ++ * ++ * struct pipefs_hdr's are meant to serve as generic, general-purpose message ++ * headers for easy rpc_pipefs I/O. When an upcall is made, the ++ * struct pipefs_hdr is assigned to a struct rpc_pipe_msg and delivered ++ * therein. --And yes, the naming can seem a little confusing at first: ++ * ++ * When one thinks of an upcall "message", in simple_rpc_pipefs that's a ++ * struct pipefs_hdr (possibly with an attached message body). A ++ * struct rpc_pipe_msg is actually only the -vehicle- by which the "real" ++ * message is delivered and processed. ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen, u16 padlen) ++{ ++ u16 totallen; ++ struct pipefs_hdr *msg = NULL; ++ ++ totallen = sizeof(*msg) + datalen + padlen; ++ if (totallen > PAGE_SIZE) { ++ msg = ERR_PTR(-E2BIG); ++ goto out; ++ } ++ ++ msg = kzalloc(totallen, GFP_KERNEL); ++ if (!msg) { ++ msg = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ msg->msgid = msgid; ++ msg->type = type; ++ msg->flags = flags; ++ msg->totallen = totallen; ++ memcpy(payload_of(msg), data, datalen); ++out: ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg_padded); ++ ++/* ++ * See the description of pipefs_alloc_init_msg_padded(). ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen) ++{ ++ return pipefs_alloc_init_msg_padded(msgid, type, flags, data, ++ datalen, 0); ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg); ++ ++ ++static void pipefs_init_rpcmsg(struct rpc_pipe_msg *rpcmsg, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ memset(rpcmsg, 0, sizeof(*rpcmsg)); ++ rpcmsg->data = msg; ++ rpcmsg->len = msg->totallen; ++ rpcmsg->flags = upflags; ++} ++ ++static struct rpc_pipe_msg *pipefs_alloc_init_rpcmsg(struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ struct rpc_pipe_msg *rpcmsg; ++ ++ rpcmsg = kmalloc(sizeof(*rpcmsg), GFP_KERNEL); ++ if (!rpcmsg) ++ return ERR_PTR(-ENOMEM); ++ ++ pipefs_init_rpcmsg(rpcmsg, msg, upflags); ++ return rpcmsg; ++} ++ ++ ++/* represents an upcall that'll block and wait for a reply */ ++struct pipefs_upcall { ++ u32 msgid; ++ struct rpc_pipe_msg rpcmsg; ++ struct list_head list; ++ wait_queue_head_t waitq; ++ struct pipefs_hdr *reply; ++}; ++ ++ ++static void pipefs_init_upcall_waitreply(struct pipefs_upcall *upcall, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ upcall->reply = NULL; ++ upcall->msgid = msg->msgid; ++ INIT_LIST_HEAD(&upcall->list); ++ init_waitqueue_head(&upcall->waitq); ++ pipefs_init_rpcmsg(&upcall->rpcmsg, msg, upflags); ++} ++ ++static int __pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_upcall *upcall, ++ struct pipefs_list *uplist, ++ u32 timeout) ++{ ++ int err = 0; ++ DECLARE_WAITQUEUE(wq, current); ++ ++ add_wait_queue(&upcall->waitq, &wq); ++ spin_lock(&uplist->list_lock); ++ list_add(&upcall->list, &uplist->list); ++ spin_unlock(&uplist->list_lock); ++ ++ err = rpc_queue_upcall(pipe->d_inode, &upcall->rpcmsg); ++ if (err < 0) ++ goto out; ++ ++ if (timeout) { ++ /* retval of 0 means timer expired */ ++ err = schedule_timeout_uninterruptible(timeout); ++ if (err == 0 && upcall->reply == NULL) ++ err = -ETIMEDOUT; ++ } else { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ } ++ ++out: ++ spin_lock(&uplist->list_lock); ++ list_del_init(&upcall->list); ++ spin_unlock(&uplist->list_lock); ++ remove_wait_queue(&upcall->waitq, &wq); ++ return err; ++} ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace, place the calling thread ++ * on @uplist, and block the thread to wait for a reply. If @timeout is ++ * nonzero, the thread will be blocked for at most @timeout jiffies. ++ * ++ * (To convert time units into jiffies, consider the functions ++ * msecs_to_jiffies(), usecs_to_jiffies(), timeval_to_jiffies(), and ++ * timespec_to_jiffies().) ++ * ++ * Once a reply is received by your downcall handler, call ++ * pipefs_assign_upcall_reply() with @uplist to find the corresponding upcall, ++ * assign the reply, and wake the waiting thread. ++ * ++ * This function's return value pointer may be an error and should be checked ++ * with IS_ERR() before attempting to access the reply message. ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list *uplist, ++ u8 upflags, u32 timeout) ++{ ++ int err = 0; ++ struct pipefs_upcall upcall; ++ ++ pipefs_init_upcall_waitreply(&upcall, msg, upflags); ++ err = __pipefs_queue_upcall_waitreply(pipe, &upcall, uplist, timeout); ++ if (err < 0) { ++ kfree(upcall.reply); ++ upcall.reply = ERR_PTR(err); ++ } ++ ++ return upcall.reply; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_waitreply); ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace and immediately return (i.e., ++ * no reply is expected). ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++int pipefs_queue_upcall_noreply(struct dentry *pipe, struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ int err = 0; ++ struct rpc_pipe_msg *rpcmsg; ++ ++ upflags |= PIPEFS_AUTOFREE_RPCMSG; ++ rpcmsg = pipefs_alloc_init_rpcmsg(msg, upflags); ++ if (IS_ERR(rpcmsg)) { ++ err = PTR_ERR(rpcmsg); ++ goto out; ++ } ++ err = rpc_queue_upcall(pipe->d_inode, rpcmsg); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_noreply); ++ ++ ++static struct pipefs_upcall *pipefs_find_upcall_msgid(u32 msgid, ++ struct pipefs_list *uplist) ++{ ++ struct pipefs_upcall *upcall; ++ ++ spin_lock(&uplist->list_lock); ++ list_for_each_entry(upcall, &uplist->list, list) ++ if (upcall->msgid == msgid) ++ goto out; ++ upcall = NULL; ++out: ++ spin_unlock(&uplist->list_lock); ++ return upcall; ++} ++ ++/* ++ * In your rpc_pipe_ops->downcall() handler, once you've read in a downcall ++ * message and have determined that it is a reply to a waiting upcall, ++ * you can use this function to find the appropriate upcall, assign the result, ++ * and wake the upcall thread. ++ * ++ * The reply message must have the same msgid as the original upcall message's. ++ * ++ * See also pipefs_queue_upcall_waitreply() and pipefs_readmsg(). ++ */ ++int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist) ++{ ++ int err = 0; ++ struct pipefs_upcall *upcall; ++ ++ upcall = pipefs_find_upcall_msgid(reply->msgid, uplist); ++ if (!upcall) { ++ printk(KERN_ERR "%s: ERROR: have reply but no matching upcall " ++ "for msgid %d\n", __func__, reply->msgid); ++ err = -ENOENT; ++ goto out; ++ } ++ upcall->reply = reply; ++ wake_up(&upcall->waitq); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_assign_upcall_reply); ++ ++/* ++ * Generic method to read-in and return a newly-allocated message which begins ++ * with a struct pipefs_hdr. ++ */ ++struct pipefs_hdr *pipefs_readmsg(struct file *filp, const char __user *src, ++ size_t len) ++{ ++ int err = 0, hdrsize; ++ struct pipefs_hdr *msg = NULL; ++ ++ hdrsize = sizeof(*msg); ++ if (len < hdrsize) { ++ printk(KERN_ERR "%s: ERROR: header is too short (%d vs %d)\n", ++ __func__, (int) len, hdrsize); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ msg = kzalloc(len, GFP_KERNEL); ++ if (!msg) { ++ err = -ENOMEM; ++ goto out; ++ } ++ if (copy_from_user(msg, src, len)) ++ err = -EFAULT; ++out: ++ if (err) { ++ kfree(msg); ++ msg = ERR_PTR(err); ++ } ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_readmsg); ++ ++/* ++ * Generic rpc_pipe_ops->upcall() handler implementation. ++ * ++ * Don't call this directly: to make an upcall, use ++ * pipefs_queue_upcall_waitreply() or pipefs_queue_upcall_noreply(). ++ */ ++ssize_t pipefs_generic_upcall(struct file *filp, struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen) ++{ ++ char *data; ++ ssize_t len, left; ++ ++ data = (char *)rpcmsg->data + rpcmsg->copied; ++ len = rpcmsg->len - rpcmsg->copied; ++ if (len > buflen) ++ len = buflen; ++ ++ left = copy_to_user(dst, data, len); ++ if (left < 0) { ++ rpcmsg->errno = left; ++ return left; ++ } ++ ++ len -= left; ++ rpcmsg->copied += len; ++ rpcmsg->errno = 0; ++ return len; ++} ++EXPORT_SYMBOL(pipefs_generic_upcall); ++ ++/* ++ * Generic rpc_pipe_ops->destroy_msg() handler implementation. ++ * ++ * Items are only freed if @rpcmsg->flags has been set appropriately. ++ * See pipefs_queue_upcall_noreply() and rpc_pipe_fs.h. ++ */ ++void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg) ++{ ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_UPCALL_MSG) ++ kfree(rpcmsg->data); ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_RPCMSG) ++ kfree(rpcmsg); ++} ++EXPORT_SYMBOL(pipefs_generic_destroy_msg); +diff -up linux-2.6.34.noarch/net/sunrpc/xdr.c.orig linux-2.6.34.noarch/net/sunrpc/xdr.c +--- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-08-23 12:08:29.081501640 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-23 12:09:03.399443371 -0400 +@@ -403,16 +403,14 @@ xdr_shrink_pagelen(struct xdr_buf *buf, + + /* Shift the tail first */ + if (tail->iov_len != 0) { +- p = (char *)tail->iov_base + len; +- if (tail->iov_len > len) { +- copy = tail->iov_len - len; +- memmove(p, tail->iov_base, copy); +- } else +- buf->buflen -= len; +- /* Copy from the inlined pages into the tail */ + copy = len; +- if (copy > tail->iov_len) ++ if (tail->iov_len > len) { ++ p = (char *)tail->iov_base + len; ++ memmove(p, tail->iov_base, tail->iov_len - len); ++ } else { + copy = tail->iov_len; ++ } ++ /* Copy from the inlined pages into the tail */ + _copy_from_pages((char *)tail->iov_base, + buf->pages, buf->page_base + pglen - len, + copy); +@@ -496,6 +494,27 @@ __be32 * xdr_reserve_space(struct xdr_st + EXPORT_SYMBOL_GPL(xdr_reserve_space); + + /** ++ * xdr_rewind_stream - rewind a stream back to some checkpoint ++ * @xdr: pointer to xdr_stream ++ * @q: some checkpoint at historical place of @xdr ++ * ++ * Restors an xdr stream to some historical point. @q must be ++ * a logical xdr point in the past that was sampled by @q = @xdr->p. ++ */ ++__be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q) ++{ ++ size_t nbytes = (xdr->p - q) << 2; ++ ++ BUG_ON(xdr->p < q); ++ BUG_ON(nbytes > xdr->iov->iov_len || nbytes > xdr->buf->len); ++ xdr->p = q; ++ xdr->iov->iov_len -= nbytes; ++ xdr->buf->len -= nbytes; ++ return q; ++} ++EXPORT_SYMBOL_GPL(xdr_rewind_stream); ++ ++/** + * xdr_write_pages - Insert a list of pages into an XDR buffer for sending + * @xdr: pointer to xdr_stream + * @pages: list of pages From d38bc48c5fab3bc830566f201c6236a0f18c395e Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Mon, 23 Aug 2010 14:15:46 -0400 Subject: [PATCH 12/20] Fixed a couple compile errors in the server code. Signed-off-by: Steve Dickson --- nfsd-35-fc.patch | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/nfsd-35-fc.patch b/nfsd-35-fc.patch index ef99b4995..2825464af 100644 --- a/nfsd-35-fc.patch +++ b/nfsd-35-fc.patch @@ -1,6 +1,6 @@ diff -up linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt --- linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt 2010-08-23 09:57:18.233564439 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt 2010-08-23 14:12:24.165356789 -0400 @@ -137,7 +137,7 @@ NS*| OPENATTR | OPT | | READ | REQ | | Section 18.22 | | READDIR | REQ | | Section 18.23 | @@ -12,7 +12,7 @@ diff -up linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig | RENAME | REQ | | Section 18.26 | diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c --- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 09:57:18.234564075 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 14:12:24.519356675 -0400 @@ -259,10 +259,9 @@ static struct cache_detail svc_expkey_ca .alloc = expkey_alloc, }; @@ -108,7 +108,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/e out_put_clp: diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c --- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 10:00:37.257414684 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 14:12:52.625429773 -0400 @@ -79,11 +79,6 @@ enum nfs_cb_opnum4 { cb_sequence_dec_sz + \ op_dec_sz) @@ -211,7 +211,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/ int status; - status = rpc_call_async(cb->cb_client, &msg, -+ status = rpc_call_async(cb->cl_cb_client, &msg, ++ status = rpc_call_async(clp->cl_cb_client, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, &nfsd4_cb_probe_ops, (void *)clp); - if (status) { @@ -402,7 +402,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/ } diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c --- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 09:57:18.237376763 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 14:12:25.698356909 -0400 @@ -969,20 +969,36 @@ static struct nfsd4_operation nfsd4_ops[ static const char *nfsd4_op_name(unsigned opnum); @@ -490,7 +490,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd static const char *nfsd4_op_name(unsigned opnum) diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c --- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 09:57:18.240356512 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 14:12:25.700356284 -0400 @@ -45,8 +45,8 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -1280,9 +1280,21 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs -{ - user_lease_time = leasetime; -} +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-08-23 14:14:22.882428704 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-23 14:14:33.418376589 -0400 +@@ -1900,7 +1900,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s + if (bmval0 & FATTR4_WORD0_LEASE_TIME) { + if ((buflen -= 4) < 0) + goto out_resource; +- WRITE32(NFSD_LEASE_TIME); ++ WRITE32(nfsd4_lease); + } + if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { + if ((buflen -= 4) < 0) diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c --- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 09:57:20.629370282 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 14:12:25.821359224 -0400 @@ -46,6 +46,7 @@ enum { */ #ifdef CONFIG_NFSD_V4 @@ -1403,7 +1415,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/n /* last one */ {""} diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h --- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 09:57:20.629370282 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 14:12:25.835418441 -0400 @@ -82,7 +82,6 @@ int nfs4_state_init(void); void nfsd4_free_slabs(void); int nfs4_state_start(void); @@ -1440,7 +1452,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfs /* diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h --- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 09:57:21.807501619 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 14:12:25.836366516 -0400 @@ -70,6 +70,16 @@ struct nfsd4_cb_sequence { struct nfs4_client *cbs_clp; }; @@ -1558,7 +1570,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/st nfs4_put_stateowner(struct nfs4_stateowner *so) diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h --- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 09:57:23.994379831 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 14:12:25.837387292 -0400 @@ -381,6 +381,10 @@ struct nfsd4_destroy_session { struct nfs4_sessionid sessionid; }; @@ -1600,7 +1612,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h --- linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h 2010-08-23 09:57:23.994379831 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h 2010-08-23 14:12:25.838377224 -0400 @@ -40,12 +40,12 @@ struct nfs_fhbase_old { * This is the new flexible, extensible style NFSv2/v3 file handle. * by Neil Brown - March 2000 @@ -1619,7 +1631,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig linux-2.6.34.noarch * This might allow a file to be confirmed to be in a writable part of a diff -up linux-2.6.34.noarch/net/sunrpc/cache.c.orig linux-2.6.34.noarch/net/sunrpc/cache.c --- linux-2.6.34.noarch/net/sunrpc/cache.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/cache.c 2010-08-23 09:57:23.995376793 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/cache.c 2010-08-23 14:12:25.839376838 -0400 @@ -49,11 +49,17 @@ static void cache_init(struct cache_head h->last_refresh = now; } @@ -1686,7 +1698,7 @@ diff -up linux-2.6.34.noarch/net/sunrpc/cache.c.orig linux-2.6.34.noarch/net/sun /* entry is valid */ diff -up linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig linux-2.6.34.noarch/net/sunrpc/svcsock.c --- linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/svcsock.c 2010-08-23 09:57:23.997368707 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/svcsock.c 2010-08-23 14:12:25.840384371 -0400 @@ -547,7 +547,6 @@ static int svc_udp_recvfrom(struct svc_r dprintk("svc: recvfrom returned error %d\n", -err); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); @@ -1753,7 +1765,7 @@ diff -up linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig linux-2.6.34.noarch/net/s error: diff -up linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig linux-2.6.34.noarch/net/sunrpc/svc_xprt.c --- linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/svc_xprt.c 2010-08-23 09:57:23.996377209 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/svc_xprt.c 2010-08-23 14:12:25.841371223 -0400 @@ -744,8 +744,10 @@ int svc_recv(struct svc_rqst *rqstp, lon if (rqstp->rq_deferred) { svc_xprt_received(xprt); @@ -1782,7 +1794,7 @@ diff -up linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig linux-2.6.34.noarch/net/ void svc_close_xprt(struct svc_xprt *xprt) diff -up linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c.orig linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c --- linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 2010-08-23 09:57:23.998377481 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 2010-08-23 14:12:25.842376584 -0400 @@ -566,7 +566,6 @@ static int rdma_read_complete(struct svc ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, rqstp->rq_arg.head[0].iov_len); From 268a34d036fc07cca40dcb828de2ef224502ce8c Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Tue, 24 Aug 2010 14:49:23 -0400 Subject: [PATCH 13/20] Removed the localversion-pnfs file from the pnfs patch Signed-off-by: Steve Dickson --- kernel.spec | 2 +- pnfs-all-2.6.35-2010-08-19-f13.patch | 395 +++++++++++++-------------- 2 files changed, 196 insertions(+), 201 deletions(-) diff --git a/kernel.spec b/kernel.spec index 6e4442efc..2a47977aa 100644 --- a/kernel.spec +++ b/kernel.spec @@ -23,7 +23,7 @@ Summary: The Linux kernel # # (Uncomment the '#' and both spaces below to set the buildid.) # -%define buildid .pnfs_all_2.6.35_2010_08_19 +%define buildid .pnfs34.2010.08.19 ################################################################### # The buildid can also be specified on the rpmbuild command line diff --git a/pnfs-all-2.6.35-2010-08-19-f13.patch b/pnfs-all-2.6.35-2010-08-19-f13.patch index a9d78ba0e..10df9b15c 100644 --- a/pnfs-all-2.6.35-2010-08-19-f13.patch +++ b/pnfs-all-2.6.35-2010-08-19-f13.patch @@ -1,6 +1,6 @@ diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arch/um/os-Linux/mem.c ---- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-08-23 12:08:27.310584826 -0400 -+++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-08-23 12:09:03.273553977 -0400 +--- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-08-24 14:14:03.643355000 -0400 ++++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-08-24 14:17:48.415730000 -0400 @@ -13,6 +13,7 @@ #include #include @@ -11,7 +11,7 @@ diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arc #include "os.h" diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd.c --- linux-2.6.34.noarch/block/genhd.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/block/genhd.c 2010-08-23 12:09:03.273553977 -0400 ++++ linux-2.6.34.noarch/block/genhd.c 2010-08-24 14:17:48.421730000 -0400 @@ -1009,6 +1009,7 @@ static void disk_release(struct device * struct class block_class = { .name = "block", @@ -21,8 +21,8 @@ diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd. static char *block_devnode(struct device *dev, mode_t *mode) { diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt ---- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-23 12:09:03.274563927 -0400 -+++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-08-23 12:09:03.274563927 -0400 +--- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-24 14:17:48.423729000 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-08-24 14:17:48.425730000 -0400 @@ -0,0 +1,211 @@ +(c) 2007 Network Appliance Inc. + @@ -237,7 +237,7 @@ diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6. + diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/drivers/md/dm-ioctl.c --- linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-08-23 12:09:03.275584050 -0400 ++++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-08-24 14:17:48.430730000 -0400 @@ -651,6 +651,12 @@ static int dev_create(struct dm_ioctl *p return r; } @@ -292,7 +292,7 @@ diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/driv int r; diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drivers/scsi/hosts.c --- linux-2.6.34.noarch/drivers/scsi/hosts.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-08-23 12:09:03.276563906 -0400 ++++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-08-24 14:17:48.435733000 -0400 @@ -49,7 +49,7 @@ static void scsi_host_cls_release(struct put_device(&class_to_shost(dev)->shost_gendev); } @@ -304,7 +304,7 @@ diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drive }; diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/exofs.h --- linux-2.6.34.noarch/fs/exofs/exofs.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-08-23 12:09:03.277563890 -0400 ++++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-08-24 14:17:48.440733000 -0400 @@ -36,13 +36,9 @@ #include #include @@ -360,8 +360,8 @@ diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/ + #endif diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs/export.c ---- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-08-23 12:09:03.278386746 -0400 -+++ linux-2.6.34.noarch/fs/exofs/export.c 2010-08-23 12:09:03.278386746 -0400 +--- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-08-24 14:17:48.444731000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/export.c 2010-08-24 14:17:48.446730000 -0400 @@ -0,0 +1,396 @@ +/* + * export.c - Implementation of the pnfs_export_operations @@ -761,7 +761,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs +} diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/inode.c --- linux-2.6.34.noarch/fs/exofs/inode.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-08-23 12:09:03.279502002 -0400 ++++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-08-24 14:17:48.452730000 -0400 @@ -833,7 +833,7 @@ void exofs_truncate(struct inode *inode) if (unlikely(wait_obj_created(oi))) goto fail; @@ -781,7 +781,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/ * Fill in an inode read from the OSD and set it up for use diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/Kbuild --- linux-2.6.34.noarch/fs/exofs/Kbuild.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-08-23 12:09:03.279502002 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-08-24 14:17:48.457733000 -0400 @@ -13,4 +13,5 @@ # @@ -790,7 +790,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/K obj-$(CONFIG_EXOFS_FS) += exofs.o diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/Kconfig --- linux-2.6.34.noarch/fs/exofs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-08-23 12:09:03.280553663 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-08-24 14:17:48.462739000 -0400 @@ -1,6 +1,7 @@ config EXOFS_FS tristate "exofs: OSD based file system support" @@ -801,7 +801,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/ as its backing storage. diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/super.c --- linux-2.6.34.noarch/fs/exofs/super.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/super.c 2010-08-23 12:09:03.281511951 -0400 ++++ linux-2.6.34.noarch/fs/exofs/super.c 2010-08-24 14:17:48.468730000 -0400 @@ -621,6 +621,7 @@ static int exofs_fill_super(struct super sb->s_fs_info = sbi; sb->s_op = &exofs_sops; @@ -812,7 +812,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/ EXOFS_ERR("ERROR: exofs_iget failed\n"); diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exportfs/expfs.c --- linux-2.6.34.noarch/fs/exportfs/expfs.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-08-23 12:09:03.282511528 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-08-24 14:17:48.473730000 -0400 @@ -16,6 +16,13 @@ #include #include @@ -829,7 +829,7 @@ diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exp diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/exportfs/Makefile --- linux-2.6.34.noarch/fs/exportfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-08-23 12:09:03.282511528 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-08-24 14:17:48.478733000 -0400 @@ -3,4 +3,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs.o @@ -840,8 +840,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/ex +exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o +exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c ---- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-23 12:09:03.283511561 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-23 12:09:03.283511561 -0400 +--- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-24 14:17:48.482731000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-24 14:17:48.484734000 -0400 @@ -0,0 +1,158 @@ +/* + * linux/fs/nfsd/nfs4blocklayoutxdr.c @@ -1002,8 +1002,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34. +} +EXPORT_SYMBOL_GPL(blocklayout_encode_layout); diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c ---- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-23 12:09:03.283511561 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-23 12:09:03.283511561 -0400 +--- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-24 14:17:48.487733000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-24 14:17:48.489734000 -0400 @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -1224,8 +1224,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.n +} +EXPORT_SYMBOL(filelayout_encode_layout); diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c ---- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-23 12:09:03.284511493 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-23 12:09:03.284511493 -0400 +--- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-24 14:17:48.493729000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-24 14:17:48.494735000 -0400 @@ -0,0 +1,289 @@ +/* + * pnfs_osd_xdr_enc.c @@ -1518,7 +1518,7 @@ diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.no +EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gfs2/ops_fstype.c --- linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-08-23 12:09:03.285539075 -0400 ++++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-08-24 14:17:48.499730000 -0400 @@ -19,6 +19,7 @@ #include #include @@ -1539,7 +1539,7 @@ diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gf sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig --- linux-2.6.34.noarch/fs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/Kconfig 2010-08-23 12:09:03.286512316 -0400 ++++ linux-2.6.34.noarch/fs/Kconfig 2010-08-24 14:17:48.505733000 -0400 @@ -224,6 +224,31 @@ config LOCKD_V4 config EXPORTFS tristate @@ -1573,8 +1573,8 @@ diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig tristate select FS_POSIX_ACL diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-23 12:09:03.287381619 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-23 12:09:03.287381619 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-24 14:17:48.509734000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-24 14:17:48.511732000 -0400 @@ -0,0 +1,66 @@ +#include +#include @@ -1643,8 +1643,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.or + return; +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-23 12:09:03.288501648 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-23 12:09:03.288501648 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-24 14:17:48.514733000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-24 14:17:48.516731000 -0400 @@ -0,0 +1,1160 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.c @@ -2807,8 +2807,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34. +module_init(nfs4blocklayout_init); +module_exit(nfs4blocklayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-23 12:09:03.289501933 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-23 12:09:03.289501933 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-24 14:17:48.519731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-24 14:17:48.521730000 -0400 @@ -0,0 +1,335 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdev.c @@ -3146,8 +3146,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6. + goto out; +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-23 12:09:03.290395707 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-23 12:09:03.290395707 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-24 14:17:48.523733000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-24 14:17:48.525730000 -0400 @@ -0,0 +1,120 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdm.c @@ -3270,8 +3270,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.3 + } +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-23 12:09:03.290395707 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-23 12:09:03.291501560 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-24 14:17:48.528729000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-24 14:17:48.529735000 -0400 @@ -0,0 +1,303 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h @@ -3577,8 +3577,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34. + +#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-23 12:09:03.292511531 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-08-23 12:09:03.292511531 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-24 14:17:48.532731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-08-24 14:17:48.534734000 -0400 @@ -0,0 +1,948 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h @@ -4529,8 +4529,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noar + } +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile ---- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-23 12:09:03.292511531 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-08-23 12:09:03.293491476 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-24 14:17:48.537729000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-08-24 14:17:48.538739000 -0400 @@ -0,0 +1,6 @@ +# +# Makefile for the pNFS block layout driver kernel module @@ -4540,7 +4540,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarc + extents.o block-device-discovery-pipe.o diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/callback.h --- linux-2.6.34.noarch/fs/nfs/callback.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-08-23 12:09:03.293491476 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-08-24 14:17:48.544730000 -0400 @@ -8,6 +8,8 @@ #ifndef __LINUX_FS_NFS_CALLBACK_H #define __LINUX_FS_NFS_CALLBACK_H @@ -4613,7 +4613,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/c extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/nfs/callback_proc.c --- linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-08-23 12:09:03.294522414 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-08-24 14:17:48.562731000 -0400 @@ -8,10 +8,15 @@ #include #include @@ -5096,7 +5096,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/ return status; diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/nfs/callback_xdr.c --- linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-08-23 12:09:03.295502055 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-08-24 14:17:48.568730000 -0400 @@ -22,6 +22,8 @@ #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) @@ -5298,8 +5298,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/n .process_op = (callback_process_op_t)nfs4_callback_sequence, .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/client.c ---- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-08-23 12:08:29.037481540 -0400 -+++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-23 12:09:03.297501650 -0400 +--- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-08-24 14:14:13.062705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-24 14:17:48.575730000 -0400 @@ -39,6 +39,7 @@ #include #include @@ -5508,8 +5508,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/cli goto error; diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/bl_com.c ---- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-08-23 12:09:03.297501650 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-08-23 12:09:03.298501447 -0400 +--- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-08-24 14:17:48.578729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-08-24 14:17:48.579735000 -0400 @@ -0,0 +1,292 @@ +#if defined(CONFIG_SPNFS_BLOCK) + @@ -5804,8 +5804,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/b +} +#endif /* CONFIG_SPNFS_BLOCK */ diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/bl_ops.c ---- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-08-23 12:09:03.299501445 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-08-23 12:09:03.299501445 -0400 +--- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-08-24 14:17:48.584729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-08-24 14:17:48.586730000 -0400 @@ -0,0 +1,1672 @@ +/* + * bl_ops.c @@ -7480,8 +7480,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/b + +#endif /* CONFIG_SPNFS_BLOCK */ diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs/delegation.c ---- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-08-23 12:08:29.037481540 -0400 -+++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-23 12:09:03.300491952 -0400 +--- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-08-24 14:14:13.068705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-24 14:17:48.592730000 -0400 @@ -104,7 +104,8 @@ again: continue; if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) @@ -7558,7 +7558,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs rcu_read_unlock(); diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs/delegation.h --- linux-2.6.34.noarch/fs/nfs/delegation.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-08-23 12:09:03.301431797 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-08-24 14:17:48.597733000 -0400 @@ -34,9 +34,7 @@ enum { int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); @@ -7571,8 +7571,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c ---- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-08-23 12:08:29.089481525 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 12:09:03.302511603 -0400 +--- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-08-24 14:14:13.612707000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-24 14:17:48.604730000 -0400 @@ -17,11 +17,19 @@ #include #include @@ -7750,7 +7750,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/e svcauth_unix_purge(); diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/direct.c --- linux-2.6.34.noarch/fs/nfs/direct.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-08-23 12:09:03.303491500 -0400 ++++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-08-24 14:17:48.610730000 -0400 @@ -267,6 +267,38 @@ static const struct rpc_call_ops nfs_rea .rpc_release = nfs_direct_read_release, }; @@ -7996,7 +7996,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/dir user_addr += bytes; diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kconfig --- linux-2.6.34.noarch/fs/nfsd/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-08-23 12:09:03.304505472 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-08-24 14:17:48.616730000 -0400 @@ -79,3 +79,52 @@ config NFSD_V4 available from http://linux-nfs.org/. @@ -8052,7 +8052,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kc + If unsure, say N. diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/Makefile --- linux-2.6.34.noarch/fs/nfsd/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-08-23 12:09:03.304505472 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-08-24 14:17:48.621733000 -0400 @@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ @@ -8062,8 +8062,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/M +nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o +nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-23 12:08:29.090501507 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 12:09:03.306491345 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-24 14:14:13.618705000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-24 14:17:48.628730000 -0400 @@ -40,7 +40,6 @@ #define NFSPROC4_CB_NULL 0 @@ -8603,8 +8603,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/ +} +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-23 12:09:03.307491492 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-23 12:09:03.308491262 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-24 14:17:48.633729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-24 14:17:48.641730000 -0400 @@ -0,0 +1,1679 @@ +/****************************************************************************** + * @@ -10286,8 +10286,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfs + return status; +} diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-23 12:09:03.309501439 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-23 12:09:03.309501439 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-24 14:17:48.645731000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-24 14:17:48.647730000 -0400 @@ -0,0 +1,461 @@ +/****************************************************************************** + * @@ -10751,8 +10751,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/n +}; +EXPORT_SYMBOL(pnfs_dlm_export_ops); diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-23 12:09:03.310501390 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-23 12:09:03.310501390 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-24 14:17:48.651729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-24 14:17:48.652735000 -0400 @@ -0,0 +1,620 @@ +/* +* linux/fs/nfsd/nfs4pnfsds.c @@ -11375,8 +11375,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nf + +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-23 12:08:29.091491685 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 12:09:03.311501496 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-24 14:14:13.623707000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-24 14:17:48.658733000 -0400 @@ -34,10 +34,14 @@ */ #include @@ -11851,8 +11851,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd static const char *nfsd4_op_name(unsigned opnum) diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-08-23 12:08:29.093491375 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 12:09:03.313491310 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-08-24 14:14:13.632707000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-24 14:17:48.667732000 -0400 @@ -42,6 +42,8 @@ #include "xdr4.h" #include "vfs.h" @@ -12368,8 +12368,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs } diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-23 12:09:03.315491356 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-08-24 14:14:13.639707000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-24 14:17:48.675730000 -0400 @@ -47,9 +47,14 @@ #include #include @@ -12988,8 +12988,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/ [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c ---- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-08-23 12:08:29.094491943 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 12:09:03.317501495 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-08-24 14:14:13.645705000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-24 14:17:48.681730000 -0400 @@ -13,10 +13,15 @@ #include #include @@ -13166,8 +13166,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/n remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h ---- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-08-23 12:08:29.095491390 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 12:09:03.318355741 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-08-24 14:14:13.651705000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-24 14:17:48.687730000 -0400 @@ -285,11 +285,17 @@ extern time_t nfsd4_grace; #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ NFSD4_SUPPORTED_ATTRS_WORD0 @@ -13189,7 +13189,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfs { diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.c --- linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-08-23 12:09:03.319511586 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-08-24 14:17:48.693730000 -0400 @@ -10,6 +10,7 @@ #include @@ -13227,7 +13227,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nf __u32 tfh[2]; diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.h --- linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-08-23 12:09:03.319511586 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-08-24 14:17:48.698733000 -0400 @@ -14,6 +14,7 @@ enum nfsd_fsid { FSID_UUID8, FSID_UUID16, @@ -13280,8 +13280,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nf + #endif /* _LINUX_NFSD_FH_INT_H */ diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/nfssvc.c ---- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-08-23 12:08:27.631563969 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-08-23 12:09:03.320416974 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-08-24 14:14:06.365163000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-08-24 14:17:48.704731000 -0400 @@ -115,7 +115,7 @@ struct svc_program nfsd_program = { }; @@ -13292,8 +13292,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/n int nfsd_vers(int vers, enum vers_op change) { diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pnfsd.h ---- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-08-23 12:09:03.321376171 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-08-23 12:09:03.321376171 -0400 +--- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-08-24 14:17:48.708729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-08-24 14:17:48.710730000 -0400 @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2005 The Regents of the University of Michigan. @@ -13439,8 +13439,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pn + +#endif /* LINUX_NFSD_PNFSD_H */ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c ---- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-23 12:09:03.321376171 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-23 12:09:03.322501672 -0400 +--- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-24 14:17:48.713731000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-24 14:17:48.715730000 -0400 @@ -0,0 +1,225 @@ +/* + * linux/fs/nfsd/pnfs_lexp.c @@ -13668,8 +13668,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nf + inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; +} diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_com.c ---- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-23 12:09:03.322501672 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-08-23 12:09:03.323511608 -0400 +--- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-24 14:17:48.719729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-08-24 14:17:48.720735000 -0400 @@ -0,0 +1,535 @@ +/* + * fs/nfsd/spnfs_com.c @@ -14207,8 +14207,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfs +} +#endif /* CONFIG_PROC_FS */ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c ---- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-23 12:09:03.324501390 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-08-23 12:09:03.324501390 -0400 +--- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-24 14:17:48.724733000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-08-24 14:17:48.726730000 -0400 @@ -0,0 +1,878 @@ +/* + * fs/nfsd/spnfs_ops.c @@ -15089,8 +15089,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfs + return 0; +} diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h ---- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-08-23 12:08:29.096512142 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 12:09:03.325501424 -0400 +--- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-08-24 14:14:13.656705000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-24 14:17:48.731738000 -0400 @@ -242,6 +242,12 @@ struct nfs4_client { u32 cl_cb_seq_nr; struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ @@ -15207,8 +15207,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/st + #endif /* NFSD4_STATE_H */ diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs.c ---- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-08-23 12:08:27.632564132 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-08-23 12:09:03.326501490 -0400 +--- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-08-24 14:14:06.371160000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-08-24 14:17:48.737742000 -0400 @@ -37,7 +37,12 @@ #ifdef CONFIG_NFSD_V4 #include @@ -15335,8 +15335,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs. out_nfserr: err = nfserrno(host_err); diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h ---- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-08-23 12:08:29.097425997 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 12:09:03.327451643 -0400 +--- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-08-24 14:14:13.661705000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-24 14:17:48.743747000 -0400 @@ -37,6 +37,8 @@ #ifndef _LINUX_NFSD_XDR4_H #define _LINUX_NFSD_XDR4_H @@ -15413,8 +15413,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr struct nfs4_replay * replay; }; diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file.c ---- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-23 12:08:29.039491912 -0400 -+++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-23 12:09:03.328501680 -0400 +--- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-24 14:14:13.079708000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-24 14:17:48.749746000 -0400 @@ -28,6 +28,7 @@ #include #include @@ -15540,8 +15540,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file. if (!ret) return VM_FAULT_LOCKED; diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inode.c ---- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-08-23 12:08:29.042511552 -0400 -+++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-23 12:09:03.329501644 -0400 +--- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-08-24 14:14:13.095705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-24 14:17:48.757730000 -0400 @@ -48,6 +48,7 @@ #include "internal.h" #include "fscache.h" @@ -15755,8 +15755,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inod nfs_fs_proc_exit(); nfsiod_stop(); diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/internal.h ---- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-08-23 12:08:29.042511552 -0400 -+++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-23 12:09:03.330502148 -0400 +--- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-08-24 14:14:13.100708000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-24 14:17:48.763734000 -0400 @@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_serv struct nfs_fattr *); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); @@ -15817,7 +15817,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/i struct page *, struct page *); diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kconfig --- linux-2.6.34.noarch/fs/nfs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-08-23 12:09:03.331395814 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-08-24 14:17:48.769730000 -0400 @@ -79,10 +79,48 @@ config NFS_V4_1 depends on NFS_V4 && EXPERIMENTAL help @@ -15870,7 +15870,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kcon depends on NFS_FS=y && IP_PNP diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Makefile --- linux-2.6.34.noarch/fs/nfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-08-23 12:09:03.331395814 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-08-24 14:17:48.774730000 -0400 @@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4x delegation.o idmap.o \ callback.o callback_xdr.o callback_proc.o \ @@ -15885,8 +15885,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Mak +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ +obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs3proc.c ---- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-08-23 12:08:29.045525837 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-23 12:09:03.332511640 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-08-24 14:14:13.119708000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-24 14:17:48.780730000 -0400 @@ -833,6 +833,7 @@ const struct nfs_rpc_ops nfs_v3_clientop .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs3_dir_inode_operations, @@ -15896,8 +15896,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/n .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-23 12:09:03.333512111 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-08-23 12:09:03.334491472 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-24 14:17:48.784731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-08-24 14:17:48.786730000 -0400 @@ -0,0 +1,765 @@ +/* + * linux/fs/nfs/nfs4filelayout.c @@ -16665,8 +16665,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs +module_init(nfs4filelayout_init); +module_exit(nfs4filelayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-23 12:09:03.334491472 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-23 12:09:03.335501543 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-24 14:17:48.790731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-24 14:17:48.792730000 -0400 @@ -0,0 +1,636 @@ +/* + * linux/fs/nfs/nfs4filelayoutdev.c @@ -17305,8 +17305,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch +} + diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-23 12:09:03.335501543 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-08-23 12:09:03.335501543 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-24 14:17:48.795731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-08-24 14:17:48.796742000 -0400 @@ -0,0 +1,97 @@ +/* + * pnfs_nfs4filelayout.h @@ -17406,8 +17406,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs + +#endif /* FS_NFS_NFS4FILELAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nfs4_fs.h ---- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-23 12:08:29.047512264 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-23 12:09:03.336490079 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-24 14:14:13.130705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-24 14:17:48.802730000 -0400 @@ -45,8 +45,28 @@ enum nfs4_client_state { NFS4CLNT_RECLAIM_NOGRACE, NFS4CLNT_DELEGRETURN, @@ -17556,8 +17556,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nf /* nfs4xdr.c */ diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs4proc.c ---- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-08-23 12:08:29.050481368 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-23 12:09:03.339481253 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-08-24 14:14:13.143709000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-24 14:17:48.811734000 -0400 @@ -49,12 +49,15 @@ #include #include @@ -19223,7 +19223,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/n .setattr = nfs4_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs/nfs4renewd.c --- linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-08-23 12:09:03.341491726 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-08-24 14:17:48.818733000 -0400 @@ -54,17 +54,17 @@ void nfs4_renew_state(struct work_struct *work) @@ -19246,8 +19246,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs spin_lock(&clp->cl_lock); lease = clp->cl_lease_time; diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/nfs4state.c ---- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-08-23 12:08:29.052491341 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-23 12:09:03.342373443 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-08-24 14:14:13.150705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-24 14:17:48.825730000 -0400 @@ -53,6 +53,9 @@ #include "callback.h" #include "delegation.h" @@ -19566,8 +19566,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/ test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs4xdr.c ---- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-23 12:08:29.054481400 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-23 12:09:03.346481283 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-24 14:14:13.159705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-24 14:17:48.834738000 -0400 @@ -50,8 +50,11 @@ #include #include @@ -21078,8 +21078,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nf }; diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild ---- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-23 12:09:03.348511665 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-08-23 12:09:03.348511665 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-24 14:17:48.839734000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-08-24 14:17:48.840742000 -0400 @@ -0,0 +1,11 @@ +# +# Makefile for the pNFS Objects Layout Driver kernel module @@ -21093,8 +21093,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs +panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o +obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-23 12:09:03.349501459 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-23 12:09:03.349501459 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-24 14:17:48.843735000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-24 14:17:48.845739000 -0400 @@ -0,0 +1,1087 @@ +/* + * objio_osd.c @@ -22184,8 +22184,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noar +module_init(objlayout_init); +module_exit(objlayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-23 12:09:03.350491564 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-08-23 12:09:03.350491564 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-24 14:17:48.848735000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-08-24 14:17:48.851730000 -0400 @@ -0,0 +1,790 @@ +/* + * objlayout.c @@ -22978,8 +22978,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noar + .uninitialize_mountpoint = objlayout_uninitialize_mountpoint, +}; diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h ---- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-23 12:09:03.351434439 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-08-23 12:09:03.351434439 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-24 14:17:48.852735000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-08-24 14:17:48.854746000 -0400 @@ -0,0 +1,171 @@ +/* + * objlayout.h @@ -23153,8 +23153,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noar + +#endif /* _OBJLAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-23 12:09:03.352501716 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-23 12:09:03.352501716 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-24 14:17:48.857735000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-24 14:17:48.860740000 -0400 @@ -0,0 +1,734 @@ +/* + * panfs_shim.c @@ -23891,8 +23891,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noa +module_init(panlayout_init); +module_exit(panlayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h ---- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-23 12:09:03.353501685 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-23 12:09:03.353501685 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-24 14:17:48.863734000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-24 14:17:48.864730000 -0400 @@ -0,0 +1,482 @@ +/* + * panfs_shim.h @@ -24377,8 +24377,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noa + +#endif /* _PANLAYOUT_PANFS_SHIM_H */ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-23 12:09:03.354501721 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-23 12:09:03.354501721 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-24 14:17:48.868731000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-24 14:17:48.869739000 -0400 @@ -0,0 +1,435 @@ +/* + * pnfs_osd_xdr.c @@ -24816,8 +24816,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6. + return 0; +} diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/pagelist.c ---- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-08-23 12:08:29.056411363 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-23 12:09:03.355511659 -0400 +--- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-08-24 14:14:13.169705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-24 14:17:48.875733000 -0400 @@ -20,6 +20,7 @@ #include @@ -24940,8 +24940,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/p if (res == INT_MAX) goto out; diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs.c ---- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-08-23 12:09:03.356501413 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-08-23 12:09:03.357481204 -0400 +--- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-08-24 14:17:48.880733000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-08-24 14:17:48.883730000 -0400 @@ -0,0 +1,2027 @@ +/* + * linux/fs/nfs/pnfs.c @@ -26971,8 +26971,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs. +} +EXPORT_SYMBOL(nfs4_put_deviceid_cache); diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs.h ---- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-08-23 12:09:03.358501440 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-08-23 12:09:03.358501440 -0400 +--- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-08-24 14:17:48.886733000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-08-24 14:17:48.887735000 -0400 @@ -0,0 +1,355 @@ +/* + * fs/nfs/pnfs.h @@ -27330,8 +27330,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs. + +#endif /* FS_NFS_PNFS_H */ diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc.c ---- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-08-23 12:08:29.057511533 -0400 -+++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-23 12:09:03.359501471 -0400 +--- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-08-24 14:14:13.174707000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-24 14:17:48.893730000 -0400 @@ -443,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, stru fattr = nfs_alloc_fattr(); status = -ENOMEM; @@ -27359,8 +27359,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc. .getattr = nfs_proc_getattr, .setattr = nfs_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read.c ---- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-08-23 12:08:29.057511533 -0400 -+++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-23 12:09:03.359501471 -0400 +--- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-08-24 14:14:13.179708000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-24 14:17:48.899733000 -0400 @@ -18,8 +18,12 @@ #include #include @@ -27575,8 +27575,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read. nfs_add_stats(inode, NFSIOS_READPAGES, npages); read_complete: diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/super.c ---- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-23 12:08:29.059491391 -0400 -+++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-23 12:09:03.361501458 -0400 +--- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-24 14:14:13.186707000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-24 14:17:48.907729000 -0400 @@ -64,6 +64,7 @@ #include "iostat.h" #include "internal.h" @@ -27624,8 +27624,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/supe #endif diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unlink.c ---- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-08-23 12:08:29.060501485 -0400 -+++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-23 12:09:03.362419975 -0400 +--- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-08-24 14:14:13.192705000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-24 14:17:48.913730000 -0400 @@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task struct nfs_unlinkdata *data = calldata; struct nfs_server *server = NFS_SERVER(data->dir); @@ -27636,8 +27636,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unl return; rpc_call_start(task); diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/write.c ---- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-08-23 12:08:27.630563929 -0400 -+++ linux-2.6.34.noarch/fs/nfs/write.c 2010-08-23 12:09:03.364491337 -0400 +--- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-08-24 14:14:06.360160000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/write.c 2010-08-24 14:17:48.921712000 -0400 @@ -20,6 +20,7 @@ #include #include @@ -28326,7 +28326,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/writ int nfs_wb_page_cancel(struct inode *inode, struct page *page) diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/include/linux/exportfs.h --- linux-2.6.34.noarch/include/linux/exportfs.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-08-23 12:09:03.365501459 -0400 ++++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-08-24 14:17:48.933713000 -0400 @@ -2,6 +2,7 @@ #define LINUX_EXPORTFS_H 1 @@ -28399,8 +28399,8 @@ diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/i +#endif /* CONFIG_PNFSD */ #endif /* LINUX_EXPORTFS_H */ diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/include/linux/exp_xdr.h ---- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-08-23 12:09:03.367491365 -0400 -+++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-08-23 12:09:03.367491365 -0400 +--- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-08-24 14:17:48.945690000 -0400 ++++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-08-24 14:17:48.946693000 -0400 @@ -0,0 +1,141 @@ +#ifndef _LINUX_EXP_XDR_H +#define _LINUX_EXP_XDR_H @@ -28544,8 +28544,8 @@ diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/in +} +#endif /* _LINUX_EXP_XDR_H */ diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include/linux/fs.h ---- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-08-23 12:08:29.021511898 -0400 -+++ linux-2.6.34.noarch/include/linux/fs.h 2010-08-23 12:09:03.369481147 -0400 +--- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-08-24 14:14:13.014707000 -0400 ++++ linux-2.6.34.noarch/include/linux/fs.h 2010-08-24 14:17:48.961675000 -0400 @@ -387,6 +387,7 @@ struct inodes_stat_t { #include @@ -28564,7 +28564,7 @@ diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include struct dentry *s_root; diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/include/linux/nfs4.h --- linux-2.6.34.noarch/include/linux/nfs4.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-08-23 12:09:03.371491472 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-08-24 14:17:48.974681000 -0400 @@ -17,7 +17,10 @@ #define NFS4_BITMAP_SIZE 2 @@ -28694,8 +28694,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/inclu #endif diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfs4_pnfs.h ---- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-23 12:09:03.372501550 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-08-23 12:09:03.372501550 -0400 +--- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-24 14:17:48.986670000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-08-24 14:17:48.989666000 -0400 @@ -0,0 +1,330 @@ +/* + * include/linux/nfs4_pnfs.h @@ -29028,8 +29028,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/ + +#endif /* LINUX_NFS4_PNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarch/include/linux/nfsd4_block.h ---- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-08-23 12:09:03.373491892 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-08-23 12:09:03.374491393 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-08-24 14:17:48.998668000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-08-24 14:17:49.000665000 -0400 @@ -0,0 +1,101 @@ +#ifndef NFSD4_BLOCK +#define NFSD4_BLOCK @@ -29133,8 +29133,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarc +#endif /* NFSD4_BLOCK */ + diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h ---- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-23 12:09:03.375501481 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-08-23 12:09:03.375501481 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-24 14:17:49.012664000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-08-24 14:17:49.013671000 -0400 @@ -0,0 +1,345 @@ +/* + * include/linux/nfsd4_spnfs.h @@ -29483,7 +29483,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarc +#endif /* NFS_SPNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch/include/linux/nfsd/const.h --- linux-2.6.34.noarch/include/linux/nfsd/const.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-08-23 12:09:03.376401789 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-08-24 14:17:49.018668000 -0400 @@ -29,6 +29,7 @@ #ifdef __KERNEL__ @@ -29494,7 +29494,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch * Largest number of bytes we need to allocate for an NFS diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch/include/linux/nfsd/debug.h --- linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-08-23 12:09:03.376401789 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-08-24 14:17:49.024673000 -0400 @@ -32,6 +32,8 @@ #define NFSDDBG_REPCACHE 0x0080 #define NFSDDBG_XDR 0x0100 @@ -29506,7 +29506,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarch/include/linux/nfsd/export.h --- linux-2.6.34.noarch/include/linux/nfsd/export.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-08-23 12:09:03.377481954 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-08-24 14:17:49.030665000 -0400 @@ -100,6 +100,7 @@ struct svc_export { uid_t ex_anon_uid; gid_t ex_anon_gid; @@ -29516,8 +29516,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarc struct nfsd4_fs_locations ex_fslocs; int ex_nflavors; diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h ---- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-23 12:09:03.377481954 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-23 12:09:03.378501747 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-24 14:17:49.033666000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-24 14:17:49.034665000 -0400 @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -29652,8 +29652,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.3 + +#endif /* NFSD_NFS4LAYOUTXDR_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h ---- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-23 12:09:03.378501747 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-23 12:09:03.378501747 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-24 14:17:49.037666000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-24 14:17:49.039665000 -0400 @@ -0,0 +1,54 @@ +/****************************************************************************** + * @@ -29710,8 +29710,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34. + +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h ---- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-23 12:09:03.379487099 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-23 12:09:03.379487099 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-24 14:17:49.042666000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-24 14:17:49.044665000 -0400 @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -29986,7 +29986,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.n +#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noarch/include/linux/nfsd/syscall.h --- linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-08-23 12:09:03.380502500 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-08-24 14:17:49.049665000 -0400 @@ -29,6 +29,7 @@ /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ @@ -30024,8 +30024,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noar union nfsctl_res { diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/include/linux/nfs_fs.h ---- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-23 12:08:29.061494081 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-23 12:09:03.381511751 -0400 +--- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-24 14:14:13.201710000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-24 14:17:49.063666000 -0400 @@ -72,13 +72,20 @@ struct nfs_access_entry { int mask; }; @@ -30124,8 +30124,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/inc #ifdef __KERNEL__ diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/include/linux/nfs_fs_sb.h ---- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-23 12:08:29.062501618 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-23 12:09:03.383491395 -0400 +--- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-24 14:14:13.206708000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-24 14:17:49.077665000 -0400 @@ -15,6 +15,7 @@ struct nlm_host; struct nfs4_sequence_args; struct nfs4_sequence_res; @@ -30200,7 +30200,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/ atomic_t active; /* Keep trace of any activity to this server */ diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch/include/linux/nfs_iostat.h --- linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-08-23 12:09:03.384501540 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-08-24 14:17:49.089668000 -0400 @@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { NFSIOS_SHORTREAD, NFSIOS_SHORTWRITE, @@ -30213,7 +30213,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/include/linux/nfs_page.h --- linux-2.6.34.noarch/include/linux/nfs_page.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-08-23 12:09:03.385491518 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-08-24 14:17:49.103665000 -0400 @@ -39,6 +39,7 @@ struct nfs_page { struct list_head wb_list; /* Defines state of page: */ struct page *wb_page; /* page to read in/write out */ @@ -30262,8 +30262,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/i struct inode *inode, int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/include/linux/nfs_xdr.h ---- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-08-23 12:08:29.062501618 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-23 12:09:03.387491422 -0400 +--- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-08-24 14:14:13.211708000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-24 14:17:49.116665000 -0400 @@ -3,6 +3,8 @@ #include @@ -30415,8 +30415,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/in extern struct rpc_version nfs_version3; extern struct rpc_version nfs_version4; diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.noarch/include/linux/panfs_shim_api.h ---- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-08-23 12:09:03.388491527 -0400 -+++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-08-23 12:09:03.388491527 -0400 +--- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-08-24 14:17:49.128664000 -0400 ++++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-08-24 14:17:49.129670000 -0400 @@ -0,0 +1,57 @@ +#ifndef _PANFS_SHIM_API_H +#define _PANFS_SHIM_API_H @@ -30476,8 +30476,8 @@ diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.no + +#endif /* _PANFS_SHIM_API_H */ diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h ---- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-23 12:09:03.390501461 -0400 -+++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-08-23 12:09:03.390501461 -0400 +--- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-24 14:17:49.141664000 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-08-24 14:17:49.142670000 -0400 @@ -0,0 +1,440 @@ +/* + * pnfs_osd_xdr.h @@ -30920,8 +30920,8 @@ diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noar + +#endif /* __PNFS_OSD_XDR_H__ */ diff -up linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_xdr.h ---- linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig 2010-08-23 12:09:03.391491550 -0400 -+++ linux-2.6.34.noarch/include/linux/pnfs_xdr.h 2010-08-23 12:09:03.391491550 -0400 +--- linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig 2010-08-24 14:17:49.153666000 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_xdr.h 2010-08-24 14:17:49.155665000 -0400 @@ -0,0 +1,134 @@ +/* + * include/linux/pnfs_xdr.h @@ -31059,7 +31059,7 @@ diff -up linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig linux-2.6.34.noarch/i +#endif /* LINUX_PNFS_XDR_H */ diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/include/linux/posix_acl.h --- linux-2.6.34.noarch/include/linux/posix_acl.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-08-23 12:09:03.393501437 -0400 ++++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-08-24 14:17:49.168668000 -0400 @@ -8,6 +8,7 @@ #ifndef __LINUX_POSIX_ACL_H #define __LINUX_POSIX_ACL_H @@ -31070,7 +31070,7 @@ diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/ #define ACL_UNDEFINED_ID (-1) diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h --- linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-08-23 12:09:03.393501437 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-08-24 14:17:49.174665000 -0400 @@ -14,6 +14,8 @@ /* size of an XDR encoding unit in bytes, i.e. 32bit */ #define XDR_UNIT (4) @@ -31082,7 +31082,7 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.n diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h --- linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-23 12:09:03.394512138 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-24 14:17:49.179667000 -0400 @@ -3,6 +3,7 @@ #ifdef __KERNEL__ @@ -31103,8 +31103,8 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.3 struct rpc_pipe_ops { diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h ---- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-23 12:09:03.394512138 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-23 12:09:03.395501822 -0400 +--- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-24 14:17:49.183664000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-24 14:17:49.184674000 -0400 @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2008 The Regents of the University of Michigan. @@ -31219,7 +31219,7 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux +#endif /* _SIMPLE_RPC_PIPEFS_H_ */ diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h --- linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-23 12:09:03.395501822 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-24 14:17:49.190665000 -0400 @@ -166,4 +166,41 @@ static inline char *__svc_print_addr(con return buf; @@ -31263,8 +31263,8 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.n +} #endif /* SUNRPC_SVC_XPRT_H */ diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xdr.h ---- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-23 12:08:29.066475323 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-23 12:09:03.396464612 -0400 +--- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-24 14:14:13.258707000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-24 14:17:49.195672000 -0400 @@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp) return p + 2; } @@ -31287,14 +31287,9 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, unsigned int len); extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); -diff -up linux-2.6.34.noarch/localversion-pnfs.orig linux-2.6.34.noarch/localversion-pnfs ---- linux-2.6.34.noarch/localversion-pnfs.orig 2010-08-23 12:09:03.396464612 -0400 -+++ linux-2.6.34.noarch/localversion-pnfs 2010-08-23 12:09:03.396464612 -0400 -@@ -0,0 +1 @@ -+-pnfs diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/sunrpc/Makefile --- linux-2.6.34.noarch/net/sunrpc/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-08-23 12:09:03.397501662 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-08-24 14:17:49.204668000 -0400 @@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt svc.o svcsock.o svcauth.o svcauth_unix.o \ addr.o rpcb_clnt.o timer.o xdr.o \ @@ -31305,8 +31300,8 @@ diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/su sunrpc-$(CONFIG_PROC_FS) += stats.o sunrpc-$(CONFIG_SYSCTL) += sysctl.o diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c ---- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-23 12:09:03.398522348 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-23 12:09:03.398522348 -0400 +--- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-24 14:17:49.208664000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-24 14:17:49.209670000 -0400 @@ -0,0 +1,424 @@ +/* + * net/sunrpc/simple_rpc_pipefs.c @@ -31733,8 +31728,8 @@ diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.no +} +EXPORT_SYMBOL(pipefs_generic_destroy_msg); diff -up linux-2.6.34.noarch/net/sunrpc/xdr.c.orig linux-2.6.34.noarch/net/sunrpc/xdr.c ---- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-08-23 12:08:29.081501640 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-23 12:09:03.399443371 -0400 +--- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-08-24 14:14:13.447705000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-24 14:17:49.215665000 -0400 @@ -403,16 +403,14 @@ xdr_shrink_pagelen(struct xdr_buf *buf, /* Shift the tail first */ From 93be1cd0134bac1b112038c2c6376d69e4511197 Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Tue, 24 Aug 2010 15:13:05 -0400 Subject: [PATCH 14/20] set the kernel flags --with firmware --with debuginfo --without vdso_install --without debug --without headers Signed-off-by: Steve Dickson --- kernel.spec | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel.spec b/kernel.spec index 2a47977aa..f3e776e20 100644 --- a/kernel.spec +++ b/kernel.spec @@ -101,23 +101,23 @@ Summary: The Linux kernel # kernel-smp (only valid for ppc 32-bit) %define with_smp %{?_without_smp: 0} %{?!_without_smp: 1} # kernel-debug -%define with_debug %{?_without_debug: 0} %{?!_without_debug: 1} +%define with_debug %{?_without_debug: 0} %{?!_without_debug: 0} # kernel-doc -%define with_doc %{?_without_doc: 0} %{?!_without_doc: 1} +%define with_doc %{?_without_doc: 0} %{?!_without_doc: 0} # kernel-headers -%define with_headers %{?_without_headers: 0} %{?!_without_headers: 1} +%define with_headers %{?_without_headers: 0} %{?!_without_headers: 0} # kernel-firmware %define with_firmware %{?_with_firmware: 1} %{?!_with_firmware: 1} # tools/perf -%define with_perftool %{?_without_perftool: 0} %{?!_without_perftool: 1} +%define with_perftool %{?_without_perftool: 0} %{?!_without_perftool: 0} # perf noarch subpkg -%define with_perf %{?_without_perf: 0} %{?!_without_perf: 1} +%define with_perf %{?_without_perf: 0} %{?!_without_perf: 0} # kernel-debuginfo -%define with_debuginfo %{?_without_debuginfo: 0} %{?!_without_debuginfo: 1} +%define with_debuginfo %{?_without_debuginfo: 1} %{?!_without_debuginfo: 1} # kernel-bootwrapper (for creating zImages from kernel + initrd) %define with_bootwrapper %{?_without_bootwrapper: 0} %{?!_without_bootwrapper: 1} # Want to build a the vsdo directories installed -%define with_vdso_install %{?_without_vdso_install: 0} %{?!_without_vdso_install: 1} +%define with_vdso_install %{?_without_vdso_install: 0} %{?!_without_vdso_install: 0} # Build the kernel-doc package, but don't fail the build if it botches. # Here "true" means "continue" and "false" means "fail the build". From 27f38a2984d252110bc12e5f2938f55701c22493 Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Mon, 23 Aug 2010 12:20:57 -0400 Subject: [PATCH 15/20] Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 Signed-off-by: Steve Dickson --- kernel.spec | 2 +- nfsd-35-fc.patch | 62 +++ pnfs-all-2.6.35-2010-08-19-f13.patch | 550 +++++++++++++++++++++++++++ 3 files changed, 613 insertions(+), 1 deletion(-) diff --git a/kernel.spec b/kernel.spec index f3e776e20..f9c6ff212 100644 --- a/kernel.spec +++ b/kernel.spec @@ -23,7 +23,7 @@ Summary: The Linux kernel # # (Uncomment the '#' and both spaces below to set the buildid.) # -%define buildid .pnfs34.2010.08.19 +%define buildid .pnfs_all_2.6.35_2010_08_19 ################################################################### # The buildid can also be specified on the rpmbuild command line diff --git a/nfsd-35-fc.patch b/nfsd-35-fc.patch index 2825464af..9a97fc6ec 100644 --- a/nfsd-35-fc.patch +++ b/nfsd-35-fc.patch @@ -1,6 +1,10 @@ diff -up linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt --- linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt 2010-08-23 14:12:24.165356789 -0400 +======= ++++ linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt 2010-08-23 09:57:18.233564439 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -137,7 +137,7 @@ NS*| OPENATTR | OPT | | READ | REQ | | Section 18.22 | | READDIR | REQ | | Section 18.23 | @@ -12,7 +16,11 @@ diff -up linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig | RENAME | REQ | | Section 18.26 | diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c --- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 14:12:24.519356675 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 09:57:18.234564075 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -259,10 +259,9 @@ static struct cache_detail svc_expkey_ca .alloc = expkey_alloc, }; @@ -108,7 +116,11 @@ diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/e out_put_clp: diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c --- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 14:12:52.625429773 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 10:00:37.257414684 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -79,11 +79,6 @@ enum nfs_cb_opnum4 { cb_sequence_dec_sz + \ op_dec_sz) @@ -211,7 +223,11 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/ int status; - status = rpc_call_async(cb->cb_client, &msg, +<<<<<<< HEAD + status = rpc_call_async(clp->cl_cb_client, &msg, +======= ++ status = rpc_call_async(cb->cl_cb_client, &msg, +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 RPC_TASK_SOFT | RPC_TASK_SOFTCONN, &nfsd4_cb_probe_ops, (void *)clp); - if (status) { @@ -402,7 +418,11 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/ } diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c --- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 14:12:25.698356909 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 09:57:18.237376763 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -969,20 +969,36 @@ static struct nfsd4_operation nfsd4_ops[ static const char *nfsd4_op_name(unsigned opnum); @@ -490,7 +510,11 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd static const char *nfsd4_op_name(unsigned opnum) diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c --- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 14:12:25.700356284 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 09:57:18.240356512 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -45,8 +45,8 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -1280,6 +1304,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs -{ - user_lease_time = leasetime; -} +<<<<<<< HEAD diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c --- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-08-23 14:14:22.882428704 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-23 14:14:33.418376589 -0400 @@ -1295,6 +1320,11 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/ diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c --- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 14:12:25.821359224 -0400 +======= +diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c +--- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 09:57:20.629370282 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -46,6 +46,7 @@ enum { */ #ifdef CONFIG_NFSD_V4 @@ -1415,7 +1445,11 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/n /* last one */ {""} diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h --- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 14:12:25.835418441 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 09:57:20.629370282 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -82,7 +82,6 @@ int nfs4_state_init(void); void nfsd4_free_slabs(void); int nfs4_state_start(void); @@ -1452,7 +1486,11 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfs /* diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h --- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 14:12:25.836366516 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 09:57:21.807501619 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -70,6 +70,16 @@ struct nfsd4_cb_sequence { struct nfs4_client *cbs_clp; }; @@ -1570,7 +1608,11 @@ diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/st nfs4_put_stateowner(struct nfs4_stateowner *so) diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h --- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 14:12:25.837387292 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 09:57:23.994379831 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -381,6 +381,10 @@ struct nfsd4_destroy_session { struct nfs4_sessionid sessionid; }; @@ -1612,7 +1654,11 @@ diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h --- linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h 2010-08-23 14:12:25.838377224 -0400 +======= ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h 2010-08-23 09:57:23.994379831 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -40,12 +40,12 @@ struct nfs_fhbase_old { * This is the new flexible, extensible style NFSv2/v3 file handle. * by Neil Brown - March 2000 @@ -1631,7 +1677,11 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig linux-2.6.34.noarch * This might allow a file to be confirmed to be in a writable part of a diff -up linux-2.6.34.noarch/net/sunrpc/cache.c.orig linux-2.6.34.noarch/net/sunrpc/cache.c --- linux-2.6.34.noarch/net/sunrpc/cache.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/net/sunrpc/cache.c 2010-08-23 14:12:25.839376838 -0400 +======= ++++ linux-2.6.34.noarch/net/sunrpc/cache.c 2010-08-23 09:57:23.995376793 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -49,11 +49,17 @@ static void cache_init(struct cache_head h->last_refresh = now; } @@ -1698,7 +1748,11 @@ diff -up linux-2.6.34.noarch/net/sunrpc/cache.c.orig linux-2.6.34.noarch/net/sun /* entry is valid */ diff -up linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig linux-2.6.34.noarch/net/sunrpc/svcsock.c --- linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/net/sunrpc/svcsock.c 2010-08-23 14:12:25.840384371 -0400 +======= ++++ linux-2.6.34.noarch/net/sunrpc/svcsock.c 2010-08-23 09:57:23.997368707 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -547,7 +547,6 @@ static int svc_udp_recvfrom(struct svc_r dprintk("svc: recvfrom returned error %d\n", -err); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); @@ -1765,7 +1819,11 @@ diff -up linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig linux-2.6.34.noarch/net/s error: diff -up linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig linux-2.6.34.noarch/net/sunrpc/svc_xprt.c --- linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/net/sunrpc/svc_xprt.c 2010-08-23 14:12:25.841371223 -0400 +======= ++++ linux-2.6.34.noarch/net/sunrpc/svc_xprt.c 2010-08-23 09:57:23.996377209 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -744,8 +744,10 @@ int svc_recv(struct svc_rqst *rqstp, lon if (rqstp->rq_deferred) { svc_xprt_received(xprt); @@ -1794,7 +1852,11 @@ diff -up linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig linux-2.6.34.noarch/net/ void svc_close_xprt(struct svc_xprt *xprt) diff -up linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c.orig linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c --- linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 2010-08-23 14:12:25.842376584 -0400 +======= ++++ linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 2010-08-23 09:57:23.998377481 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -566,7 +566,6 @@ static int rdma_read_complete(struct svc ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, rqstp->rq_arg.head[0].iov_len); diff --git a/pnfs-all-2.6.35-2010-08-19-f13.patch b/pnfs-all-2.6.35-2010-08-19-f13.patch index 10df9b15c..ecc100c30 100644 --- a/pnfs-all-2.6.35-2010-08-19-f13.patch +++ b/pnfs-all-2.6.35-2010-08-19-f13.patch @@ -1,6 +1,11 @@ diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arch/um/os-Linux/mem.c +<<<<<<< HEAD --- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-08-24 14:14:03.643355000 -0400 +++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-08-24 14:17:48.415730000 -0400 +======= +--- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-08-23 12:08:27.310584826 -0400 ++++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-08-23 12:09:03.273553977 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -13,6 +13,7 @@ #include #include @@ -11,7 +16,11 @@ diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arc #include "os.h" diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd.c --- linux-2.6.34.noarch/block/genhd.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/block/genhd.c 2010-08-24 14:17:48.421730000 -0400 +======= ++++ linux-2.6.34.noarch/block/genhd.c 2010-08-23 12:09:03.273553977 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -1009,6 +1009,7 @@ static void disk_release(struct device * struct class block_class = { .name = "block", @@ -21,8 +30,13 @@ diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd. static char *block_devnode(struct device *dev, mode_t *mode) { diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt +<<<<<<< HEAD --- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-24 14:17:48.423729000 -0400 +++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-08-24 14:17:48.425730000 -0400 +======= +--- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-23 12:09:03.274563927 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-08-23 12:09:03.274563927 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,211 @@ +(c) 2007 Network Appliance Inc. + @@ -237,7 +251,11 @@ diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6. + diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/drivers/md/dm-ioctl.c --- linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-08-24 14:17:48.430730000 -0400 +======= ++++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-08-23 12:09:03.275584050 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -651,6 +651,12 @@ static int dev_create(struct dm_ioctl *p return r; } @@ -292,7 +310,11 @@ diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/driv int r; diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drivers/scsi/hosts.c --- linux-2.6.34.noarch/drivers/scsi/hosts.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-08-24 14:17:48.435733000 -0400 +======= ++++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-08-23 12:09:03.276563906 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -49,7 +49,7 @@ static void scsi_host_cls_release(struct put_device(&class_to_shost(dev)->shost_gendev); } @@ -304,7 +326,11 @@ diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drive }; diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/exofs.h --- linux-2.6.34.noarch/fs/exofs/exofs.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-08-24 14:17:48.440733000 -0400 +======= ++++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-08-23 12:09:03.277563890 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -36,13 +36,9 @@ #include #include @@ -360,8 +386,13 @@ diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/ + #endif diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs/export.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-08-24 14:17:48.444731000 -0400 +++ linux-2.6.34.noarch/fs/exofs/export.c 2010-08-24 14:17:48.446730000 -0400 +======= +--- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-08-23 12:09:03.278386746 -0400 ++++ linux-2.6.34.noarch/fs/exofs/export.c 2010-08-23 12:09:03.278386746 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,396 @@ +/* + * export.c - Implementation of the pnfs_export_operations @@ -761,7 +792,11 @@ diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs +} diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/inode.c --- linux-2.6.34.noarch/fs/exofs/inode.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-08-24 14:17:48.452730000 -0400 +======= ++++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-08-23 12:09:03.279502002 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -833,7 +833,7 @@ void exofs_truncate(struct inode *inode) if (unlikely(wait_obj_created(oi))) goto fail; @@ -781,7 +816,11 @@ diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/ * Fill in an inode read from the OSD and set it up for use diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/Kbuild --- linux-2.6.34.noarch/fs/exofs/Kbuild.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-08-24 14:17:48.457733000 -0400 +======= ++++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-08-23 12:09:03.279502002 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -13,4 +13,5 @@ # @@ -790,7 +829,11 @@ diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/K obj-$(CONFIG_EXOFS_FS) += exofs.o diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/Kconfig --- linux-2.6.34.noarch/fs/exofs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-08-24 14:17:48.462739000 -0400 +======= ++++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-08-23 12:09:03.280553663 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -1,6 +1,7 @@ config EXOFS_FS tristate "exofs: OSD based file system support" @@ -801,7 +844,11 @@ diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/ as its backing storage. diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/super.c --- linux-2.6.34.noarch/fs/exofs/super.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/exofs/super.c 2010-08-24 14:17:48.468730000 -0400 +======= ++++ linux-2.6.34.noarch/fs/exofs/super.c 2010-08-23 12:09:03.281511951 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -621,6 +621,7 @@ static int exofs_fill_super(struct super sb->s_fs_info = sbi; sb->s_op = &exofs_sops; @@ -812,7 +859,11 @@ diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/ EXOFS_ERR("ERROR: exofs_iget failed\n"); diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exportfs/expfs.c --- linux-2.6.34.noarch/fs/exportfs/expfs.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-08-24 14:17:48.473730000 -0400 +======= ++++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-08-23 12:09:03.282511528 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -16,6 +16,13 @@ #include #include @@ -829,7 +880,11 @@ diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exp diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/exportfs/Makefile --- linux-2.6.34.noarch/fs/exportfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-08-24 14:17:48.478733000 -0400 +======= ++++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-08-23 12:09:03.282511528 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -3,4 +3,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs.o @@ -840,8 +895,13 @@ diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/ex +exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o +exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-24 14:17:48.482731000 -0400 +++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-24 14:17:48.484734000 -0400 +======= +--- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-23 12:09:03.283511561 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-23 12:09:03.283511561 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,158 @@ +/* + * linux/fs/nfsd/nfs4blocklayoutxdr.c @@ -1002,8 +1062,13 @@ diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34. +} +EXPORT_SYMBOL_GPL(blocklayout_encode_layout); diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-24 14:17:48.487733000 -0400 +++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-24 14:17:48.489734000 -0400 +======= +--- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-23 12:09:03.283511561 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-23 12:09:03.283511561 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -1224,8 +1289,13 @@ diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.n +} +EXPORT_SYMBOL(filelayout_encode_layout); diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-24 14:17:48.493729000 -0400 +++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-24 14:17:48.494735000 -0400 +======= +--- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-23 12:09:03.284511493 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-23 12:09:03.284511493 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,289 @@ +/* + * pnfs_osd_xdr_enc.c @@ -1518,7 +1588,11 @@ diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.no +EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gfs2/ops_fstype.c --- linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-08-24 14:17:48.499730000 -0400 +======= ++++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-08-23 12:09:03.285539075 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -19,6 +19,7 @@ #include #include @@ -1539,7 +1613,11 @@ diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gf sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig --- linux-2.6.34.noarch/fs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/Kconfig 2010-08-24 14:17:48.505733000 -0400 +======= ++++ linux-2.6.34.noarch/fs/Kconfig 2010-08-23 12:09:03.286512316 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -224,6 +224,31 @@ config LOCKD_V4 config EXPORTFS tristate @@ -1573,8 +1651,13 @@ diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig tristate select FS_POSIX_ACL diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-24 14:17:48.509734000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-24 14:17:48.511732000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-23 12:09:03.287381619 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-23 12:09:03.287381619 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,66 @@ +#include +#include @@ -1643,8 +1726,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.or + return; +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-24 14:17:48.514733000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-24 14:17:48.516731000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-23 12:09:03.288501648 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-23 12:09:03.288501648 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,1160 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.c @@ -2807,8 +2895,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34. +module_init(nfs4blocklayout_init); +module_exit(nfs4blocklayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-24 14:17:48.519731000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-24 14:17:48.521730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-23 12:09:03.289501933 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-23 12:09:03.289501933 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,335 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdev.c @@ -3146,8 +3239,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6. + goto out; +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-24 14:17:48.523733000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-24 14:17:48.525730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-23 12:09:03.290395707 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-23 12:09:03.290395707 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,120 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdm.c @@ -3270,8 +3368,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.3 + } +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-24 14:17:48.528729000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-24 14:17:48.529735000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-23 12:09:03.290395707 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-23 12:09:03.291501560 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,303 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h @@ -3577,8 +3680,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34. + +#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-24 14:17:48.532731000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-08-24 14:17:48.534734000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-23 12:09:03.292511531 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-08-23 12:09:03.292511531 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,948 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h @@ -4529,8 +4637,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noar + } +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-24 14:17:48.537729000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-08-24 14:17:48.538739000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-23 12:09:03.292511531 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-08-23 12:09:03.293491476 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,6 @@ +# +# Makefile for the pNFS block layout driver kernel module @@ -4540,7 +4653,11 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarc + extents.o block-device-discovery-pipe.o diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/callback.h --- linux-2.6.34.noarch/fs/nfs/callback.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-08-24 14:17:48.544730000 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-08-23 12:09:03.293491476 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -8,6 +8,8 @@ #ifndef __LINUX_FS_NFS_CALLBACK_H #define __LINUX_FS_NFS_CALLBACK_H @@ -4613,7 +4730,11 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/c extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/nfs/callback_proc.c --- linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-08-24 14:17:48.562731000 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-08-23 12:09:03.294522414 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -8,10 +8,15 @@ #include #include @@ -5096,7 +5217,11 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/ return status; diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/nfs/callback_xdr.c --- linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-08-24 14:17:48.568730000 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-08-23 12:09:03.295502055 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -22,6 +22,8 @@ #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) @@ -5298,8 +5423,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/n .process_op = (callback_process_op_t)nfs4_callback_sequence, .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/client.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-08-24 14:14:13.062705000 -0400 +++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-24 14:17:48.575730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-08-23 12:08:29.037481540 -0400 ++++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-23 12:09:03.297501650 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -39,6 +39,7 @@ #include #include @@ -5508,8 +5638,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/cli goto error; diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/bl_com.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-08-24 14:17:48.578729000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-08-24 14:17:48.579735000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-08-23 12:09:03.297501650 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-08-23 12:09:03.298501447 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,292 @@ +#if defined(CONFIG_SPNFS_BLOCK) + @@ -5804,8 +5939,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/b +} +#endif /* CONFIG_SPNFS_BLOCK */ diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/bl_ops.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-08-24 14:17:48.584729000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-08-24 14:17:48.586730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-08-23 12:09:03.299501445 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-08-23 12:09:03.299501445 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,1672 @@ +/* + * bl_ops.c @@ -7480,8 +7620,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/b + +#endif /* CONFIG_SPNFS_BLOCK */ diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs/delegation.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-08-24 14:14:13.068705000 -0400 +++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-24 14:17:48.592730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-08-23 12:08:29.037481540 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-23 12:09:03.300491952 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -104,7 +104,8 @@ again: continue; if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) @@ -7558,7 +7703,11 @@ diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs rcu_read_unlock(); diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs/delegation.h --- linux-2.6.34.noarch/fs/nfs/delegation.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-08-24 14:17:48.597733000 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-08-23 12:09:03.301431797 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -34,9 +34,7 @@ enum { int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); @@ -7571,8 +7720,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-08-24 14:14:13.612707000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-24 14:17:48.604730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-08-23 12:08:29.089481525 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 12:09:03.302511603 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -17,11 +17,19 @@ #include #include @@ -7750,7 +7904,11 @@ diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/e svcauth_unix_purge(); diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/direct.c --- linux-2.6.34.noarch/fs/nfs/direct.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-08-24 14:17:48.610730000 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-08-23 12:09:03.303491500 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -267,6 +267,38 @@ static const struct rpc_call_ops nfs_rea .rpc_release = nfs_direct_read_release, }; @@ -7996,7 +8154,11 @@ diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/dir user_addr += bytes; diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kconfig --- linux-2.6.34.noarch/fs/nfsd/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-08-24 14:17:48.616730000 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-08-23 12:09:03.304505472 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -79,3 +79,52 @@ config NFSD_V4 available from http://linux-nfs.org/. @@ -8052,7 +8214,11 @@ diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kc + If unsure, say N. diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/Makefile --- linux-2.6.34.noarch/fs/nfsd/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-08-24 14:17:48.621733000 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-08-23 12:09:03.304505472 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ @@ -8062,8 +8228,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/M +nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o +nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-24 14:14:13.618705000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-24 14:17:48.628730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-23 12:08:29.090501507 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 12:09:03.306491345 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -40,7 +40,6 @@ #define NFSPROC4_CB_NULL 0 @@ -8603,8 +8774,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/ +} +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-24 14:17:48.633729000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-24 14:17:48.641730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-23 12:09:03.307491492 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-23 12:09:03.308491262 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,1679 @@ +/****************************************************************************** + * @@ -10286,8 +10462,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfs + return status; +} diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-24 14:17:48.645731000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-24 14:17:48.647730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-23 12:09:03.309501439 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-23 12:09:03.309501439 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,461 @@ +/****************************************************************************** + * @@ -10751,8 +10932,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/n +}; +EXPORT_SYMBOL(pnfs_dlm_export_ops); diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-24 14:17:48.651729000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-24 14:17:48.652735000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-23 12:09:03.310501390 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-23 12:09:03.310501390 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,620 @@ +/* +* linux/fs/nfsd/nfs4pnfsds.c @@ -11375,8 +11561,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nf + +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-24 14:14:13.623707000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-24 14:17:48.658733000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-23 12:08:29.091491685 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 12:09:03.311501496 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -34,10 +34,14 @@ */ #include @@ -11851,8 +12042,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd static const char *nfsd4_op_name(unsigned opnum) diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-08-24 14:14:13.632707000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-24 14:17:48.667732000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-08-23 12:08:29.093491375 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 12:09:03.313491310 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -42,6 +42,8 @@ #include "xdr4.h" #include "vfs.h" @@ -12368,8 +12564,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs } diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-08-24 14:14:13.639707000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-24 14:17:48.675730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-23 12:09:03.315491356 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -47,9 +47,14 @@ #include #include @@ -12988,8 +13189,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/ [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-08-24 14:14:13.645705000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-24 14:17:48.681730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-08-23 12:08:29.094491943 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 12:09:03.317501495 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -13,10 +13,15 @@ #include #include @@ -13166,8 +13372,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/n remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-08-24 14:14:13.651705000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-24 14:17:48.687730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-08-23 12:08:29.095491390 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 12:09:03.318355741 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -285,11 +285,17 @@ extern time_t nfsd4_grace; #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ NFSD4_SUPPORTED_ATTRS_WORD0 @@ -13189,7 +13400,11 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfs { diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.c --- linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-08-24 14:17:48.693730000 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-08-23 12:09:03.319511586 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -10,6 +10,7 @@ #include @@ -13227,7 +13442,11 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nf __u32 tfh[2]; diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.h --- linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-08-24 14:17:48.698733000 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-08-23 12:09:03.319511586 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -14,6 +14,7 @@ enum nfsd_fsid { FSID_UUID8, FSID_UUID16, @@ -13280,8 +13499,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nf + #endif /* _LINUX_NFSD_FH_INT_H */ diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/nfssvc.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-08-24 14:14:06.365163000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-08-24 14:17:48.704731000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-08-23 12:08:27.631563969 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-08-23 12:09:03.320416974 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -115,7 +115,7 @@ struct svc_program nfsd_program = { }; @@ -13292,8 +13516,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/n int nfsd_vers(int vers, enum vers_op change) { diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pnfsd.h +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-08-24 14:17:48.708729000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-08-24 14:17:48.710730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-08-23 12:09:03.321376171 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-08-23 12:09:03.321376171 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2005 The Regents of the University of Michigan. @@ -13439,8 +13668,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pn + +#endif /* LINUX_NFSD_PNFSD_H */ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-24 14:17:48.713731000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-24 14:17:48.715730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-23 12:09:03.321376171 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-23 12:09:03.322501672 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,225 @@ +/* + * linux/fs/nfsd/pnfs_lexp.c @@ -13668,8 +13902,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nf + inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; +} diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_com.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-24 14:17:48.719729000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-08-24 14:17:48.720735000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-23 12:09:03.322501672 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-08-23 12:09:03.323511608 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,535 @@ +/* + * fs/nfsd/spnfs_com.c @@ -14207,8 +14446,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfs +} +#endif /* CONFIG_PROC_FS */ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-24 14:17:48.724733000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-08-24 14:17:48.726730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-23 12:09:03.324501390 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-08-23 12:09:03.324501390 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,878 @@ +/* + * fs/nfsd/spnfs_ops.c @@ -15089,8 +15333,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfs + return 0; +} diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-08-24 14:14:13.656705000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-24 14:17:48.731738000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-08-23 12:08:29.096512142 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 12:09:03.325501424 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -242,6 +242,12 @@ struct nfs4_client { u32 cl_cb_seq_nr; struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ @@ -15207,8 +15456,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/st + #endif /* NFSD4_STATE_H */ diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-08-24 14:14:06.371160000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-08-24 14:17:48.737742000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-08-23 12:08:27.632564132 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-08-23 12:09:03.326501490 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -37,7 +37,12 @@ #ifdef CONFIG_NFSD_V4 #include @@ -15335,8 +15589,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs. out_nfserr: err = nfserrno(host_err); diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-08-24 14:14:13.661705000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-24 14:17:48.743747000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-08-23 12:08:29.097425997 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 12:09:03.327451643 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -37,6 +37,8 @@ #ifndef _LINUX_NFSD_XDR4_H #define _LINUX_NFSD_XDR4_H @@ -15413,8 +15672,13 @@ diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr struct nfs4_replay * replay; }; diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-24 14:14:13.079708000 -0400 +++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-24 14:17:48.749746000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-23 12:08:29.039491912 -0400 ++++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-23 12:09:03.328501680 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -28,6 +28,7 @@ #include #include @@ -15540,8 +15804,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file. if (!ret) return VM_FAULT_LOCKED; diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inode.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-08-24 14:14:13.095705000 -0400 +++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-24 14:17:48.757730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-08-23 12:08:29.042511552 -0400 ++++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-23 12:09:03.329501644 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -48,6 +48,7 @@ #include "internal.h" #include "fscache.h" @@ -15755,8 +16024,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inod nfs_fs_proc_exit(); nfsiod_stop(); diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/internal.h +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-08-24 14:14:13.100708000 -0400 +++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-24 14:17:48.763734000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-08-23 12:08:29.042511552 -0400 ++++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-23 12:09:03.330502148 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_serv struct nfs_fattr *); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); @@ -15817,7 +16091,11 @@ diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/i struct page *, struct page *); diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kconfig --- linux-2.6.34.noarch/fs/nfs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-08-24 14:17:48.769730000 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-08-23 12:09:03.331395814 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -79,10 +79,48 @@ config NFS_V4_1 depends on NFS_V4 && EXPERIMENTAL help @@ -15870,7 +16148,11 @@ diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kcon depends on NFS_FS=y && IP_PNP diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Makefile --- linux-2.6.34.noarch/fs/nfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-08-24 14:17:48.774730000 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-08-23 12:09:03.331395814 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4x delegation.o idmap.o \ callback.o callback_xdr.o callback_proc.o \ @@ -15885,8 +16167,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Mak +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ +obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs3proc.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-08-24 14:14:13.119708000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-24 14:17:48.780730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-08-23 12:08:29.045525837 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-23 12:09:03.332511640 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -833,6 +833,7 @@ const struct nfs_rpc_ops nfs_v3_clientop .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs3_dir_inode_operations, @@ -15896,8 +16183,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/n .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-24 14:17:48.784731000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-08-24 14:17:48.786730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-23 12:09:03.333512111 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-08-23 12:09:03.334491472 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,765 @@ +/* + * linux/fs/nfs/nfs4filelayout.c @@ -16665,8 +16957,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs +module_init(nfs4filelayout_init); +module_exit(nfs4filelayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-24 14:17:48.790731000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-24 14:17:48.792730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-23 12:09:03.334491472 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-23 12:09:03.335501543 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,636 @@ +/* + * linux/fs/nfs/nfs4filelayoutdev.c @@ -17305,8 +17602,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch +} + diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-24 14:17:48.795731000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-08-24 14:17:48.796742000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-23 12:09:03.335501543 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-08-23 12:09:03.335501543 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,97 @@ +/* + * pnfs_nfs4filelayout.h @@ -17406,8 +17708,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs + +#endif /* FS_NFS_NFS4FILELAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nfs4_fs.h +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-24 14:14:13.130705000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-24 14:17:48.802730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-23 12:08:29.047512264 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-23 12:09:03.336490079 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -45,8 +45,28 @@ enum nfs4_client_state { NFS4CLNT_RECLAIM_NOGRACE, NFS4CLNT_DELEGRETURN, @@ -17556,8 +17863,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nf /* nfs4xdr.c */ diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs4proc.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-08-24 14:14:13.143709000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-24 14:17:48.811734000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-08-23 12:08:29.050481368 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-23 12:09:03.339481253 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -49,12 +49,15 @@ #include #include @@ -19223,7 +19535,11 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/n .setattr = nfs4_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs/nfs4renewd.c --- linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-08-24 14:17:48.818733000 -0400 +======= ++++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-08-23 12:09:03.341491726 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -54,17 +54,17 @@ void nfs4_renew_state(struct work_struct *work) @@ -19246,8 +19562,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs spin_lock(&clp->cl_lock); lease = clp->cl_lease_time; diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/nfs4state.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-08-24 14:14:13.150705000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-24 14:17:48.825730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-08-23 12:08:29.052491341 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-23 12:09:03.342373443 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -53,6 +53,9 @@ #include "callback.h" #include "delegation.h" @@ -19566,8 +19887,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/ test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs4xdr.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-24 14:14:13.159705000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-24 14:17:48.834738000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-23 12:08:29.054481400 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-23 12:09:03.346481283 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -50,8 +50,11 @@ #include #include @@ -21078,8 +21404,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nf }; diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-24 14:17:48.839734000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-08-24 14:17:48.840742000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-23 12:09:03.348511665 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-08-23 12:09:03.348511665 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,11 @@ +# +# Makefile for the pNFS Objects Layout Driver kernel module @@ -21093,8 +21424,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs +panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o +obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-24 14:17:48.843735000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-24 14:17:48.845739000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-23 12:09:03.349501459 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-23 12:09:03.349501459 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,1087 @@ +/* + * objio_osd.c @@ -22184,8 +22520,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noar +module_init(objlayout_init); +module_exit(objlayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-24 14:17:48.848735000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-08-24 14:17:48.851730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-23 12:09:03.350491564 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-08-23 12:09:03.350491564 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,790 @@ +/* + * objlayout.c @@ -22978,8 +23319,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noar + .uninitialize_mountpoint = objlayout_uninitialize_mountpoint, +}; diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-24 14:17:48.852735000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-08-24 14:17:48.854746000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-23 12:09:03.351434439 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-08-23 12:09:03.351434439 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,171 @@ +/* + * objlayout.h @@ -23153,8 +23499,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noar + +#endif /* _OBJLAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-24 14:17:48.857735000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-24 14:17:48.860740000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-23 12:09:03.352501716 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-23 12:09:03.352501716 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,734 @@ +/* + * panfs_shim.c @@ -23891,8 +24242,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noa +module_init(panlayout_init); +module_exit(panlayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-24 14:17:48.863734000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-24 14:17:48.864730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-23 12:09:03.353501685 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-23 12:09:03.353501685 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,482 @@ +/* + * panfs_shim.h @@ -24377,8 +24733,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noa + +#endif /* _PANLAYOUT_PANFS_SHIM_H */ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-24 14:17:48.868731000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-24 14:17:48.869739000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-23 12:09:03.354501721 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-23 12:09:03.354501721 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,435 @@ +/* + * pnfs_osd_xdr.c @@ -24816,8 +25177,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6. + return 0; +} diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/pagelist.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-08-24 14:14:13.169705000 -0400 +++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-24 14:17:48.875733000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-08-23 12:08:29.056411363 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-23 12:09:03.355511659 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -20,6 +20,7 @@ #include @@ -24940,8 +25306,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/p if (res == INT_MAX) goto out; diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-08-24 14:17:48.880733000 -0400 +++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-08-24 14:17:48.883730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-08-23 12:09:03.356501413 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-08-23 12:09:03.357481204 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,2027 @@ +/* + * linux/fs/nfs/pnfs.c @@ -26971,8 +27342,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs. +} +EXPORT_SYMBOL(nfs4_put_deviceid_cache); diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs.h +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-08-24 14:17:48.886733000 -0400 +++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-08-24 14:17:48.887735000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-08-23 12:09:03.358501440 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-08-23 12:09:03.358501440 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,355 @@ +/* + * fs/nfs/pnfs.h @@ -27330,8 +27706,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs. + +#endif /* FS_NFS_PNFS_H */ diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-08-24 14:14:13.174707000 -0400 +++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-24 14:17:48.893730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-08-23 12:08:29.057511533 -0400 ++++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-23 12:09:03.359501471 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -443,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, stru fattr = nfs_alloc_fattr(); status = -ENOMEM; @@ -27359,8 +27740,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc. .getattr = nfs_proc_getattr, .setattr = nfs_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-08-24 14:14:13.179708000 -0400 +++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-24 14:17:48.899733000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-08-23 12:08:29.057511533 -0400 ++++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-23 12:09:03.359501471 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -18,8 +18,12 @@ #include #include @@ -27575,8 +27961,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read. nfs_add_stats(inode, NFSIOS_READPAGES, npages); read_complete: diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/super.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-24 14:14:13.186707000 -0400 +++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-24 14:17:48.907729000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-23 12:08:29.059491391 -0400 ++++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-23 12:09:03.361501458 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -64,6 +64,7 @@ #include "iostat.h" #include "internal.h" @@ -27624,8 +28015,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/supe #endif diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unlink.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-08-24 14:14:13.192705000 -0400 +++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-24 14:17:48.913730000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-08-23 12:08:29.060501485 -0400 ++++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-23 12:09:03.362419975 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task struct nfs_unlinkdata *data = calldata; struct nfs_server *server = NFS_SERVER(data->dir); @@ -27636,8 +28032,13 @@ diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unl return; rpc_call_start(task); diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/write.c +<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-08-24 14:14:06.360160000 -0400 +++ linux-2.6.34.noarch/fs/nfs/write.c 2010-08-24 14:17:48.921712000 -0400 +======= +--- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-08-23 12:08:27.630563929 -0400 ++++ linux-2.6.34.noarch/fs/nfs/write.c 2010-08-23 12:09:03.364491337 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -20,6 +20,7 @@ #include #include @@ -28326,7 +28727,11 @@ diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/writ int nfs_wb_page_cancel(struct inode *inode, struct page *page) diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/include/linux/exportfs.h --- linux-2.6.34.noarch/include/linux/exportfs.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-08-24 14:17:48.933713000 -0400 +======= ++++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-08-23 12:09:03.365501459 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -2,6 +2,7 @@ #define LINUX_EXPORTFS_H 1 @@ -28399,8 +28804,13 @@ diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/i +#endif /* CONFIG_PNFSD */ #endif /* LINUX_EXPORTFS_H */ diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/include/linux/exp_xdr.h +<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-08-24 14:17:48.945690000 -0400 +++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-08-24 14:17:48.946693000 -0400 +======= +--- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-08-23 12:09:03.367491365 -0400 ++++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-08-23 12:09:03.367491365 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,141 @@ +#ifndef _LINUX_EXP_XDR_H +#define _LINUX_EXP_XDR_H @@ -28544,8 +28954,13 @@ diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/in +} +#endif /* _LINUX_EXP_XDR_H */ diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include/linux/fs.h +<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-08-24 14:14:13.014707000 -0400 +++ linux-2.6.34.noarch/include/linux/fs.h 2010-08-24 14:17:48.961675000 -0400 +======= +--- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-08-23 12:08:29.021511898 -0400 ++++ linux-2.6.34.noarch/include/linux/fs.h 2010-08-23 12:09:03.369481147 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -387,6 +387,7 @@ struct inodes_stat_t { #include @@ -28564,7 +28979,11 @@ diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include struct dentry *s_root; diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/include/linux/nfs4.h --- linux-2.6.34.noarch/include/linux/nfs4.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-08-24 14:17:48.974681000 -0400 +======= ++++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-08-23 12:09:03.371491472 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -17,7 +17,10 @@ #define NFS4_BITMAP_SIZE 2 @@ -28694,8 +29113,13 @@ diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/inclu #endif diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfs4_pnfs.h +<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-24 14:17:48.986670000 -0400 +++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-08-24 14:17:48.989666000 -0400 +======= +--- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-23 12:09:03.372501550 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-08-23 12:09:03.372501550 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,330 @@ +/* + * include/linux/nfs4_pnfs.h @@ -29028,8 +29452,13 @@ diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/ + +#endif /* LINUX_NFS4_PNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarch/include/linux/nfsd4_block.h +<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-08-24 14:17:48.998668000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-08-24 14:17:49.000665000 -0400 +======= +--- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-08-23 12:09:03.373491892 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-08-23 12:09:03.374491393 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,101 @@ +#ifndef NFSD4_BLOCK +#define NFSD4_BLOCK @@ -29133,8 +29562,13 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarc +#endif /* NFSD4_BLOCK */ + diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h +<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-24 14:17:49.012664000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-08-24 14:17:49.013671000 -0400 +======= +--- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-23 12:09:03.375501481 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-08-23 12:09:03.375501481 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,345 @@ +/* + * include/linux/nfsd4_spnfs.h @@ -29483,7 +29917,11 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarc +#endif /* NFS_SPNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch/include/linux/nfsd/const.h --- linux-2.6.34.noarch/include/linux/nfsd/const.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-08-24 14:17:49.018668000 -0400 +======= ++++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-08-23 12:09:03.376401789 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -29,6 +29,7 @@ #ifdef __KERNEL__ @@ -29494,7 +29932,11 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch * Largest number of bytes we need to allocate for an NFS diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch/include/linux/nfsd/debug.h --- linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-08-24 14:17:49.024673000 -0400 +======= ++++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-08-23 12:09:03.376401789 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -32,6 +32,8 @@ #define NFSDDBG_REPCACHE 0x0080 #define NFSDDBG_XDR 0x0100 @@ -29506,7 +29948,11 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarch/include/linux/nfsd/export.h --- linux-2.6.34.noarch/include/linux/nfsd/export.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-08-24 14:17:49.030665000 -0400 +======= ++++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-08-23 12:09:03.377481954 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -100,6 +100,7 @@ struct svc_export { uid_t ex_anon_uid; gid_t ex_anon_gid; @@ -29516,8 +29962,13 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarc struct nfsd4_fs_locations ex_fslocs; int ex_nflavors; diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h +<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-24 14:17:49.033666000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-24 14:17:49.034665000 -0400 +======= +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-23 12:09:03.377481954 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-23 12:09:03.378501747 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -29652,8 +30103,13 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.3 + +#endif /* NFSD_NFS4LAYOUTXDR_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h +<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-24 14:17:49.037666000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-24 14:17:49.039665000 -0400 +======= +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-23 12:09:03.378501747 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-23 12:09:03.378501747 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,54 @@ +/****************************************************************************** + * @@ -29710,8 +30166,13 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34. + +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h +<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-24 14:17:49.042666000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-24 14:17:49.044665000 -0400 +======= +--- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-23 12:09:03.379487099 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-23 12:09:03.379487099 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -29986,7 +30447,11 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.n +#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noarch/include/linux/nfsd/syscall.h --- linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-08-24 14:17:49.049665000 -0400 +======= ++++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-08-23 12:09:03.380502500 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -29,6 +29,7 @@ /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ @@ -30024,8 +30489,13 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noar union nfsctl_res { diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/include/linux/nfs_fs.h +<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-24 14:14:13.201710000 -0400 +++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-24 14:17:49.063666000 -0400 +======= +--- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-23 12:08:29.061494081 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-23 12:09:03.381511751 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -72,13 +72,20 @@ struct nfs_access_entry { int mask; }; @@ -30124,8 +30594,13 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/inc #ifdef __KERNEL__ diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/include/linux/nfs_fs_sb.h +<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-24 14:14:13.206708000 -0400 +++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-24 14:17:49.077665000 -0400 +======= +--- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-23 12:08:29.062501618 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-23 12:09:03.383491395 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -15,6 +15,7 @@ struct nlm_host; struct nfs4_sequence_args; struct nfs4_sequence_res; @@ -30200,7 +30675,11 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/ atomic_t active; /* Keep trace of any activity to this server */ diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch/include/linux/nfs_iostat.h --- linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-08-24 14:17:49.089668000 -0400 +======= ++++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-08-23 12:09:03.384501540 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { NFSIOS_SHORTREAD, NFSIOS_SHORTWRITE, @@ -30213,7 +30692,11 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/include/linux/nfs_page.h --- linux-2.6.34.noarch/include/linux/nfs_page.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-08-24 14:17:49.103665000 -0400 +======= ++++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-08-23 12:09:03.385491518 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -39,6 +39,7 @@ struct nfs_page { struct list_head wb_list; /* Defines state of page: */ struct page *wb_page; /* page to read in/write out */ @@ -30262,8 +30745,13 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/i struct inode *inode, int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/include/linux/nfs_xdr.h +<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-08-24 14:14:13.211708000 -0400 +++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-24 14:17:49.116665000 -0400 +======= +--- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-08-23 12:08:29.062501618 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-23 12:09:03.387491422 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -3,6 +3,8 @@ #include @@ -30415,8 +30903,13 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/in extern struct rpc_version nfs_version3; extern struct rpc_version nfs_version4; diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.noarch/include/linux/panfs_shim_api.h +<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-08-24 14:17:49.128664000 -0400 +++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-08-24 14:17:49.129670000 -0400 +======= +--- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-08-23 12:09:03.388491527 -0400 ++++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-08-23 12:09:03.388491527 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,57 @@ +#ifndef _PANFS_SHIM_API_H +#define _PANFS_SHIM_API_H @@ -30476,8 +30969,13 @@ diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.no + +#endif /* _PANFS_SHIM_API_H */ diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h +<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-24 14:17:49.141664000 -0400 +++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-08-24 14:17:49.142670000 -0400 +======= +--- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-23 12:09:03.390501461 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-08-23 12:09:03.390501461 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,440 @@ +/* + * pnfs_osd_xdr.h @@ -30920,8 +31418,13 @@ diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noar + +#endif /* __PNFS_OSD_XDR_H__ */ diff -up linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_xdr.h +<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig 2010-08-24 14:17:49.153666000 -0400 +++ linux-2.6.34.noarch/include/linux/pnfs_xdr.h 2010-08-24 14:17:49.155665000 -0400 +======= +--- linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig 2010-08-23 12:09:03.391491550 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_xdr.h 2010-08-23 12:09:03.391491550 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,134 @@ +/* + * include/linux/pnfs_xdr.h @@ -31059,7 +31562,11 @@ diff -up linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig linux-2.6.34.noarch/i +#endif /* LINUX_PNFS_XDR_H */ diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/include/linux/posix_acl.h --- linux-2.6.34.noarch/include/linux/posix_acl.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-08-24 14:17:49.168668000 -0400 +======= ++++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-08-23 12:09:03.393501437 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -8,6 +8,7 @@ #ifndef __LINUX_POSIX_ACL_H #define __LINUX_POSIX_ACL_H @@ -31070,7 +31577,11 @@ diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/ #define ACL_UNDEFINED_ID (-1) diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h --- linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-08-24 14:17:49.174665000 -0400 +======= ++++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-08-23 12:09:03.393501437 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -14,6 +14,8 @@ /* size of an XDR encoding unit in bytes, i.e. 32bit */ #define XDR_UNIT (4) @@ -31082,7 +31593,11 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.n diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h --- linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-24 14:17:49.179667000 -0400 +======= ++++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-23 12:09:03.394512138 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -3,6 +3,7 @@ #ifdef __KERNEL__ @@ -31103,8 +31618,13 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.3 struct rpc_pipe_ops { diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h +<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-24 14:17:49.183664000 -0400 +++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-24 14:17:49.184674000 -0400 +======= +--- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-23 12:09:03.394512138 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-23 12:09:03.395501822 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2008 The Regents of the University of Michigan. @@ -31219,7 +31739,11 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux +#endif /* _SIMPLE_RPC_PIPEFS_H_ */ diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h --- linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig 2010-05-16 17:17:36.000000000 -0400 +<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-24 14:17:49.190665000 -0400 +======= ++++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-23 12:09:03.395501822 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -166,4 +166,41 @@ static inline char *__svc_print_addr(con return buf; @@ -31263,8 +31787,13 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.n +} #endif /* SUNRPC_SVC_XPRT_H */ diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xdr.h +<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-24 14:14:13.258707000 -0400 +++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-24 14:17:49.195672000 -0400 +======= +--- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-23 12:08:29.066475323 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-23 12:09:03.396464612 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp) return p + 2; } @@ -31287,9 +31816,20 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, unsigned int len); extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); +<<<<<<< HEAD diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/sunrpc/Makefile --- linux-2.6.34.noarch/net/sunrpc/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-08-24 14:17:49.204668000 -0400 +======= +diff -up linux-2.6.34.noarch/localversion-pnfs.orig linux-2.6.34.noarch/localversion-pnfs +--- linux-2.6.34.noarch/localversion-pnfs.orig 2010-08-23 12:09:03.396464612 -0400 ++++ linux-2.6.34.noarch/localversion-pnfs 2010-08-23 12:09:03.396464612 -0400 +@@ -0,0 +1 @@ ++-pnfs +diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/sunrpc/Makefile +--- linux-2.6.34.noarch/net/sunrpc/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-08-23 12:09:03.397501662 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt svc.o svcsock.o svcauth.o svcauth_unix.o \ addr.o rpcb_clnt.o timer.o xdr.o \ @@ -31300,8 +31840,13 @@ diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/su sunrpc-$(CONFIG_PROC_FS) += stats.o sunrpc-$(CONFIG_SYSCTL) += sysctl.o diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c +<<<<<<< HEAD --- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-24 14:17:49.208664000 -0400 +++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-24 14:17:49.209670000 -0400 +======= +--- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-23 12:09:03.398522348 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-23 12:09:03.398522348 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,424 @@ +/* + * net/sunrpc/simple_rpc_pipefs.c @@ -31728,8 +32273,13 @@ diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.no +} +EXPORT_SYMBOL(pipefs_generic_destroy_msg); diff -up linux-2.6.34.noarch/net/sunrpc/xdr.c.orig linux-2.6.34.noarch/net/sunrpc/xdr.c +<<<<<<< HEAD --- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-08-24 14:14:13.447705000 -0400 +++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-24 14:17:49.215665000 -0400 +======= +--- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-08-23 12:08:29.081501640 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-23 12:09:03.399443371 -0400 +>>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -403,16 +403,14 @@ xdr_shrink_pagelen(struct xdr_buf *buf, /* Shift the tail first */ From 7f5ad3008e5cc3c63310ed4b440290adc3c90309 Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Mon, 23 Aug 2010 14:15:46 -0400 Subject: [PATCH 16/20] Fixed a couple compile errors in the server code. Signed-off-by: Steve Dickson --- nfsd-35-fc.patch | 62 ------------------------------------------------ 1 file changed, 62 deletions(-) diff --git a/nfsd-35-fc.patch b/nfsd-35-fc.patch index 9a97fc6ec..2825464af 100644 --- a/nfsd-35-fc.patch +++ b/nfsd-35-fc.patch @@ -1,10 +1,6 @@ diff -up linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt --- linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt 2010-08-23 14:12:24.165356789 -0400 -======= -+++ linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt 2010-08-23 09:57:18.233564439 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -137,7 +137,7 @@ NS*| OPENATTR | OPT | | READ | REQ | | Section 18.22 | | READDIR | REQ | | Section 18.23 | @@ -16,11 +12,7 @@ diff -up linux-2.6.34.noarch/Documentation/filesystems/nfs/nfs41-server.txt.orig | RENAME | REQ | | Section 18.26 | diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c --- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 14:12:24.519356675 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 09:57:18.234564075 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -259,10 +259,9 @@ static struct cache_detail svc_expkey_ca .alloc = expkey_alloc, }; @@ -116,11 +108,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/e out_put_clp: diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c --- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 14:12:52.625429773 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 10:00:37.257414684 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -79,11 +79,6 @@ enum nfs_cb_opnum4 { cb_sequence_dec_sz + \ op_dec_sz) @@ -223,11 +211,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/ int status; - status = rpc_call_async(cb->cb_client, &msg, -<<<<<<< HEAD + status = rpc_call_async(clp->cl_cb_client, &msg, -======= -+ status = rpc_call_async(cb->cl_cb_client, &msg, ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 RPC_TASK_SOFT | RPC_TASK_SOFTCONN, &nfsd4_cb_probe_ops, (void *)clp); - if (status) { @@ -418,11 +402,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/ } diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c --- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 14:12:25.698356909 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 09:57:18.237376763 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -969,20 +969,36 @@ static struct nfsd4_operation nfsd4_ops[ static const char *nfsd4_op_name(unsigned opnum); @@ -510,11 +490,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd static const char *nfsd4_op_name(unsigned opnum) diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c --- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 14:12:25.700356284 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 09:57:18.240356512 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -45,8 +45,8 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -1304,7 +1280,6 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs -{ - user_lease_time = leasetime; -} -<<<<<<< HEAD diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c --- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-08-23 14:14:22.882428704 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-23 14:14:33.418376589 -0400 @@ -1320,11 +1295,6 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/ diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c --- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 14:12:25.821359224 -0400 -======= -diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c ---- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 09:57:20.629370282 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -46,6 +46,7 @@ enum { */ #ifdef CONFIG_NFSD_V4 @@ -1445,11 +1415,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/n /* last one */ {""} diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h --- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 14:12:25.835418441 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 09:57:20.629370282 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -82,7 +82,6 @@ int nfs4_state_init(void); void nfsd4_free_slabs(void); int nfs4_state_start(void); @@ -1486,11 +1452,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfs /* diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h --- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 14:12:25.836366516 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 09:57:21.807501619 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -70,6 +70,16 @@ struct nfsd4_cb_sequence { struct nfs4_client *cbs_clp; }; @@ -1608,11 +1570,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/st nfs4_put_stateowner(struct nfs4_stateowner *so) diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h --- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 14:12:25.837387292 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 09:57:23.994379831 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -381,6 +381,10 @@ struct nfsd4_destroy_session { struct nfs4_sessionid sessionid; }; @@ -1654,11 +1612,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h --- linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h 2010-08-23 14:12:25.838377224 -0400 -======= -+++ linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h 2010-08-23 09:57:23.994379831 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -40,12 +40,12 @@ struct nfs_fhbase_old { * This is the new flexible, extensible style NFSv2/v3 file handle. * by Neil Brown - March 2000 @@ -1677,11 +1631,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsfh.h.orig linux-2.6.34.noarch * This might allow a file to be confirmed to be in a writable part of a diff -up linux-2.6.34.noarch/net/sunrpc/cache.c.orig linux-2.6.34.noarch/net/sunrpc/cache.c --- linux-2.6.34.noarch/net/sunrpc/cache.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/net/sunrpc/cache.c 2010-08-23 14:12:25.839376838 -0400 -======= -+++ linux-2.6.34.noarch/net/sunrpc/cache.c 2010-08-23 09:57:23.995376793 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -49,11 +49,17 @@ static void cache_init(struct cache_head h->last_refresh = now; } @@ -1748,11 +1698,7 @@ diff -up linux-2.6.34.noarch/net/sunrpc/cache.c.orig linux-2.6.34.noarch/net/sun /* entry is valid */ diff -up linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig linux-2.6.34.noarch/net/sunrpc/svcsock.c --- linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/net/sunrpc/svcsock.c 2010-08-23 14:12:25.840384371 -0400 -======= -+++ linux-2.6.34.noarch/net/sunrpc/svcsock.c 2010-08-23 09:57:23.997368707 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -547,7 +547,6 @@ static int svc_udp_recvfrom(struct svc_r dprintk("svc: recvfrom returned error %d\n", -err); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); @@ -1819,11 +1765,7 @@ diff -up linux-2.6.34.noarch/net/sunrpc/svcsock.c.orig linux-2.6.34.noarch/net/s error: diff -up linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig linux-2.6.34.noarch/net/sunrpc/svc_xprt.c --- linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/net/sunrpc/svc_xprt.c 2010-08-23 14:12:25.841371223 -0400 -======= -+++ linux-2.6.34.noarch/net/sunrpc/svc_xprt.c 2010-08-23 09:57:23.996377209 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -744,8 +744,10 @@ int svc_recv(struct svc_rqst *rqstp, lon if (rqstp->rq_deferred) { svc_xprt_received(xprt); @@ -1852,11 +1794,7 @@ diff -up linux-2.6.34.noarch/net/sunrpc/svc_xprt.c.orig linux-2.6.34.noarch/net/ void svc_close_xprt(struct svc_xprt *xprt) diff -up linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c.orig linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c --- linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 2010-08-23 14:12:25.842376584 -0400 -======= -+++ linux-2.6.34.noarch/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 2010-08-23 09:57:23.998377481 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -566,7 +566,6 @@ static int rdma_read_complete(struct svc ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, rqstp->rq_arg.head[0].iov_len); From d7cf8e11508fe3b3aa3fd8bb68768938ae587158 Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Tue, 24 Aug 2010 14:49:23 -0400 Subject: [PATCH 17/20] Removed the localversion-pnfs file from the pnfs patch Signed-off-by: Steve Dickson --- kernel.spec | 2 +- pnfs-all-2.6.35-2010-08-19-f13.patch | 550 --------------------------- 2 files changed, 1 insertion(+), 551 deletions(-) diff --git a/kernel.spec b/kernel.spec index f9c6ff212..f3e776e20 100644 --- a/kernel.spec +++ b/kernel.spec @@ -23,7 +23,7 @@ Summary: The Linux kernel # # (Uncomment the '#' and both spaces below to set the buildid.) # -%define buildid .pnfs_all_2.6.35_2010_08_19 +%define buildid .pnfs34.2010.08.19 ################################################################### # The buildid can also be specified on the rpmbuild command line diff --git a/pnfs-all-2.6.35-2010-08-19-f13.patch b/pnfs-all-2.6.35-2010-08-19-f13.patch index ecc100c30..10df9b15c 100644 --- a/pnfs-all-2.6.35-2010-08-19-f13.patch +++ b/pnfs-all-2.6.35-2010-08-19-f13.patch @@ -1,11 +1,6 @@ diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arch/um/os-Linux/mem.c -<<<<<<< HEAD --- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-08-24 14:14:03.643355000 -0400 +++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-08-24 14:17:48.415730000 -0400 -======= ---- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-08-23 12:08:27.310584826 -0400 -+++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-08-23 12:09:03.273553977 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -13,6 +13,7 @@ #include #include @@ -16,11 +11,7 @@ diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arc #include "os.h" diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd.c --- linux-2.6.34.noarch/block/genhd.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/block/genhd.c 2010-08-24 14:17:48.421730000 -0400 -======= -+++ linux-2.6.34.noarch/block/genhd.c 2010-08-23 12:09:03.273553977 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -1009,6 +1009,7 @@ static void disk_release(struct device * struct class block_class = { .name = "block", @@ -30,13 +21,8 @@ diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd. static char *block_devnode(struct device *dev, mode_t *mode) { diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt -<<<<<<< HEAD --- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-24 14:17:48.423729000 -0400 +++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-08-24 14:17:48.425730000 -0400 -======= ---- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-23 12:09:03.274563927 -0400 -+++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-08-23 12:09:03.274563927 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,211 @@ +(c) 2007 Network Appliance Inc. + @@ -251,11 +237,7 @@ diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6. + diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/drivers/md/dm-ioctl.c --- linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-08-24 14:17:48.430730000 -0400 -======= -+++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-08-23 12:09:03.275584050 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -651,6 +651,12 @@ static int dev_create(struct dm_ioctl *p return r; } @@ -310,11 +292,7 @@ diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/driv int r; diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drivers/scsi/hosts.c --- linux-2.6.34.noarch/drivers/scsi/hosts.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-08-24 14:17:48.435733000 -0400 -======= -+++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-08-23 12:09:03.276563906 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -49,7 +49,7 @@ static void scsi_host_cls_release(struct put_device(&class_to_shost(dev)->shost_gendev); } @@ -326,11 +304,7 @@ diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drive }; diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/exofs.h --- linux-2.6.34.noarch/fs/exofs/exofs.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-08-24 14:17:48.440733000 -0400 -======= -+++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-08-23 12:09:03.277563890 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -36,13 +36,9 @@ #include #include @@ -386,13 +360,8 @@ diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/ + #endif diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs/export.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-08-24 14:17:48.444731000 -0400 +++ linux-2.6.34.noarch/fs/exofs/export.c 2010-08-24 14:17:48.446730000 -0400 -======= ---- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-08-23 12:09:03.278386746 -0400 -+++ linux-2.6.34.noarch/fs/exofs/export.c 2010-08-23 12:09:03.278386746 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,396 @@ +/* + * export.c - Implementation of the pnfs_export_operations @@ -792,11 +761,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs +} diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/inode.c --- linux-2.6.34.noarch/fs/exofs/inode.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-08-24 14:17:48.452730000 -0400 -======= -+++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-08-23 12:09:03.279502002 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -833,7 +833,7 @@ void exofs_truncate(struct inode *inode) if (unlikely(wait_obj_created(oi))) goto fail; @@ -816,11 +781,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/ * Fill in an inode read from the OSD and set it up for use diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/Kbuild --- linux-2.6.34.noarch/fs/exofs/Kbuild.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-08-24 14:17:48.457733000 -0400 -======= -+++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-08-23 12:09:03.279502002 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -13,4 +13,5 @@ # @@ -829,11 +790,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/K obj-$(CONFIG_EXOFS_FS) += exofs.o diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/Kconfig --- linux-2.6.34.noarch/fs/exofs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-08-24 14:17:48.462739000 -0400 -======= -+++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-08-23 12:09:03.280553663 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -1,6 +1,7 @@ config EXOFS_FS tristate "exofs: OSD based file system support" @@ -844,11 +801,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/ as its backing storage. diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/super.c --- linux-2.6.34.noarch/fs/exofs/super.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/exofs/super.c 2010-08-24 14:17:48.468730000 -0400 -======= -+++ linux-2.6.34.noarch/fs/exofs/super.c 2010-08-23 12:09:03.281511951 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -621,6 +621,7 @@ static int exofs_fill_super(struct super sb->s_fs_info = sbi; sb->s_op = &exofs_sops; @@ -859,11 +812,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/ EXOFS_ERR("ERROR: exofs_iget failed\n"); diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exportfs/expfs.c --- linux-2.6.34.noarch/fs/exportfs/expfs.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-08-24 14:17:48.473730000 -0400 -======= -+++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-08-23 12:09:03.282511528 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -16,6 +16,13 @@ #include #include @@ -880,11 +829,7 @@ diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exp diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/exportfs/Makefile --- linux-2.6.34.noarch/fs/exportfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-08-24 14:17:48.478733000 -0400 -======= -+++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-08-23 12:09:03.282511528 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -3,4 +3,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs.o @@ -895,13 +840,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/ex +exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o +exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-24 14:17:48.482731000 -0400 +++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-24 14:17:48.484734000 -0400 -======= ---- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-23 12:09:03.283511561 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-23 12:09:03.283511561 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,158 @@ +/* + * linux/fs/nfsd/nfs4blocklayoutxdr.c @@ -1062,13 +1002,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34. +} +EXPORT_SYMBOL_GPL(blocklayout_encode_layout); diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-24 14:17:48.487733000 -0400 +++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-24 14:17:48.489734000 -0400 -======= ---- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-23 12:09:03.283511561 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-23 12:09:03.283511561 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -1289,13 +1224,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.n +} +EXPORT_SYMBOL(filelayout_encode_layout); diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-24 14:17:48.493729000 -0400 +++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-24 14:17:48.494735000 -0400 -======= ---- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-23 12:09:03.284511493 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-23 12:09:03.284511493 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,289 @@ +/* + * pnfs_osd_xdr_enc.c @@ -1588,11 +1518,7 @@ diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.no +EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gfs2/ops_fstype.c --- linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-08-24 14:17:48.499730000 -0400 -======= -+++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-08-23 12:09:03.285539075 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -19,6 +19,7 @@ #include #include @@ -1613,11 +1539,7 @@ diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gf sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig --- linux-2.6.34.noarch/fs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/Kconfig 2010-08-24 14:17:48.505733000 -0400 -======= -+++ linux-2.6.34.noarch/fs/Kconfig 2010-08-23 12:09:03.286512316 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -224,6 +224,31 @@ config LOCKD_V4 config EXPORTFS tristate @@ -1651,13 +1573,8 @@ diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig tristate select FS_POSIX_ACL diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-24 14:17:48.509734000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-24 14:17:48.511732000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-23 12:09:03.287381619 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-23 12:09:03.287381619 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,66 @@ +#include +#include @@ -1726,13 +1643,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.or + return; +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-24 14:17:48.514733000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-24 14:17:48.516731000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-23 12:09:03.288501648 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-23 12:09:03.288501648 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,1160 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.c @@ -2895,13 +2807,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34. +module_init(nfs4blocklayout_init); +module_exit(nfs4blocklayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-24 14:17:48.519731000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-24 14:17:48.521730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-23 12:09:03.289501933 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-23 12:09:03.289501933 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,335 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdev.c @@ -3239,13 +3146,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6. + goto out; +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-24 14:17:48.523733000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-24 14:17:48.525730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-23 12:09:03.290395707 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-23 12:09:03.290395707 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,120 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdm.c @@ -3368,13 +3270,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.3 + } +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-24 14:17:48.528729000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-24 14:17:48.529735000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-23 12:09:03.290395707 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-23 12:09:03.291501560 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,303 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h @@ -3680,13 +3577,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34. + +#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-24 14:17:48.532731000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-08-24 14:17:48.534734000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-23 12:09:03.292511531 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-08-23 12:09:03.292511531 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,948 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h @@ -4637,13 +4529,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noar + } +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-24 14:17:48.537729000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-08-24 14:17:48.538739000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-23 12:09:03.292511531 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-08-23 12:09:03.293491476 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,6 @@ +# +# Makefile for the pNFS block layout driver kernel module @@ -4653,11 +4540,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarc + extents.o block-device-discovery-pipe.o diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/callback.h --- linux-2.6.34.noarch/fs/nfs/callback.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-08-24 14:17:48.544730000 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-08-23 12:09:03.293491476 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -8,6 +8,8 @@ #ifndef __LINUX_FS_NFS_CALLBACK_H #define __LINUX_FS_NFS_CALLBACK_H @@ -4730,11 +4613,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/c extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/nfs/callback_proc.c --- linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-08-24 14:17:48.562731000 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-08-23 12:09:03.294522414 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -8,10 +8,15 @@ #include #include @@ -5217,11 +5096,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/ return status; diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/nfs/callback_xdr.c --- linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-08-24 14:17:48.568730000 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-08-23 12:09:03.295502055 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -22,6 +22,8 @@ #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) @@ -5423,13 +5298,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/n .process_op = (callback_process_op_t)nfs4_callback_sequence, .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/client.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-08-24 14:14:13.062705000 -0400 +++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-24 14:17:48.575730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-08-23 12:08:29.037481540 -0400 -+++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-23 12:09:03.297501650 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -39,6 +39,7 @@ #include #include @@ -5638,13 +5508,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/cli goto error; diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/bl_com.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-08-24 14:17:48.578729000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-08-24 14:17:48.579735000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-08-23 12:09:03.297501650 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-08-23 12:09:03.298501447 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,292 @@ +#if defined(CONFIG_SPNFS_BLOCK) + @@ -5939,13 +5804,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/b +} +#endif /* CONFIG_SPNFS_BLOCK */ diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/bl_ops.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-08-24 14:17:48.584729000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-08-24 14:17:48.586730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-08-23 12:09:03.299501445 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-08-23 12:09:03.299501445 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,1672 @@ +/* + * bl_ops.c @@ -7620,13 +7480,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/b + +#endif /* CONFIG_SPNFS_BLOCK */ diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs/delegation.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-08-24 14:14:13.068705000 -0400 +++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-24 14:17:48.592730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-08-23 12:08:29.037481540 -0400 -+++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-23 12:09:03.300491952 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -104,7 +104,8 @@ again: continue; if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) @@ -7703,11 +7558,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs rcu_read_unlock(); diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs/delegation.h --- linux-2.6.34.noarch/fs/nfs/delegation.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-08-24 14:17:48.597733000 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-08-23 12:09:03.301431797 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -34,9 +34,7 @@ enum { int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); @@ -7720,13 +7571,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-08-24 14:14:13.612707000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-24 14:17:48.604730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-08-23 12:08:29.089481525 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-23 12:09:03.302511603 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -17,11 +17,19 @@ #include #include @@ -7904,11 +7750,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/e svcauth_unix_purge(); diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/direct.c --- linux-2.6.34.noarch/fs/nfs/direct.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-08-24 14:17:48.610730000 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-08-23 12:09:03.303491500 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -267,6 +267,38 @@ static const struct rpc_call_ops nfs_rea .rpc_release = nfs_direct_read_release, }; @@ -8154,11 +7996,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/dir user_addr += bytes; diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kconfig --- linux-2.6.34.noarch/fs/nfsd/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-08-24 14:17:48.616730000 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-08-23 12:09:03.304505472 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -79,3 +79,52 @@ config NFSD_V4 available from http://linux-nfs.org/. @@ -8214,11 +8052,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kc + If unsure, say N. diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/Makefile --- linux-2.6.34.noarch/fs/nfsd/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-08-24 14:17:48.621733000 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-08-23 12:09:03.304505472 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ @@ -8228,13 +8062,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/M +nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o +nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-24 14:14:13.618705000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-24 14:17:48.628730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-23 12:08:29.090501507 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-23 12:09:03.306491345 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -40,7 +40,6 @@ #define NFSPROC4_CB_NULL 0 @@ -8774,13 +8603,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/ +} +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-24 14:17:48.633729000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-24 14:17:48.641730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-23 12:09:03.307491492 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-23 12:09:03.308491262 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,1679 @@ +/****************************************************************************** + * @@ -10462,13 +10286,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfs + return status; +} diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-24 14:17:48.645731000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-24 14:17:48.647730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-23 12:09:03.309501439 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-23 12:09:03.309501439 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,461 @@ +/****************************************************************************** + * @@ -10932,13 +10751,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/n +}; +EXPORT_SYMBOL(pnfs_dlm_export_ops); diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-24 14:17:48.651729000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-24 14:17:48.652735000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-23 12:09:03.310501390 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-23 12:09:03.310501390 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,620 @@ +/* +* linux/fs/nfsd/nfs4pnfsds.c @@ -11561,13 +11375,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nf + +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-24 14:14:13.623707000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-24 14:17:48.658733000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-23 12:08:29.091491685 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-23 12:09:03.311501496 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -34,10 +34,14 @@ */ #include @@ -12042,13 +11851,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd static const char *nfsd4_op_name(unsigned opnum) diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-08-24 14:14:13.632707000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-24 14:17:48.667732000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-08-23 12:08:29.093491375 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-23 12:09:03.313491310 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -42,6 +42,8 @@ #include "xdr4.h" #include "vfs.h" @@ -12564,13 +12368,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs } diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-08-24 14:14:13.639707000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-24 14:17:48.675730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-23 12:09:03.315491356 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -47,9 +47,14 @@ #include #include @@ -13189,13 +12988,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/ [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-08-24 14:14:13.645705000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-24 14:17:48.681730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-08-23 12:08:29.094491943 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-23 12:09:03.317501495 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -13,10 +13,15 @@ #include #include @@ -13372,13 +13166,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/n remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-08-24 14:14:13.651705000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-24 14:17:48.687730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-08-23 12:08:29.095491390 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-23 12:09:03.318355741 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -285,11 +285,17 @@ extern time_t nfsd4_grace; #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ NFSD4_SUPPORTED_ATTRS_WORD0 @@ -13400,11 +13189,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfs { diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.c --- linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-08-24 14:17:48.693730000 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-08-23 12:09:03.319511586 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -10,6 +10,7 @@ #include @@ -13442,11 +13227,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nf __u32 tfh[2]; diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.h --- linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-08-24 14:17:48.698733000 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-08-23 12:09:03.319511586 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -14,6 +14,7 @@ enum nfsd_fsid { FSID_UUID8, FSID_UUID16, @@ -13499,13 +13280,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nf + #endif /* _LINUX_NFSD_FH_INT_H */ diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/nfssvc.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-08-24 14:14:06.365163000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-08-24 14:17:48.704731000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-08-23 12:08:27.631563969 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-08-23 12:09:03.320416974 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -115,7 +115,7 @@ struct svc_program nfsd_program = { }; @@ -13516,13 +13292,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/n int nfsd_vers(int vers, enum vers_op change) { diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pnfsd.h -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-08-24 14:17:48.708729000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-08-24 14:17:48.710730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-08-23 12:09:03.321376171 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-08-23 12:09:03.321376171 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2005 The Regents of the University of Michigan. @@ -13668,13 +13439,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pn + +#endif /* LINUX_NFSD_PNFSD_H */ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-24 14:17:48.713731000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-24 14:17:48.715730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-23 12:09:03.321376171 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-23 12:09:03.322501672 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,225 @@ +/* + * linux/fs/nfsd/pnfs_lexp.c @@ -13902,13 +13668,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nf + inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; +} diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_com.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-24 14:17:48.719729000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-08-24 14:17:48.720735000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-23 12:09:03.322501672 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-08-23 12:09:03.323511608 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,535 @@ +/* + * fs/nfsd/spnfs_com.c @@ -14446,13 +14207,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfs +} +#endif /* CONFIG_PROC_FS */ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-24 14:17:48.724733000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-08-24 14:17:48.726730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-23 12:09:03.324501390 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-08-23 12:09:03.324501390 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,878 @@ +/* + * fs/nfsd/spnfs_ops.c @@ -15333,13 +15089,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfs + return 0; +} diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-08-24 14:14:13.656705000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-24 14:17:48.731738000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-08-23 12:08:29.096512142 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-23 12:09:03.325501424 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -242,6 +242,12 @@ struct nfs4_client { u32 cl_cb_seq_nr; struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ @@ -15456,13 +15207,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/st + #endif /* NFSD4_STATE_H */ diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-08-24 14:14:06.371160000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-08-24 14:17:48.737742000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-08-23 12:08:27.632564132 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-08-23 12:09:03.326501490 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -37,7 +37,12 @@ #ifdef CONFIG_NFSD_V4 #include @@ -15589,13 +15335,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs. out_nfserr: err = nfserrno(host_err); diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-08-24 14:14:13.661705000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-24 14:17:48.743747000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-08-23 12:08:29.097425997 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-23 12:09:03.327451643 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -37,6 +37,8 @@ #ifndef _LINUX_NFSD_XDR4_H #define _LINUX_NFSD_XDR4_H @@ -15672,13 +15413,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr struct nfs4_replay * replay; }; diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-24 14:14:13.079708000 -0400 +++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-24 14:17:48.749746000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-23 12:08:29.039491912 -0400 -+++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-23 12:09:03.328501680 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -28,6 +28,7 @@ #include #include @@ -15804,13 +15540,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file. if (!ret) return VM_FAULT_LOCKED; diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inode.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-08-24 14:14:13.095705000 -0400 +++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-24 14:17:48.757730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-08-23 12:08:29.042511552 -0400 -+++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-23 12:09:03.329501644 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -48,6 +48,7 @@ #include "internal.h" #include "fscache.h" @@ -16024,13 +15755,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inod nfs_fs_proc_exit(); nfsiod_stop(); diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/internal.h -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-08-24 14:14:13.100708000 -0400 +++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-24 14:17:48.763734000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-08-23 12:08:29.042511552 -0400 -+++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-23 12:09:03.330502148 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_serv struct nfs_fattr *); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); @@ -16091,11 +15817,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/i struct page *, struct page *); diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kconfig --- linux-2.6.34.noarch/fs/nfs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-08-24 14:17:48.769730000 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-08-23 12:09:03.331395814 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -79,10 +79,48 @@ config NFS_V4_1 depends on NFS_V4 && EXPERIMENTAL help @@ -16148,11 +15870,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kcon depends on NFS_FS=y && IP_PNP diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Makefile --- linux-2.6.34.noarch/fs/nfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-08-24 14:17:48.774730000 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-08-23 12:09:03.331395814 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4x delegation.o idmap.o \ callback.o callback_xdr.o callback_proc.o \ @@ -16167,13 +15885,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Mak +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ +obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs3proc.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-08-24 14:14:13.119708000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-24 14:17:48.780730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-08-23 12:08:29.045525837 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-23 12:09:03.332511640 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -833,6 +833,7 @@ const struct nfs_rpc_ops nfs_v3_clientop .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs3_dir_inode_operations, @@ -16183,13 +15896,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/n .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-24 14:17:48.784731000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-08-24 14:17:48.786730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-23 12:09:03.333512111 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-08-23 12:09:03.334491472 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,765 @@ +/* + * linux/fs/nfs/nfs4filelayout.c @@ -16957,13 +16665,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs +module_init(nfs4filelayout_init); +module_exit(nfs4filelayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-24 14:17:48.790731000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-24 14:17:48.792730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-23 12:09:03.334491472 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-23 12:09:03.335501543 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,636 @@ +/* + * linux/fs/nfs/nfs4filelayoutdev.c @@ -17602,13 +17305,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch +} + diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-24 14:17:48.795731000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-08-24 14:17:48.796742000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-23 12:09:03.335501543 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-08-23 12:09:03.335501543 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,97 @@ +/* + * pnfs_nfs4filelayout.h @@ -17708,13 +17406,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs + +#endif /* FS_NFS_NFS4FILELAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nfs4_fs.h -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-24 14:14:13.130705000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-24 14:17:48.802730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-23 12:08:29.047512264 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-23 12:09:03.336490079 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -45,8 +45,28 @@ enum nfs4_client_state { NFS4CLNT_RECLAIM_NOGRACE, NFS4CLNT_DELEGRETURN, @@ -17863,13 +17556,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nf /* nfs4xdr.c */ diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs4proc.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-08-24 14:14:13.143709000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-24 14:17:48.811734000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-08-23 12:08:29.050481368 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-23 12:09:03.339481253 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -49,12 +49,15 @@ #include #include @@ -19535,11 +19223,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/n .setattr = nfs4_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs/nfs4renewd.c --- linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-08-24 14:17:48.818733000 -0400 -======= -+++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-08-23 12:09:03.341491726 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -54,17 +54,17 @@ void nfs4_renew_state(struct work_struct *work) @@ -19562,13 +19246,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs spin_lock(&clp->cl_lock); lease = clp->cl_lease_time; diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/nfs4state.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-08-24 14:14:13.150705000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-24 14:17:48.825730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-08-23 12:08:29.052491341 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-23 12:09:03.342373443 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -53,6 +53,9 @@ #include "callback.h" #include "delegation.h" @@ -19887,13 +19566,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/ test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs4xdr.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-24 14:14:13.159705000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-24 14:17:48.834738000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-23 12:08:29.054481400 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-23 12:09:03.346481283 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -50,8 +50,11 @@ #include #include @@ -21404,13 +21078,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nf }; diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-24 14:17:48.839734000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-08-24 14:17:48.840742000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-23 12:09:03.348511665 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-08-23 12:09:03.348511665 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,11 @@ +# +# Makefile for the pNFS Objects Layout Driver kernel module @@ -21424,13 +21093,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs +panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o +obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-24 14:17:48.843735000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-24 14:17:48.845739000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-23 12:09:03.349501459 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-23 12:09:03.349501459 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,1087 @@ +/* + * objio_osd.c @@ -22520,13 +22184,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noar +module_init(objlayout_init); +module_exit(objlayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-24 14:17:48.848735000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-08-24 14:17:48.851730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-23 12:09:03.350491564 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-08-23 12:09:03.350491564 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,790 @@ +/* + * objlayout.c @@ -23319,13 +22978,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noar + .uninitialize_mountpoint = objlayout_uninitialize_mountpoint, +}; diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-24 14:17:48.852735000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-08-24 14:17:48.854746000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-23 12:09:03.351434439 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-08-23 12:09:03.351434439 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,171 @@ +/* + * objlayout.h @@ -23499,13 +23153,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noar + +#endif /* _OBJLAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-24 14:17:48.857735000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-24 14:17:48.860740000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-23 12:09:03.352501716 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-23 12:09:03.352501716 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,734 @@ +/* + * panfs_shim.c @@ -24242,13 +23891,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noa +module_init(panlayout_init); +module_exit(panlayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-24 14:17:48.863734000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-24 14:17:48.864730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-23 12:09:03.353501685 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-23 12:09:03.353501685 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,482 @@ +/* + * panfs_shim.h @@ -24733,13 +24377,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noa + +#endif /* _PANLAYOUT_PANFS_SHIM_H */ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-24 14:17:48.868731000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-24 14:17:48.869739000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-23 12:09:03.354501721 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-23 12:09:03.354501721 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,435 @@ +/* + * pnfs_osd_xdr.c @@ -25177,13 +24816,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6. + return 0; +} diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/pagelist.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-08-24 14:14:13.169705000 -0400 +++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-24 14:17:48.875733000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-08-23 12:08:29.056411363 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-23 12:09:03.355511659 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -20,6 +20,7 @@ #include @@ -25306,13 +24940,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/p if (res == INT_MAX) goto out; diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-08-24 14:17:48.880733000 -0400 +++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-08-24 14:17:48.883730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-08-23 12:09:03.356501413 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-08-23 12:09:03.357481204 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,2027 @@ +/* + * linux/fs/nfs/pnfs.c @@ -27342,13 +26971,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs. +} +EXPORT_SYMBOL(nfs4_put_deviceid_cache); diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs.h -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-08-24 14:17:48.886733000 -0400 +++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-08-24 14:17:48.887735000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-08-23 12:09:03.358501440 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-08-23 12:09:03.358501440 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,355 @@ +/* + * fs/nfs/pnfs.h @@ -27706,13 +27330,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs. + +#endif /* FS_NFS_PNFS_H */ diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-08-24 14:14:13.174707000 -0400 +++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-24 14:17:48.893730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-08-23 12:08:29.057511533 -0400 -+++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-23 12:09:03.359501471 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -443,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, stru fattr = nfs_alloc_fattr(); status = -ENOMEM; @@ -27740,13 +27359,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc. .getattr = nfs_proc_getattr, .setattr = nfs_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-08-24 14:14:13.179708000 -0400 +++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-24 14:17:48.899733000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-08-23 12:08:29.057511533 -0400 -+++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-23 12:09:03.359501471 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -18,8 +18,12 @@ #include #include @@ -27961,13 +27575,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read. nfs_add_stats(inode, NFSIOS_READPAGES, npages); read_complete: diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/super.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-24 14:14:13.186707000 -0400 +++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-24 14:17:48.907729000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-23 12:08:29.059491391 -0400 -+++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-23 12:09:03.361501458 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -64,6 +64,7 @@ #include "iostat.h" #include "internal.h" @@ -28015,13 +27624,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/supe #endif diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unlink.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-08-24 14:14:13.192705000 -0400 +++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-24 14:17:48.913730000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-08-23 12:08:29.060501485 -0400 -+++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-23 12:09:03.362419975 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task struct nfs_unlinkdata *data = calldata; struct nfs_server *server = NFS_SERVER(data->dir); @@ -28032,13 +27636,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unl return; rpc_call_start(task); diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/write.c -<<<<<<< HEAD --- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-08-24 14:14:06.360160000 -0400 +++ linux-2.6.34.noarch/fs/nfs/write.c 2010-08-24 14:17:48.921712000 -0400 -======= ---- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-08-23 12:08:27.630563929 -0400 -+++ linux-2.6.34.noarch/fs/nfs/write.c 2010-08-23 12:09:03.364491337 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -20,6 +20,7 @@ #include #include @@ -28727,11 +28326,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/writ int nfs_wb_page_cancel(struct inode *inode, struct page *page) diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/include/linux/exportfs.h --- linux-2.6.34.noarch/include/linux/exportfs.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-08-24 14:17:48.933713000 -0400 -======= -+++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-08-23 12:09:03.365501459 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -2,6 +2,7 @@ #define LINUX_EXPORTFS_H 1 @@ -28804,13 +28399,8 @@ diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/i +#endif /* CONFIG_PNFSD */ #endif /* LINUX_EXPORTFS_H */ diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/include/linux/exp_xdr.h -<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-08-24 14:17:48.945690000 -0400 +++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-08-24 14:17:48.946693000 -0400 -======= ---- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-08-23 12:09:03.367491365 -0400 -+++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-08-23 12:09:03.367491365 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,141 @@ +#ifndef _LINUX_EXP_XDR_H +#define _LINUX_EXP_XDR_H @@ -28954,13 +28544,8 @@ diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/in +} +#endif /* _LINUX_EXP_XDR_H */ diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include/linux/fs.h -<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-08-24 14:14:13.014707000 -0400 +++ linux-2.6.34.noarch/include/linux/fs.h 2010-08-24 14:17:48.961675000 -0400 -======= ---- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-08-23 12:08:29.021511898 -0400 -+++ linux-2.6.34.noarch/include/linux/fs.h 2010-08-23 12:09:03.369481147 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -387,6 +387,7 @@ struct inodes_stat_t { #include @@ -28979,11 +28564,7 @@ diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include struct dentry *s_root; diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/include/linux/nfs4.h --- linux-2.6.34.noarch/include/linux/nfs4.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-08-24 14:17:48.974681000 -0400 -======= -+++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-08-23 12:09:03.371491472 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -17,7 +17,10 @@ #define NFS4_BITMAP_SIZE 2 @@ -29113,13 +28694,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/inclu #endif diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfs4_pnfs.h -<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-24 14:17:48.986670000 -0400 +++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-08-24 14:17:48.989666000 -0400 -======= ---- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-23 12:09:03.372501550 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-08-23 12:09:03.372501550 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,330 @@ +/* + * include/linux/nfs4_pnfs.h @@ -29452,13 +29028,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/ + +#endif /* LINUX_NFS4_PNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarch/include/linux/nfsd4_block.h -<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-08-24 14:17:48.998668000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-08-24 14:17:49.000665000 -0400 -======= ---- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-08-23 12:09:03.373491892 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-08-23 12:09:03.374491393 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,101 @@ +#ifndef NFSD4_BLOCK +#define NFSD4_BLOCK @@ -29562,13 +29133,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarc +#endif /* NFSD4_BLOCK */ + diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h -<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-24 14:17:49.012664000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-08-24 14:17:49.013671000 -0400 -======= ---- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-23 12:09:03.375501481 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-08-23 12:09:03.375501481 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,345 @@ +/* + * include/linux/nfsd4_spnfs.h @@ -29917,11 +29483,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarc +#endif /* NFS_SPNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch/include/linux/nfsd/const.h --- linux-2.6.34.noarch/include/linux/nfsd/const.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-08-24 14:17:49.018668000 -0400 -======= -+++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-08-23 12:09:03.376401789 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -29,6 +29,7 @@ #ifdef __KERNEL__ @@ -29932,11 +29494,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch * Largest number of bytes we need to allocate for an NFS diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch/include/linux/nfsd/debug.h --- linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-08-24 14:17:49.024673000 -0400 -======= -+++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-08-23 12:09:03.376401789 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -32,6 +32,8 @@ #define NFSDDBG_REPCACHE 0x0080 #define NFSDDBG_XDR 0x0100 @@ -29948,11 +29506,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarch/include/linux/nfsd/export.h --- linux-2.6.34.noarch/include/linux/nfsd/export.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-08-24 14:17:49.030665000 -0400 -======= -+++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-08-23 12:09:03.377481954 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -100,6 +100,7 @@ struct svc_export { uid_t ex_anon_uid; gid_t ex_anon_gid; @@ -29962,13 +29516,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarc struct nfsd4_fs_locations ex_fslocs; int ex_nflavors; diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h -<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-24 14:17:49.033666000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-24 14:17:49.034665000 -0400 -======= ---- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-23 12:09:03.377481954 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-23 12:09:03.378501747 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -30103,13 +29652,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.3 + +#endif /* NFSD_NFS4LAYOUTXDR_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h -<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-24 14:17:49.037666000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-24 14:17:49.039665000 -0400 -======= ---- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-23 12:09:03.378501747 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-23 12:09:03.378501747 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,54 @@ +/****************************************************************************** + * @@ -30166,13 +29710,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34. + +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h -<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-24 14:17:49.042666000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-24 14:17:49.044665000 -0400 -======= ---- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-23 12:09:03.379487099 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-23 12:09:03.379487099 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -30447,11 +29986,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.n +#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noarch/include/linux/nfsd/syscall.h --- linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-08-24 14:17:49.049665000 -0400 -======= -+++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-08-23 12:09:03.380502500 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -29,6 +29,7 @@ /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ @@ -30489,13 +30024,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noar union nfsctl_res { diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/include/linux/nfs_fs.h -<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-24 14:14:13.201710000 -0400 +++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-24 14:17:49.063666000 -0400 -======= ---- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-23 12:08:29.061494081 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-23 12:09:03.381511751 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -72,13 +72,20 @@ struct nfs_access_entry { int mask; }; @@ -30594,13 +30124,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/inc #ifdef __KERNEL__ diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/include/linux/nfs_fs_sb.h -<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-24 14:14:13.206708000 -0400 +++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-24 14:17:49.077665000 -0400 -======= ---- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-23 12:08:29.062501618 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-23 12:09:03.383491395 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -15,6 +15,7 @@ struct nlm_host; struct nfs4_sequence_args; struct nfs4_sequence_res; @@ -30675,11 +30200,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/ atomic_t active; /* Keep trace of any activity to this server */ diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch/include/linux/nfs_iostat.h --- linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-08-24 14:17:49.089668000 -0400 -======= -+++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-08-23 12:09:03.384501540 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { NFSIOS_SHORTREAD, NFSIOS_SHORTWRITE, @@ -30692,11 +30213,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/include/linux/nfs_page.h --- linux-2.6.34.noarch/include/linux/nfs_page.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-08-24 14:17:49.103665000 -0400 -======= -+++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-08-23 12:09:03.385491518 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -39,6 +39,7 @@ struct nfs_page { struct list_head wb_list; /* Defines state of page: */ struct page *wb_page; /* page to read in/write out */ @@ -30745,13 +30262,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/i struct inode *inode, int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/include/linux/nfs_xdr.h -<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-08-24 14:14:13.211708000 -0400 +++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-24 14:17:49.116665000 -0400 -======= ---- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-08-23 12:08:29.062501618 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-23 12:09:03.387491422 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -3,6 +3,8 @@ #include @@ -30903,13 +30415,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/in extern struct rpc_version nfs_version3; extern struct rpc_version nfs_version4; diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.noarch/include/linux/panfs_shim_api.h -<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-08-24 14:17:49.128664000 -0400 +++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-08-24 14:17:49.129670000 -0400 -======= ---- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-08-23 12:09:03.388491527 -0400 -+++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-08-23 12:09:03.388491527 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,57 @@ +#ifndef _PANFS_SHIM_API_H +#define _PANFS_SHIM_API_H @@ -30969,13 +30476,8 @@ diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.no + +#endif /* _PANFS_SHIM_API_H */ diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h -<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-24 14:17:49.141664000 -0400 +++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-08-24 14:17:49.142670000 -0400 -======= ---- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-23 12:09:03.390501461 -0400 -+++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-08-23 12:09:03.390501461 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,440 @@ +/* + * pnfs_osd_xdr.h @@ -31418,13 +30920,8 @@ diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noar + +#endif /* __PNFS_OSD_XDR_H__ */ diff -up linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_xdr.h -<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig 2010-08-24 14:17:49.153666000 -0400 +++ linux-2.6.34.noarch/include/linux/pnfs_xdr.h 2010-08-24 14:17:49.155665000 -0400 -======= ---- linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig 2010-08-23 12:09:03.391491550 -0400 -+++ linux-2.6.34.noarch/include/linux/pnfs_xdr.h 2010-08-23 12:09:03.391491550 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,134 @@ +/* + * include/linux/pnfs_xdr.h @@ -31562,11 +31059,7 @@ diff -up linux-2.6.34.noarch/include/linux/pnfs_xdr.h.orig linux-2.6.34.noarch/i +#endif /* LINUX_PNFS_XDR_H */ diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/include/linux/posix_acl.h --- linux-2.6.34.noarch/include/linux/posix_acl.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-08-24 14:17:49.168668000 -0400 -======= -+++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-08-23 12:09:03.393501437 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -8,6 +8,7 @@ #ifndef __LINUX_POSIX_ACL_H #define __LINUX_POSIX_ACL_H @@ -31577,11 +31070,7 @@ diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/ #define ACL_UNDEFINED_ID (-1) diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h --- linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-08-24 14:17:49.174665000 -0400 -======= -+++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-08-23 12:09:03.393501437 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -14,6 +14,8 @@ /* size of an XDR encoding unit in bytes, i.e. 32bit */ #define XDR_UNIT (4) @@ -31593,11 +31082,7 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.n diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h --- linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-24 14:17:49.179667000 -0400 -======= -+++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-23 12:09:03.394512138 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -3,6 +3,7 @@ #ifdef __KERNEL__ @@ -31618,13 +31103,8 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.3 struct rpc_pipe_ops { diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h -<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-24 14:17:49.183664000 -0400 +++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-24 14:17:49.184674000 -0400 -======= ---- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-23 12:09:03.394512138 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-23 12:09:03.395501822 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2008 The Regents of the University of Michigan. @@ -31739,11 +31219,7 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux +#endif /* _SIMPLE_RPC_PIPEFS_H_ */ diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h --- linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig 2010-05-16 17:17:36.000000000 -0400 -<<<<<<< HEAD +++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-24 14:17:49.190665000 -0400 -======= -+++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-23 12:09:03.395501822 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -166,4 +166,41 @@ static inline char *__svc_print_addr(con return buf; @@ -31787,13 +31263,8 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.n +} #endif /* SUNRPC_SVC_XPRT_H */ diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xdr.h -<<<<<<< HEAD --- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-24 14:14:13.258707000 -0400 +++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-24 14:17:49.195672000 -0400 -======= ---- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-23 12:08:29.066475323 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-23 12:09:03.396464612 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp) return p + 2; } @@ -31816,20 +31287,9 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, unsigned int len); extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); -<<<<<<< HEAD diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/sunrpc/Makefile --- linux-2.6.34.noarch/net/sunrpc/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-08-24 14:17:49.204668000 -0400 -======= -diff -up linux-2.6.34.noarch/localversion-pnfs.orig linux-2.6.34.noarch/localversion-pnfs ---- linux-2.6.34.noarch/localversion-pnfs.orig 2010-08-23 12:09:03.396464612 -0400 -+++ linux-2.6.34.noarch/localversion-pnfs 2010-08-23 12:09:03.396464612 -0400 -@@ -0,0 +1 @@ -+-pnfs -diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/sunrpc/Makefile ---- linux-2.6.34.noarch/net/sunrpc/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-08-23 12:09:03.397501662 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt svc.o svcsock.o svcauth.o svcauth_unix.o \ addr.o rpcb_clnt.o timer.o xdr.o \ @@ -31840,13 +31300,8 @@ diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/su sunrpc-$(CONFIG_PROC_FS) += stats.o sunrpc-$(CONFIG_SYSCTL) += sysctl.o diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c -<<<<<<< HEAD --- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-24 14:17:49.208664000 -0400 +++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-24 14:17:49.209670000 -0400 -======= ---- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-23 12:09:03.398522348 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-23 12:09:03.398522348 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -0,0 +1,424 @@ +/* + * net/sunrpc/simple_rpc_pipefs.c @@ -32273,13 +31728,8 @@ diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.no +} +EXPORT_SYMBOL(pipefs_generic_destroy_msg); diff -up linux-2.6.34.noarch/net/sunrpc/xdr.c.orig linux-2.6.34.noarch/net/sunrpc/xdr.c -<<<<<<< HEAD --- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-08-24 14:14:13.447705000 -0400 +++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-24 14:17:49.215665000 -0400 -======= ---- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-08-23 12:08:29.081501640 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-23 12:09:03.399443371 -0400 ->>>>>>> Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-19 @@ -403,16 +403,14 @@ xdr_shrink_pagelen(struct xdr_buf *buf, /* Shift the tail first */ From c9fe5dbdd69fdb2b0598be1491c3f0c938fa359e Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Tue, 31 Aug 2010 20:57:01 -0400 Subject: [PATCH 18/20] - Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-24 Signed-off-by: Steve Dickson --- kernel.spec | 8 +- pnfs-all-2.6.35-2010-08-24-f13.patch | 31778 +++++++++++++++++++++++++ 2 files changed, 31783 insertions(+), 3 deletions(-) create mode 100644 pnfs-all-2.6.35-2010-08-24-f13.patch diff --git a/kernel.spec b/kernel.spec index f3e776e20..6b9632cdd 100644 --- a/kernel.spec +++ b/kernel.spec @@ -23,7 +23,7 @@ Summary: The Linux kernel # # (Uncomment the '#' and both spaces below to set the buildid.) # -%define buildid .pnfs34.2010.08.19 +%define buildid .pnfs34.2010.08.24 ################################################################### # The buildid can also be specified on the rpmbuild command line @@ -818,7 +818,7 @@ Patch12581: xen-use-percpu-interrupts-for-ipis-and-virqs.patch Patch30000: nfs-35-fc.patch Patch30001: nfsd-35-fc.patch -Patch30002: pnfs-all-2.6.35-2010-08-19-f13.patch +Patch30002: pnfs-all-2.6.35-2010-08-24-f13.patch Patch30003: linux-2.6-pnfs-compile.patch Patch30004: linux-2.6.35-inline.patch @@ -1551,7 +1551,7 @@ ApplyPatch xen-use-percpu-interrupts-for-ipis-and-virqs.patch ApplyPatch nfs-35-fc.patch ApplyPatch nfsd-35-fc.patch -ApplyPatch pnfs-all-2.6.35-2010-08-19-f13.patch +ApplyPatch pnfs-all-2.6.35-2010-08-24-f13.patch ApplyPatch linux-2.6-pnfs-compile.patch ApplyPatch linux-2.6.35-inline.patch # END OF PATCH APPLICATIONS @@ -2256,6 +2256,8 @@ fi * Wed Sep 01 2010 Chuck Ebbert 2.6.34.6-48 - Revert commit 6a1a82df91fa0eb1cc76069a9efe5714d087eccd from 2.6.34.1; it breaks ftdi_sio (#613597) +* Tue Aug 31 2010 Steve Dickson +- Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-24 * Fri Aug 27 2010 Chuck Ebbert 2.6.34.6-47 - Linux 2.6.34.6 diff --git a/pnfs-all-2.6.35-2010-08-24-f13.patch b/pnfs-all-2.6.35-2010-08-24-f13.patch new file mode 100644 index 000000000..17d1c844d --- /dev/null +++ b/pnfs-all-2.6.35-2010-08-24-f13.patch @@ -0,0 +1,31778 @@ +diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arch/um/os-Linux/mem.c +--- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-08-31 20:41:16.924243041 -0400 ++++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-08-31 20:42:05.486160576 -0400 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include "init.h" + #include "kern_constants.h" + #include "os.h" +diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd.c +--- linux-2.6.34.noarch/block/genhd.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/block/genhd.c 2010-08-31 20:42:05.487160201 -0400 +@@ -1009,6 +1009,7 @@ static void disk_release(struct device * + struct class block_class = { + .name = "block", + }; ++EXPORT_SYMBOL(block_class); + + static char *block_devnode(struct device *dev, mode_t *mode) + { +diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt +--- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-31 20:42:05.486160576 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-08-31 20:42:05.486160576 -0400 +@@ -0,0 +1,211 @@ ++(c) 2007 Network Appliance Inc. ++ ++spNFS ++----- ++ ++An spNFS system consists of a Meta Data Server (MDS), a number of Client machines (C) and a number of Data Servers (DS). ++ ++A file system is mounted by the clients from the MDS, and all file data ++is striped across the DSs. ++ ++Identify the machines that will be filling each of these roles. ++ ++The spnfs kernel will be installed on all machines: clients, the MDS and DSs. ++ ++ ++Building and installing the spNFS kernel ++---------------------------------------- ++ ++Get the spNFS kernel from: ++ ++ git://linux-nfs.org/~bhalevy/linux-pnfs.git ++ ++Use the pnfs-all-latest branch and add these options to your .config file ++ ++ CONFIG_NETWORK_FILESYSTEMS=y ++ CONFIG_NFS_FS=m ++ CONFIG_NFS_V4=y ++ CONFIG_NFS_V4_1=y ++ CONFIG_PNFS=y ++ CONFIG_NFSD=m ++ CONFIG_PNFSD=y ++ # CONFIG_PNFSD_LOCAL_EXPORT is not set ++ CONFIG_SPNFS=y ++ ++By default, spNFS uses whole-file layouts. Layout segments can be enabled ++by adding: ++ ++ CONFIG_SPNFS_LAYOUTSEGMENTS=y ++ ++to your .config file. ++ ++Building and installation of kernel+modules is as usual. ++This kernel should be installed and booted on the client, MDS and DSs. ++ ++Note that CONFIG_PNFSD_LOCAL_EXPORT must be disabled for spnfs as it ++takes over the pnfs export interface. ++ ++Building nfs-utils ++------------------ ++ ++Get the nfs-utils package containing spnfsd from: ++ ++ git://linux-nfs.org/~bhalevy/pnfs-nfs-utils.git ++ ++Follow the standard instructions for building nfs-utils. ++ ++After building, the spnfsd daemon will be located in utils/spnfsd. The spnfsd ++daemon will only be needed on the MDS. ++ ++ ++Installation ++------------ ++ ++The nfs-utils package contains a default spnfsd.conf file in ++utils/spnfsd/spnfsd.conf. Copy this file to /etc/spnfsd.conf. ++ ++By default, the DS-Mount-Directory is set to /spnfs (see spnfsd.conf). Under ++this directory, mount points must be created for each DS to ++be used for pNFS data stripes. These mount points are named by the ip address ++of the corresponding DS. In the sample spnfsd.conf, there are two ++DSs defined (172.16.28.134 and 172.16.28.141). ++ ++Following the sample spnfsd.conf, ++ ++ mkdir /spnfs ++ ++on the MDS (corresponding to DS-Mount-Directory). Then ++ ++ mkdir /spnfs/172.16.28.134 ++ mkdir /spnfs/172.16.28.141 ++ ++to create the mount points for the DSs. ++ ++On the DSs, chose a directory where data stripes will be created by the MDS. ++For the sample file, this directory is /pnfs, so on each DS execute: ++ ++ mkdir /pnfs ++ ++This directory is specified in the spnfsd.conf file by the DS*_ROOT option ++(where * is replaced by the DS number). DS_ROOT is specified relative to ++the directory being exported by the DSs. In our example, our DSs are exporting ++the root directory (/) and therefore our DS_ROOT is /pnfs. On the DSs, we have ++the following entry in /etc/exports: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check) ++ ++N.B. If we had created a /exports directory and a /pnfs directory under ++/exports, and if we were exporting /exports, then DS_ROOT would still be /pnfs ++(not /exports/pnfs). ++ ++It may be useful to add entries to /etc/fstab on the MDS to automatically ++mount the DS_ROOT file systems. For this example, our MDS fstab would ++contain: ++ ++ 172.17.84.128:/pnfs /spnfs/172.17.84.128 nfs defaults 1 2 ++ 172.17.84.122:/pnfs /spnfs/172.17.84.122 nfs defaults 1 2 ++ ++The DS mounts must be performed manually or via fstab at this time (automatic ++mounting, directory creation, etc. are on the todo list). To perform I/O ++through the MDS, the DS mounts MUST use NFSv3 at this time (this restriction ++will eventually be removed). ++ ++ ++On the MDS, choose a file system to use with spNFS and export it, e.g.: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check,pnfs) ++ ++Make sure nfsd and all supporting processes are running on the MDS and DSs. ++ ++ ++Running ++------- ++ ++If rpc_pipefs is not already mounted (if you're running idmapd it probably is), ++you may want to add the following line to /etc/fstab: ++ ++ rpc_pipefs /var/lib/nfs/rpc_pipefs rpc_pipefs defaults 0 0 ++ ++to automatically mount rpc_pipefs. ++ ++With spnfsd.conf configured for your environment and the mounts mounted as ++described above, spnfsd can now be started. ++ ++On the MDS, execute spnfsd: ++ ++ spnfsd ++ ++The executable is located in the directory where it was built, and ++may also have been installed elsewhere depending on how you built nfs-utils. ++It will run in the foreground by default, and in fact will do so despite ++any options suggesting the contrary (it's still a debugging build). ++ ++On the client, make sure the nfslayoutdriver module is loaded: ++ ++ modprobe nfslayoutdriver ++ ++Then mount the file system from the MDS: ++ ++ mount -t nfs4 -o minorversion=1 mds:/ /mnt ++ ++I/O through the MDS is now supported. To use it, do not load the ++nfslayoutdriver on the client, and mount the MDS using NFSv4 or 4.1 ++(NFSv2 and v3 are not yet supported). ++ ++You may now use spNFS by performing file system activities in /mnt. ++If you create files in /mnt, you should see stripe files corresponding to ++new files being created on the DSs. The current implementation names the ++stripe files based on the inode number of the file on the MDS. For example, ++if you create a file foo in /mnt and do an 'ls -li /mnt/foo': ++ ++ # ls -li foo ++ 1233 -rw-r--r-- 1 root root 0 Nov 29 15:54 foo ++ ++You should see stripe files on each under /pnfs (per the sample) named ++1233. The file /pnfs/1233 on DS1 will contain the first bytes ++of data written to foo, DS2 will contain the next bytes, etc. ++Removing /mnt/foo will remove the corresponding stripe files on the DSs. ++Other file system operations should behave (mostly :-) as expected. ++ ++ ++Layout Segments ++--------------- ++ ++If the kernel is compiled to support layout segments, there will ++be two files created under /proc/fs/spnfs for controlling layout ++segment functionality. ++ ++To enable layout segments, write a '1' to /proc/fs/spnfs/layoutseg, e.g.: ++ ++ echo 1 > /proc/fs/spnfs/layoutseg ++ ++Layout segments can be disabled (returning to whole-file layouts) by ++writing a '0' to /proc/fs/spnfs/layoutseg: ++ ++ echo 0 > /proc/fs/spnfs/layoutseg ++ ++When layout segments are enabled, the size of the layouts returned can ++be specified by writing a decimal number (ascii representation) to ++/proc/fs/spnfs/layoutsegsize: ++ ++ echo 1024 > /proc/fs/spnfs/layoutsegsize ++ ++The value'0' has a special meaning--it causes the server to return a ++layout that is exactly the size requested by the client: ++ ++ echo 0 > /proc/fs/spnfs/layoutsegsize ++ ++ ++Troubleshooting ++--------------- ++ ++If you see data being written to the files on the MDS rather than ++the stripe files, make sure the nfslayoutdriver is loaded on the client ++(see above). ++ ++If you get a "permission denied" error, make sure mountd is running on the mds ++(it occasionally fails to start). ++ ++Bugs, enhancements, compliments, complaints to: dmuntz@netapp.com ++ ++ +diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/drivers/md/dm-ioctl.c +--- linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig 2010-08-31 20:41:17.063232968 -0400 ++++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-08-31 20:42:05.488160560 -0400 +@@ -657,6 +657,12 @@ static int dev_create(struct dm_ioctl *p + return r; + } + ++int dm_dev_create(struct dm_ioctl *param) ++{ ++ return dev_create(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_create); ++ + /* + * Always use UUID for lookups if it's present, otherwise use name or dev. + */ +@@ -751,6 +757,12 @@ static int dev_remove(struct dm_ioctl *p + return 0; + } + ++int dm_dev_remove(struct dm_ioctl *param) ++{ ++ return dev_remove(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_remove); ++ + /* + * Check a string doesn't overrun the chunk of + * memory we copied from userland. +@@ -923,6 +935,12 @@ static int do_resume(struct dm_ioctl *pa + return r; + } + ++int dm_do_resume(struct dm_ioctl *param) ++{ ++ return do_resume(param); ++} ++EXPORT_SYMBOL(dm_do_resume); ++ + /* + * Set or unset the suspension state of a device. + * If the device already is in the requested state we just return its status. +@@ -1200,6 +1218,12 @@ out: + return r; + } + ++int dm_table_load(struct dm_ioctl *param, size_t param_size) ++{ ++ return table_load(param, param_size); ++} ++EXPORT_SYMBOL(dm_table_load); ++ + static int table_clear(struct dm_ioctl *param, size_t param_size) + { + int r; +diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drivers/scsi/hosts.c +--- linux-2.6.34.noarch/drivers/scsi/hosts.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-08-31 20:42:05.489160594 -0400 +@@ -49,7 +49,7 @@ static void scsi_host_cls_release(struct + put_device(&class_to_shost(dev)->shost_gendev); + } + +-static struct class shost_class = { ++struct class shost_class = { + .name = "scsi_host", + .dev_release = scsi_host_cls_release, + }; +diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/exofs.h +--- linux-2.6.34.noarch/fs/exofs/exofs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-08-31 20:42:05.492243039 -0400 +@@ -36,13 +36,9 @@ + #include + #include + #include ++#include + #include "common.h" + +-/* FIXME: Remove once pnfs hits mainline +- * #include +- */ +-#include "pnfs.h" +- + #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) + + #ifdef CONFIG_EXOFS_DEBUG +@@ -103,6 +99,7 @@ struct exofs_sb_info { + struct exofs_i_info { + struct inode vfs_inode; /* normal in-memory inode */ + wait_queue_head_t i_wq; /* wait queue for inode */ ++ spinlock_t i_layout_lock; /* lock for layout/return/recall */ + unsigned long i_flags; /* various atomic flags */ + uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ + uint32_t i_dir_start_lookup; /* which page to start lookup */ +@@ -166,6 +163,9 @@ static inline unsigned exofs_io_state_si + */ + #define OBJ_2BCREATED 0 /* object will be created soon*/ + #define OBJ_CREATED 1 /* object has been created on the osd*/ ++/* Below are not used atomic but reuse the same i_flags */ ++#define OBJ_LAYOUT_IS_GIVEN 2 /* inode has given layouts to clients*/ ++#define OBJ_IN_LAYOUT_RECALL 3 /* inode is in the middle of a layout recall*/ + + static inline int obj_2bcreated(struct exofs_i_info *oi) + { +@@ -304,4 +304,20 @@ extern const struct inode_operations exo + extern const struct inode_operations exofs_symlink_inode_operations; + extern const struct inode_operations exofs_fast_symlink_inode_operations; + ++/* export.c */ ++typedef int (exofs_recall_fn)(struct inode *inode); ++#ifdef CONFIG_PNFSD ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo); ++void exofs_init_export(struct super_block *sb); ++#else ++static inline int exofs_inode_recall_layout(struct inode *inode, ++ enum pnfs_iomode iomode, exofs_recall_fn todo) ++{ ++ return todo(inode); ++} ++ ++static inline void exofs_init_export(struct super_block *sb) {} ++#endif ++ + #endif +diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs/export.c +--- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-08-31 20:42:05.493222759 -0400 ++++ linux-2.6.34.noarch/fs/exofs/export.c 2010-08-31 20:42:05.493222759 -0400 +@@ -0,0 +1,396 @@ ++/* ++ * export.c - Implementation of the pnfs_export_operations ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Boaz Harrosh ++ * ++ * This file is part of exofs. ++ * ++ * exofs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation. Since it is based on ext2, and the only ++ * valid version of GPL for the Linux kernel is version 2, the only valid ++ * version of GPL for exofs is version 2. ++ * ++ * exofs is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with exofs; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include ++#include "exofs.h" ++ ++static int exofs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_OSD2_OBJECTS; ++} ++ ++static void set_dev_id(struct pnfs_deviceid *pnfs_devid, u64 sbid, u64 devid) ++{ ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)pnfs_devid; ++ ++ dev_id->sbid = sbid; ++ dev_id->devid = devid; ++} ++ ++static int cb_layout_recall(struct inode *inode, enum pnfs_iomode iomode, ++ u64 offset, u64 length, void *cookie) ++{ ++ struct nfsd4_pnfs_cb_layout cbl; ++ struct pnfsd_cb_ctl cb_ctl; ++ int status; ++ ++ memset(&cb_ctl, 0, sizeof(cb_ctl)); ++ status = pnfsd_get_cb_op(&cb_ctl); ++ if (unlikely(status)) { ++ EXOFS_ERR("%s: nfsd unloaded!! inode (0x%lx) status=%d\n", ++ __func__, inode->i_ino, status); ++ goto err; ++ } ++ ++ memset(&cbl, 0, sizeof(cbl)); ++ cbl.cbl_recall_type = RETURN_FILE; ++ cbl.cbl_seg.layout_type = LAYOUT_OSD2_OBJECTS; ++ cbl.cbl_seg.iomode = iomode; ++ cbl.cbl_seg.offset = offset; ++ cbl.cbl_seg.length = length; ++ cbl.cbl_cookie = cookie; ++ ++ status = cb_ctl.cb_op->cb_layout_recall(inode->i_sb, inode, &cbl); ++ pnfsd_put_cb_op(&cb_ctl); ++ ++err: ++ return status; ++} ++ ++static enum nfsstat4 exofs_layout_get( ++ struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; ++ struct exofs_layout *el = &sbi->layout; ++ struct pnfs_osd_object_cred *creds = NULL; ++ struct pnfs_osd_layout layout; ++ __be32 *start; ++ bool in_recall; ++ int i, err; ++ enum nfsstat4 nfserr; ++ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ res->lg_seg.iomode = IOMODE_RW; ++ res->lg_return_on_close = true; /* TODO: unused but will be soon */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ creds = kcalloc(el->s_numdevs, sizeof(*creds), GFP_KERNEL); ++ if (!creds) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto out; ++ } ++ ++ /* Fill in a pnfs_osd_layout struct */ ++ layout.olo_map = sbi->data_map; ++ ++ for (i = 0; i < el->s_numdevs; i++) { ++ struct pnfs_osd_object_cred *cred = &creds[i]; ++ osd_id id = exofs_oi_objno(oi); ++ unsigned dev = exofs_layout_od_id(el, id, i); ++ ++ set_dev_id(&cred->oc_object_id.oid_device_id, args->lg_sbid, ++ dev); ++ cred->oc_object_id.oid_partition_id = el->s_pid; ++ cred->oc_object_id.oid_object_id = id; ++ cred->oc_osd_version = osd_dev_is_ver1(el->s_ods[dev]) ? ++ PNFS_OSD_VERSION_1 : ++ PNFS_OSD_VERSION_2; ++ cred->oc_cap_key_sec = PNFS_OSD_CAP_KEY_SEC_NONE; ++ ++ cred->oc_cap_key.cred_len = 0; ++ cred->oc_cap_key.cred = NULL; ++ ++ cred->oc_cap.cred_len = OSD_CAP_LEN; ++ cred->oc_cap.cred = oi->i_cred; ++ } ++ ++ layout.olo_comps_index = 0; ++ layout.olo_num_comps = el->s_numdevs; ++ layout.olo_comps = creds; ++ ++ err = pnfs_osd_xdr_encode_layout(xdr, &layout); ++ if (err) { ++ nfserr = NFS4ERR_TOOSMALL; /* FIXME: Change osd_xdr error codes */ ++ goto out; ++ } ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ if (!in_recall) { ++ __set_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ nfserr = NFS4_OK; ++ } else { ++ nfserr = NFS4ERR_RECALLCONFLICT; ++ } ++ spin_unlock(&oi->i_layout_lock); ++ ++out: ++ kfree(creds); ++ EXOFS_DBGMSG("(0x%lx) nfserr=%u xdr_bytes=%zu\n", ++ inode->i_ino, nfserr, exp_xdr_qbytes(xdr->p - start)); ++ return nfserr; ++} ++ ++/* NOTE: inode mutex must NOT be held */ ++static int exofs_layout_commit( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct timespec mtime; ++ loff_t i_size; ++ int in_recall; ++ ++ /* In case of a recall we ignore the new size and mtime since they ++ * are going to be changed again by truncate, and since we cannot take ++ * the inode lock in that case. ++ */ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ if (in_recall) { ++ EXOFS_DBGMSG("(0x%lx) commit was called during recall\n", ++ inode->i_ino); ++ return 0; ++ } ++ ++ /* NOTE: I would love to call inode_setattr here ++ * but i cannot since this will cause an eventual vmtruncate, ++ * which will cause a layout_recall. So open code the i_size ++ * and mtime/atime changes under i_mutex. ++ */ ++ mutex_lock_nested(&inode->i_mutex, I_MUTEX_NORMAL); ++ ++ if (args->lc_mtime.seconds) { ++ mtime.tv_sec = args->lc_mtime.seconds; ++ mtime.tv_nsec = args->lc_mtime.nseconds; ++ ++ /* layout commit may only make time bigger, since there might ++ * be reordering of the notifications and it might arrive after ++ * A local change. ++ * TODO: if mtime > ctime then we know set_attr did an mtime ++ * in the future. and we can let this update through ++ */ ++ if (0 <= timespec_compare(&mtime, &inode->i_mtime)) ++ mtime = inode->i_mtime; ++ } else { ++ mtime = current_fs_time(inode->i_sb); ++ } ++ ++ /* TODO: Will below work? since mark_inode_dirty has it's own ++ * Time handling ++ */ ++ inode->i_atime = inode->i_mtime = mtime; ++ ++ i_size = i_size_read(inode); ++ if (args->lc_newoffset) { ++ loff_t new_size = args->lc_last_wr + 1; ++ ++ if (i_size < new_size) { ++ i_size_write(inode, i_size = new_size); ++ res->lc_size_chg = 1; ++ res->lc_newsize = new_size; ++ } ++ } ++ /* TODO: else { i_size = osd_get_object_length() } */ ++ ++/* TODO: exofs does not currently use the osd_xdr part of the layout_commit */ ++ ++ mark_inode_dirty_sync(inode); ++ ++ mutex_unlock(&inode->i_mutex); ++ EXOFS_DBGMSG("(0x%lx) i_size=0x%llx lcp->off=0x%llx\n", ++ inode->i_ino, i_size, args->lc_last_wr); ++ return 0; ++} ++ ++static void exofs_handle_error(struct pnfs_osd_ioerr *ioerr) ++{ ++ EXOFS_ERR("exofs_handle_error: errno=%d is_write=%d obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ ioerr->oer_errno, ioerr->oer_iswrite, ++ _LLU(ioerr->oer_component.oid_object_id), ++ _LLU(ioerr->oer_comp_offset), ++ _LLU(ioerr->oer_comp_length)); ++} ++ ++static int exofs_layout_return( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ __be32 *p = args->lrf_body; ++ unsigned len = exp_xdr_qwords(args->lrf_body_len); ++ ++ EXOFS_DBGMSG("(0x%lx) cookie %p xdr_len %d\n", ++ inode->i_ino, args->lr_cookie, len); ++ ++ while (len >= pnfs_osd_ioerr_xdr_sz()) { ++ struct pnfs_osd_ioerr ioerr; ++ ++ p = pnfs_osd_xdr_decode_ioerr(&ioerr, p); ++ len -= pnfs_osd_ioerr_xdr_sz(); ++ exofs_handle_error(&ioerr); ++ } ++ ++ if (args->lr_cookie) { ++ struct exofs_i_info *oi = exofs_i(inode); ++ bool in_recall; ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ __clear_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ /* TODO: how to communicate cookie with the waiter */ ++ if (in_recall) ++ wake_up(&oi->i_wq); /* wakeup any recalls */ ++ } ++ ++ return 0; ++} ++ ++int exofs_get_device_info(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct exofs_sb_info *sbi = sb->s_fs_info; ++ struct pnfs_osd_deviceaddr devaddr; ++ const struct osd_dev_info *odi; ++ u64 devno = devid->devid; ++ __be32 *start; ++ int err; ++ ++ memset(&devaddr, 0, sizeof(devaddr)); ++ ++ if (unlikely(devno >= sbi->layout.s_numdevs)) ++ return -ENODEV; ++ ++ odi = osduld_device_info(sbi->layout.s_ods[devno]); ++ ++ devaddr.oda_systemid.len = odi->systemid_len; ++ devaddr.oda_systemid.data = (void *)odi->systemid; /* !const cast */ ++ ++ devaddr.oda_osdname.len = odi->osdname_len ; ++ devaddr.oda_osdname.data = (void *)odi->osdname;/* !const cast */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ err = -E2BIG; ++ goto err; ++ } ++ ++ err = pnfs_osd_xdr_encode_deviceaddr(xdr, &devaddr); ++ if (err) ++ goto err; ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ EXOFS_DBGMSG("xdr_bytes=%Zu devno=%lld osdname-%s\n", ++ exp_xdr_qbytes(xdr->p - start), devno, odi->osdname); ++ return 0; ++ ++err: ++ EXOFS_DBGMSG("Error: err=%d at_byte=%zu\n", ++ err, exp_xdr_qbytes(xdr->p - start)); ++ return err; ++} ++ ++struct pnfs_export_operations exofs_pnfs_ops = { ++ .layout_type = exofs_layout_type, ++ .layout_get = exofs_layout_get, ++ .layout_commit = exofs_layout_commit, ++ .layout_return = exofs_layout_return, ++ .get_device_info = exofs_get_device_info, ++}; ++ ++static bool is_layout_returned(struct exofs_i_info *oi) ++{ ++ bool layout_given; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ return !layout_given; ++} ++ ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ int layout_given; ++ int error = 0; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ __set_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ if (!layout_given) ++ goto exec; ++ ++ for (;;) { ++ EXOFS_DBGMSG("(0x%lx) has_layout issue a recall\n", ++ inode->i_ino); ++ error = cb_layout_recall(inode, iomode, 0, NFS4_MAX_UINT64, ++ &oi->i_wq); ++ switch (error) { ++ case 0: ++ case -EAGAIN: ++ break; ++ case -ENOENT: ++ goto exec; ++ default: ++ goto err; ++ } ++ ++ error = wait_event_interruptible(oi->i_wq, ++ is_layout_returned(oi)); ++ if (error) ++ goto err; ++ } ++ ++exec: ++ error = todo(inode); ++ ++err: ++ spin_lock(&oi->i_layout_lock); ++ __clear_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ EXOFS_DBGMSG("(0x%lx) return=>%d\n", inode->i_ino, error); ++ return error; ++} ++ ++void exofs_init_export(struct super_block *sb) ++{ ++ sb->s_pnfs_op = &exofs_pnfs_ops; ++} +diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/inode.c +--- linux-2.6.34.noarch/fs/exofs/inode.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-08-31 20:42:05.494222756 -0400 +@@ -833,7 +833,7 @@ void exofs_truncate(struct inode *inode) + if (unlikely(wait_obj_created(oi))) + goto fail; + +- ret = _do_truncate(inode); ++ ret = exofs_inode_recall_layout(inode, IOMODE_ANY, _do_truncate); + if (ret) + goto fail; + +@@ -964,6 +964,7 @@ static void __oi_init(struct exofs_i_inf + { + init_waitqueue_head(&oi->i_wq); + oi->i_flags = 0; ++ spin_lock_init(&oi->i_layout_lock); + } + /* + * Fill in an inode read from the OSD and set it up for use +diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/Kbuild +--- linux-2.6.34.noarch/fs/exofs/Kbuild.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-08-31 20:42:05.490222933 -0400 +@@ -13,4 +13,5 @@ + # + + exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o ++exofs-$(CONFIG_PNFSD) += export.o + obj-$(CONFIG_EXOFS_FS) += exofs.o +diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/Kconfig +--- linux-2.6.34.noarch/fs/exofs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-08-31 20:42:05.491232880 -0400 +@@ -1,6 +1,7 @@ + config EXOFS_FS + tristate "exofs: OSD based file system support" + depends on SCSI_OSD_ULD ++ select EXPORTFS_OSD_LAYOUT if PNFSD + help + EXOFS is a file system that uses an OSD storage device, + as its backing storage. +diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/super.c +--- linux-2.6.34.noarch/fs/exofs/super.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/super.c 2010-08-31 20:42:05.496073173 -0400 +@@ -621,6 +621,7 @@ static int exofs_fill_super(struct super + sb->s_fs_info = sbi; + sb->s_op = &exofs_sops; + sb->s_export_op = &exofs_export_ops; ++ exofs_init_export(sb); + root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); + if (IS_ERR(root)) { + EXOFS_ERR("ERROR: exofs_iget failed\n"); +diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exportfs/expfs.c +--- linux-2.6.34.noarch/fs/exportfs/expfs.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-08-31 20:42:05.497212975 -0400 +@@ -16,6 +16,13 @@ + #include + #include + ++#if defined(CONFIG_PNFSD) ++struct pnfsd_cb_ctl pnfsd_cb_ctl = { ++ .lock = __SPIN_LOCK_UNLOCKED(pnfsd_cb_ctl.lock) ++}; ++EXPORT_SYMBOL(pnfsd_cb_ctl); ++#endif /* CONFIG_PNFSD */ ++ + #define dprintk(fmt, args...) do{}while(0) + + +diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/exportfs/Makefile +--- linux-2.6.34.noarch/fs/exportfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-08-31 20:42:05.496073173 -0400 +@@ -3,4 +3,7 @@ + + obj-$(CONFIG_EXPORTFS) += exportfs.o + +-exportfs-objs := expfs.o ++exportfs-y := expfs.o ++exportfs-$(CONFIG_EXPORTFS_FILE_LAYOUT) += nfs4filelayoutxdr.o ++exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o ++exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o +diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c +--- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-31 20:42:05.497212975 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-31 20:42:05.498113655 -0400 +@@ -0,0 +1,158 @@ ++/* ++ * linux/fs/nfsd/nfs4blocklayoutxdr.c ++ * ++ * ++ * Created by Rick McNeal on 3/31/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++#include ++#include ++#include ++#include ++ ++static int ++bl_encode_simple(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 12 + 4 + bld->u.simple.bld_sig_len); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u32(p, 1); ++ p = exp_xdr_encode_u64(p, bld->u.simple.bld_offset); ++ exp_xdr_encode_opaque(p, bld->u.simple.bld_sig, ++ bld->u.simple.bld_sig_len); ++ ++ return 0; ++} ++ ++static int ++bl_encode_slice(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2 + 2 + 1); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_start); ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_len); ++ exp_xdr_encode_u32(p, bld->u.slice.bld_index); ++ ++ return 0; ++} ++ ++static int ++bl_encode_concat(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ return -ENOTSUPP; ++} ++ ++static int ++bl_encode_stripe(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ int i; ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 2 + 1 + bld->u.stripe.bld_stripes); ++ ++ p = exp_xdr_encode_u64(p, bld->u.stripe.bld_chunk_size); ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripes); ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripe_indexs[i]); ++ ++ return 0; ++} ++ ++int ++blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes) ++{ ++ u32 num_vols = 0, ++ *layoutlen_p = xdr->p; ++ pnfs_blocklayout_devinfo_t *bld; ++ int status = 0; ++ __be32 *p; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -ETOOSMALL; ++ p += 2; ++ ++ /* ++ * All simple volumes with their signature are required to be listed ++ * first. ++ */ ++ list_for_each_entry(bld, volumes, bld_list) { ++ num_vols++; ++ p = exp_xdr_reserve_qwords(xdr, 1); ++ if (!p) ++ return -ETOOSMALL; ++ p = exp_xdr_encode_u32(p, bld->bld_type); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ status = bl_encode_simple(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_SLICE: ++ status = bl_encode_slice(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ status = bl_encode_concat(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ status = bl_encode_stripe(xdr, bld); ++ break; ++ default: ++ BUG(); ++ } ++ if (status) ++ goto error; ++ } ++ ++ /* ---- Fill in the overall length and number of volumes ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (xdr->p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, num_vols); ++ ++error: ++ return status; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_devinfo); ++ ++enum nfsstat4 ++blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *bl_head) ++{ ++ struct pnfs_blocklayout_layout *b; ++ u32 *layoutlen_p = xdr->p, ++ extents = 0; ++ __be32 *p; ++ ++ /* ++ * Save spot for opaque block layout length and number of extents, ++ * fill-in later. ++ */ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p += 2; ++ ++ list_for_each_entry(b, bl_head, bll_list) { ++ extents++; ++ p = exp_xdr_reserve_qwords(xdr, 5 * 2 + 1); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.sbid); ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.devid); ++ p = exp_xdr_encode_u64(p, b->bll_foff); ++ p = exp_xdr_encode_u64(p, b->bll_len); ++ p = exp_xdr_encode_u64(p, b->bll_soff); ++ p = exp_xdr_encode_u32(p, b->bll_es); ++ } ++ ++ /* ---- Fill in the overall length and number of extents ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, extents); ++ ++ return NFS4_OK; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_layout); +diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c +--- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-31 20:42:05.498113655 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-31 20:42:05.498113655 -0400 +@@ -0,0 +1,218 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++/* We do our-own dprintk so filesystems are not dependent on sunrpc */ ++#ifdef dprintk ++#undef dprintk ++#endif ++#define dprintk(fmt, args, ...) do { } while (0) ++ ++/* Calculate the XDR length of the GETDEVICEINFO4resok structure ++ * excluding the gdir_notification and the gdir_device_addr da_layout_type. ++ */ ++static int fl_devinfo_xdr_words(const struct pnfs_filelayout_device *fdev) ++{ ++ struct pnfs_filelayout_devaddr *fl_addr; ++ struct pnfs_filelayout_multipath *mp; ++ int i, j, nwords; ++ ++ /* da_addr_body length, indice length, indices, ++ * multipath_list4 length */ ++ nwords = 1 + 1 + fdev->fl_stripeindices_length + 1; ++ for (i = 0; i < fdev->fl_device_length; i++) { ++ mp = &fdev->fl_device_list[i]; ++ nwords++; /* multipath list length */ ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ fl_addr = mp->fl_multipath_list; ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_netid.len); ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_addr.len); ++ } ++ } ++ dprintk("<-- %s nwords %d\n", __func__, nwords); ++ return nwords; ++} ++ ++/* Encodes the nfsv4_1_file_layout_ds_addr4 structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++int ++filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ unsigned int i, j, len = 0, opaque_words; ++ u32 *p_in; ++ u32 index_count = fdev->fl_stripeindices_length; ++ u32 dev_count = fdev->fl_device_length; ++ int error = 0; ++ __be32 *p; ++ ++ opaque_words = fl_devinfo_xdr_words(fdev); ++ dprintk("%s: Begin indx_cnt: %u dev_cnt: %u total size %u\n", ++ __func__, ++ index_count, ++ dev_count, ++ opaque_words*4); ++ ++ /* check space for opaque length */ ++ p = p_in = exp_xdr_reserve_qwords(xdr, opaque_words); ++ if (!p) { ++ error = -ETOOSMALL; ++ goto out; ++ } ++ ++ /* Fill in length later */ ++ p++; ++ ++ /* encode device list indices */ ++ p = exp_xdr_encode_u32(p, index_count); ++ for (i = 0; i < index_count; i++) ++ p = exp_xdr_encode_u32(p, fdev->fl_stripeindices_list[i]); ++ ++ /* encode device list */ ++ p = exp_xdr_encode_u32(p, dev_count); ++ for (i = 0; i < dev_count; i++) { ++ struct pnfs_filelayout_multipath *mp = &fdev->fl_device_list[i]; ++ ++ p = exp_xdr_encode_u32(p, mp->fl_multipath_length); ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ struct pnfs_filelayout_devaddr *da = ++ &mp->fl_multipath_list[j]; ++ ++ /* Encode device info */ ++ p = exp_xdr_encode_opaque(p, da->r_netid.data, ++ da->r_netid.len); ++ p = exp_xdr_encode_opaque(p, da->r_addr.data, ++ da->r_addr.len); ++ } ++ } ++ ++ /* backfill in length. Subtract 4 for da_addr_body size */ ++ len = (char *)p - (char *)p_in; ++ exp_xdr_encode_u32(p_in, len - 4); ++ ++ error = 0; ++out: ++ dprintk("%s: End err %d xdrlen %d\n", ++ __func__, error, len); ++ return error; ++} ++EXPORT_SYMBOL(filelayout_encode_devinfo); ++ ++/* Encodes the loc_body structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++enum nfsstat4 ++filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp) ++{ ++ u32 len = 0, nfl_util, fhlen, i; ++ u32 *layoutlen_p; ++ enum nfsstat4 nfserr; ++ __be32 *p; ++ ++ dprintk("%s: device_id %llx:%llx fsi %u, numfh %u\n", ++ __func__, ++ flp->device_id.pnfs_fsid, ++ flp->device_id.pnfs_devid, ++ flp->lg_first_stripe_index, ++ flp->lg_fh_length); ++ ++ /* Ensure file system added at least one file handle */ ++ if (flp->lg_fh_length <= 0) { ++ dprintk("%s: File Layout has no file handles!!\n", __func__); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto out; ++ } ++ ++ /* Ensure room for len, devid, util, first_stripe_index, ++ * pattern_offset, number of filehandles */ ++ p = layoutlen_p = exp_xdr_reserve_qwords(xdr, 1+2+2+1+1+2+1); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ /* save spot for opaque file layout length, fill-in later*/ ++ p++; ++ ++ /* encode device id */ ++ p = exp_xdr_encode_u64(p, flp->device_id.sbid); ++ p = exp_xdr_encode_u64(p, flp->device_id.devid); ++ ++ /* set and encode flags */ ++ nfl_util = flp->lg_stripe_unit; ++ if (flp->lg_commit_through_mds) ++ nfl_util |= NFL4_UFLG_COMMIT_THRU_MDS; ++ if (flp->lg_stripe_type == STRIPE_DENSE) ++ nfl_util |= NFL4_UFLG_DENSE; ++ p = exp_xdr_encode_u32(p, nfl_util); ++ ++ /* encode first stripe index */ ++ p = exp_xdr_encode_u32(p, flp->lg_first_stripe_index); ++ ++ /* encode striping pattern start */ ++ p = exp_xdr_encode_u64(p, flp->lg_pattern_offset); ++ ++ /* encode number of file handles */ ++ p = exp_xdr_encode_u32(p, flp->lg_fh_length); ++ ++ /* encode file handles */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ fhlen = flp->lg_fh_list[i].fh_size; ++ p = exp_xdr_reserve_space(xdr, 4 + fhlen); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ p = exp_xdr_encode_opaque(p, &flp->lg_fh_list[i].fh_base, fhlen); ++ } ++ ++ /* Set number of bytes encoded = total_bytes_encoded - length var */ ++ len = (char *)p - (char *)layoutlen_p; ++ exp_xdr_encode_u32(layoutlen_p, len - 4); ++ ++ nfserr = NFS4_OK; ++out: ++ dprintk("%s: End err %u xdrlen %d\n", ++ __func__, nfserr, len); ++ return nfserr; ++} ++EXPORT_SYMBOL(filelayout_encode_layout); +diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c +--- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-31 20:42:05.499125509 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-31 20:42:05.499125509 -0400 +@@ -0,0 +1,289 @@ ++/* ++ * pnfs_osd_xdr_enc.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_data_map( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_data_map *data_map) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 1+2+1+1+1+1); ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, data_map->odm_num_comps); ++ p = exp_xdr_encode_u64(p, data_map->odm_stripe_unit); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_width); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_depth); ++ p = exp_xdr_encode_u32(p, data_map->odm_mirror_cnt); ++ p = exp_xdr_encode_u32(p, data_map->odm_raid_algorithm); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline int pnfs_osd_xdr_encode_objid( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2+2+2+2); ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)&object_id->oid_device_id; ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u64(p, dev_id->sbid); ++ p = exp_xdr_encode_u64(p, dev_id->devid); ++ p = exp_xdr_encode_u64(p, object_id->oid_partition_id); ++ p = exp_xdr_encode_u64(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * enum pnfs_osd_cap_key_sec4 { ++ * PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ * PNFS_OSD_CAP_KEY_SEC_SSV = 1 ++ * }; ++ * ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_object_cred( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_object_cred *olo_comp) ++{ ++ __be32 *p; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_objid(xdr, &olo_comp->oc_object_id); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_space(xdr, 3*4 + 4+olo_comp->oc_cap.cred_len); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, olo_comp->oc_osd_version); ++ ++ /* No sec for now */ ++ p = exp_xdr_encode_u32(p, PNFS_OSD_CAP_KEY_SEC_NONE); ++ p = exp_xdr_encode_u32(p, 0); /* opaque oc_capability_key<> */ ++ ++ exp_xdr_encode_opaque(p, olo_comp->oc_cap.cred, ++ olo_comp->oc_cap.cred_len); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_layout { ++ * struct pnfs_osd_data_map olo_map; ++ * u32 olo_comps_index; ++ * u32 olo_num_comps; ++ * struct pnfs_osd_object_cred *olo_comps; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *pol) ++{ ++ __be32 *p; ++ u32 i; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_data_map(xdr, &pol->olo_map); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, pol->olo_comps_index); ++ p = exp_xdr_encode_u32(p, pol->olo_num_comps); ++ ++ for (i = 0; i < pol->olo_num_comps; i++) { ++ err = pnfs_osd_xdr_encode_object_cred(xdr, &pol->olo_comps[i]); ++ if (err) ++ return err; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_layout); ++ ++static int _encode_string(struct exp_xdr_stream *xdr, ++ const struct nfs4_string *str) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, 4 + str->len); ++ ++ if (!p) ++ return -E2BIG; ++ exp_xdr_encode_opaque(p, str->data, str->len); ++ return 0; ++} ++ ++/* struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr) ++{ ++ __be32 *p; ++ int err; ++ ++ p = exp_xdr_reserve_space(xdr, 4 + 4 + sizeof(devaddr->oda_lun)); ++ if (!p) ++ return -E2BIG; ++ ++ /* Empty oda_targetid */ ++ p = exp_xdr_encode_u32(p, OBJ_TARGET_ANON); ++ ++ /* Empty oda_targetaddr for now */ ++ p = exp_xdr_encode_u32(p, 0); ++ ++ /* oda_lun */ ++ exp_xdr_encode_bytes(p, devaddr->oda_lun, sizeof(devaddr->oda_lun)); ++ ++ err = _encode_string(xdr, &devaddr->oda_systemid); ++ if (err) ++ return err; ++ ++ err = pnfs_osd_xdr_encode_object_cred(xdr, ++ &devaddr->oda_root_obj_cred); ++ if (err) ++ return err; ++ ++ err = _encode_string(xdr, &devaddr->oda_osdname); ++ if (err) ++ return err; ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_deviceaddr); ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p) ++{ ++ lou->dsu_valid = be32_to_cpu(*p++); ++ if (lou->dsu_valid) ++ p = xdr_decode_hyper(p, &lou->dsu_delta); ++ lou->olu_ioerr_flag = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_layoutupdate); ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline __be32 * ++pnfs_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) ++{ ++ /* FIXME: p = xdr_decode_fixed(...) */ ++ memcpy(objid->oid_device_id.data, p, sizeof(objid->oid_device_id.data)); ++ p += XDR_QUADLEN(sizeof(objid->oid_device_id.data)); ++ ++ p = xdr_decode_hyper(p, &objid->oid_partition_id); ++ p = xdr_decode_hyper(p, &objid->oid_object_id); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p) ++{ ++ p = pnfs_osd_xdr_decode_objid(p, &ioerr->oer_component); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_offset); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_length); ++ ioerr->oer_iswrite = be32_to_cpu(*p++); ++ ioerr->oer_errno = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); +diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gfs2/ops_fstype.c +--- linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-08-31 20:42:05.500123860 -0400 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + #include "gfs2.h" + #include "incore.h" +@@ -1146,6 +1147,9 @@ static int fill_super(struct super_block + sb->s_magic = GFS2_MAGIC; + sb->s_op = &gfs2_super_ops; + sb->s_export_op = &gfs2_export_ops; ++#if defined(CONFIG_PNFSD) ++ sb->s_pnfs_op = &pnfs_dlm_export_ops; ++#endif /* CONFIG_PNFSD */ + sb->s_xattr = gfs2_xattr_handlers; + sb->s_qcop = &gfs2_quotactl_ops; + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; +diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig +--- linux-2.6.34.noarch/fs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/Kconfig 2010-08-31 20:42:05.490222933 -0400 +@@ -224,6 +224,31 @@ config LOCKD_V4 + config EXPORTFS + tristate + ++config EXPORTFS_FILE_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 files layout type. ++ Must be automatically selected by supporting filesystems. ++ ++config EXPORTFS_OSD_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 objects layout type. ++ Must be automatically selected by supporting osd ++ filesystems. ++ ++ If unsure, say N. ++ ++config EXPORTFS_BLOCK_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 blocks layout type. ++ Must be automatically selected by supporting filesystems. ++ ++ + config NFS_ACL_SUPPORT + tristate + select FS_POSIX_ACL +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-31 20:42:05.503222878 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-31 20:42:05.503222878 -0400 +@@ -0,0 +1,66 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pipefs_list bl_device_list; ++struct dentry *bl_device_pipe; ++ ++ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, size_t len) ++{ ++ int err; ++ struct pipefs_hdr *msg; ++ ++ dprintk("Entering %s...\n", __func__); ++ ++ msg = pipefs_readmsg(filp, src, len); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: unable to read pipefs message.\n"); ++ return PTR_ERR(msg); ++ } ++ ++ /* now assign the result, which wakes the blocked thread */ ++ err = pipefs_assign_upcall_reply(msg, &bl_device_list); ++ if (err) { ++ dprintk("ERROR: failed to assign upcall with id %u\n", ++ msg->msgid); ++ kfree(msg); ++ } ++ return len; ++} ++ ++static const struct rpc_pipe_ops bl_pipe_ops = { ++ .upcall = pipefs_generic_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = pipefs_generic_destroy_msg, ++}; ++ ++int bl_pipe_init(void) ++{ ++ dprintk("%s: block_device pipefs registering...\n", __func__); ++ bl_device_pipe = pipefs_mkpipe("bl_device_pipe", &bl_pipe_ops, 1); ++ if (IS_ERR(bl_device_pipe)) ++ dprintk("ERROR, unable to make block_device pipe\n"); ++ ++ if (!bl_device_pipe) ++ dprintk("bl_device_pipe is NULL!\n"); ++ else ++ dprintk("bl_device_pipe created!\n"); ++ pipefs_init_list(&bl_device_list); ++ return 0; ++} ++ ++void bl_pipe_exit(void) ++{ ++ dprintk("%s: block_device pipefs unregistering...\n", __func__); ++ if (IS_ERR(bl_device_pipe)) ++ return ; ++ pipefs_closepipe(bl_device_pipe); ++ return; ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-31 20:42:05.504232855 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-31 20:42:05.504232855 -0400 +@@ -0,0 +1,1160 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include ++ ++#include /* various write calls */ ++#include /* struct bio */ ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Andy Adamson "); ++MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); ++ ++/* Callback operations to the pNFS client */ ++static struct pnfs_client_operations *pnfs_block_callback_ops; ++ ++static void print_page(struct page *page) ++{ ++ dprintk("PRINTPAGE page %p\n", page); ++ dprintk(" PagePrivate %d\n", PagePrivate(page)); ++ dprintk(" PageUptodate %d\n", PageUptodate(page)); ++ dprintk(" PageError %d\n", PageError(page)); ++ dprintk(" PageDirty %d\n", PageDirty(page)); ++ dprintk(" PageReferenced %d\n", PageReferenced(page)); ++ dprintk(" PageLocked %d\n", PageLocked(page)); ++ dprintk(" PageWriteback %d\n", PageWriteback(page)); ++ dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); ++ dprintk("\n"); ++} ++ ++/* Given the be associated with isect, determine if page data needs to be ++ * initialized. ++ */ ++static int is_hole(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return !is_sector_initialized(be->be_inval, isect); ++} ++ ++/* Given the be associated with isect, determine if page data can be ++ * written to disk. ++ */ ++static int is_writable(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return is_sector_initialized(be->be_inval, isect); ++} ++ ++static int ++dont_like_caller(struct nfs_page *req) ++{ ++ if (atomic_read(&req->wb_complete)) { ++ /* Called by _multi */ ++ return 1; ++ } else { ++ /* Called by _one */ ++ return 0; ++ } ++} ++ ++static enum pnfs_try_status ++bl_commit(struct nfs_write_data *nfs_data, ++ int sync) ++{ ++ dprintk("%s enter\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++/* The data we are handed might be spread across several bios. We need ++ * to track when the last one is finished. ++ */ ++struct parallel_io { ++ struct kref refcnt; ++ struct rpc_call_ops call_ops; ++ void (*pnfs_callback) (void *data); ++ void *data; ++}; ++ ++static inline struct parallel_io *alloc_parallel(void *data) ++{ ++ struct parallel_io *rv; ++ ++ rv = kmalloc(sizeof(*rv), GFP_KERNEL); ++ if (rv) { ++ rv->data = data; ++ kref_init(&rv->refcnt); ++ } ++ return rv; ++} ++ ++static inline void get_parallel(struct parallel_io *p) ++{ ++ kref_get(&p->refcnt); ++} ++ ++static void destroy_parallel(struct kref *kref) ++{ ++ struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); ++ ++ dprintk("%s enter\n", __func__); ++ p->pnfs_callback(p->data); ++ kfree(p); ++} ++ ++static inline void put_parallel(struct parallel_io *p) ++{ ++ kref_put(&p->refcnt, destroy_parallel); ++} ++ ++static struct bio * ++bl_submit_bio(int rw, struct bio *bio) ++{ ++ if (bio) { ++ get_parallel(bio->bi_private); ++ dprintk("%s submitting %s bio %u@%llu\n", __func__, ++ rw == READ ? "read" : "write", ++ bio->bi_size, (u64)bio->bi_sector); ++ submit_bio(rw, bio); ++ } ++ return NULL; ++} ++ ++static inline void ++bl_done_with_rpage(struct page *page, const int ok) ++{ ++ if (ok) { ++ ClearPagePnfsErr(page); ++ SetPageUptodate(page); ++ } else { ++ ClearPageUptodate(page); ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ } ++ /* Page is unlocked via rpc_release. Should really be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_read(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_rpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++static void bl_read_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ pnfs_block_callback_ops->nfs_readlist_complete(rdata); ++} ++ ++static void ++bl_end_par_io_read(void *data) ++{ ++ struct nfs_read_data *rdata = data; ++ ++ INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); ++ schedule_work(&rdata->task.u.tk_work); ++} ++ ++/* We don't want normal .rpc_call_done callback used, so we replace it ++ * with this stub. ++ */ ++static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) ++{ ++ return; ++} ++ ++static enum pnfs_try_status ++bl_read_pagelist(struct nfs_read_data *rdata, ++ unsigned nr_pages) ++{ ++ int i, hole; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t f_offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct page **pages = rdata->args.pages; ++ int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, ++ nr_pages, f_offset, count); ++ ++ if (dont_like_caller(rdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ goto use_mds; ++ } ++ if ((nr_pages == 1) && PagePnfsErr(rdata->req->wb_page)) { ++ /* We want to fall back to mds in case of read_page ++ * after error on read_pages. ++ */ ++ dprintk("%s PG_pnfserr set\n", __func__); ++ goto use_mds; ++ } ++ par = alloc_parallel(rdata); ++ if (!par) ++ goto use_mds; ++ par->call_ops = *rdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_read; ++ /* At this point, we can no longer jump to use_mds */ ++ ++ isect = (sector_t) (f_offset >> 9); ++ /* Code assumes extents are page-aligned */ ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ put_extent(cow_read); ++ bio = bl_submit_bio(READ, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(rdata->pdata.lseg), ++ isect, &cow_read); ++ if (!be) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ if (cow_read) { ++ sector_t cow_length = cow_read->be_length - ++ (isect - cow_read->be_f_offset); ++ extent_length = min(extent_length, cow_length); ++ } ++ } ++ hole = is_hole(be, isect); ++ if (hole && !cow_read) { ++ bio = bl_submit_bio(READ, bio); ++ /* Fill hole w/ zeroes w/o accessing device */ ++ dprintk("%s Zeroing page for hole\n", __func__); ++ zero_user(pages[i], 0, ++ min_t(int, PAGE_CACHE_SIZE, count)); ++ print_page(pages[i]); ++ bl_done_with_rpage(pages[i], 1); ++ } else { ++ struct pnfs_block_extent *be_read; ++ ++ be_read = (hole && cow_read) ? cow_read : be; ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - ++ be_read->be_f_offset + ++ be_read->be_v_offset; ++ bio->bi_bdev = be_read->be_mdev; ++ bio->bi_end_io = bl_end_io_read; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(READ, bio); ++ } ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ if ((isect << 9) >= rdata->inode->i_size) { ++ rdata->res.eof = 1; ++ rdata->res.count = rdata->inode->i_size - f_offset; ++ } else { ++ rdata->res.count = (isect << 9) - f_offset; ++ } ++ put_extent(be); ++ put_extent(cow_read); ++ bl_submit_bio(READ, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++ ++ use_mds: ++ dprintk("Giving up and using normal NFS\n"); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static void mark_extents_written(struct pnfs_block_layout *bl, ++ __u64 offset, __u32 count) ++{ ++ sector_t isect, end; ++ struct pnfs_block_extent *be; ++ ++ dprintk("%s(%llu, %u)\n", __func__, offset, count); ++ if (count == 0) ++ return; ++ isect = (offset & (long)(PAGE_CACHE_MASK)) >> 9; ++ end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); ++ end >>= 9; ++ while (isect < end) { ++ sector_t len; ++ be = find_get_extent(bl, isect, NULL); ++ BUG_ON(!be); /* FIXME */ ++ len = min(end, be->be_f_offset + be->be_length) - isect; ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ mark_for_commit(be, isect, len); /* What if fails? */ ++ isect += len; ++ put_extent(be); ++ } ++} ++ ++/* STUB - this needs thought */ ++static inline void ++bl_done_with_wpage(struct page *page, const int ok) ++{ ++ if (!ok) { ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ /* This is an inline copy of nfs_zap_mapping */ ++ /* This is oh so fishy, and needs deep thought */ ++ if (page->mapping->nrpages != 0) { ++ struct inode *inode = page->mapping->host; ++ spin_lock(&inode->i_lock); ++ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; ++ spin_unlock(&inode->i_lock); ++ } ++ } ++ /* end_page_writeback called in rpc_release. Should be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_write(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_wpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++/* Function scheduled for call during bl_end_par_io_write, ++ * it marks sectors as written and extends the commitlist. ++ */ ++static void bl_write_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ if (!wdata->task.tk_status) { ++ /* Marks for LAYOUTCOMMIT */ ++ /* BUG - this should be called after each bio, not after ++ * all finish, unless have some way of storing success/failure ++ */ ++ mark_extents_written(BLK_LSEG2EXT(wdata->pdata.lseg), ++ wdata->args.offset, wdata->args.count); ++ } ++ pnfs_block_callback_ops->nfs_writelist_complete(wdata); ++} ++ ++/* Called when last of bios associated with a bl_write_pagelist call finishes */ ++static void ++bl_end_par_io_write(void *data) ++{ ++ struct nfs_write_data *wdata = data; ++ ++ /* STUB - ignoring error handling */ ++ wdata->task.tk_status = 0; ++ wdata->verf.committed = NFS_FILE_SYNC; ++ INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); ++ schedule_work(&wdata->task.u.tk_work); ++} ++ ++static enum pnfs_try_status ++bl_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int sync) ++{ ++ int i; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t offset = wdata->args.offset; ++ size_t count = wdata->args.count; ++ struct page **pages = wdata->args.pages; ++ int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); ++ if (!wdata->req->wb_lseg) { ++ dprintk("%s no lseg, falling back to MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ if (dont_like_caller(wdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ /* At this point, wdata->pages is a (sequential) list of nfs_pages. ++ * We want to write each, and if there is an error remove it from ++ * list and call ++ * nfs_retry_request(req) to have it redone using nfs. ++ * QUEST? Do as block or per req? Think have to do per block ++ * as part of end_bio ++ */ ++ par = alloc_parallel(wdata); ++ if (!par) ++ return PNFS_NOT_ATTEMPTED; ++ par->call_ops = *wdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_write; ++ /* At this point, have to be more careful with error handling */ ++ ++ isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> 9); ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ bio = bl_submit_bio(WRITE, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(wdata->pdata.lseg), ++ isect, NULL); ++ if (!be || !is_writable(be, isect)) { ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ } ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - be->be_f_offset + ++ be->be_v_offset; ++ bio->bi_bdev = be->be_mdev; ++ bio->bi_end_io = bl_end_io_write; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(WRITE, bio); ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ wdata->res.count = (isect << 9) - (offset & (long)PAGE_CACHE_MASK); ++ put_extent(be); ++ bl_submit_bio(WRITE, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++} ++ ++/* FIXME - range ignored */ ++static void ++release_extents(struct pnfs_block_layout *bl, ++ struct pnfs_layout_range *range) ++{ ++ int i; ++ struct pnfs_block_extent *be; ++ ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ while (!list_empty(&bl->bl_extents[i])) { ++ be = list_first_entry(&bl->bl_extents[i], ++ struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++} ++ ++static void ++release_inval_marks(struct pnfs_inval_markings *marks) ++{ ++ struct pnfs_inval_tracking *pos, *temp; ++ ++ list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { ++ list_del(&pos->it_link); ++ kfree(pos); ++ } ++ return; ++} ++ ++/* Note we are relying on caller locking to prevent nasty races. */ ++static void ++bl_free_layout(struct pnfs_layout_hdr *lo) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ ++ dprintk("%s enter\n", __func__); ++ release_extents(bl, NULL); ++ release_inval_marks(&bl->bl_inval); ++ kfree(bl); ++} ++ ++static struct pnfs_layout_hdr * ++bl_alloc_layout(struct inode *inode) ++{ ++ struct pnfs_block_layout *bl; ++ ++ dprintk("%s enter\n", __func__); ++ bl = kzalloc(sizeof(*bl), GFP_KERNEL); ++ if (!bl) ++ return NULL; ++ spin_lock_init(&bl->bl_ext_lock); ++ INIT_LIST_HEAD(&bl->bl_extents[0]); ++ INIT_LIST_HEAD(&bl->bl_extents[1]); ++ INIT_LIST_HEAD(&bl->bl_commit); ++ bl->bl_count = 0; ++ bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> 9; ++ INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); ++ return &bl->bl_layout; ++} ++ ++static void ++bl_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter\n", __func__); ++ kfree(lseg); ++} ++ ++/* Because the generic infrastructure does not correctly merge layouts, ++ * we pretty much ignore lseg, and store all data layout wide, so we ++ * can correctly merge. Eventually we should push some correct merge ++ * behavior up to the generic code, as the current behavior tends to ++ * cause lots of unnecessary overlapping LAYOUTGET requests. ++ */ ++static struct pnfs_layout_segment * ++bl_alloc_lseg(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr) ++{ ++ struct pnfs_layout_segment *lseg; ++ int status; ++ ++ dprintk("%s enter\n", __func__); ++ lseg = kzalloc(sizeof(*lseg) + 0, GFP_KERNEL); ++ if (!lseg) ++ return NULL; ++ status = nfs4_blk_process_layoutget(lo, lgr); ++ if (status) { ++ /* We don't want to call the full-blown bl_free_lseg, ++ * since on error extents were not touched. ++ */ ++ /* STUB - we really want to distinguish between 2 error ++ * conditions here. This lseg failed, but lo data structures ++ * are OK, or we hosed the lo data structures. The calling ++ * code probably needs to distinguish this too. ++ */ ++ kfree(lseg); ++ return ERR_PTR(status); ++ } ++ return lseg; ++} ++ ++static int ++bl_setup_layoutcommit(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutcommit_args *arg) ++{ ++ struct nfs_server *nfss = PNFS_NFS_SERVER(lo); ++ struct bl_layoutupdate_data *layoutupdate_data; ++ ++ dprintk("%s enter\n", __func__); ++ /* Need to ensure commit is block-size aligned */ ++ if (nfss->pnfs_blksize) { ++ u64 mask = nfss->pnfs_blksize - 1; ++ u64 offset = arg->range.offset & mask; ++ ++ arg->range.offset -= offset; ++ arg->range.length += offset + mask; ++ arg->range.length &= ~mask; ++ } ++ ++ layoutupdate_data = kmalloc(sizeof(struct bl_layoutupdate_data), ++ GFP_KERNEL); ++ if (unlikely(!layoutupdate_data)) ++ return -ENOMEM; ++ INIT_LIST_HEAD(&layoutupdate_data->ranges); ++ arg->layoutdriver_data = layoutupdate_data; ++ ++ return 0; ++} ++ ++static void ++bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg) ++{ ++ dprintk("%s enter\n", __func__); ++ encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); ++} ++ ++static void ++bl_cleanup_layoutcommit(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutcommit_args *arg, int status) ++{ ++ dprintk("%s enter\n", __func__); ++ clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), arg, status); ++ kfree(arg->layoutdriver_data); ++} ++ ++static void free_blk_mountid(struct block_mount_id *mid) ++{ ++ if (mid) { ++ struct pnfs_block_dev *dev; ++ spin_lock(&mid->bm_lock); ++ while (!list_empty(&mid->bm_devlist)) { ++ dev = list_first_entry(&mid->bm_devlist, ++ struct pnfs_block_dev, ++ bm_node); ++ list_del(&dev->bm_node); ++ free_block_dev(dev); ++ } ++ spin_unlock(&mid->bm_lock); ++ kfree(mid); ++ } ++} ++ ++/* This is mostly copied form the filelayout's get_device_info function. ++ * It seems much of this should be at the generic pnfs level. ++ */ ++static struct pnfs_block_dev * ++nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, ++ struct pnfs_deviceid *d_id, ++ struct list_head *sdlist) ++{ ++ struct pnfs_device *dev; ++ struct pnfs_block_dev *rv = NULL; ++ u32 max_resp_sz; ++ int max_pages; ++ struct page **pages = NULL; ++ int i, rc; ++ ++ /* ++ * Use the session max response size as the basis for setting ++ * GETDEVICEINFO's maxcount ++ */ ++ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; ++ max_pages = max_resp_sz >> PAGE_SHIFT; ++ dprintk("%s max_resp_sz %u max_pages %d\n", ++ __func__, max_resp_sz, max_pages); ++ ++ dev = kmalloc(sizeof(*dev), GFP_KERNEL); ++ if (!dev) { ++ dprintk("%s kmalloc failed\n", __func__); ++ return NULL; ++ } ++ ++ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); ++ if (pages == NULL) { ++ kfree(dev); ++ return NULL; ++ } ++ for (i = 0; i < max_pages; i++) { ++ pages[i] = alloc_page(GFP_KERNEL); ++ if (!pages[i]) ++ goto out_free; ++ } ++ ++ /* set dev->area */ ++ dev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); ++ if (!dev->area) ++ goto out_free; ++ ++ memcpy(&dev->dev_id, d_id, sizeof(*d_id)); ++ dev->layout_type = LAYOUT_BLOCK_VOLUME; ++ dev->dev_notify_types = 0; ++ dev->pages = pages; ++ dev->pgbase = 0; ++ dev->pglen = PAGE_SIZE * max_pages; ++ dev->mincount = 0; ++ ++ dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); ++ rc = pnfs_block_callback_ops->nfs_getdeviceinfo(server, dev); ++ dprintk("%s getdevice info returns %d\n", __func__, rc); ++ if (rc) ++ goto out_free; ++ ++ rv = nfs4_blk_decode_device(server, dev, sdlist); ++ out_free: ++ if (dev->area != NULL) ++ vunmap(dev->area); ++ for (i = 0; i < max_pages; i++) ++ __free_page(pages[i]); ++ kfree(pages); ++ kfree(dev); ++ return rv; ++} ++ ++ ++/* ++ * Retrieve the list of available devices for the mountpoint. ++ */ ++static int ++bl_initialize_mountpoint(struct nfs_server *server, const struct nfs_fh *fh) ++{ ++ struct block_mount_id *b_mt_id = NULL; ++ struct pnfs_mount_type *mtype = NULL; ++ struct pnfs_devicelist *dlist = NULL; ++ struct pnfs_block_dev *bdev; ++ LIST_HEAD(block_disklist); ++ int status = 0, i; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (server->pnfs_blksize == 0) { ++ dprintk("%s Server did not return blksize\n", __func__); ++ return -EINVAL; ++ } ++ b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_KERNEL); ++ if (!b_mt_id) { ++ status = -ENOMEM; ++ goto out_error; ++ } ++ /* Initialize nfs4 block layout mount id */ ++ spin_lock_init(&b_mt_id->bm_lock); ++ INIT_LIST_HEAD(&b_mt_id->bm_devlist); ++ ++ dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_KERNEL); ++ if (!dlist) ++ goto out_error; ++ dlist->eof = 0; ++ while (!dlist->eof) { ++ status = pnfs_block_callback_ops->nfs_getdevicelist( ++ server, fh, dlist); ++ if (status) ++ goto out_error; ++ dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", ++ __func__, dlist->num_devs, dlist->eof); ++ /* For each device returned in dlist, call GETDEVICEINFO, and ++ * decode the opaque topology encoding to create a flat ++ * volume topology, matching VOLUME_SIMPLE disk signatures ++ * to disks in the visible block disk list. ++ * Construct an LVM meta device from the flat volume topology. ++ */ ++ for (i = 0; i < dlist->num_devs; i++) { ++ bdev = nfs4_blk_get_deviceinfo(server, fh, ++ &dlist->dev_id[i], ++ &block_disklist); ++ if (!bdev) ++ goto out_error; ++ spin_lock(&b_mt_id->bm_lock); ++ list_add(&bdev->bm_node, &b_mt_id->bm_devlist); ++ spin_unlock(&b_mt_id->bm_lock); ++ } ++ } ++ dprintk("%s SUCCESS\n", __func__); ++ server->pnfs_ld_data = b_mt_id; ++ ++ out_return: ++ kfree(dlist); ++ return status; ++ ++ out_error: ++ free_blk_mountid(b_mt_id); ++ kfree(mtype); ++ goto out_return; ++} ++ ++static int ++bl_uninitialize_mountpoint(struct nfs_server *server) ++{ ++ struct block_mount_id *b_mt_id = server->pnfs_ld_data; ++ ++ dprintk("%s enter\n", __func__); ++ free_blk_mountid(b_mt_id); ++ dprintk("%s RETURNS\n", __func__); ++ return 0; ++} ++ ++/* STUB - mark intersection of layout and page as bad, so is not ++ * used again. ++ */ ++static void mark_bad_read(void) ++{ ++ return; ++} ++ ++/* Copied from buffer.c */ ++static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) ++{ ++ if (uptodate) { ++ set_buffer_uptodate(bh); ++ } else { ++ /* This happens, due to failed READA attempts. */ ++ clear_buffer_uptodate(bh); ++ } ++ unlock_buffer(bh); ++} ++ ++/* Copied from buffer.c */ ++static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) ++{ ++ __end_buffer_read_notouch(bh, uptodate); ++} ++ ++/* ++ * map_block: map a requested I/0 block (isect) into an offset in the LVM ++ * meta block_device ++ */ ++static void ++map_block(sector_t isect, struct pnfs_block_extent *be, struct buffer_head *bh) ++{ ++ dprintk("%s enter be=%p\n", __func__, be); ++ ++ set_buffer_mapped(bh); ++ bh->b_bdev = be->be_mdev; ++ bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> ++ (be->be_mdev->bd_inode->i_blkbits - 9); ++ ++ dprintk("%s isect %ld, bh->b_blocknr %ld, using bsize %Zd\n", ++ __func__, (long)isect, ++ (long)bh->b_blocknr, ++ bh->b_size); ++ return; ++} ++ ++/* Given an unmapped page, zero it (or read in page for COW), ++ * and set appropriate flags/markings, but it is safe to not initialize ++ * the range given in [from, to). ++ */ ++/* This is loosely based on nobh_write_begin */ ++static int ++init_page_for_write(struct pnfs_block_layout *bl, struct page *page, ++ unsigned from, unsigned to, sector_t **pages_to_mark) ++{ ++ struct buffer_head *bh; ++ int inval, ret = -EIO; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect; ++ ++ dprintk("%s enter, %p\n", __func__, page); ++ bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); ++ if (!bh) { ++ ret = -ENOMEM; ++ goto cleanup; ++ } ++ ++ isect = (sector_t)page->index << (PAGE_CACHE_SHIFT - 9); ++ be = find_get_extent(bl, isect, &cow_read); ++ if (!be) ++ goto cleanup; ++ inval = is_hole(be, isect); ++ dprintk("%s inval=%i, from=%u, to=%u\n", __func__, inval, from, to); ++ if (inval) { ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) { ++ dprintk("%s PANIC - got NONE_DATA extent %p\n", ++ __func__, be); ++ goto cleanup; ++ } ++ map_block(isect, be, bh); ++ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); ++ } ++ if (PageUptodate(page)) { ++ /* Do nothing */ ++ } else if (inval & !cow_read) { ++ zero_user_segments(page, 0, from, to, PAGE_CACHE_SIZE); ++ } else if (0 < from || PAGE_CACHE_SIZE > to) { ++ struct pnfs_block_extent *read_extent; ++ ++ read_extent = (inval && cow_read) ? cow_read : be; ++ map_block(isect, read_extent, bh); ++ lock_buffer(bh); ++ bh->b_end_io = end_buffer_read_nobh; ++ submit_bh(READ, bh); ++ dprintk("%s: Waiting for buffer read\n", __func__); ++ /* XXX Don't really want to hold layout lock here */ ++ wait_on_buffer(bh); ++ if (!buffer_uptodate(bh)) ++ goto cleanup; ++ } ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ /* There is a BUG here if is a short copy after write_begin, ++ * but I think this is a generic fs bug. The problem is that ++ * we have marked the page as initialized, but it is possible ++ * that the section not copied may never get copied. ++ */ ++ ret = mark_initialized_sectors(be->be_inval, isect, ++ PAGE_CACHE_SECTORS, ++ pages_to_mark); ++ /* Want to preallocate mem so above can't fail */ ++ if (ret) ++ goto cleanup; ++ } ++ SetPageMappedToDisk(page); ++ ret = 0; ++ ++cleanup: ++ free_buffer_head(bh); ++ put_extent(be); ++ put_extent(cow_read); ++ if (ret) { ++ /* Need to mark layout with bad read...should now ++ * just use nfs4 for reads and writes. ++ */ ++ mark_bad_read(); ++ } ++ return ret; ++} ++ ++static int ++bl_write_begin(struct pnfs_layout_segment *lseg, struct page *page, loff_t pos, ++ unsigned count, struct pnfs_fsdata *fsdata) ++{ ++ unsigned from, to; ++ int ret; ++ sector_t *pages_to_mark = NULL; ++ struct pnfs_block_layout *bl = BLK_LSEG2EXT(lseg); ++ ++ dprintk("%s enter, %u@%lld\n", __func__, count, pos); ++ print_page(page); ++ /* The following code assumes blocksize >= PAGE_CACHE_SIZE */ ++ if (bl->bl_blocksize < (PAGE_CACHE_SIZE >> 9)) { ++ dprintk("%s Can't handle blocksize %llu\n", __func__, ++ (u64)bl->bl_blocksize); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ return 0; ++ } ++ if (PageMappedToDisk(page)) { ++ /* Basically, this is a flag that says we have ++ * successfully called write_begin already on this page. ++ */ ++ /* NOTE - there are cache consistency issues here. ++ * For example, what if the layout is recalled, then regained? ++ * If the file is closed and reopened, will the page flags ++ * be reset? If not, we'll have to use layout info instead of ++ * the page flag. ++ */ ++ return 0; ++ } ++ from = pos & (PAGE_CACHE_SIZE - 1); ++ to = from + count; ++ ret = init_page_for_write(bl, page, from, to, &pages_to_mark); ++ if (ret) { ++ dprintk("%s init page failed with %i", __func__, ret); ++ /* Revert back to plain NFS and just continue on with ++ * write. This assumes there is no request attached, which ++ * should be true if we get here. ++ */ ++ BUG_ON(PagePrivate(page)); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ kfree(pages_to_mark); ++ ret = 0; ++ } else { ++ fsdata->private = pages_to_mark; ++ } ++ return ret; ++} ++ ++/* CAREFUL - what happens if copied < count??? */ ++static int ++bl_write_end(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter, %u@%lld, lseg=%p\n", __func__, count, pos, lseg); ++ print_page(page); ++ if (lseg) ++ SetPageUptodate(page); ++ return 0; ++} ++ ++/* Return any memory allocated to fsdata->private, and take advantage ++ * of no page locks to mark pages noted in write_begin as needing ++ * initialization. ++ */ ++static void ++bl_write_end_cleanup(struct file *filp, struct pnfs_fsdata *fsdata) ++{ ++ struct page *page; ++ pgoff_t index; ++ sector_t *pos; ++ struct address_space *mapping = filp->f_mapping; ++ struct pnfs_fsdata *fake_data; ++ struct pnfs_layout_segment *lseg; ++ ++ if (!fsdata) ++ return; ++ lseg = fsdata->lseg; ++ if (!lseg) ++ return; ++ pos = fsdata->private; ++ if (!pos) ++ return; ++ dprintk("%s enter with pos=%llu\n", __func__, (u64)(*pos)); ++ for (; *pos != ~0; pos++) { ++ index = *pos >> (PAGE_CACHE_SHIFT - 9); ++ /* XXX How do we properly deal with failures here??? */ ++ page = grab_cache_page_write_begin(mapping, index, 0); ++ if (!page) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", __func__); ++ continue; ++ } ++ dprintk("%s: Examining block page\n", __func__); ++ print_page(page); ++ if (!PageMappedToDisk(page)) { ++ /* XXX How do we properly deal with failures here??? */ ++ dprintk("%s Marking block page\n", __func__); ++ init_page_for_write(BLK_LSEG2EXT(fsdata->lseg), page, ++ PAGE_CACHE_SIZE, PAGE_CACHE_SIZE, ++ NULL); ++ print_page(page); ++ fake_data = kzalloc(sizeof(*fake_data), GFP_KERNEL); ++ if (!fake_data) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", ++ __func__); ++ unlock_page(page); ++ continue; ++ } ++ get_lseg(lseg); ++ fake_data->lseg = lseg; ++ fake_data->bypass_eof = 1; ++ mapping->a_ops->write_end(filp, mapping, ++ index << PAGE_CACHE_SHIFT, ++ PAGE_CACHE_SIZE, ++ PAGE_CACHE_SIZE, ++ page, fake_data); ++ /* Note fake_data is freed by nfs_write_end */ ++ } else ++ unlock_page(page); ++ } ++ kfree(fsdata->private); ++ fsdata->private = NULL; ++} ++ ++static ssize_t ++bl_get_stripesize(struct pnfs_layout_hdr *lo) ++{ ++ dprintk("%s enter\n", __func__); ++ return 0; ++} ++ ++/* This is called by nfs_can_coalesce_requests via nfs_pageio_do_add_request. ++ * Should return False if there is a reason requests can not be coalesced, ++ * otherwise, should default to returning True. ++ */ ++static int ++bl_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ dprintk("%s enter\n", __func__); ++ if (pgio->pg_iswrite) ++ return prev->wb_lseg == req->wb_lseg; ++ else ++ return 1; ++} ++ ++static struct layoutdriver_io_operations blocklayout_io_operations = { ++ .commit = bl_commit, ++ .read_pagelist = bl_read_pagelist, ++ .write_pagelist = bl_write_pagelist, ++ .write_begin = bl_write_begin, ++ .write_end = bl_write_end, ++ .write_end_cleanup = bl_write_end_cleanup, ++ .alloc_layout = bl_alloc_layout, ++ .free_layout = bl_free_layout, ++ .alloc_lseg = bl_alloc_lseg, ++ .free_lseg = bl_free_lseg, ++ .setup_layoutcommit = bl_setup_layoutcommit, ++ .encode_layoutcommit = bl_encode_layoutcommit, ++ .cleanup_layoutcommit = bl_cleanup_layoutcommit, ++ .initialize_mountpoint = bl_initialize_mountpoint, ++ .uninitialize_mountpoint = bl_uninitialize_mountpoint, ++}; ++ ++static struct layoutdriver_policy_operations blocklayout_policy_operations = { ++ .get_stripesize = bl_get_stripesize, ++ .pg_test = bl_pg_test, ++}; ++ ++static struct pnfs_layoutdriver_type blocklayout_type = { ++ .id = LAYOUT_BLOCK_VOLUME, ++ .name = "LAYOUT_BLOCK_VOLUME", ++ .ld_io_ops = &blocklayout_io_operations, ++ .ld_policy_ops = &blocklayout_policy_operations, ++}; ++ ++static int __init nfs4blocklayout_init(void) ++{ ++ dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); ++ ++ pnfs_block_callback_ops = pnfs_register_layoutdriver(&blocklayout_type); ++ bl_pipe_init(); ++ return 0; ++} ++ ++static void __exit nfs4blocklayout_exit(void) ++{ ++ dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", ++ __func__); ++ ++ pnfs_unregister_layoutdriver(&blocklayout_type); ++ bl_pipe_exit(); ++} ++ ++module_init(nfs4blocklayout_init); ++module_exit(nfs4blocklayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-31 20:42:05.506119071 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-31 20:42:05.506119071 -0400 +@@ -0,0 +1,335 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdev.c ++ * ++ * Device operations for the pnfs nfs4 file layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include /* __bread */ ++ ++#include ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes) ++{ ++ uint32_t *q = p + XDR_QUADLEN(nbytes); ++ if (unlikely(q > end || q < p)) ++ return NULL; ++ return p; ++} ++EXPORT_SYMBOL(blk_overflow); ++ ++/* Open a block_device by device number. */ ++struct block_device *nfs4_blkdev_get(dev_t dev) ++{ ++ struct block_device *bd; ++ ++ dprintk("%s enter\n", __func__); ++ bd = open_by_devnum(dev, FMODE_READ); ++ if (IS_ERR(bd)) ++ goto fail; ++ return bd; ++fail: ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ return NULL; ++} ++ ++/* ++ * Release the block device ++ */ ++int nfs4_blkdev_put(struct block_device *bdev) ++{ ++ dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), ++ MINOR(bdev->bd_dev)); ++ bd_release(bdev); ++ return blkdev_put(bdev, FMODE_READ); ++} ++ ++/* Decodes pnfs_block_deviceaddr4 (draft-8) which is XDR encoded ++ * in dev->dev_addr_buf. ++ */ ++struct pnfs_block_dev * ++nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist) ++{ ++ struct pnfs_block_dev *rv = NULL; ++ struct block_device *bd = NULL; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint32_t major, minor; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return NULL; ++ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); ++ dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, ++ dev->mincount); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_MOUNT, 0, dev->area, ++ dev->mincount); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out_err; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ dprintk("%s CALLING USERSPACE DAEMON\n", __func__); ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out_err; ++ } ++ if (reply->status != BL_DEVICE_REQUEST_PROC) { ++ dprintk("%s failed to open device: %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ memcpy(&major, (uint32_t *)(payload_of(reply)), sizeof(uint32_t)); ++ memcpy(&minor, (uint32_t *)(payload_of(reply) + sizeof(uint32_t)), ++ sizeof(uint32_t)); ++ bd = nfs4_blkdev_get(MKDEV(major, minor)); ++ if (IS_ERR(bd)) { ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ ++ rv = kzalloc(sizeof(*rv), GFP_KERNEL); ++ if (!rv) ++ goto out_err; ++ ++ rv->bm_mdev = bd; ++ memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct pnfs_deviceid)); ++ dprintk("%s Created device %s with bd_block_size %u\n", ++ __func__, ++ bd->bd_disk->disk_name, ++ bd->bd_block_size); ++ kfree(reply); ++ kfree(msg); ++ return rv; ++ ++out_err: ++ kfree(rv); ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return NULL; ++} ++ ++/* Map deviceid returned by the server to constructed block_device */ ++static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, ++ struct pnfs_deviceid *id) ++{ ++ struct block_device *rv = NULL; ++ struct block_mount_id *mid; ++ struct pnfs_block_dev *dev; ++ ++ dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); ++ mid = BLK_ID(lo); ++ spin_lock(&mid->bm_lock); ++ list_for_each_entry(dev, &mid->bm_devlist, bm_node) { ++ if (memcmp(id->data, dev->bm_mdevid.data, ++ NFS4_PNFS_DEVICEID4_SIZE) == 0) { ++ rv = dev->bm_mdev; ++ goto out; ++ } ++ } ++ out: ++ spin_unlock(&mid->bm_lock); ++ dprintk("%s returning %p\n", __func__, rv); ++ return rv; ++} ++ ++/* Tracks info needed to ensure extents in layout obey constraints of spec */ ++struct layout_verification { ++ u32 mode; /* R or RW */ ++ u64 start; /* Expected start of next non-COW extent */ ++ u64 inval; /* Start of INVAL coverage */ ++ u64 cowread; /* End of COW read coverage */ ++}; ++ ++/* Verify the extent meets the layout requirements of the pnfs-block draft, ++ * section 2.3.1. ++ */ ++static int verify_extent(struct pnfs_block_extent *be, ++ struct layout_verification *lv) ++{ ++ if (lv->mode == IOMODE_READ) { ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA || ++ be->be_state == PNFS_BLOCK_INVALID_DATA) ++ return -EIO; ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } ++ /* lv->mode == IOMODE_RW */ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ if (lv->cowread > lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ lv->inval = lv->start; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_READ_DATA) { ++ if (be->be_f_offset > lv->start) ++ return -EIO; ++ if (be->be_f_offset < lv->inval) ++ return -EIO; ++ if (be->be_f_offset < lv->cowread) ++ return -EIO; ++ /* It looks like you might want to min this with lv->start, ++ * but you really don't. ++ */ ++ lv->inval = lv->inval + be->be_length; ++ lv->cowread = be->be_f_offset + be->be_length; ++ return 0; ++ } else ++ return -EIO; ++} ++ ++/* XDR decode pnfs_block_layout4 structure */ ++int ++nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ uint32_t *p = (uint32_t *)lgr->layout.buf; ++ uint32_t *end = (uint32_t *)((char *)lgr->layout.buf + lgr->layout.len); ++ int i, status = -EIO; ++ uint32_t count; ++ struct pnfs_block_extent *be = NULL, *save; ++ uint64_t tmp; /* Used by READSECTOR */ ++ struct layout_verification lv = { ++ .mode = lgr->range.iomode, ++ .start = lgr->range.offset >> 9, ++ .inval = lgr->range.offset >> 9, ++ .cowread = lgr->range.offset >> 9, ++ }; ++ ++ LIST_HEAD(extents); ++ ++ BLK_READBUF(p, end, 4); ++ READ32(count); ++ ++ dprintk("%s enter, number of extents %i\n", __func__, count); ++ BLK_READBUF(p, end, (28 + NFS4_PNFS_DEVICEID4_SIZE) * count); ++ ++ /* Decode individual extents, putting them in temporary ++ * staging area until whole layout is decoded to make error ++ * recovery easier. ++ */ ++ for (i = 0; i < count; i++) { ++ be = alloc_extent(); ++ if (!be) { ++ status = -ENOMEM; ++ goto out_err; ++ } ++ READ_DEVID(&be->be_devid); ++ be->be_mdev = translate_devid(lo, &be->be_devid); ++ if (!be->be_mdev) ++ goto out_err; ++ /* The next three values are read in as bytes, ++ * but stored as 512-byte sector lengths ++ */ ++ READ_SECTOR(be->be_f_offset); ++ READ_SECTOR(be->be_length); ++ READ_SECTOR(be->be_v_offset); ++ READ32(be->be_state); ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ be->be_inval = &bl->bl_inval; ++ if (verify_extent(be, &lv)) { ++ dprintk("%s verify failed\n", __func__); ++ goto out_err; ++ } ++ list_add_tail(&be->be_node, &extents); ++ } ++ if (p != end) { ++ dprintk("%s Undecoded cruft at end of opaque\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lgr->range.offset + lgr->range.length != lv.start << 9) { ++ dprintk("%s Final length mismatch\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lv.start < lv.cowread) { ++ dprintk("%s Final uncovered COW extent\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ /* Extents decoded properly, now try to merge them in to ++ * existing layout extents. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ list_for_each_entry_safe(be, save, &extents, be_node) { ++ list_del(&be->be_node); ++ status = add_and_merge_extent(bl, be); ++ if (status) { ++ spin_unlock(&bl->bl_ext_lock); ++ /* This is a fairly catastrophic error, as the ++ * entire layout extent lists are now corrupted. ++ * We should have some way to distinguish this. ++ */ ++ be = NULL; ++ goto out_err; ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ status = 0; ++ out: ++ dprintk("%s returns %i\n", __func__, status); ++ return status; ++ ++ out_err: ++ put_extent(be); ++ while (!list_empty(&extents)) { ++ be = list_first_entry(&extents, struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ goto out; ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-31 20:42:05.506119071 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-31 20:42:05.506119071 -0400 +@@ -0,0 +1,120 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdm.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2007 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Fred Isaman ++ * Andy Adamson ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include /* gendisk - used in a dprintk*/ ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Defines used for calculating memory usage in nfs4_blk_flatten() */ ++#define ARGSIZE 24 /* Max bytes needed for linear target arg string */ ++#define SPECSIZE (sizeof8(struct dm_target_spec) + ARGSIZE) ++#define SPECS_PER_PAGE (PAGE_SIZE / SPECSIZE) ++#define SPEC_HEADER_ADJUST (SPECS_PER_PAGE - \ ++ (PAGE_SIZE - sizeof8(struct dm_ioctl)) / SPECSIZE) ++#define roundup8(x) (((x)+7) & ~7) ++#define sizeof8(x) roundup8(sizeof(x)) ++ ++static int dev_remove(dev_t dev) ++{ ++ int ret = 1; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint64_t bl_dev; ++ uint32_t major = MAJOR(dev), minor = MINOR(dev); ++ ++ dprintk("Entering %s\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return ret; ++ ++ memcpy((void *)&bl_dev, &major, sizeof(uint32_t)); ++ memcpy((void *)&bl_dev + sizeof(uint32_t), &minor, sizeof(uint32_t)); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_UMOUNT, 0, (void *)&bl_dev, ++ sizeof(uint64_t)); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out; ++ } ++ ++ if (reply->status == BL_DEVICE_REQUEST_PROC) ++ ret = 0; /*TODO: what to return*/ ++out: ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return ret; ++} ++ ++/* ++ * Release meta device ++ */ ++static int nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) ++{ ++ int rv; ++ ++ dprintk("%s Releasing\n", __func__); ++ /* XXX Check return? */ ++ rv = nfs4_blkdev_put(bdev->bm_mdev); ++ dprintk("%s nfs4_blkdev_put returns %d\n", __func__, rv); ++ ++ rv = dev_remove(bdev->bm_mdev->bd_dev); ++ dprintk("%s Returns %d\n", __func__, rv); ++ return rv; ++} ++ ++void free_block_dev(struct pnfs_block_dev *bdev) ++{ ++ if (bdev) { ++ if (bdev->bm_mdev) { ++ dprintk("%s Removing DM device: %d:%d\n", ++ __func__, ++ MAJOR(bdev->bm_mdev->bd_dev), ++ MINOR(bdev->bm_mdev->bd_dev)); ++ /* XXX Check status ?? */ ++ nfs4_blk_metadev_release(bdev); ++ } ++ kfree(bdev); ++ } ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-31 20:42:05.505169618 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-31 20:42:05.505169618 -0400 +@@ -0,0 +1,302 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#ifndef FS_NFS_NFS4BLOCKLAYOUT_H ++#define FS_NFS_NFS4BLOCKLAYOUT_H ++ ++#include ++#include ++#include /* Needed for struct dm_ioctl*/ ++ ++#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> 9) ++ ++#define PG_pnfserr PG_owner_priv_1 ++#define PagePnfsErr(page) test_bit(PG_pnfserr, &(page)->flags) ++#define SetPagePnfsErr(page) set_bit(PG_pnfserr, &(page)->flags) ++#define ClearPagePnfsErr(page) clear_bit(PG_pnfserr, &(page)->flags) ++ ++extern int dm_dev_create(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_dev_remove(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_do_resume(struct dm_ioctl *param); ++extern int dm_table_load(struct dm_ioctl *param, size_t param_size); ++ ++struct block_mount_id { ++ spinlock_t bm_lock; /* protects list */ ++ struct list_head bm_devlist; /* holds pnfs_block_dev */ ++}; ++ ++struct pnfs_block_dev { ++ struct list_head bm_node; ++ struct pnfs_deviceid bm_mdevid; /* associated devid */ ++ struct block_device *bm_mdev; /* meta device itself */ ++}; ++ ++/* holds visible disks that can be matched against VOLUME_SIMPLE signatures */ ++struct visible_block_device { ++ struct list_head vi_node; ++ struct block_device *vi_bdev; ++ int vi_mapped; ++ int vi_put_done; ++}; ++ ++enum blk_vol_type { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, /* maps to a single LU */ ++ PNFS_BLOCK_VOLUME_SLICE = 1, /* slice of another volume */ ++ PNFS_BLOCK_VOLUME_CONCAT = 2, /* concatenation of multiple volumes */ ++ PNFS_BLOCK_VOLUME_STRIPE = 3 /* striped across multiple volumes */ ++}; ++ ++/* All disk offset/lengths are stored in 512-byte sectors */ ++struct pnfs_blk_volume { ++ uint32_t bv_type; ++ sector_t bv_size; ++ struct pnfs_blk_volume **bv_vols; ++ int bv_vol_n; ++ union { ++ dev_t bv_dev; ++ sector_t bv_stripe_unit; ++ sector_t bv_offset; ++ }; ++}; ++ ++/* Since components need not be aligned, cannot use sector_t */ ++struct pnfs_blk_sig_comp { ++ int64_t bs_offset; /* In bytes */ ++ uint32_t bs_length; /* In bytes */ ++ char *bs_string; ++}; ++ ++/* Maximum number of signatures components in a simple volume */ ++# define PNFS_BLOCK_MAX_SIG_COMP 16 ++ ++struct pnfs_blk_sig { ++ int si_num_comps; ++ struct pnfs_blk_sig_comp si_comps[PNFS_BLOCK_MAX_SIG_COMP]; ++}; ++ ++enum exstate4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ ++ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ ++}; ++ ++#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ ++ ++struct my_tree_t { ++ sector_t mtt_step_size; /* Internal sector alignment */ ++ struct list_head mtt_stub; /* Should be a radix tree */ ++}; ++ ++struct pnfs_inval_markings { ++ spinlock_t im_lock; ++ struct my_tree_t im_tree; /* Sectors that need LAYOUTCOMMIT */ ++ sector_t im_block_size; /* Server blocksize in sectors */ ++}; ++ ++struct pnfs_inval_tracking { ++ struct list_head it_link; ++ int it_sector; ++ int it_tags; ++}; ++ ++/* sector_t fields are all in 512-byte sectors */ ++struct pnfs_block_extent { ++ struct kref be_refcnt; ++ struct list_head be_node; /* link into lseg list */ ++ struct pnfs_deviceid be_devid; /* STUB - remevable??? */ ++ struct block_device *be_mdev; ++ sector_t be_f_offset; /* the starting offset in the file */ ++ sector_t be_length; /* the size of the extent */ ++ sector_t be_v_offset; /* the starting offset in the volume */ ++ enum exstate4 be_state; /* the state of this extent */ ++ struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ ++}; ++ ++/* Shortened extent used by LAYOUTCOMMIT */ ++struct pnfs_block_short_extent { ++ struct list_head bse_node; ++ struct pnfs_deviceid bse_devid; /* STUB - removable??? */ ++ struct block_device *bse_mdev; ++ sector_t bse_f_offset; /* the starting offset in the file */ ++ sector_t bse_length; /* the size of the extent */ ++}; ++ ++static inline void ++INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) ++{ ++ spin_lock_init(&marks->im_lock); ++ INIT_LIST_HEAD(&marks->im_tree.mtt_stub); ++ marks->im_block_size = blocksize; ++ marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, ++ blocksize); ++} ++ ++enum extentclass4 { ++ RW_EXTENT = 0, /* READWRTE and INVAL */ ++ RO_EXTENT = 1, /* READ and NONE */ ++ EXTENT_LISTS = 2, ++}; ++ ++static inline int choose_list(enum exstate4 state) ++{ ++ if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) ++ return RO_EXTENT; ++ else ++ return RW_EXTENT; ++} ++ ++struct pnfs_block_layout { ++ struct pnfs_layout_hdr bl_layout; ++ struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ ++ spinlock_t bl_ext_lock; /* Protects list manipulation */ ++ struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ ++ struct list_head bl_commit; /* Needs layout commit */ ++ unsigned int bl_count; /* entries in bl_commit */ ++ sector_t bl_blocksize; /* Server blocksize in sectors */ ++}; ++ ++/* this struct is comunicated between: ++ * bl_setup_layoutcommit && bl_encode_layoutcommit && bl_cleanup_layoutcommit ++ */ ++struct bl_layoutupdate_data { ++ struct list_head ranges; ++}; ++ ++#define BLK_ID(lo) ((struct block_mount_id *)(PNFS_NFS_SERVER(lo)->pnfs_ld_data)) ++ ++static inline struct pnfs_block_layout * ++BLK_LO2EXT(struct pnfs_layout_hdr *lo) ++{ ++ return container_of(lo, struct pnfs_block_layout, bl_layout); ++} ++ ++static inline struct pnfs_block_layout * ++BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) ++{ ++ return BLK_LO2EXT(lseg->layout); ++} ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes); ++ ++#define BLK_READBUF(p, e, nbytes) do { \ ++ p = blk_overflow(p, e, nbytes); \ ++ if (!p) { \ ++ printk(KERN_WARNING \ ++ "%s: reply buffer overflowed in line %d.\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++} while (0) ++ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (uint64_t)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++#define READ_DEVID(x) COPYMEM((x)->data, NFS4_PNFS_DEVICEID4_SIZE) ++#define READ_SECTOR(x) do { \ ++ READ64(tmp); \ ++ if (tmp & 0x1ff) { \ ++ printk(KERN_WARNING \ ++ "%s Value not 512-byte aligned at line %d\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++ (x) = tmp >> 9; \ ++} while (0) ++ ++#define WRITE32(n) do { \ ++ *p++ = htonl(n); \ ++ } while (0) ++#define WRITE64(n) do { \ ++ *p++ = htonl((uint32_t)((n) >> 32)); \ ++ *p++ = htonl((uint32_t)(n)); \ ++} while (0) ++#define WRITEMEM(ptr, nbytes) do { \ ++ p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ ++} while (0) ++#define WRITE_DEVID(x) WRITEMEM((x)->data, NFS4_PNFS_DEVICEID4_SIZE) ++ ++/* blocklayoutdev.c */ ++struct block_device *nfs4_blkdev_get(dev_t dev); ++int nfs4_blkdev_put(struct block_device *bdev); ++struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist); ++int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr); ++int nfs4_blk_create_block_disk_list(struct list_head *); ++void nfs4_blk_destroy_disk_list(struct list_head *); ++/* blocklayoutdm.c */ ++int nfs4_blk_flatten(struct pnfs_blk_volume *, int, struct pnfs_block_dev *); ++void free_block_dev(struct pnfs_block_dev *bdev); ++/* extents.c */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read); ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages); ++void put_extent(struct pnfs_block_extent *be); ++struct pnfs_block_extent *alloc_extent(void); ++struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be); ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect); ++int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg); ++void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct nfs4_layoutcommit_args *arg, ++ int status); ++int add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new); ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length); ++ ++#include ++ ++extern struct pipefs_list bl_device_list; ++extern struct dentry *bl_device_pipe; ++ ++int bl_pipe_init(void); ++void bl_pipe_exit(void); ++ ++#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ ++#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ ++#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ ++#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ ++#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ ++ ++#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-31 20:42:05.507113260 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-08-31 20:42:05.508119925 -0400 +@@ -0,0 +1,948 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include "blocklayout.h" ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Bit numbers */ ++#define EXTENT_INITIALIZED 0 ++#define EXTENT_WRITTEN 1 ++#define EXTENT_IN_COMMIT 2 ++#define INTERNAL_EXISTS MY_MAX_TAGS ++#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) ++ ++/* Returns largest t<=s s.t. t%base==0 */ ++static inline sector_t normalize(sector_t s, int base) ++{ ++ sector_t tmp = s; /* Since do_div modifies its argument */ ++ return s - do_div(tmp, base); ++} ++ ++static inline sector_t normalize_up(sector_t s, int base) ++{ ++ return normalize(s + base - 1, base); ++} ++ ++/* Complete stub using list while determine API wanted */ ++ ++/* Returns tags, or negative */ ++static int32_t _find_entry(struct my_tree_t *tree, u64 s) ++{ ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu) enter\n", __func__, s); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) ++ return pos->it_tags & INTERNAL_MASK; ++ else ++ break; ++ } ++ return -ENOENT; ++} ++ ++static inline ++int _has_tag(struct my_tree_t *tree, u64 s, int32_t tag) ++{ ++ int32_t tags; ++ ++ dprintk("%s(%llu, %i) enter\n", __func__, s, tag); ++ s = normalize(s, tree->mtt_step_size); ++ tags = _find_entry(tree, s); ++ if ((tags < 0) || !(tags & (1 << tag))) ++ return 0; ++ else ++ return 1; ++} ++ ++/* Creates entry with tag, or if entry already exists, unions tag to it. ++ * If storage is not NULL, newly created entry will use it. ++ * Returns number of entries added, or negative on error. ++ */ ++static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag, ++ struct pnfs_inval_tracking *storage) ++{ ++ int found = 0; ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) { ++ found = 1; ++ break; ++ } else ++ break; ++ } ++ if (found) { ++ pos->it_tags |= (1 << tag); ++ return 0; ++ } else { ++ struct pnfs_inval_tracking *new; ++ if (storage) ++ new = storage; ++ else { ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ } ++ new->it_sector = s; ++ new->it_tags = (1 << tag); ++ list_add(&new->it_link, &pos->it_link); ++ return 1; ++ } ++} ++ ++/* XXXX Really want option to not create */ ++/* Over range, unions tag with existing entries, else creates entry with tag */ ++static int _set_range(struct my_tree_t *tree, int32_t tag, u64 s, u64 length) ++{ ++ u64 i; ++ ++ dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); ++ for (i = normalize(s, tree->mtt_step_size); i < s + length; ++ i += tree->mtt_step_size) ++ if (_add_entry(tree, i, tag, NULL)) ++ return -ENOMEM; ++ return 0; ++} ++ ++/* Ensure that future operations on given range of tree will not malloc */ ++static int _preload_range(struct my_tree_t *tree, u64 offset, u64 length) ++{ ++ u64 start, end, s; ++ int count, i, used = 0, status = -ENOMEM; ++ struct pnfs_inval_tracking **storage; ++ ++ dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); ++ start = normalize(offset, tree->mtt_step_size); ++ end = normalize_up(offset + length, tree->mtt_step_size); ++ count = (int)(end - start) / (int)tree->mtt_step_size; ++ ++ /* Pre-malloc what memory we might need */ ++ storage = kmalloc(sizeof(*storage) * count, GFP_KERNEL); ++ if (!storage) ++ return -ENOMEM; ++ for (i = 0; i < count; i++) { ++ storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), ++ GFP_KERNEL); ++ if (!storage[i]) ++ goto out_cleanup; ++ } ++ ++ /* Now need lock - HOW??? */ ++ ++ for (s = start; s < end; s += tree->mtt_step_size) ++ used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); ++ ++ /* Unlock - HOW??? */ ++ status = 0; ++ ++ out_cleanup: ++ for (i = used; i < count; i++) { ++ if (!storage[i]) ++ break; ++ kfree(storage[i]); ++ } ++ kfree(storage); ++ return status; ++} ++ ++static void set_needs_init(sector_t *array, sector_t offset) ++{ ++ sector_t *p = array; ++ ++ dprintk("%s enter\n", __func__); ++ if (!p) ++ return; ++ while (*p < offset) ++ p++; ++ if (*p == offset) ++ return; ++ else if (*p == ~0) { ++ *p++ = offset; ++ *p = ~0; ++ return; ++ } else { ++ sector_t *save = p; ++ dprintk("%s Adding %llu\n", __func__, (u64)offset); ++ while (*p != ~0) ++ p++; ++ p++; ++ memmove(save + 1, save, (char *)p - (char *)save); ++ *save = offset; ++ return; ++ } ++} ++ ++/* We are relying on page lock to serialize this */ ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Assume start, end already sector aligned */ ++static int ++_range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag) ++{ ++ struct pnfs_inval_tracking *pos; ++ u64 expect = 0; ++ ++ dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector >= end) ++ continue; ++ if (!expect) { ++ if ((pos->it_sector == end - tree->mtt_step_size) && ++ (pos->it_tags & (1 << tag))) { ++ expect = pos->it_sector - tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ continue; ++ } else { ++ return 0; ++ } ++ } ++ if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) ++ return 0; ++ expect -= tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ } ++ return 0; ++} ++ ++static int is_range_written(struct pnfs_inval_markings *marks, ++ sector_t start, sector_t end) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Marks sectors in [offest, offset_length) as having been initialized. ++ * All lengths are step-aligned, where step is min(pagesize, blocksize). ++ * Notes where partial block is initialized, and helps prepare it for ++ * complete initialization later. ++ */ ++/* Currently assumes offset is page-aligned */ ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages) ++{ ++ sector_t s, start, end; ++ sector_t *array = NULL; /* Pages to mark */ ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", ++ __func__, (u64)offset, (u64)length); ++ s = max((sector_t) 3, ++ 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); ++ dprintk("%s set max=%llu\n", __func__, (u64)s); ++ if (pages) { ++ array = kmalloc(s * sizeof(sector_t), GFP_KERNEL); ++ if (!array) ++ goto outerr; ++ array[0] = ~0; ++ } ++ ++ start = normalize(offset, marks->im_block_size); ++ end = normalize_up(offset + length, marks->im_block_size); ++ if (_preload_range(&marks->im_tree, start, end - start)) ++ goto outerr; ++ ++ spin_lock(&marks->im_lock); ++ ++ for (s = normalize_up(start, PAGE_CACHE_SECTORS); ++ s < offset; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s pre-area pages\n", __func__); ++ /* Portion of used block is not initialized */ ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) ++ goto out_unlock; ++ for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); ++ s < end; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s post-area pages\n", __func__); ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ ++ spin_unlock(&marks->im_lock); ++ ++ if (pages) { ++ if (array[0] == ~0) { ++ kfree(array); ++ *pages = NULL; ++ } else ++ *pages = array; ++ } ++ return 0; ++ ++ out_unlock: ++ spin_unlock(&marks->im_lock); ++ outerr: ++ if (pages) { ++ kfree(array); ++ *pages = NULL; ++ } ++ return -ENOMEM; ++} ++ ++/* Marks sectors in [offest, offset+length) as having been written to disk. ++ * All lengths should be block aligned. ++ */ ++int mark_written_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length) ++{ ++ int status; ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, ++ (u64)offset, (u64)length); ++ spin_lock(&marks->im_lock); ++ status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); ++ spin_unlock(&marks->im_lock); ++ return status; ++} ++ ++static void print_short_extent(struct pnfs_block_short_extent *be) ++{ ++ dprintk("PRINT SHORT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->bse_length); ++ } ++} ++ ++void print_clist(struct list_head *list, unsigned int count) ++{ ++ struct pnfs_block_short_extent *be; ++ unsigned int i = 0; ++ ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, bse_node) { ++ i++; ++ print_short_extent(be); ++ } ++ if (i != count) ++ dprintk("\n\nExpected %u entries\n\n\n", count); ++ dprintk("****************\n"); ++} ++ ++/* Note: In theory, we should do more checking that devid's match between ++ * old and new, but if they don't, the lists are too corrupt to salvage anyway. ++ */ ++/* Note this is very similar to add_and_merge_extent */ ++static void add_to_commitlist(struct pnfs_block_layout *bl, ++ struct pnfs_block_short_extent *new) ++{ ++ struct list_head *clist = &bl->bl_commit; ++ struct pnfs_block_short_extent *old, *save; ++ sector_t end = new->bse_f_offset + new->bse_length; ++ ++ dprintk("%s enter\n", __func__); ++ print_short_extent(new); ++ print_clist(clist, bl->bl_count); ++ bl->bl_count++; ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe(old, save, clist, bse_node) { ++ if (new->bse_f_offset < old->bse_f_offset) ++ break; ++ if (end <= old->bse_f_offset + old->bse_length) { ++ /* Range is already in list */ ++ bl->bl_count--; ++ kfree(new); ++ return; ++ } else if (new->bse_f_offset <= ++ old->bse_f_offset + old->bse_length) { ++ /* new overlaps or abuts existing be */ ++ if (new->bse_mdev == old->bse_mdev) { ++ /* extend new to fully replace old */ ++ new->bse_length += new->bse_f_offset - ++ old->bse_f_offset; ++ new->bse_f_offset = old->bse_f_offset; ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ } ++ /* Note that if we never hit the above break, old will not point to a ++ * valid extent. However, in that case &old->bse_node==list. ++ */ ++ list_add_tail(&new->bse_node, &old->bse_node); ++ /* Scan forward for overlaps. If we find any, extend new and ++ * remove the overlapped extent. ++ */ ++ old = list_prepare_entry(new, clist, bse_node); ++ list_for_each_entry_safe_continue(old, save, clist, bse_node) { ++ if (end < old->bse_f_offset) ++ break; ++ /* new overlaps or abuts old */ ++ if (new->bse_mdev == old->bse_mdev) { ++ if (end < old->bse_f_offset + old->bse_length) { ++ /* extend new to fully cover old */ ++ end = old->bse_f_offset + old->bse_length; ++ new->bse_length = end - new->bse_f_offset; ++ } ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ dprintk("%s: after merging\n", __func__); ++ print_clist(clist, bl->bl_count); ++} ++ ++/* Note the range described by offset, length is guaranteed to be contained ++ * within be. ++ */ ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length) ++{ ++ sector_t new_end, end = offset + length; ++ struct pnfs_block_short_extent *new; ++ struct pnfs_block_layout *bl = container_of(be->be_inval, ++ struct pnfs_block_layout, ++ bl_inval); ++ ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ ++ mark_written_sectors(be->be_inval, offset, length); ++ /* We want to add the range to commit list, but it must be ++ * block-normalized, and verified that the normalized range has ++ * been entirely written to disk. ++ */ ++ new->bse_f_offset = offset; ++ offset = normalize(offset, bl->bl_blocksize); ++ if (offset < new->bse_f_offset) { ++ if (is_range_written(be->be_inval, offset, new->bse_f_offset)) ++ new->bse_f_offset = offset; ++ else ++ new->bse_f_offset = offset + bl->bl_blocksize; ++ } ++ new_end = normalize_up(end, bl->bl_blocksize); ++ if (end < new_end) { ++ if (is_range_written(be->be_inval, end, new_end)) ++ end = new_end; ++ else ++ end = new_end - bl->bl_blocksize; ++ } ++ if (end <= new->bse_f_offset) { ++ kfree(new); ++ return 0; ++ } ++ new->bse_length = end - new->bse_f_offset; ++ new->bse_devid = be->be_devid; ++ new->bse_mdev = be->be_mdev; ++ ++ spin_lock(&bl->bl_ext_lock); ++ /* new will be freed, either by add_to_commitlist if it decides not ++ * to use it, or after LAYOUTCOMMIT uses it in the commitlist. ++ */ ++ add_to_commitlist(bl, new); ++ spin_unlock(&bl->bl_ext_lock); ++ return 0; ++} ++ ++static void print_bl_extent(struct pnfs_block_extent *be) ++{ ++ dprintk("PRINT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->be_length); ++ dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); ++ dprintk(" be_state %d\n", be->be_state); ++ } ++} ++ ++static void ++destroy_extent(struct kref *kref) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = container_of(kref, struct pnfs_block_extent, be_refcnt); ++ dprintk("%s be=%p\n", __func__, be); ++ kfree(be); ++} ++ ++void ++put_extent(struct pnfs_block_extent *be) ++{ ++ if (be) { ++ dprintk("%s enter %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_put(&be->be_refcnt, destroy_extent); ++ } ++} ++ ++struct pnfs_block_extent *alloc_extent(void) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = kmalloc(sizeof(struct pnfs_block_extent), GFP_KERNEL); ++ if (!be) ++ return NULL; ++ INIT_LIST_HEAD(&be->be_node); ++ kref_init(&be->be_refcnt); ++ be->be_inval = NULL; ++ return be; ++} ++ ++struct pnfs_block_extent * ++get_extent(struct pnfs_block_extent *be) ++{ ++ if (be) ++ kref_get(&be->be_refcnt); ++ return be; ++} ++ ++void print_elist(struct list_head *list) ++{ ++ struct pnfs_block_extent *be; ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, be_node) { ++ print_bl_extent(be); ++ } ++ dprintk("****************\n"); ++} ++ ++static inline int ++extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) ++{ ++ /* Note this assumes new->be_f_offset >= old->be_f_offset */ ++ return (new->be_state == old->be_state) && ++ ((new->be_state == PNFS_BLOCK_NONE_DATA) || ++ ((new->be_v_offset - old->be_v_offset == ++ new->be_f_offset - old->be_f_offset) && ++ new->be_mdev == old->be_mdev)); ++} ++ ++/* Adds new to appropriate list in bl, modifying new and removing existing ++ * extents as appropriate to deal with overlaps. ++ * ++ * See find_get_extent for list constraints. ++ * ++ * Refcount on new is already set. If end up not using it, or error out, ++ * need to put the reference. ++ * ++ * Lock is held by caller. ++ */ ++int ++add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new) ++{ ++ struct pnfs_block_extent *be, *tmp; ++ sector_t end = new->be_f_offset + new->be_length; ++ struct list_head *list; ++ ++ dprintk("%s enter with be=%p\n", __func__, new); ++ print_bl_extent(new); ++ list = &bl->bl_extents[choose_list(new->be_state)]; ++ print_elist(list); ++ ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe_reverse(be, tmp, list, be_node) { ++ if (new->be_f_offset >= be->be_f_offset + be->be_length) ++ break; ++ if (new->be_f_offset >= be->be_f_offset) { ++ if (end <= be->be_f_offset + be->be_length) { ++ /* new is a subset of existing be*/ ++ if (extents_consistent(be, new)) { ++ dprintk("%s: new is subset, ignoring\n", ++ __func__); ++ put_extent(new); ++ return 0; ++ } else { ++ goto out_err; ++ } ++ } else { ++ /* |<-- be -->| ++ * |<-- new -->| */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ new->be_length += new->be_f_offset - ++ be->be_f_offset; ++ new->be_f_offset = be->be_f_offset; ++ new->be_v_offset = be->be_v_offset; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } else if (end >= be->be_f_offset + be->be_length) { ++ /* new extent overlap existing be */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } else if (end > be->be_f_offset) { ++ /* |<-- be -->| ++ *|<-- new -->| */ ++ if (extents_consistent(new, be)) { ++ /* extend new to fully replace be */ ++ new->be_length += be->be_f_offset + be->be_length - ++ new->be_f_offset - new->be_length; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } ++ /* Note that if we never hit the above break, be will not point to a ++ * valid extent. However, in that case &be->be_node==list. ++ */ ++ list_add(&new->be_node, &be->be_node); ++ dprintk("%s: inserting new\n", __func__); ++ print_elist(list); ++ /* STUB - The per-list consistency checks have all been done, ++ * should now check cross-list consistency. ++ */ ++ return 0; ++ ++ out_err: ++ put_extent(new); ++ return -EIO; ++} ++ ++/* Returns extent, or NULL. If a second READ extent exists, it is returned ++ * in cow_read, if given. ++ * ++ * The extents are kept in two seperate ordered lists, one for READ and NONE, ++ * one for READWRITE and INVALID. Within each list, we assume: ++ * 1. Extents are ordered by file offset. ++ * 2. For any given isect, there is at most one extents that matches. ++ */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read) ++{ ++ struct pnfs_block_extent *be, *cow, *ret; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ cow = ret = NULL; ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret && ++ (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ if (!ret) ++ ret = be; ++ else if (be->be_state != PNFS_BLOCK_READ_DATA) ++ put_extent(be); ++ else ++ cow = be; ++ break; ++ } ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ if (cow_read) ++ *cow_read = cow; ++ print_bl_extent(ret); ++ return ret; ++} ++ ++/* Similar to find_get_extent, but called with lock held, and ignores cow */ ++static struct pnfs_block_extent * ++find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) ++{ ++ struct pnfs_block_extent *be, *ret = NULL; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ ret = be; ++ break; ++ } ++ } ++ } ++ print_bl_extent(ret); ++ return ret; ++} ++ ++int ++encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg) ++{ ++ sector_t start, end; ++ struct pnfs_block_short_extent *lce, *save; ++ unsigned int count = 0; ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct list_head *ranges = &bld->ranges; ++ __be32 *p, *xdr_start; ++ ++ dprintk("%s enter\n", __func__); ++ start = arg->range.offset >> 9; ++ end = start + (arg->range.length >> 9); ++ dprintk("%s set start=%llu, end=%llu\n", ++ __func__, (u64)start, (u64)end); ++ ++ /* BUG - creation of bl_commit is buggy - need to wait for ++ * entire block to be marked WRITTEN before it can be added. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ /* Want to adjust for possible truncate */ ++ /* We now want to adjust argument range */ ++ ++ /* XDR encode the ranges found */ ++ xdr_start = xdr_reserve_space(xdr, 8); ++ if (!xdr_start) ++ goto out; ++ list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { ++ p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); ++ if (!p) ++ break; ++ WRITE_DEVID(&lce->bse_devid); ++ WRITE64(lce->bse_f_offset << 9); ++ WRITE64(lce->bse_length << 9); ++ WRITE64(0LL); ++ WRITE32(PNFS_BLOCK_READWRITE_DATA); ++ list_del(&lce->bse_node); ++ list_add_tail(&lce->bse_node, ranges); ++ bl->bl_count--; ++ count++; ++ } ++ xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); ++ xdr_start[1] = cpu_to_be32(count); ++out: ++ spin_unlock(&bl->bl_ext_lock); ++ dprintk("%s found %i ranges\n", __func__, count); ++ return 0; ++} ++ ++/* Helper function to set_to_rw that initialize a new extent */ ++static void ++_prep_new_extent(struct pnfs_block_extent *new, ++ struct pnfs_block_extent *orig, ++ sector_t offset, sector_t length, int state) ++{ ++ kref_init(&new->be_refcnt); ++ /* don't need to INIT_LIST_HEAD(&new->be_node) */ ++ memcpy(&new->be_devid, &orig->be_devid, sizeof(struct pnfs_deviceid)); ++ new->be_mdev = orig->be_mdev; ++ new->be_f_offset = offset; ++ new->be_length = length; ++ new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; ++ new->be_state = state; ++ new->be_inval = orig->be_inval; ++} ++ ++/* Tries to merge be with extent in front of it in list. ++ * Frees storage if not used. ++ */ ++static struct pnfs_block_extent * ++_front_merge(struct pnfs_block_extent *be, struct list_head *head, ++ struct pnfs_block_extent *storage) ++{ ++ struct pnfs_block_extent *prev; ++ ++ if (!storage) ++ goto no_merge; ++ if (&be->be_node == head || be->be_node.prev == head) ++ goto no_merge; ++ prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); ++ if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || ++ !extents_consistent(prev, be)) ++ goto no_merge; ++ _prep_new_extent(storage, prev, prev->be_f_offset, ++ prev->be_length + be->be_length, prev->be_state); ++ list_replace(&prev->be_node, &storage->be_node); ++ put_extent(prev); ++ list_del(&be->be_node); ++ put_extent(be); ++ return storage; ++ ++ no_merge: ++ kfree(storage); ++ return be; ++} ++ ++static u64 ++set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) ++{ ++ u64 rv = offset + length; ++ struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; ++ struct pnfs_block_extent *children[3]; ++ struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; ++ int i = 0, j; ++ ++ dprintk("%s(%llu, %llu)\n", __func__, offset, length); ++ /* Create storage for up to three new extents e1, e2, e3 */ ++ e1 = kmalloc(sizeof(*e1), GFP_KERNEL); ++ e2 = kmalloc(sizeof(*e2), GFP_KERNEL); ++ e3 = kmalloc(sizeof(*e3), GFP_KERNEL); ++ /* BUG - we are ignoring any failure */ ++ if (!e1 || !e2 || !e3) ++ goto out_nosplit; ++ ++ spin_lock(&bl->bl_ext_lock); ++ be = find_get_extent_locked(bl, offset); ++ rv = be->be_f_offset + be->be_length; ++ if (be->be_state != PNFS_BLOCK_INVALID_DATA) { ++ spin_unlock(&bl->bl_ext_lock); ++ goto out_nosplit; ++ } ++ /* Add e* to children, bumping e*'s krefs */ ++ if (be->be_f_offset != offset) { ++ _prep_new_extent(e1, be, be->be_f_offset, ++ offset - be->be_f_offset, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e1; ++ print_bl_extent(e1); ++ } else ++ merge1 = e1; ++ _prep_new_extent(e2, be, offset, ++ min(length, be->be_f_offset + be->be_length - offset), ++ PNFS_BLOCK_READWRITE_DATA); ++ children[i++] = e2; ++ print_bl_extent(e2); ++ if (offset + length < be->be_f_offset + be->be_length) { ++ _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, ++ be->be_f_offset + be->be_length - ++ offset - length, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e3; ++ print_bl_extent(e3); ++ } else ++ merge2 = e3; ++ ++ /* Remove be from list, and insert the e* */ ++ /* We don't get refs on e*, since this list is the base reference ++ * set when init'ed. ++ */ ++ if (i < 3) ++ children[i] = NULL; ++ new = children[0]; ++ list_replace(&be->be_node, &new->be_node); ++ put_extent(be); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); ++ for (j = 1; j < i; j++) { ++ old = new; ++ new = children[j]; ++ list_add(&new->be_node, &old->be_node); ++ } ++ if (merge2) { ++ /* This is a HACK, should just create a _back_merge function */ ++ new = list_entry(new->be_node.next, ++ struct pnfs_block_extent, be_node); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ ++ /* Since we removed the base reference above, be is now scheduled for ++ * destruction. ++ */ ++ put_extent(be); ++ dprintk("%s returns %llu after split\n", __func__, rv); ++ return rv; ++ ++ out_nosplit: ++ kfree(e1); ++ kfree(e2); ++ kfree(e3); ++ dprintk("%s returns %llu without splitting\n", __func__, rv); ++ return rv; ++} ++ ++void ++clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct nfs4_layoutcommit_args *arg, ++ int status) ++{ ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct pnfs_block_short_extent *lce, *save; ++ ++ dprintk("%s status %d\n", __func__, status); ++ list_for_each_entry_safe_reverse(lce, save, &bld->ranges, bse_node) { ++ if (likely(!status)) { ++ u64 offset = lce->bse_f_offset; ++ u64 end = offset + lce->bse_length; ++ ++ do { ++ offset = set_to_rw(bl, offset, end - offset); ++ } while (offset < end); ++ ++ kfree(lce); ++ } else { ++ spin_lock(&bl->bl_ext_lock); ++ add_to_commitlist(bl, lce); ++ spin_unlock(&bl->bl_ext_lock); ++ } ++ } ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile +--- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-31 20:42:05.502212803 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-08-31 20:42:05.502212803 -0400 +@@ -0,0 +1,6 @@ ++# ++# Makefile for the pNFS block layout driver kernel module ++# ++obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o ++blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \ ++ extents.o block-device-discovery-pipe.o +diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/callback.h +--- linux-2.6.34.noarch/fs/nfs/callback.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-08-31 20:42:05.508119925 -0400 +@@ -111,6 +111,13 @@ extern int nfs41_validate_delegation_sta + + #define RCA4_TYPE_MASK_RDATA_DLG 0 + #define RCA4_TYPE_MASK_WDATA_DLG 1 ++#define RCA4_TYPE_MASK_DIR_DLG 2 ++#define RCA4_TYPE_MASK_FILE_LAYOUT 3 ++#define RCA4_TYPE_MASK_BLK_LAYOUT 4 ++#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8 ++#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 ++#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 ++#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 + + struct cb_recallanyargs { + struct sockaddr *craa_addr; +@@ -127,6 +134,39 @@ struct cb_recallslotargs { + extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, + void *dummy); + ++struct cb_layoutrecallargs { ++ struct sockaddr *cbl_addr; ++ struct nfs_fh cbl_fh; ++ struct pnfs_layout_range cbl_seg; ++ struct nfs_fsid cbl_fsid; ++ uint32_t cbl_recall_type; ++ uint32_t cbl_layout_type; ++ uint32_t cbl_layoutchanged; ++ nfs4_stateid cbl_stateid; ++}; ++ ++extern unsigned nfs4_callback_layoutrecall( ++ struct cb_layoutrecallargs *args, ++ void *dummy); ++ ++struct cb_devicenotifyitem { ++ uint32_t cbd_notify_type; ++ uint32_t cbd_layout_type; ++ struct pnfs_deviceid cbd_dev_id; ++ uint32_t cbd_immediate; ++}; ++ ++/* XXX: Should be dynamic up to max compound size */ ++#define NFS4_DEV_NOTIFY_MAXENTRIES 10 ++struct cb_devicenotifyargs { ++ struct sockaddr *addr; ++ int ndevs; ++ struct cb_devicenotifyitem devs[NFS4_DEV_NOTIFY_MAXENTRIES]; ++}; ++ ++extern unsigned nfs4_callback_devicenotify( ++ struct cb_devicenotifyargs *args, ++ void *dummy); + #endif /* CONFIG_NFS_V4_1 */ + + extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); +diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/nfs/callback_proc.c +--- linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-08-31 20:42:05.509093330 -0400 +@@ -8,10 +8,15 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + #include "nfs4_fs.h" + #include "callback.h" + #include "delegation.h" + #include "internal.h" ++#include "pnfs.h" + + #ifdef NFS_DEBUG + #define NFSDBG_FACILITY NFSDBG_CALLBACK +@@ -62,16 +67,6 @@ out: + return res->status; + } + +-static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *) +-{ +-#if defined(CONFIG_NFS_V4_1) +- if (clp->cl_minorversion > 0) +- return nfs41_validate_delegation_stateid; +-#endif +- return nfs4_validate_delegation_stateid; +-} +- +- + __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) + { + struct nfs_client *clp; +@@ -92,8 +87,7 @@ __be32 nfs4_callback_recall(struct cb_re + inode = nfs_delegation_find_inode(clp, &args->fh); + if (inode != NULL) { + /* Set up a helper thread to actually return the delegation */ +- switch (nfs_async_inode_return_delegation(inode, &args->stateid, +- nfs_validate_delegation_stateid(clp))) { ++ switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { + case 0: + res = 0; + break; +@@ -116,24 +110,364 @@ out: + + int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) + { +- if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, +- sizeof(delegation->stateid.data)) != 0) ++ if (delegation == NULL || memcmp(delegation->stateid.u.data, ++ stateid->u.data, ++ sizeof(delegation->stateid.u.data))) + return 0; + return 1; + } + + #if defined(CONFIG_NFS_V4_1) + ++static bool ++pnfs_is_next_layout_stateid(const struct pnfs_layout_hdr *lo, ++ const nfs4_stateid stateid) ++{ ++ int seqlock; ++ bool res; ++ u32 oldseqid, newseqid; ++ ++ do { ++ seqlock = read_seqbegin(&lo->seqlock); ++ oldseqid = be32_to_cpu(lo->stateid.u.stateid.seqid); ++ newseqid = be32_to_cpu(stateid.u.stateid.seqid); ++ res = !memcmp(lo->stateid.u.stateid.other, ++ stateid.u.stateid.other, ++ NFS4_STATEID_OTHER_SIZE); ++ if (res) { /* comparing layout stateids */ ++ if (oldseqid == ~0) ++ res = (newseqid == 1); ++ else ++ res = (newseqid == oldseqid + 1); ++ } else { /* open stateid */ ++ res = !memcmp(lo->stateid.u.data, ++ &zero_stateid, ++ NFS4_STATEID_SIZE); ++ if (res) ++ res = (newseqid == 1); ++ } ++ } while (read_seqretry(&lo->seqlock, seqlock)); ++ ++ return res; ++} ++ ++/* ++ * Retrieve an inode based on layout recall parameters ++ * ++ * Note: caller must iput(inode) to dereference the inode. ++ */ ++static struct inode * ++nfs_layoutrecall_find_inode(struct nfs_client *clp, ++ const struct cb_layoutrecallargs *args) ++{ ++ struct nfs_inode *nfsi; ++ struct pnfs_layout_hdr *lo; ++ struct nfs_server *server; ++ struct inode *ino = NULL; ++ ++ dprintk("%s: Begin recall_type=%d clp %p\n", ++ __func__, args->cbl_recall_type, clp); ++ ++ spin_lock(&clp->cl_lock); ++ list_for_each_entry(lo, &clp->cl_layouts, layouts) { ++ nfsi = PNFS_NFS_INODE(lo); ++ if (!nfsi) ++ continue; ++ ++ dprintk("%s: Searching inode=%lu\n", ++ __func__, nfsi->vfs_inode.i_ino); ++ ++ if (args->cbl_recall_type == RETURN_FILE) { ++ if (nfs_compare_fh(&args->cbl_fh, &nfsi->fh)) ++ continue; ++ } else if (args->cbl_recall_type == RETURN_FSID) { ++ server = NFS_SERVER(&nfsi->vfs_inode); ++ if (server->fsid.major != args->cbl_fsid.major || ++ server->fsid.minor != args->cbl_fsid.minor) ++ continue; ++ } ++ ++ /* Make sure client didn't clean up layout without ++ * telling the server */ ++ if (!has_layout(nfsi)) ++ continue; ++ ++ ino = igrab(&nfsi->vfs_inode); ++ dprintk("%s: Found inode=%p\n", __func__, ino); ++ break; ++ } ++ spin_unlock(&clp->cl_lock); ++ return ino; ++} ++ ++struct recall_layout_threadargs { ++ struct inode *inode; ++ struct nfs_client *clp; ++ struct completion started; ++ struct cb_layoutrecallargs *rl; ++ int result; ++}; ++ ++static int pnfs_recall_layout(void *data) ++{ ++ struct inode *inode, *ino; ++ struct nfs_client *clp; ++ struct cb_layoutrecallargs rl; ++ struct nfs4_layoutreturn *lrp; ++ struct recall_layout_threadargs *args = ++ (struct recall_layout_threadargs *)data; ++ int status = 0; ++ ++ daemonize("nfsv4-layoutreturn"); ++ ++ dprintk("%s: recall_type=%d fsid 0x%llx-0x%llx start\n", ++ __func__, args->rl->cbl_recall_type, ++ args->rl->cbl_fsid.major, args->rl->cbl_fsid.minor); ++ ++ clp = args->clp; ++ inode = args->inode; ++ rl = *args->rl; ++ ++ /* support whole file layouts only */ ++ rl.cbl_seg.offset = 0; ++ rl.cbl_seg.length = NFS4_MAX_UINT64; ++ ++ if (rl.cbl_recall_type == RETURN_FILE) { ++ if (pnfs_is_next_layout_stateid(NFS_I(inode)->layout, ++ rl.cbl_stateid)) ++ status = pnfs_return_layout(inode, &rl.cbl_seg, ++ &rl.cbl_stateid, RETURN_FILE, ++ false); ++ else ++ status = cpu_to_be32(NFS4ERR_DELAY); ++ if (status) ++ dprintk("%s RETURN_FILE error: %d\n", __func__, status); ++ else ++ status = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); ++ args->result = status; ++ complete(&args->started); ++ goto out; ++ } ++ ++ status = cpu_to_be32(NFS4_OK); ++ args->result = status; ++ complete(&args->started); ++ args = NULL; ++ ++ /* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */ ++ while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) { ++ /* FIXME: need to check status on pnfs_return_layout */ ++ pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, false); ++ iput(ino); ++ } ++ ++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); ++ if (!lrp) { ++ dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n", ++ __func__); ++ goto out; ++ } ++ ++ /* send final layoutreturn */ ++ lrp->args.reclaim = 0; ++ lrp->args.layout_type = rl.cbl_layout_type; ++ lrp->args.return_type = rl.cbl_recall_type; ++ lrp->args.range = rl.cbl_seg; ++ lrp->args.inode = inode; ++ nfs4_proc_layoutreturn(lrp, true); ++ ++out: ++ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); ++ nfs_put_client(clp); ++ module_put_and_exit(0); ++ dprintk("%s: exit status %d\n", __func__, 0); ++ return 0; ++} ++ ++/* ++ * Asynchronous layout recall! ++ */ ++static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode, ++ struct cb_layoutrecallargs *rl) ++{ ++ struct recall_layout_threadargs data = { ++ .clp = clp, ++ .inode = inode, ++ .rl = rl, ++ }; ++ struct task_struct *t; ++ int status = -EAGAIN; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ /* FIXME: do not allow two concurrent layout recalls */ ++ if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) ++ return status; ++ ++ init_completion(&data.started); ++ __module_get(THIS_MODULE); ++ if (!atomic_inc_not_zero(&clp->cl_count)) ++ goto out_put_no_client; ++ ++ t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout"); ++ if (IS_ERR(t)) { ++ printk(KERN_INFO "NFS: Layout recall callback thread failed " ++ "for client (clientid %08x/%08x)\n", ++ (unsigned)(clp->cl_clientid >> 32), ++ (unsigned)(clp->cl_clientid)); ++ status = PTR_ERR(t); ++ goto out_module_put; ++ } ++ wait_for_completion(&data.started); ++ return data.result; ++out_module_put: ++ nfs_put_client(clp); ++out_put_no_client: ++ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); ++ module_put(THIS_MODULE); ++ return status; ++} ++ ++static int pnfs_recall_all_layouts(struct nfs_client *clp) ++{ ++ struct cb_layoutrecallargs rl; ++ struct inode *inode; ++ int status = 0; ++ ++ rl.cbl_recall_type = RETURN_ALL; ++ rl.cbl_seg.iomode = IOMODE_ANY; ++ rl.cbl_seg.offset = 0; ++ rl.cbl_seg.length = NFS4_MAX_UINT64; ++ ++ /* we need the inode to get the nfs_server struct */ ++ inode = nfs_layoutrecall_find_inode(clp, &rl); ++ if (!inode) ++ return status; ++ status = pnfs_async_return_layout(clp, inode, &rl); ++ iput(inode); ++ ++ return status; ++} ++ ++__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args, ++ void *dummy) ++{ ++ struct nfs_client *clp; ++ struct inode *inode = NULL; ++ __be32 res; ++ int status; ++ unsigned int num_client = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); ++ clp = nfs_find_client(args->cbl_addr, 4); ++ if (clp == NULL) { ++ dprintk("%s: no client for addr %u.%u.%u.%u\n", ++ __func__, NIPQUAD(args->cbl_addr)); ++ goto out; ++ } ++ ++ res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); ++ do { ++ struct nfs_client *prev = clp; ++ num_client++; ++ /* the callback must come from the MDS personality */ ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) ++ goto loop; ++ if (args->cbl_recall_type == RETURN_FILE) { ++ inode = nfs_layoutrecall_find_inode(clp, args); ++ if (inode != NULL) { ++ status = pnfs_async_return_layout(clp, inode, ++ args); ++ if (status) ++ res = cpu_to_be32(NFS4ERR_DELAY); ++ iput(inode); ++ } ++ } else { /* _ALL or _FSID */ ++ /* we need the inode to get the nfs_server struct */ ++ inode = nfs_layoutrecall_find_inode(clp, args); ++ if (!inode) ++ goto loop; ++ status = pnfs_async_return_layout(clp, inode, args); ++ if (status) ++ res = cpu_to_be32(NFS4ERR_DELAY); ++ iput(inode); ++ } ++loop: ++ clp = nfs_find_client_next(prev); ++ nfs_put_client(prev); ++ } while (clp != NULL); ++ ++out: ++ dprintk("%s: exit with status = %d numclient %u\n", ++ __func__, ntohl(res), num_client); ++ return res; ++} ++ ++/* Remove the deviceid(s) from the nfs_client deviceid cache */ ++static __be32 pnfs_devicenotify_client(struct nfs_client *clp, ++ struct cb_devicenotifyargs *args) ++{ ++ uint32_t type; ++ int i; ++ ++ dprintk("%s: --> clp %p\n", __func__, clp); ++ ++ for (i = 0; i < args->ndevs; i++) { ++ struct cb_devicenotifyitem *dev = &args->devs[i]; ++ type = dev->cbd_notify_type; ++ if (type == NOTIFY_DEVICEID4_DELETE && clp->cl_devid_cache) ++ nfs4_delete_device(clp->cl_devid_cache, ++ &dev->cbd_dev_id); ++ else if (type == NOTIFY_DEVICEID4_CHANGE) ++ printk(KERN_ERR "%s: NOTIFY_DEVICEID4_CHANGE " ++ "not supported\n", __func__); ++ } ++ return 0; ++} ++ ++__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, ++ void *dummy) ++{ ++ struct nfs_client *clp; ++ __be32 res = 0; ++ unsigned int num_client = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ res = __constant_htonl(NFS4ERR_INVAL); ++ clp = nfs_find_client(args->addr, 4); ++ if (clp == NULL) { ++ dprintk("%s: no client for addr %u.%u.%u.%u\n", ++ __func__, NIPQUAD(args->addr)); ++ goto out; ++ } ++ ++ do { ++ struct nfs_client *prev = clp; ++ num_client++; ++ res = pnfs_devicenotify_client(clp, args); ++ clp = nfs_find_client_next(prev); ++ nfs_put_client(prev); ++ } while (clp != NULL); ++ ++out: ++ dprintk("%s: exit with status = %d numclient %u\n", ++ __func__, ntohl(res), num_client); ++ return res; ++} ++ + int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) + { + if (delegation == NULL) + return 0; + +- /* seqid is 4-bytes long */ +- if (((u32 *) &stateid->data)[0] != 0) ++ if (stateid->u.stateid.seqid != 0) + return 0; +- if (memcmp(&delegation->stateid.data[4], &stateid->data[4], +- sizeof(stateid->data)-4)) ++ if (memcmp(&delegation->stateid.u.stateid.other, ++ &stateid->u.stateid.other, ++ NFS4_STATEID_OTHER_SIZE)) + return 0; + + return 1; +@@ -335,13 +669,37 @@ out: + return status; + } + ++static inline bool ++validate_bitmap_values(const unsigned long *mask) ++{ ++ int i; ++ ++ if (*mask == 0) ++ return true; ++ if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_WDATA_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_DIR_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, mask) || ++ test_bit(RCA4_TYPE_MASK_BLK_LAYOUT, mask)) ++ return true; ++ for (i = RCA4_TYPE_MASK_OBJ_LAYOUT_MIN; ++ i <= RCA4_TYPE_MASK_OBJ_LAYOUT_MAX; i++) ++ if (test_bit(i, mask)) ++ return true; ++ for (i = RCA4_TYPE_MASK_OTHER_LAYOUT_MIN; ++ i <= RCA4_TYPE_MASK_OTHER_LAYOUT_MAX; i++) ++ if (test_bit(i, mask)) ++ return true; ++ return false; ++} ++ + __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) + { + struct nfs_client *clp; + __be32 status; + fmode_t flags = 0; + +- status = htonl(NFS4ERR_OP_NOT_IN_SESSION); ++ status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + clp = nfs_find_client(args->craa_addr, 4); + if (clp == NULL) + goto out; +@@ -349,16 +707,25 @@ __be32 nfs4_callback_recallany(struct cb + dprintk("NFS: RECALL_ANY callback request from %s\n", + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + ++ status = cpu_to_be32(NFS4ERR_INVAL); ++ if (!validate_bitmap_values((const unsigned long *) ++ &args->craa_type_mask)) ++ return status; ++ ++ status = cpu_to_be32(NFS4_OK); + if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags = FMODE_READ; + if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags |= FMODE_WRITE; ++ if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) ++ &args->craa_type_mask)) ++ if (pnfs_recall_all_layouts(clp) == -EAGAIN) ++ status = cpu_to_be32(NFS4ERR_DELAY); + + if (flags) + nfs_expire_all_delegation_types(clp, flags); +- status = htonl(NFS4_OK); + out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/nfs/callback_xdr.c +--- linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-08-31 20:42:05.510143651 -0400 +@@ -22,6 +22,8 @@ + #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + + #if defined(CONFIG_NFS_V4_1) ++#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) ++#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ + 4 + 1 + 3) + #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +@@ -136,7 +138,7 @@ static __be32 decode_stateid(struct xdr_ + p = read_buf(xdr, 16); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); +- memcpy(stateid->data, p, 16); ++ memcpy(stateid->u.data, p, 16); + return 0; + } + +@@ -220,6 +222,148 @@ out: + + #if defined(CONFIG_NFS_V4_1) + ++static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct cb_layoutrecallargs *args) ++{ ++ __be32 *p; ++ __be32 status = 0; ++ ++ args->cbl_addr = svc_addr(rqstp); ++ p = read_buf(xdr, 4 * sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ ++ args->cbl_layout_type = ntohl(*p++); ++ args->cbl_seg.iomode = ntohl(*p++); ++ args->cbl_layoutchanged = ntohl(*p++); ++ args->cbl_recall_type = ntohl(*p++); ++ ++ if (likely(args->cbl_recall_type == RETURN_FILE)) { ++ status = decode_fh(xdr, &args->cbl_fh); ++ if (unlikely(status != 0)) ++ goto out; ++ ++ p = read_buf(xdr, 2 * sizeof(uint64_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ p = xdr_decode_hyper(p, &args->cbl_seg.offset); ++ p = xdr_decode_hyper(p, &args->cbl_seg.length); ++ status = decode_stateid(xdr, &args->cbl_stateid); ++ if (unlikely(status != 0)) ++ goto out; ++ } else if (args->cbl_recall_type == RETURN_FSID) { ++ p = read_buf(xdr, 2 * sizeof(uint64_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ p = xdr_decode_hyper(p, &args->cbl_fsid.major); ++ p = xdr_decode_hyper(p, &args->cbl_fsid.minor); ++ } ++ dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d " ++ "fsid %llx-%llx fhsize %d\n", __func__, ++ args->cbl_layout_type, args->cbl_seg.iomode, ++ args->cbl_layoutchanged, args->cbl_recall_type, ++ args->cbl_fsid.major, args->cbl_fsid.minor, ++ args->cbl_fh.size); ++out: ++ dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); ++ return status; ++} ++ ++static ++__be32 decode_devicenotify_args(struct svc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct cb_devicenotifyargs *args) ++{ ++ __be32 *p; ++ __be32 status = 0; ++ u32 tmp; ++ int n, i; ++ args->ndevs = 0; ++ ++ args->addr = svc_addr(rqstp); ++ ++ /* Num of device notifications */ ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ n = ntohl(*p++); ++ if (n <= 0) ++ goto out; ++ ++ /* XXX: need to possibly return error in this case */ ++ if (n > NFS4_DEV_NOTIFY_MAXENTRIES) { ++ dprintk("%s: Processing (%d) notifications out of (%d)\n", ++ __func__, NFS4_DEV_NOTIFY_MAXENTRIES, n); ++ n = NFS4_DEV_NOTIFY_MAXENTRIES; ++ } ++ ++ /* Decode each dev notification */ ++ for (i = 0; i < n; i++) { ++ struct cb_devicenotifyitem *dev = &args->devs[i]; ++ ++ p = read_buf(xdr, (4 * sizeof(uint32_t)) ++ + NFS4_PNFS_DEVICEID4_SIZE); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* bitmap size */ ++ if (tmp != 1) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_notify_type = ntohl(*p++); ++ if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* opaque size */ ++ if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) && ++ (tmp != NFS4_PNFS_DEVICEID4_SIZE + 8)) || ++ ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) && ++ (tmp != NFS4_PNFS_DEVICEID4_SIZE + 4))) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_layout_type = ntohl(*p++); ++ memcpy(dev->cbd_dev_id.data, p, NFS4_PNFS_DEVICEID4_SIZE); ++ p += XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE); ++ ++ if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_DELAY); ++ goto out; ++ } ++ dev->cbd_immediate = ntohl(*p++); ++ } else { ++ dev->cbd_immediate = 0; ++ } ++ ++ args->ndevs++; ++ ++ dprintk("%s: type %d layout 0x%x immediate %d\n", ++ __func__, dev->cbd_notify_type, dev->cbd_layout_type, ++ dev->cbd_immediate); ++ } ++out: ++ dprintk("%s: status %d ndevs %d\n", ++ __func__, ntohl(status), args->ndevs); ++ return status; ++} ++ + static __be32 decode_sessionid(struct xdr_stream *xdr, + struct nfs4_sessionid *sid) + { +@@ -574,11 +718,11 @@ preprocess_nfs41_op(int nop, unsigned in + case OP_CB_SEQUENCE: + case OP_CB_RECALL_ANY: + case OP_CB_RECALL_SLOT: ++ case OP_CB_LAYOUTRECALL: ++ case OP_CB_NOTIFY_DEVICEID: + *op = &callback_ops[op_nr]; + break; + +- case OP_CB_LAYOUTRECALL: +- case OP_CB_NOTIFY_DEVICEID: + case OP_CB_NOTIFY: + case OP_CB_PUSH_DELEG: + case OP_CB_RECALLABLE_OBJ_AVAIL: +@@ -739,6 +883,18 @@ static struct callback_op callback_ops[] + .res_maxsize = CB_OP_RECALL_RES_MAXSZ, + }, + #if defined(CONFIG_NFS_V4_1) ++ [OP_CB_LAYOUTRECALL] = { ++ .process_op = (callback_process_op_t)nfs4_callback_layoutrecall, ++ .decode_args = ++ (callback_decode_arg_t)decode_layoutrecall_args, ++ .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, ++ }, ++ [OP_CB_NOTIFY_DEVICEID] = { ++ .process_op = (callback_process_op_t)nfs4_callback_devicenotify, ++ .decode_args = ++ (callback_decode_arg_t)decode_devicenotify_args, ++ .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ, ++ }, + [OP_CB_SEQUENCE] = { + .process_op = (callback_process_op_t)nfs4_callback_sequence, + .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, +diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/client.c +--- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-08-31 20:41:19.144140225 -0400 ++++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-31 20:42:05.511222861 -0400 +@@ -39,6 +39,7 @@ + #include + #include + #include ++#include + + #include + +@@ -48,6 +49,7 @@ + #include "iostat.h" + #include "internal.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_CLIENT + +@@ -150,11 +152,14 @@ static struct nfs_client *nfs_alloc_clie + clp->cl_boot_time = CURRENT_TIME; + clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; + clp->cl_minorversion = cl_init->minorversion; ++ clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; + #endif + cred = rpc_lookup_machine_cred(); + if (!IS_ERR(cred)) + clp->cl_machine_cred = cred; +- ++#if defined(CONFIG_NFS_V4_1) ++ INIT_LIST_HEAD(&clp->cl_layouts); ++#endif + nfs_fscache_get_client_cookie(clp); + + return clp; +@@ -178,7 +183,7 @@ static void nfs4_clear_client_minor_vers + clp->cl_session = NULL; + } + +- clp->cl_call_sync = _nfs4_call_sync; ++ clp->cl_mvops = nfs_v4_minor_ops[0]; + #endif /* CONFIG_NFS_V4_1 */ + } + +@@ -188,7 +193,7 @@ static void nfs4_clear_client_minor_vers + static void nfs4_destroy_callback(struct nfs_client *clp) + { + if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) +- nfs_callback_down(clp->cl_minorversion); ++ nfs_callback_down(clp->cl_mvops->minor_version); + } + + static void nfs4_shutdown_client(struct nfs_client *clp) +@@ -251,6 +256,7 @@ void nfs_put_client(struct nfs_client *c + nfs_free_client(clp); + } + } ++EXPORT_SYMBOL(nfs_put_client); + + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + /* +@@ -343,7 +349,7 @@ static int nfs_sockaddr_match_ipaddr(con + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, including the port number. + */ +-static int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2) + { + if (sa1->sa_family != sa2->sa_family) +@@ -357,6 +363,7 @@ static int nfs_sockaddr_cmp(const struct + } + return 0; + } ++EXPORT_SYMBOL(nfs_sockaddr_cmp); + + /* + * Find a client by IP address and protocol version +@@ -548,6 +555,7 @@ int nfs4_check_client_ready(struct nfs_c + return -EPROTONOSUPPORT; + return 0; + } ++EXPORT_SYMBOL(nfs4_check_client_ready); + + /* + * Initialise the timeout values for a connection +@@ -865,9 +873,34 @@ error: + } + + /* ++ * Initialize the pNFS layout driver and setup pNFS related parameters ++ */ ++static void nfs4_init_pnfs(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) ++{ ++#if defined(CONFIG_NFS_V4_1) ++ struct nfs_client *clp = server->nfs_client; ++ ++ if (nfs4_has_session(clp) && ++ (clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) { ++ server->pnfs_blksize = fsinfo->blksize; ++ set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype); ++ pnfs_set_ds_iosize(server); ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ ++static void nfs4_uninit_pnfs(struct nfs_server *server) ++{ ++#if defined(CONFIG_NFS_V4_1) ++ if (server->nfs_client && nfs4_has_session(server->nfs_client)) ++ unmount_pnfs_layoutdriver(server); ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ ++/* + * Load up the server record from information gained in an fsinfo record + */ +-static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) ++static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) + { + unsigned long max_rpc_payload; + +@@ -897,6 +930,8 @@ static void nfs_server_set_fsinfo(struct + if (server->wsize > NFS_MAX_FILE_IO_SIZE) + server->wsize = NFS_MAX_FILE_IO_SIZE; + server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; ++ nfs4_init_pnfs(server, mntfh, fsinfo); ++ + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); + + server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); +@@ -938,7 +973,7 @@ static int nfs_probe_fsinfo(struct nfs_s + if (error < 0) + goto out_error; + +- nfs_server_set_fsinfo(server, &fsinfo); ++ nfs_server_set_fsinfo(server, mntfh, &fsinfo); + + /* Get some general file system info */ + if (server->namelen == 0) { +@@ -1016,6 +1051,7 @@ void nfs_free_server(struct nfs_server * + { + dprintk("--> nfs_free_server()\n"); + ++ nfs4_uninit_pnfs(server); + spin_lock(&nfs_client_lock); + list_del(&server->client_link); + list_del(&server->master_link); +@@ -1126,7 +1162,7 @@ static int nfs4_init_callback(struct nfs + return error; + } + +- error = nfs_callback_up(clp->cl_minorversion, ++ error = nfs_callback_up(clp->cl_mvops->minor_version, + clp->cl_rpcclient->cl_xprt); + if (error < 0) { + dprintk("%s: failed to start callback. Error = %d\n", +@@ -1143,10 +1179,8 @@ static int nfs4_init_callback(struct nfs + */ + static int nfs4_init_client_minor_version(struct nfs_client *clp) + { +- clp->cl_call_sync = _nfs4_call_sync; +- + #if defined(CONFIG_NFS_V4_1) +- if (clp->cl_minorversion) { ++ if (clp->cl_mvops->minor_version) { + struct nfs4_session *session = NULL; + /* + * Create the session and mark it expired. +@@ -1158,7 +1192,13 @@ static int nfs4_init_client_minor_versio + return -ENOMEM; + + clp->cl_session = session; +- clp->cl_call_sync = _nfs4_call_sync_session; ++ /* ++ * The create session reply races with the server back ++ * channel probe. Mark the client NFS_CS_SESSION_INITING ++ * so that the client back channel can find the ++ * nfs_client struct ++ */ ++ clp->cl_cons_state = NFS_CS_SESSION_INITING; + } + #endif /* CONFIG_NFS_V4_1 */ + +@@ -1216,7 +1256,7 @@ error: + /* + * Set up an NFS4 client + */ +-static int nfs4_set_client(struct nfs_server *server, ++int nfs4_set_client(struct nfs_server *server, + const char *hostname, + const struct sockaddr *addr, + const size_t addrlen, +@@ -1259,6 +1299,7 @@ error: + dprintk("<-- nfs4_set_client() = xerror %d\n", error); + return error; + } ++EXPORT_SYMBOL(nfs4_set_client); + + + /* +@@ -1448,7 +1489,7 @@ struct nfs_server *nfs4_create_referral_ + data->authflavor, + parent_server->client->cl_xprt->prot, + parent_server->client->cl_timeout, +- parent_client->cl_minorversion); ++ parent_client->cl_mvops->minor_version); + if (error < 0) + goto error; + +diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/bl_com.c +--- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-08-31 20:42:05.550110844 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-08-31 20:42:05.550110844 -0400 +@@ -0,0 +1,292 @@ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++static ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); ++static void bl_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops bl_upcall_ops = { ++ .upcall = bl_pipe_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = bl_pipe_destroy_msg, ++}; ++ ++bl_comm_t *bl_comm_global; ++ ++int ++nfsd_bl_start(void) ++{ ++ bl_comm_t *bl_comm = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ dprintk("%s: starting pipe\n", __func__); ++ if (bl_comm_global) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ bl_comm = kzalloc(sizeof (*bl_comm), GFP_KERNEL); ++ if (!bl_comm) { ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ /* FIXME: rename to "spnfs_block" */ ++ bl_comm->pipe_dentry = rpc_mkpipe(nd.path.dentry, "pnfs_block", bl_comm, ++ &bl_upcall_ops, 0); ++ if (IS_ERR(bl_comm->pipe_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ mutex_init(&bl_comm->lock); ++ mutex_init(&bl_comm->pipe_lock); ++ init_waitqueue_head(&bl_comm->pipe_wq); ++ ++ bl_comm_global = bl_comm; ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(bl_comm); ++ return rc; ++} ++ ++void ++nfsd_bl_stop(void) ++{ ++ bl_comm_t *c = bl_comm_global; ++ ++ dprintk("%s: stopping pipe\n", __func__); ++ if (!c) ++ return; ++ rpc_unlink(c->pipe_dentry); ++ rpc_put_mount(); ++ bl_comm_global = NULL; ++ kfree(c); ++} ++ ++static ssize_t ++bl_pipe_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *dst, ++ size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied, ++ left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ ++ return mlen; ++} ++ ++static ssize_t ++bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ bl_comm_t *bc = (bl_comm_t *)rpci->private; ++ bl_comm_msg_t *im = &bc->msg; ++ int ret; ++ bl_comm_res_t *res; ++ ++ ++ if (mlen == 0) { ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ im->msg_res = NULL; ++ wake_up(&bc->pipe_wq); ++ return -EFAULT; ++ } ++ ++ if ((res = kmalloc(mlen, GFP_KERNEL)) == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(res, src, mlen)) { ++ kfree(res); ++ return -EFAULT; ++ } ++ ++ mutex_lock(&bc->pipe_lock); ++ ++ ret = mlen; ++ im->msg_status = res->res_status; ++ im->msg_res = res; ++ ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++ return ret; ++} ++ ++static void ++bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ bl_comm_msg_t *im = msg->data; ++ bl_comm_t *bc = container_of(im, struct bl_comm, msg); ++ ++ if (msg->errno >= 0) ++ return; ++ ++ mutex_lock(&bc->pipe_lock); ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++} ++ ++int ++bl_upcall(bl_comm_t *bc, bl_comm_msg_t *upmsg, bl_comm_res_t **res) ++{ ++ struct rpc_pipe_msg msg; ++ DECLARE_WAITQUEUE(wq, current); ++ int rval = 1; ++ bl_comm_msg_t *m = &bc->msg; ++ ++ if (bc == NULL) { ++ dprintk("%s: No pNFS block daemon available\n", __func__); ++ return 1; ++ } ++ ++ mutex_lock(&bc->lock); ++ mutex_lock(&bc->pipe_lock); ++ ++ memcpy(m, upmsg, sizeof (*m)); ++ ++ memset(&msg, 0, sizeof (msg)); ++ msg.data = m; ++ msg.len = sizeof (*m); ++ ++ add_wait_queue(&bc->pipe_wq, &wq); ++ rval = rpc_queue_upcall(bc->pipe_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&bc->pipe_lock); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ mutex_lock(&bc->pipe_lock); ++ ++ if (m->msg_status == PNFS_BLOCK_SUCCESS) { ++ *res = m->msg_res; ++ rval = 0; ++ } else ++ rval = 1; ++ ++out: ++ mutex_unlock(&bc->pipe_lock); ++ mutex_unlock(&bc->lock); ++ return rval; ++} ++ ++static ssize_t ctl_write(struct file *file, const char __user *buf, size_t len, ++ loff_t *offset) ++{ ++ int cmd, ++ rc; ++ bl_comm_t *bc = bl_comm_global; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof (int))) ++ return -EFAULT; ++ switch (cmd) { ++ case PNFS_BLOCK_CTL_STOP: ++ msg.msg_type = PNFS_UPCALL_MSG_STOP; ++ (void) bl_upcall(bc, &msg, &res); ++ kfree(res); ++ nfsd_bl_stop(); ++ break; ++ ++ case PNFS_BLOCK_CTL_START: ++ rc = nfsd_bl_start(); ++ if (rc != 0) ++ return rc; ++ break; ++ ++ case PNFS_BLOCK_CTL_VERS: ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bc, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ kfree(res); ++ break; ++ ++ default: ++ dprintk("%s: unknown ctl command %d\n", __func__, cmd); ++ break; ++ } ++ return len; ++} ++ ++static struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++ ++/* ++ * bl_init_proc -- set up proc interfaces ++ * ++ * Creating a pnfs_block directory isn't really required at this point ++ * since we've only got a single node in that directory. If the need for ++ * more nodes doesn't present itself shortly this code should revert ++ * to a single top level node. McNeal 11-Aug-2008. ++ */ ++int ++bl_init_proc(void) ++{ ++ struct proc_dir_entry *e; ++ ++ e = proc_mkdir("fs/pnfs_block", NULL); ++ if (!e) ++ return -ENOMEM; ++ ++ e = create_proc_entry("fs/pnfs_block/ctl", 0, NULL); ++ if (!e) ++ return -ENOMEM; ++ e->proc_fops = &ctl_ops; ++ ++ return 0; ++} ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/bl_ops.c +--- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-08-31 20:42:05.551222888 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-08-31 20:42:05.551222888 -0400 +@@ -0,0 +1,1672 @@ ++/* ++ * bl_ops.c ++ * spNFS ++ * ++ * Created by Rick McNeal on 4/1/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++ ++/* ++ * Block layout operations. ++ * ++ * These functions, with the exception of pnfs_block_enabled, are assigned to ++ * the super block s_export_op structure. ++ */ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#define MIN(a, b) ((a) < (b) ? (a) : (b)) ++ ++#define BL_LAYOUT_HASH_BITS 4 ++#define BL_LAYOUT_HASH_SIZE (1 << BL_LAYOUT_HASH_BITS) ++#define BL_LAYOUT_HASH_MASK (BL_LAYOUT_HASH_SIZE - 1) ++#define BL_LIST_REQ (sizeof (struct dm_ioctl) + 256) ++ ++#define bl_layout_hashval(id) \ ++ ((id) & BL_LAYOUT_HASH_MASK) ++ ++#define BLL_F_END(p) ((p)->bll_foff + (p)->bll_len) ++#define BLL_S_END(p) ((p)->bll_soff + (p)->bll_len) ++#define _2SECTS(v) ((v) >> 9) ++ ++#ifndef READ32 ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++(x) = (u64)ntohl(*p++) << 32; \ ++(x) |= ntohl(*p++); \ ++} while (0) ++#endif ++ ++ ++typedef enum {True, False} boolean_t; ++/* ---- block layoutget and commit structure ---- */ ++typedef struct bl_layout_rec { ++ struct list_head blr_hash, ++ blr_layouts; ++ dev_t blr_rdev; ++ struct inode *blr_inode; ++ int blr_recalled; // debug ++ u64 blr_orig_size, ++ blr_commit_size, ++ blr_ext_size; ++ spinlock_t blr_lock; // Protects blr_layouts ++} bl_layout_rec_t; ++ ++static struct list_head layout_hash; ++static struct list_head layout_hashtbl[BL_LAYOUT_HASH_SIZE]; ++static spinlock_t layout_hashtbl_lock; ++ ++/* ---- prototypes ---- */ ++static boolean_t device_slice(dev_t devid); ++static boolean_t device_dm(dev_t devid); ++static boolean_t layout_inode_add(struct inode *i, bl_layout_rec_t **); ++static bl_layout_rec_t *layout_inode_find(struct inode *i); ++static void layout_inode_del(struct inode *i); ++static char *map_state2name(enum pnfs_block_extent_state4 s); ++static pnfs_blocklayout_devinfo_t *bld_alloc(struct list_head *volume, int type); ++static void bld_free(pnfs_blocklayout_devinfo_t *bld); ++static pnfs_blocklayout_devinfo_t *bld_simple(struct list_head *volumes, ++ dev_t devid, int local_index); ++static pnfs_blocklayout_devinfo_t *bld_slice(struct list_head *volumes, ++ dev_t devid, int my_loc, int idx); ++static int layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg); ++struct list_head *layout_cache_iter(bl_layout_rec_t *r, ++ struct list_head *bl_possible, struct nfsd4_layout_seg *seg); ++static void layout_cache_merge(bl_layout_rec_t *r, struct list_head *h); ++static int layout_cache_update(bl_layout_rec_t *r, struct list_head *h); ++static void layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg); ++static void print_bll(pnfs_blocklayout_layout_t *b, char *); ++static inline boolean_t layout_cache_fill_from_list(bl_layout_rec_t *r, ++ struct list_head *h, struct nfsd4_layout_seg *seg); ++static inline void bll_collapse(bl_layout_rec_t *r, ++ pnfs_blocklayout_layout_t *c); ++static pnfs_blocklayout_layout_t *bll_alloc(u64 offset, u64 len, ++ enum bl_cache_state state, struct list_head *h); ++static pnfs_blocklayout_layout_t *bll_alloc_dup(pnfs_blocklayout_layout_t *b, ++ enum bl_cache_state c, struct list_head *h); ++static inline boolean_t layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s); ++static void extents_setup(struct fiemap_extent_info *fei); ++static void extents_count(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_get(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_process(struct fiemap_extent_info *fei, ++ struct list_head *bl_candidates, struct nfsd4_layout_seg *, dev_t dev, ++ pnfs_blocklayout_layout_t *b); ++static void extents_cleanup(struct fiemap_extent_info *fei); ++ ++void ++nfsd_bl_init(void) ++{ ++ int i; ++ dprintk("%s loaded\n", __func__); ++ ++ spin_lock_init(&layout_hashtbl_lock); ++ INIT_LIST_HEAD(&layout_hash); ++ for (i = 0; i < BL_LAYOUT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&layout_hashtbl[i]); ++ bl_init_proc(); ++} ++ ++/* ++ * pnfs_block_enabled -- check to see if this file system should be export as ++ * block pnfs ++ */ ++int ++pnfs_block_enabled(struct inode *inode, int ex_flags) ++{ ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ static int bl_comm_once = 0; ++ ++ dprintk("--> %s\n", __func__); ++ /* ++ * FIXME: Figure out method to determine if this file system should ++ * be exported. The following areas need to be checked. ++ * (1) Validate that this file system was exported as a pNFS ++ * block-layout ++ * (2) Has there been successful communication with the ++ * volume daemon? ++ */ ++ /* Check #1 */ ++#ifdef notyet ++ if (!(ex_flags & NFSEXP_PNFS_BLOCK)) { ++ dprintk("%s: pnfs_block not set in export\n", __func__); ++ return 0; ++ } ++#endif ++ ++ /* Check #1 */ ++ if (!bl_comm_once) { ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ if (msg.u.msg_vers != res->u.vers) { ++ dprintk("%s: vers mismatch, kernel != daemon\n", ++ __func__); ++ kfree(res); ++ return 0; ++ } ++ } ++ bl_comm_once = 1; ++ ++ kfree(res); ++ ++ dprintk("<-- %s okay\n", __func__); ++ return 1; ++} ++ ++int ++bl_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_BLOCK_VOLUME; ++} ++ ++int ++bl_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_devid = sb->s_dev; ++ res->gd_verf = 1; ++ res->gd_cookie = 1; ++ return 0; ++} ++ ++static int ++bl_getdeviceinfo_slice(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld_slice_p, ++ *bld_simple_p, ++ *bld; ++ int status = -EIO, ++ location = 0; ++ struct list_head volumes; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ bld_simple_p = bld_simple(&volumes, devid->devid, ++ location++); ++ if (!bld_simple_p) ++ goto out; ++ bld_slice_p = bld_slice(&volumes, devid->devid, location++, ++ bld_simple_p->bld_index_loc); ++ ++ if (!bld_slice_p) ++ goto out; ++ ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ if (bld->bld_type == PNFS_BLOCK_VOLUME_SIMPLE) ++ kfree(bld->u.simple.bld_sig); ++ bld_free(bld); ++ } ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++static int ++bl_getdeviceinfo_dm(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ int status = -EIO, // default to error ++ i, ++ location = 0; ++ struct list_head volumes; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMGET; ++ msg.u.msg_dev = devid->devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: upcall for DMGET failed\n", __func__); ++ goto out; ++ } ++ ++ /* ++ * Don't use bld_alloc() here. If used this will be the first volume ++ * type added to the list whereas the protocol requires it to be the ++ * last. ++ */ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ goto out; ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = PNFS_BLOCK_VOLUME_STRIPE; ++ bld->u.stripe.bld_stripes = res->u.stripe.num_stripes; ++ bld->u.stripe.bld_chunk_size = res->u.stripe.stripe_size * 512LL; ++ dprintk("%s: stripes %d, chunk_size %Lu\n", __func__, ++ bld->u.stripe.bld_stripes, bld->u.stripe.bld_chunk_size / 512LL); ++ ++ bld->u.stripe.bld_stripe_indexs = kmalloc(bld->u.stripe.bld_stripes * ++ sizeof (int), GFP_KERNEL); ++ if (!bld->u.stripe.bld_stripe_indexs) ++ goto out; ++ ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) { ++ dev_t dev; ++ pnfs_blocklayout_devinfo_t *bldp; ++ ++ dev = MKDEV(res->u.stripe.devs[i].major, ++ res->u.stripe.devs[i].minor); ++ if (dev == 0) ++ goto out; ++ ++ bldp = bld_simple(&volumes, dev, location++); ++ if (!bldp) { ++ dprintk("%s: bld_simple failed\n", __func__); ++ goto out; ++ } ++ bldp = bld_slice(&volumes, dev, location++, bldp->bld_index_loc); ++ ++ if (!bldp) { ++ dprintk("%s: bld_slice failed\n", __func__); ++ goto out; ++ } ++ bld->u.stripe.bld_stripe_indexs[i] = bldp->bld_index_loc; ++ ++ } ++ list_add_tail(&bld->bld_list, &volumes); ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SLICE: ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ // No memory to release for these ++ break; ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ kfree(bld->u.simple.bld_sig); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ kfree(bld->u.stripe.bld_stripe_indexs); ++ break; ++ } ++ bld_free(bld); ++ } ++ kfree(res); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++/* ++ * bl_getdeviceinfo -- determine device tree for requested devid ++ */ ++int ++bl_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ if (device_slice(devid->devid) == True) ++ return bl_getdeviceinfo_slice(sb, xdr, devid); ++ else if (device_dm(devid->devid) == True) ++ return bl_getdeviceinfo_dm(sb, xdr, devid); ++ return -EINVAL; ++} ++ ++enum nfsstat4 ++bl_layoutget(struct inode *i, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ pnfs_blocklayout_layout_t *b; ++ bl_layout_rec_t *r; ++ struct list_head bl_possible, ++ *bl_candidates = NULL; ++ boolean_t del_on_error = False; ++ int adj; ++ enum nfsstat4 nfserr = NFS4_OK; ++ ++ dprintk("--> %s (inode=[0x%x:%lu], offset=%Lu, len=%Lu, iomode=%d)\n", ++ __func__, i->i_sb->s_dev, i->i_ino, _2SECTS(res->lg_seg.offset), ++ _2SECTS(res->lg_seg.length), res->lg_seg.iomode); ++ ++ if (res->lg_seg.length == 0) { ++ printk("%s: request length of 0, error condition\n", __func__); ++ return NFS4ERR_BADLAYOUT; ++ } ++ ++ /* ++ * Adjust the length as required per spec. ++ * - First case is were the length is set to (u64)-1. Cheap means to ++ * define the end of the file. ++ * - Second case is were the I/O mode is read-only, but the request is ++ * past the end of the file so the request needs to be trimed. ++ */ ++ if ((res->lg_seg.length == NFS4_MAX_UINT64) || ++ (((res->lg_seg.offset + res->lg_seg.length) > i->i_size) && ++ (res->lg_seg.iomode == IOMODE_READ))) ++ res->lg_seg.length = i->i_size - res->lg_seg.offset; ++ ++ adj = (res->lg_seg.offset & 511) ? res->lg_seg.offset & 511 : 0; ++ res->lg_seg.offset -= adj; ++ res->lg_seg.length = (res->lg_seg.length + adj + 511) & ~511; ++ ++ if (res->lg_seg.iomode != IOMODE_READ) ++ if (i->i_op->fallocate(i, FALLOC_FL_KEEP_SIZE, ++ res->lg_seg.offset, res->lg_seg.length)) ++ return NFS4ERR_IO; ++ ++ INIT_LIST_HEAD(&bl_possible); ++ ++ if ((r = layout_inode_find(i)) == NULL) { ++ if (layout_inode_add(i, &r) == False) { ++ printk("%s: layout_inode_add failed\n", __func__); ++ return NFS4ERR_IO; ++ } ++ del_on_error = True; ++ } ++ BUG_ON(!r); ++ ++ spin_lock(&r->blr_lock); ++ ++ if (layout_cache_fill_from(r, &bl_possible, &res->lg_seg)) { ++ /* ++ * This will send LAYOUTTRYAGAIN error to the client. ++ */ ++ dprintk("%s: layout_cache_fill_from() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res->lg_return_on_close = 1; ++ res->lg_seg.length = 0; ++ ++ bl_candidates = layout_cache_iter(r, &bl_possible, &res->lg_seg); ++ if (!bl_candidates) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ layout_cache_merge(r, bl_candidates); ++ if (layout_cache_update(r, bl_candidates)) { ++ /* ---- Failed to allocate memory. ---- */ ++ dprintk("%s: layout_cache_update() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ nfserr = blocklayout_encode_layout(xdr, bl_candidates); ++ if (nfserr) ++ dprintk("%s: layoutget xdr routine failed\n", __func__); ++ ++layoutget_cleanup: ++ if (bl_candidates) { ++ while (!list_empty(bl_candidates)) { ++ b = list_entry(bl_candidates->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ } ++ ++ spin_unlock(&r->blr_lock); ++ if (unlikely(nfserr)) { ++ if (del_on_error == True) ++ layout_inode_del(i); ++ res->lg_seg.length = 0; ++ res->lg_seg.offset = 0; ++ } ++ ++ dprintk("<-- %s (rval %u)\n", __func__, nfserr); ++ return nfserr; ++} ++ ++/* ++ * bl_layoutcommit -- commit changes, especially size, to file systemj ++ * ++ * Currently this routine isn't called and everything is handled within ++ * nfsd4_layoutcommit(). By not calling this routine the server doesn't ++ * handle a partial return, a set of extents, of the layout. The extents ++ * are decoded here, but nothing is done with them. If this routine is ++ * be called the interface must change to pass the 'dentry' pointer such ++ * that notify_change() can be called. ++ */ ++int ++bl_layoutcommit(struct inode *i, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ bl_layout_rec_t *r; ++ int status = 0; ++ u64 lw_plus; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ r = layout_inode_find(i); ++ if (r) { ++ lw_plus = args->lc_last_wr + 1; ++ if (args->lc_newoffset) { ++ dprintk(" lc_last_wr %Lu\n", lw_plus); ++ if (r->blr_orig_size < lw_plus) { ++ r->blr_orig_size = lw_plus; ++ res->lc_size_chg = 1; ++ res->lc_newsize = lw_plus; ++ } ++ } ++ ++ if (args->lc_up_len) { ++ int extents, ++ i; ++ struct pnfs_blocklayout_layout *b; ++ __be32 *p = args->lc_up_layout; ++ ++ /* ++ * Client is returning a set of extents which ++ * should/could be used to update the file system. ++ * See section 2.3.2 in draft-ietf-nfsv4-pnfs-block-08 ++ */ ++ READ32(extents); ++ dprintk(" Client returning %d extents: data size %d\n", ++ extents, args->lc_up_len); ++ b = kmalloc(sizeof (struct pnfs_blocklayout_layout) * ++ extents, GFP_KERNEL); ++ if (b) { ++ for (i = 0; i < extents; i++) { ++ READ64(b[i].bll_vol_id.sbid); ++ READ64(b[i].bll_vol_id.devid); ++ READ64(b[i].bll_foff); ++ READ64(b[i].bll_len); ++ READ64(b[i].bll_soff); ++ READ32(b[i].bll_es); ++ dprintk(" %d: foff %Lu, len %Lu, soff %Lu " ++ "state %s\n", ++ i, _2SECTS(b[i].bll_foff), ++ _2SECTS(b[i].bll_len), ++ _2SECTS(b[i].bll_soff), ++ map_state2name(b[i].bll_es)); ++ } ++ kfree(b); ++ } else { ++ status = -ENOMEM; ++ } ++ } ++ } else ++ dprintk("%s: Unexpected commit to inode %p\n", __func__, i); ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutreturn(struct inode *i, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ int status = 0; ++ bl_layout_rec_t *r; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ ++ r = layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ layout_cache_del(r, &args->lr_seg); ++ spin_unlock(&r->blr_lock); ++ dprintk(" ext_size %Lu, i_size %Lu, orig_size %Lu\n", ++ r->blr_ext_size, i->i_size, r->blr_orig_size); ++ } ++ ++ layout_inode_del(i); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ bl_layout_rec_t *r; ++ pnfs_blocklayout_layout_t *b; ++ u64 adj; ++ ++ dprintk("--> %s\n", __func__); ++ BUG_ON(!len); ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk(" recalling layout [0x%x:%lu], %Lu:%Lu\n", ++ inode->i_sb->s_dev, inode->i_ino, ++ _2SECTS(offset), _2SECTS(len)); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* ++ * XXX figure out how to get a sb since there's no ++ * inode ptr ++ */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++restart: ++ r = layout_inode_find(inode); ++ if (r && len && !r->blr_recalled) { ++ spin_lock(&r->blr_lock); ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (!r->blr_recalled && !b->bll_recalled && ++ (offset >= b->bll_foff) && (offset < BLL_F_END(b))) { ++ b->bll_recalled = 1; ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_BLOCK_VOLUME; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = 0; ++ lr.cbl_seg.length = NFS4_MAX_UINT64; ++ r->blr_recalled = 1; ++ dprintk(" FULL LAYOUTRECALL\n"); ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ ++ /* ++ * Currently there are only two cases where the ++ * layout is being returned. ++ * (1) Someone is issuing a NFS_WRITE operation ++ * to this layout. ++ * (2) The file has been truncated which means ++ * the layout is immediately made invalid. ++ * In both cases the client must write any ++ * uncommitted modifications to the server via ++ * NFS_WRITE. ++ */ ++ lr.cbl_layoutchanged = 1; ++ ++ /* ++ * Need to drop the lock because we'll get a ++ * layoutreturn which will block waiting for ++ * the lock. The request will come in on the ++ * same thread which will cause a deadlock. ++ */ ++ spin_unlock(&r->blr_lock); ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ adj = MIN(b->bll_len - (offset - b->bll_foff), ++ len); ++ offset += adj; ++ len -= adj; ++ if (!len) { ++ spin_lock(&r->blr_lock); ++ break; ++ } ++ /* ++ * Since layoutreturn will have been called we ++ * can't assume blr_layouts is still valid, ++ * so restart. ++ */ ++ goto restart; ++ } ++ } ++ spin_unlock(&r->blr_lock); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++/* ++ * []------------------------------------------------------------------[] ++ * | Support functions from here on down. | ++ * []------------------------------------------------------------------[] ++ */ ++ ++/* ++ * bld_simple -- given a dev_t build a simple volume structure ++ * ++ * Simple volume contains the device signature and offset to that data in ++ * the storage volume. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_simple(struct list_head *volumes, dev_t devid, int local_index) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSIG; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to get signature information\n", __func__); ++ goto error; ++ } ++ ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SIMPLE); ++ if (!bld) ++ return NULL; ++ ++ bld->u.simple.bld_offset = (res->u.sig.sector * 512LL) + res->u.sig.offset; ++ bld->u.simple.bld_sig_len = res->u.sig.len; ++ bld->u.simple.bld_sig = kmalloc(res->u.sig.len, GFP_KERNEL); ++ if (!bld->u.simple.bld_sig) ++ goto error; ++ ++ memcpy(bld->u.simple.bld_sig, res->u.sig.sig, res->u.sig.len); ++ kfree(res); ++ return bld; ++ ++error: ++ if (bld) ++ bld_free(bld); ++ if (res) ++ kfree(res); ++ dprintk("%s: error in bld_simple\n", __func__); ++ return NULL; ++} ++ ++/* ++ * bld_slice -- given a dev_t build a slice volume structure ++ * ++ * A slice volume contains the length of the slice/partition and its offset ++ * from the beginning of the storage volume. There's also a reference to ++ * the "simple" volume which contains this slice. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_slice(struct list_head *volumes, dev_t devid, int my_loc, int simple_loc) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SLICE); ++ if (!bld) ++ return NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSLICE; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Upcall to get slice info failed\n"); ++ bld_free(bld); ++ return NULL; ++ } ++ ++ bld->bld_devid.devid = devid; ++ bld->bld_index_loc = my_loc; ++ bld->u.slice.bld_start = res->u.slice.start * 512LL; ++ bld->u.slice.bld_len = res->u.slice.length * 512LL; ++ bld->u.slice.bld_index = simple_loc; ++ ++ dprintk("%s: start %Lu, len %Lu\n", __func__, ++ bld->u.slice.bld_start / 512LL, bld->u.slice.bld_len / 512LL); ++ ++ kfree(res); ++ dprintk("<-- %s (rval %p)\n", __func__, bld); ++ return bld; ++} ++ ++static int ++layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!list_empty(&r->blr_layouts)) ++ if (layout_cache_fill_from_list(r, h, seg) == False) ++ return -EIO; ++ ++ /* ++ * This deals with two conditions. ++ * (1) When blr_layouts is empty we need to create the first entry ++ * (2) When the range requested falls past the end of any current ++ * layout the residual must be taken care of. ++ */ ++ if (seg->length) { ++ n = bll_alloc(seg->offset, seg->length, BLOCK_LAYOUT_NEW, h); ++ if (!n) ++ return -ENOMEM; ++ dprintk(" remaining at %Lu, len %Lu\n", _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len)); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++struct list_head * ++layout_cache_iter(bl_layout_rec_t *r, struct list_head *bl_possible, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n = NULL; ++ struct list_head *bl_candidates = NULL; ++ struct fiemap_extent_info fei; ++ struct inode *i; ++ dev_t dev; ++ ++ dev = r->blr_rdev; ++ i = r->blr_inode; ++ ++ dprintk("--> %s\n", __func__); ++ bl_candidates = kmalloc(sizeof (*bl_candidates), GFP_KERNEL); ++ if (!bl_candidates) ++ return NULL; ++ INIT_LIST_HEAD(bl_candidates); ++ extents_setup(&fei); ++ ++ list_for_each_entry(b, bl_possible, bll_list) { ++ if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ extents_count(&fei, i, b->bll_foff, b->bll_len); ++ if (fei.fi_extents_mapped) { ++ ++ /* ++ * Common case here. Got a range which has ++ * extents. Now get those extents and process ++ * them into pNFS extents. ++ */ ++ if (extents_get(&fei, i, b->bll_foff, ++ b->bll_len) == False) ++ goto cleanup; ++ if (extents_process(&fei, bl_candidates, ++ seg, dev, b) == False) ++ goto cleanup; ++ extents_cleanup(&fei); ++ ++ } else if (seg->iomode == IOMODE_READ) { ++ ++ /* ++ * Found a hole in a file while reading. No ++ * problem, just create a pNFS extent for the ++ * range and let the client know there's no ++ * backing store. ++ */ ++ n = bll_alloc(b->bll_foff, b->bll_len, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += b->bll_len; ++ } else { ++ ++ /* ++ * There's a problem here. Since the iomode ++ * is read/write fallocate should have allocated ++ * any necessary storage for the given range. ++ */ ++ dprintk(" Extent count for RW is 0\n"); ++ goto cleanup; ++ } ++ ++ } else { ++ n = bll_alloc_dup(b, b->bll_cache_state, bl_candidates); ++ seg->length += n->bll_len; ++ } ++ ++ if (r->blr_ext_size < (b->bll_foff + b->bll_len)) ++ r->blr_ext_size = b->bll_foff + b->bll_len; ++ } ++ ++ while (!list_empty(bl_possible)) { ++ b = list_entry(bl_possible->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ ++ b = list_first_entry(bl_candidates, struct pnfs_blocklayout_layout, ++ bll_list); ++ seg->offset = b->bll_foff; ++ dprintk("<-- %s okay\n", __func__); ++ return bl_candidates; ++ ++cleanup: ++ extents_cleanup(&fei); ++ if (bl_candidates) ++ kfree(bl_candidates); ++ dprintk("<-- %s, error occurred\n", __func__); ++ return NULL; ++} ++ ++/* ++ * layout_cache_merge -- collapse layouts which make up a contiguous range. ++ */ ++static void ++layout_cache_merge(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *p; ++ ++ dprintk("--> %s\n", __func__); ++restart: ++ p = NULL; ++ list_for_each_entry(b, h, bll_list) { ++ if (p && (BLL_S_END(p) == b->bll_soff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ /* ++ * We've got a condidate. ++ */ ++#ifdef too_verbose ++ dprintk(" merge %Lu(f):%Lu(l):%Lu(s) into %Lu(f):%Lu(l):%Lu(s)\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), ++ _2SECTS(p->bll_foff), _2SECTS(p->bll_len), ++ _2SECTS(b->bll_soff)); ++#endif ++ ++ if (p->bll_cache_state == BLOCK_LAYOUT_CACHE) ++ p->bll_cache_state = BLOCK_LAYOUT_UPDATE; ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else if (p && (BLL_F_END(p) == b->bll_foff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es == PNFS_BLOCK_NONE_DATA)) { ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else ++ p = b; ++ } ++ dprintk("<-- %s\n", __func__); ++} ++ ++static int ++layout_cache_update(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *c, ++ *n; ++ boolean_t status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ if (list_empty(&r->blr_layouts)) { ++ /* ---- Just add entries and return ---- */ ++ dprintk(" cache empty for inode 0x%x:%ld\n", r->blr_rdev, ++ r->blr_inode->i_ino); ++ list_for_each_entry(b, h, bll_list) { ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, ++ &r->blr_layouts); ++ if (!c) { ++ status = -ENOMEM; ++ break; ++ } ++ dprintk(" adding %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), c->bll_es); ++ } ++ return status; ++ } ++ ++ list_for_each_entry(b, h, bll_list) { ++ BUG_ON(!b->bll_vol_id.devid); ++ if (b->bll_cache_state == BLOCK_LAYOUT_UPDATE) { ++ boolean_t found = False; ++ list_for_each_entry(c, &r->blr_layouts, bll_list) { ++ if ((b->bll_soff >= c->bll_soff) && ++ (b->bll_soff < BLL_S_END(c)) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ u64 u; ++ ++ if ((b->bll_foff < c->bll_foff) || ++ (b->bll_foff > BLL_F_END(c))) ++ BUG(); ++ ++ u = BLL_S_END(b) - BLL_S_END(c); ++ /* ++ * The updated cache entry has to be ++ * different than the current. ++ * Otherwise the cache state for 'b' ++ * should be BLOCK_LAYOUT_CACHE. ++ */ ++ BUG_ON(BLL_S_END(b) < BLL_S_END(c)); ++ ++ dprintk(" " ++ "updating %Lu(f):%Lu(l):%Lu(s) to len %Lu\n", ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), ++ _2SECTS(c->bll_len + u)); ++ c->bll_len += u; ++ bll_collapse(r, c); ++ found = True; ++ break; ++ } ++ } ++ ++ if (found == False) { ++ dprintk(" ERROR Expected to find" ++ " %Lu(f):%Lu(l):%Lu(s), but didn't\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_for_each_entry(c, &r->blr_layouts, bll_list) ++ print_bll(c, "Cached"); ++ BUG(); ++ } ++ } else if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ c = list_first_entry(&r->blr_layouts, ++ struct pnfs_blocklayout_layout, bll_list); ++ if (b->bll_foff < c->bll_foff) { ++ /* ++ * Special case where new entry is before ++ * first cached entry. ++ */ ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, NULL); ++ list_add(&c->bll_list, &r->blr_layouts); ++ dprintk(" new entry at head of list at %Lu, " ++ "len %Lu\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len)); ++ } else { ++ list_for_each_entry(c, &r->blr_layouts, ++ bll_list) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, ++ bll_list); ++ /* ++ * This is ugly, but can't think of ++ * another way to examine this case. ++ * Consider the following. Need to ++ * add an entry which starts at 40 ++ * and the cache has the following ++ * entries: ++ * Start Length ++ * 10 5 ++ * 30 5 ++ * 50 5 ++ * So, need to look and see if the new ++ * entry starts after the current ++ * cache, but before the next one. ++ * There's a catch in that the next ++ * entry might not be valid as it's ++ * really just a pointer to the list ++ * head. ++ */ ++ if (((b->bll_foff >= ++ BLL_F_END(c)) && ++ (c->bll_list.next == &r->blr_layouts)) || ++ ((b->bll_foff >= ++ BLL_F_END(c)) && ++ (b->bll_foff < n->bll_foff))) { ++ ++ n = bll_alloc_dup(b, ++ BLOCK_LAYOUT_CACHE, NULL); ++ dprintk(" adding new %Lu:%Lu" ++ " after %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len), ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len)); ++ list_add(&n->bll_list, ++ &c->bll_list); ++ break; ++ } ++ } ++ } ++ } ++ } ++ dprintk("<-- %s\n", __func__); ++ return status; ++} ++ ++static void ++layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg_in) ++{ ++ struct pnfs_blocklayout_layout *b, ++ *n; ++ u64 len; ++ struct nfsd4_layout_seg seg = *seg_in; ++ ++ dprintk("--> %s\n", __func__); ++ if (seg.length == NFS4_MAX_UINT64) { ++ r->blr_recalled = 0; ++ dprintk(" Fast return of all layouts\n"); ++ while (!list_empty(&r->blr_layouts)) { ++ b = list_entry(r->blr_layouts.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ dprintk(" foff %Lu, len %Lu, soff %Lu\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ dprintk("<-- %s\n", __func__); ++ return; ++ } ++ ++restart: ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg.offset == b->bll_foff) { ++ /* ++ * This handle the following three cases: ++ * (1) return layout matches entire cache layout ++ * (2) return layout matches beginning portion of cache ++ * (3) return layout matches entire cache layout and ++ * into next entry. Varies from #1 in end case. ++ */ ++ dprintk(" match on offsets, %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length)); ++ len = MIN(seg.length, b->bll_len); ++ b->bll_foff += len; ++ b->bll_soff += len; ++ b->bll_len -= len; ++ seg.length -= len; ++ seg.offset += len; ++ if (!b->bll_len) { ++ list_del(&b->bll_list); ++ kfree(b); ++ dprintk(" removing cache line\n"); ++ if (!seg.length) { ++ dprintk(" also finished\n"); ++ goto complete; ++ } ++ /* ++ * Since 'b' was freed we can't continue at the ++ * next entry which is referenced as ++ * b->bll_list.next by the list_for_each_entry ++ * macro. Need to restart the loop. ++ * TODO: Think about creating a dummy 'b' which ++ * would keep list_for_each_entry() happy. ++ */ ++ goto restart; ++ } ++ if (!seg.length) { ++ dprintk(" finished, but cache line not" ++ "empty\n"); ++ goto complete; ++ } ++ } else if ((seg.offset >= b->bll_foff) && ++ (seg.offset < BLL_F_END(b))) { ++ /* ++ * layout being returned is within this cache line. ++ */ ++ dprintk(" layout %Lu:%Lu within cache line %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ BUG_ON(!seg.length); ++ if ((seg.offset + seg.length) >= BLL_F_END(b)) { ++ /* ++ * Layout returned starts in the middle of ++ * cache entry and just need to trim back ++ * cache to shorter length. ++ */ ++ dprintk(" trim back cache line\n"); ++ len = seg.offset - b->bll_foff; ++ seg.offset += b->bll_len - len; ++ seg.length -= b->bll_len - len; ++ b->bll_len = len; ++ if (!seg.length) ++ return; ++ } else { ++ /* ++ * Need to split current cache layout because ++ * chunk is being removed from the middle. ++ */ ++ dprintk(" split cache line\n"); ++ len = seg.offset + seg.length; ++ n = bll_alloc(len, ++ (b->bll_foff + b->bll_len) - len, ++ BLOCK_LAYOUT_CACHE, NULL); ++ n->bll_soff = b->bll_soff + len; ++ list_add(&n->bll_list, &b->bll_list); ++ b->bll_len = seg.offset - b->bll_foff; ++ return; ++ } ++ } ++ } ++complete: ++ if (list_empty(&r->blr_layouts)) ++ r->blr_recalled = 0; ++ dprintk("<-- %s\n", __func__); ++} ++ ++/* ++ * layout_cache_fill_from_list -- fills from cache list ++ * ++ * NOTE: This routine was only seperated out from layout_cache_file_from() ++ * to reduce the indentation level which makes the code easier to read. ++ */ ++static inline boolean_t ++layout_cache_fill_from_list(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n; ++ enum pnfs_block_extent_state4 s; ++ ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg->offset < b->bll_foff) { ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, b->bll_foff - seg->offset), ++ BLOCK_LAYOUT_NEW, NULL); ++ if (!n) ++ return False; ++ ++ list_add(&n->bll_list, h->prev); ++ dprintk(" new: %Lu:%Lu, added before %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ ++ if ((seg->offset >= b->bll_foff) && ++ (seg->offset < BLL_F_END(b))) { ++ if (layout_conflict(b, seg->iomode, &s) == False) { ++ dprintk(" CONFLICT FOUND: " ++ "%Lu(f):%Lu(l):%Lu(s) state %d, iomode %d\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es, ++ seg->iomode); ++ return False; ++ } ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, BLL_F_END(b) - seg->offset), ++ BLOCK_LAYOUT_CACHE, h); ++ dprintk(" CACHE hit: Found %Lu(f):%Lu(l): " ++ "in %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es); ++ if (!n) ++ return False; ++ ++ n->bll_soff = b->bll_soff + seg->offset - b->bll_foff; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ n->bll_es = s; ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ } ++ return True; ++} ++ ++static u64 ++bll_alloc_holey(struct list_head *bl_candidates, u64 offset, u64 length, ++ dev_t dev) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ n = bll_alloc(offset, length, BLOCK_LAYOUT_NEW, bl_candidates); ++ if (!n) ++ return 0; ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ ++ return n->bll_len; ++} ++ ++static void ++extents_setup(struct fiemap_extent_info *fei) ++{ ++ fei->fi_extents_start = NULL; ++} ++ ++/* ++ * extents_count -- Determine the number of extents for a given range. ++ * ++ * No need to call set_fs() here because the function ++ * doesn't use copy_to_user() if it's only counting ++ * the number of extents needed. ++ */ ++static void ++extents_count(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ dprintk(" Need fiemap of %Ld:%Ld\n", _2SECTS(foff), _2SECTS(len)); ++ fei->fi_flags = FIEMAP_FLAG_SYNC; ++ fei->fi_extents_max = 0; ++ fei->fi_extents_start = NULL; ++ fei->fi_extents_mapped = 0; ++ i->i_op->fiemap(i, fei, foff, len + (1 << i->i_sb->s_blocksize_bits) - 1); ++} ++ ++/* ++ * extents_get -- Get list of extents for range ++ * ++ * extents_count() must have been called before this routine such that ++ * fi_extents_mapped is known. ++ */ ++static boolean_t ++extents_get(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ int m_space, ++ rval; ++ struct fiemap_extent *fe; ++ mm_segment_t old_fs = get_fs(); ++ ++ /* ++ * Now malloc the correct amount of space ++ * needed. It's possible for the file to have changed ++ * between calls which would require more space for ++ * the extents. If that occurs the last extent will ++ * not have FIEMAP_EXTENT_LAST set and the error will ++ * be caught in extents_process(). ++ */ ++ m_space = fei->fi_extents_mapped * sizeof (struct fiemap_extent); ++ fe = kmalloc(m_space, GFP_KERNEL); ++ if (!fe) ++ return False; ++ memset(fe, 0, m_space); ++ ++ fei->fi_extents_max = fei->fi_extents_mapped; ++ fei->fi_extents_mapped = 0; ++ fei->fi_extents_start = fe; ++ ++ set_fs(KERNEL_DS); ++ rval = i->i_op->fiemap(i, fei, foff, len + ++ (1 << i->i_sb->s_blocksize_bits) - 1); ++ set_fs(old_fs); ++ ++ if (rval || !fei->fi_extents_mapped) { ++ dprintk(" No extents. Wanted %d, got %d\n", ++ fei->fi_extents_max, fei->fi_extents_mapped); ++ kfree(fe); ++ fei->fi_extents_start = NULL; ++ return False; ++ } else ++ return True; ++} ++ ++/* ++ * extents_process -- runs through the extent returned from the file system and ++ * creates block layout entries. ++ */ ++static boolean_t ++extents_process(struct fiemap_extent_info *fei, struct list_head *bl_candidates, ++ struct nfsd4_layout_seg *seg, dev_t dev, pnfs_blocklayout_layout_t *b) ++{ ++ struct fiemap_extent *fep, ++ *fep_last = NULL; ++ int i; ++ pnfs_blocklayout_layout_t *n; ++ u64 last_end, ++ rval; ++ ++ dprintk("--> %s\n", __func__); ++ for (fep = fei->fi_extents_start, i = 0; i < fei->fi_extents_mapped; ++ i++, fep++) { ++ ++ BUG_ON(!fep->fe_physical); ++ /* ++ * Deal with corner cases of hoel-y files. ++ */ ++ if (fep_last && ((fep_last->fe_logical + fep_last->fe_length) != ++ fep->fe_logical)) { ++ ++ /* ++ * If the last extent doesn't end logically ++ * at the beginning of the current we've got ++ * hole and need to create a pNFS extent. ++ */ ++ dprintk(" Got a hole at %Ld:%Ld \n", ++ _2SECTS(fep_last->fe_logical), ++ _2SECTS(fep_last->fe_length)); ++ last_end = fep_last->fe_logical + fep_last->fe_length; ++ rval = bll_alloc_holey(bl_candidates, last_end, ++ fep->fe_logical - last_end, dev); ++ if (!rval) ++ return False; ++ seg->length += rval; ++ } ++ ++ n = bll_alloc(fep->fe_logical, fep->fe_length, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ if (unlikely(n == NULL)) { ++ dprintk("%s: bll_alloc failed\n", __func__); ++ return False; ++ } ++ ++ n->bll_soff = fep->fe_physical; ++ n->bll_es = seg->iomode == IOMODE_READ ? ++ PNFS_BLOCK_READ_DATA : PNFS_BLOCK_READWRITE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += fep->fe_length; ++ print_bll(n, "New extent"); ++ fep_last = fep; ++ } ++ dprintk("<-- %s (i=%d)\n", __func__, i); ++ ++ return True; ++} ++ ++static void ++extents_cleanup(struct fiemap_extent_info *fei) ++{ ++ if (fei->fi_extents_start) { ++ kfree(fei->fi_extents_start); ++ fei->fi_extents_start = NULL; ++ } ++} ++ ++/* ++ * device_slice -- check to see if device is a slice or DM ++ */ ++static boolean_t ++device_slice(dev_t devid) ++{ ++ struct block_device *bd = open_by_devnum(devid, FMODE_READ); ++ boolean_t rval = False; ++ ++ if (bd) { ++ if (bd->bd_disk->minors > 1) ++ rval = True; ++ blkdev_put(bd, FMODE_READ); ++ } ++ return rval; ++} ++ ++/* ++ * device_dm -- check to see if device is a Device Mapper volume. ++ * ++ * Returns 1 for DM or 0 if not ++ */ ++static boolean_t ++device_dm(dev_t devid) ++{ ++ boolean_t rval = False; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMCHK; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Failed upcall to check on DM status\n"); ++ } else if (res->u.dm_vol) { ++ rval = True; ++ dprintk("Device is DM volume\n"); ++ } else ++ dprintk("Device is not DM volume\n"); ++ kfree(res); ++ ++ return rval; ++} ++ ++static boolean_t ++layout_inode_add(struct inode *i, bl_layout_rec_t **p) ++{ ++ bl_layout_rec_t *r = NULL; ++ ++ if (!i->i_op->fiemap || !i->i_op->fallocate) { ++ printk("pNFS: file system doesn't support required fiemap or" ++ "fallocate methods\n"); ++ return False; ++ } ++ ++ r = kmalloc(sizeof (*r), GFP_KERNEL); ++ if (!r) ++ goto error; ++ ++ r->blr_rdev = i->i_sb->s_dev; ++ r->blr_inode = i; ++ r->blr_orig_size = i->i_size; ++ r->blr_ext_size = 0; ++ r->blr_recalled = 0; ++ INIT_LIST_HEAD(&r->blr_layouts); ++ spin_lock_init(&r->blr_lock); ++ spin_lock(&layout_hashtbl_lock); ++ list_add_tail(&r->blr_hash, &layout_hash); ++ spin_unlock(&layout_hashtbl_lock); ++ *p = r; ++ return True; ++ ++error: ++ if (r) ++ kfree(r); ++ return False; ++} ++ ++static bl_layout_rec_t * ++__layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ if (!list_empty(&layout_hash)) { ++ list_for_each_entry(r, &layout_hash, blr_hash) { ++ if ((r->blr_inode->i_ino == i->i_ino) && ++ (r->blr_rdev == i->i_sb->s_dev)) { ++ return r; ++ } ++ } ++ } ++ return NULL; ++} ++ ++static bl_layout_rec_t * ++layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ spin_unlock(&layout_hashtbl_lock); ++ ++ return r; ++} ++ ++static void ++layout_inode_del(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ if (list_empty(&r->blr_layouts)) { ++ list_del(&r->blr_hash); ++ spin_unlock(&r->blr_lock); ++ kfree(r); ++ } else { ++ spin_unlock(&r->blr_lock); ++ } ++ } else { ++ dprintk("%s: failed to find inode [0x%x:%lu] in table for delete\n", ++ __func__, i->i_sb->s_dev, i->i_ino); ++ } ++ spin_unlock(&layout_hashtbl_lock); ++} ++ ++/* ++ * map_state2name -- converts state in ascii string. ++ * ++ * Used for debug messages only. ++ */ ++static char * ++map_state2name(enum pnfs_block_extent_state4 s) ++{ ++ switch (s) { ++ case PNFS_BLOCK_READWRITE_DATA: return " RW"; ++ case PNFS_BLOCK_READ_DATA: return " RO"; ++ case PNFS_BLOCK_INVALID_DATA: return "INVALID"; ++ case PNFS_BLOCK_NONE_DATA: return " NONE"; ++ default: ++ BUG(); ++ } ++} ++ ++static pnfs_blocklayout_devinfo_t * ++bld_alloc(struct list_head *volumes, int type) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ return NULL; ++ ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = type; ++ list_add_tail(&bld->bld_list, volumes); ++ ++ return bld; ++} ++ ++static void ++bld_free(pnfs_blocklayout_devinfo_t *bld) ++{ ++ list_del(&bld->bld_list); ++ kfree(bld); ++} ++ ++static void ++print_bll(pnfs_blocklayout_layout_t *b, char *text) ++{ ++ dprintk(" BLL: %s\n", text); ++ dprintk(" foff %Lu, soff %Lu, len %Lu, state %s\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_soff), _2SECTS(b->bll_len), ++ map_state2name(b->bll_es)); ++} ++ ++static inline void ++bll_collapse(bl_layout_rec_t *r, pnfs_blocklayout_layout_t *c) ++{ ++ pnfs_blocklayout_layout_t *n; ++ int dbg_count = 0; ++ u64 endpoint; ++ ++ BUG_ON(c->bll_es == PNFS_BLOCK_NONE_DATA); ++ while (c->bll_list.next != &r->blr_layouts) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ endpoint = BLL_S_END(c); ++ if ((n->bll_soff >= c->bll_soff) && ++ (n->bll_soff < endpoint)) { ++ if (endpoint < BLL_S_END(n)) { ++ /* ++ * The following is possible. ++ * ++ * ++ * Existing: +---+ +---+ ++ * New: +-----------------------+ ++ * The client request merge entries together ++ * but didn't require picking up all of the ++ * last entry. So, we still need to delete ++ * the last entry and add the remaining space ++ * to the new entry. ++ */ ++ c->bll_len += BLL_S_END(n) - endpoint; ++ } ++ dbg_count++; ++ list_del(&n->bll_list); ++ kfree(n); ++ } else { ++ break; ++ } ++ } ++ /* ---- Debug only, remove before integration ---- */ ++ if (dbg_count) ++ dprintk(" Collapsed %d cache entries between %Lu(s) and %Lu(s)\n", ++ dbg_count, _2SECTS(c->bll_soff), _2SECTS(BLL_S_END(c))); ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc(u64 offset, u64 len, enum bl_cache_state state, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = kmalloc(sizeof (*n), GFP_KERNEL); ++ if (n) { ++ memset(n, 0, sizeof (*n)); ++ n->bll_foff = offset; ++ n->bll_len = len; ++ n->bll_cache_state = state; ++ if (h) ++ list_add_tail(&n->bll_list, h); ++ } ++ return n; ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc_dup(pnfs_blocklayout_layout_t *b, enum bl_cache_state c, ++ struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = bll_alloc(b->bll_foff, b->bll_len, c, h); ++ if (n) { ++ n->bll_es = b->bll_es; ++ n->bll_soff = b->bll_soff; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ } ++ return n; ++} ++ ++static inline boolean_t ++layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s) ++{ ++ /* ---- Normal case ---- */ ++ *s = b->bll_es; ++ ++ switch (b->bll_es) { ++ case PNFS_BLOCK_READWRITE_DATA: ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_READ_DATA; ++ /* ---- Any use is permitted. ---- */ ++ break; ++ case PNFS_BLOCK_READ_DATA: ++ /* ---- Committed as read only data. ---- */ ++ if (iomode == IOMODE_RW) ++ return False; ++ break; ++ case PNFS_BLOCK_INVALID_DATA: ++ /* ---- Blocks have been allocated, but not initialized ---- */ ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_NONE_DATA; ++ break; ++ case PNFS_BLOCK_NONE_DATA: ++ /* ---- Hole-y file. No backing store avail. ---- */ ++ if (iomode != IOMODE_READ) ++ return False; ++ break; ++ default: ++ BUG(); ++ } ++ return True; ++} ++ ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs/delegation.c +--- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-08-31 20:41:19.144140225 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-31 20:42:05.512106042 -0400 +@@ -104,7 +104,8 @@ again: + continue; + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + continue; +- if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) ++ if (memcmp(state->stateid.u.data, stateid->u.data, ++ sizeof(state->stateid.u.data)) != 0) + continue; + get_nfs_open_context(ctx); + spin_unlock(&inode->i_lock); +@@ -133,8 +134,8 @@ void nfs_inode_reclaim_delegation(struct + if (delegation != NULL) { + spin_lock(&delegation->lock); + if (delegation->inode != NULL) { +- memcpy(delegation->stateid.data, res->delegation.data, +- sizeof(delegation->stateid.data)); ++ memcpy(delegation->stateid.u.data, res->delegation.u.data, ++ sizeof(delegation->stateid.u.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + oldcred = delegation->cred; +@@ -187,8 +188,9 @@ static struct nfs_delegation *nfs_detach + if (delegation == NULL) + goto nomatch; + spin_lock(&delegation->lock); +- if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data, +- sizeof(delegation->stateid.data)) != 0) ++ if (stateid != NULL && memcmp(delegation->stateid.u.data, ++ stateid->u.data, ++ sizeof(delegation->stateid.u.data)) != 0) + goto nomatch_unlock; + list_del_rcu(&delegation->super_list); + delegation->inode = NULL; +@@ -216,8 +218,8 @@ int nfs_inode_set_delegation(struct inod + delegation = kmalloc(sizeof(*delegation), GFP_NOFS); + if (delegation == NULL) + return -ENOMEM; +- memcpy(delegation->stateid.data, res->delegation.data, +- sizeof(delegation->stateid.data)); ++ memcpy(delegation->stateid.u.data, res->delegation.u.data, ++ sizeof(delegation->stateid.u.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + delegation->change_attr = nfsi->change_attr; +@@ -471,9 +473,7 @@ void nfs_expire_unreferenced_delegations + /* + * Asynchronous delegation recall! + */ +-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, +- int (*validate_stateid)(struct nfs_delegation *delegation, +- const nfs4_stateid *stateid)) ++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) + { + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_delegation *delegation; +@@ -481,7 +481,7 @@ int nfs_async_inode_return_delegation(st + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + +- if (!validate_stateid(delegation, stateid)) { ++ if (!clp->cl_mvops->validate_stateid(delegation, stateid)) { + rcu_read_unlock(); + return -ENOENT; + } +@@ -562,7 +562,8 @@ int nfs4_copy_delegation_stateid(nfs4_st + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation != NULL) { +- memcpy(dst->data, delegation->stateid.data, sizeof(dst->data)); ++ memcpy(dst->u.data, delegation->stateid.u.data, ++ sizeof(dst->u.data)); + ret = 1; + } + rcu_read_unlock(); +diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs/delegation.h +--- linux-2.6.34.noarch/fs/nfs/delegation.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-08-31 20:42:05.513114811 -0400 +@@ -34,9 +34,7 @@ enum { + int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); + void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); + int nfs_inode_return_delegation(struct inode *inode); +-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, +- int (*validate_stateid)(struct nfs_delegation *delegation, +- const nfs4_stateid *stateid)); ++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); + void nfs_inode_return_delegation_noreclaim(struct inode *inode); + + struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); +diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c +--- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-08-31 20:41:19.196140434 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-31 20:42:05.553222784 -0400 +@@ -17,11 +17,19 @@ + #include + #include + ++#include ++#if defined(CONFIG_SPNFS) ++#include ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif ++#endif + #include + #include + + #include "nfsd.h" + #include "nfsfh.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_EXPORT + +@@ -352,6 +360,40 @@ static int svc_export_upcall(struct cach + return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); + } + ++#if defined(CONFIG_PNFSD) ++static struct pnfsd_cb_operations pnfsd_cb_op = { ++ .cb_layout_recall = nfsd_layout_recall_cb, ++ .cb_device_notify = nfsd_device_notify_cb, ++ ++ .cb_get_state = nfs4_pnfs_cb_get_state, ++ .cb_change_state = nfs4_pnfs_cb_change_state, ++}; ++ ++#if defined(CONFIG_SPNFS) ++static struct pnfs_export_operations spnfs_export_ops = { ++ .layout_type = spnfs_layout_type, ++ .get_device_info = spnfs_getdeviceinfo, ++ .get_device_iter = spnfs_getdeviceiter, ++ .layout_get = spnfs_layoutget, ++ .layout_return = spnfs_layoutreturn, ++}; ++ ++static struct pnfs_export_operations spnfs_ds_export_ops = { ++ .get_state = spnfs_get_state, ++}; ++ ++#if defined(CONFIG_SPNFS_BLOCK) ++static struct pnfs_export_operations bl_export_ops = { ++ .layout_type = bl_layout_type, ++ .get_device_info = bl_getdeviceinfo, ++ .get_device_iter = bl_getdeviceiter, ++ .layout_get = bl_layoutget, ++ .layout_return = bl_layoutreturn, ++}; ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_SPNFS */ ++#endif /* CONFIG_PNFSD */ ++ + static struct svc_export *svc_export_update(struct svc_export *new, + struct svc_export *old); + static struct svc_export *svc_export_lookup(struct svc_export *); +@@ -395,6 +437,47 @@ static int check_export(struct inode *in + return -EINVAL; + } + ++#if !defined(CONFIG_SPNFS) ++ if (inode->i_sb->s_pnfs_op && ++ (!inode->i_sb->s_pnfs_op->layout_type || ++ !inode->i_sb->s_pnfs_op->get_device_info || ++ !inode->i_sb->s_pnfs_op->layout_get)) { ++ dprintk("exp_export: export of invalid fs pnfs export ops.\n"); ++ return -EINVAL; ++ } ++#endif /* CONFIG_SPNFS */ ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ if (!inode->i_sb->s_pnfs_op) ++ pnfsd_lexp_init(inode); ++ return 0; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, *flags)) { ++ dprintk("set pnfs block export structure... \n"); ++ inode->i_sb->s_pnfs_op = &bl_export_ops; ++ } else ++#endif /* CONFIG_SPNFS_BLOCK */ ++ /* ++ * spnfs_enabled() indicates we're an MDS. ++ * XXX Better to check an export time option as well. ++ */ ++ if (spnfs_enabled()) { ++ dprintk("set spnfs export structure...\n"); ++ inode->i_sb->s_pnfs_op = &spnfs_export_ops; ++ } else { ++ dprintk("%s spnfs not in use\n", __func__); ++ ++ /* ++ * get_state is needed if we're a DS using spnfs. ++ * XXX Better to check an export time option instead. ++ */ ++ inode->i_sb->s_pnfs_op = &spnfs_ds_export_ops; ++ } ++#endif /* CONFIG_SPNFS */ ++ + return 0; + + } +@@ -586,6 +669,8 @@ static int svc_export_parse(struct cache + if (exp.ex_uuid == NULL) + err = -ENOMEM; + } ++ } else if (strcmp(buf, "pnfs") == 0) { ++ exp.ex_pnfs = 1; + } else if (strcmp(buf, "secinfo") == 0) + err = secinfo_parse(&mesg, buf, &exp); + else +@@ -660,6 +745,8 @@ static int svc_export_show(struct seq_fi + seq_printf(m, "%02x", exp->ex_uuid[i]); + } + } ++ if (exp->ex_pnfs) ++ seq_puts(m, ",pnfs"); + show_secinfo(m, exp); + } + seq_puts(m, ")\n"); +@@ -687,6 +774,7 @@ static void svc_export_init(struct cache + new->ex_fslocs.locations = NULL; + new->ex_fslocs.locations_count = 0; + new->ex_fslocs.migrated = 0; ++ new->ex_pnfs = 0; + } + + static void export_update(struct cache_head *cnew, struct cache_head *citem) +@@ -699,6 +787,7 @@ static void export_update(struct cache_h + new->ex_anon_uid = item->ex_anon_uid; + new->ex_anon_gid = item->ex_anon_gid; + new->ex_fsid = item->ex_fsid; ++ new->ex_pnfs = item->ex_pnfs; + new->ex_uuid = item->ex_uuid; + item->ex_uuid = NULL; + new->ex_pathname = item->ex_pathname; +@@ -1635,8 +1724,17 @@ nfsd_export_init(void) + if (rv) + return rv; + rv = cache_register(&svc_expkey_cache); +- if (rv) ++ if (rv) { + cache_unregister(&svc_export_cache); ++ goto out; ++ } ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = THIS_MODULE; ++ pnfsd_cb_ctl.cb_op = &pnfsd_cb_op; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ ++out: + return rv; + + } +@@ -1664,6 +1762,12 @@ nfsd_export_shutdown(void) + + exp_writelock(); + ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = NULL; ++ pnfsd_cb_ctl.cb_op = NULL; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ + cache_unregister(&svc_expkey_cache); + cache_unregister(&svc_export_cache); + svcauth_unix_purge(); +diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/direct.c +--- linux-2.6.34.noarch/fs/nfs/direct.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-08-31 20:42:05.514196343 -0400 +@@ -267,6 +267,38 @@ static const struct rpc_call_ops nfs_rea + .rpc_release = nfs_direct_read_release, + }; + ++static long nfs_direct_read_execute(struct nfs_read_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ nfs_fattr_init(&data->fattr); ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ NFS_PROTO(inode)->read_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct read call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each rsize'd chunk of the user's buffer, dispatch an NFS READ + * operation. If nfs_readdata_alloc() or get_user_pages() fails, +@@ -283,7 +315,6 @@ static ssize_t nfs_direct_read_schedule_ + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; + size_t rsize = NFS_SERVER(inode)->rsize; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -343,26 +374,9 @@ static ssize_t nfs_direct_read_schedule_ + data->res.fattr = &data->fattr; + data->res.eof = 0; + data->res.count = bytes; +- nfs_fattr_init(&data->fattr); +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct read call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_read_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +@@ -448,12 +462,15 @@ static void nfs_direct_free_writedata(st + } + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg); ++ + static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) + { + struct inode *inode = dreq->inode; + struct list_head *p; + struct nfs_write_data *data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = dreq->ctx->cred, + }; +@@ -487,25 +504,7 @@ static void nfs_direct_write_reschedule( + * Reuse data->task; data->args should not have changed + * since the original request was sent. + */ +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- /* +- * We're called via an RPC callback, so BKL is already held. +- */ +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); +- +- dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- data->args.count, +- (unsigned long long)data->args.offset); ++ nfs_direct_write_execute(data, &task_setup_data, &msg); + } + + if (put_dreq(dreq)) +@@ -548,10 +547,31 @@ static const struct rpc_call_ops nfs_com + .rpc_release = nfs_direct_commit_release, + }; + ++static long nfs_direct_commit_execute(struct nfs_direct_req *dreq, ++ struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct rpc_task *task; ++ ++ NFS_PROTO(data->inode)->commit_setup(data, msg); ++ ++ /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ ++ dreq->commit_data = NULL; ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ return 0; ++} ++ + static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) + { + struct nfs_write_data *data = dreq->commit_data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +@@ -579,16 +599,7 @@ static void nfs_direct_commit_schedule(s + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- NFS_PROTO(data->inode)->commit_setup(data, &msg); +- +- /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ +- dreq->commit_data = NULL; +- +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); +- +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); ++ nfs_direct_commit_execute(dreq, data, &task_setup_data, &msg); + } + + static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +@@ -690,6 +701,36 @@ static const struct rpc_call_ops nfs_wri + .rpc_release = nfs_direct_write_release, + }; + ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ NFS_PROTO(inode)->write_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE + * operation. If nfs_writedata_alloc() or get_user_pages() fails, +@@ -705,7 +746,6 @@ static ssize_t nfs_direct_write_schedule + struct inode *inode = ctx->path.dentry->d_inode; + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -771,24 +811,8 @@ static ssize_t nfs_direct_write_schedule + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct write call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_write_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kconfig +--- linux-2.6.34.noarch/fs/nfsd/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-08-31 20:42:05.549222922 -0400 +@@ -79,3 +79,52 @@ config NFSD_V4 + available from http://linux-nfs.org/. + + If unsure, say N. ++ ++config PNFSD ++ bool "NFSv4.1 server support for Parallel NFS (pNFS) (DEVELOPER ONLY)" ++ depends on NFSD_V4 && EXPERIMENTAL ++ select EXPORTFS_FILE_LAYOUT ++ help ++ This option enables support for the parallel NFS features of the ++ minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1) ++ in the kernel's NFS server. ++ ++ Unless you're an NFS developer, say N. ++ ++config PNFSD_LOCAL_EXPORT ++ bool "Enable pNFS support for exporting local filesystems for debugging purposes" ++ depends on PNFSD ++ help ++ Say Y here if you want your pNFS server to export local file systems ++ over the files layout type. With this option the MDS (metadata ++ server) functions also as a single DS (data server). This is mostly ++ useful for development and debugging purposes. ++ ++ If unsure, say N. ++ ++config SPNFS ++ bool "Provide spNFS server support (EXPERIMENTAL)" ++ depends on PNFSD ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS server support. ++ ++ If unsure, say N. ++ ++config SPNFS_LAYOUTSEGMENTS ++ bool "Allow spNFS to return partial file layouts (EXPERIMENTAL)" ++ depends on SPNFS ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS to be able to return layout segments. ++ ++ If unsure, say N. ++ ++config SPNFS_BLOCK ++ bool "Provide Block Layout server support (EXPERIMENTAL)" ++ depends on SPNFS ++ select EXPORTFS_BLOCK_LAYOUT ++ help ++ Say Y here if you want spNFS block layout support ++ ++ If unsure, say N. +diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/Makefile +--- linux-2.6.34.noarch/fs/nfsd/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-08-31 20:42:05.549222922 -0400 +@@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs + nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o + nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ + nfs4acl.o nfs4callback.o nfs4recover.o ++nfsd-$(CONFIG_PNFSD) += nfs4pnfsd.o nfs4pnfsdlm.o nfs4pnfsds.o ++nfsd-$(CONFIG_PNFSD_LOCAL_EXPORT) += pnfsd_lexp.o ++nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o ++nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-31 20:41:19.197150385 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-31 20:42:05.554114789 -0400 +@@ -40,7 +40,6 @@ + + #define NFSPROC4_CB_NULL 0 + #define NFSPROC4_CB_COMPOUND 1 +-#define NFS4_STATEID_SIZE 16 + + /* Index of predefined Linux callback client operations */ + +@@ -48,11 +47,17 @@ enum { + NFSPROC4_CLNT_CB_NULL = 0, + NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_SEQUENCE, ++#if defined(CONFIG_PNFSD) ++ NFSPROC4_CLNT_CB_LAYOUT, ++ NFSPROC4_CLNT_CB_DEVICE, ++#endif + }; + + enum nfs_cb_opnum4 { + OP_CB_RECALL = 4, ++ OP_CB_LAYOUT = 5, + OP_CB_SEQUENCE = 11, ++ OP_CB_DEVICE = 14, + }; + + #define NFS4_MAXTAGLEN 20 +@@ -78,6 +83,19 @@ enum nfs_cb_opnum4 { + #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) ++#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 3 + \ ++ enc_nfs4_fh_sz + 4) ++#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) ++#define NFS4_enc_cb_device_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 6) ++#define NFS4_dec_cb_device_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) + + /* + * Generic encode routines from fs/nfs/nfs4xdr.c +@@ -94,6 +112,10 @@ xdr_writemem(__be32 *p, const void *ptr, + } + + #define WRITE32(n) *p++ = htonl(n) ++#define WRITE64(n) do { \ ++ *p++ = htonl((u32)((n) >> 32)); \ ++ *p++ = htonl((u32)(n)); \ ++} while (0) + #define WRITEMEM(ptr,nbytes) do { \ + p = xdr_writemem(p, ptr, nbytes); \ + } while (0) +@@ -204,6 +226,16 @@ nfs_cb_stat_to_errno(int stat) + */ + + static void ++encode_stateid(struct xdr_stream *xdr, stateid_t *sid) ++{ ++ __be32 *p; ++ ++ RESERVE_SPACE(sizeof(stateid_t)); ++ WRITE32(sid->si_generation); ++ WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); ++} ++ ++static void + encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) + { + __be32 * p; +@@ -228,10 +260,10 @@ encode_cb_recall(struct xdr_stream *xdr, + __be32 *p; + int len = dp->dl_fh.fh_size; + +- RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len); ++ RESERVE_SPACE(4); + WRITE32(OP_CB_RECALL); +- WRITE32(dp->dl_stateid.si_generation); +- WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t)); ++ encode_stateid(xdr, &dp->dl_stateid); ++ RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2)); + WRITE32(0); /* truncate optimization not implemented */ + WRITE32(len); + WRITEMEM(&dp->dl_fh.fh_base, len); +@@ -259,6 +291,111 @@ encode_cb_sequence(struct xdr_stream *xd + hdr->nops++; + } + ++#if defined(CONFIG_PNFSD) ++ ++#include "pnfsd.h" ++ ++static void ++encode_cb_layout(struct xdr_stream *xdr, struct nfs4_layoutrecall *clr, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ RESERVE_SPACE(20); ++ WRITE32(OP_CB_LAYOUT); ++ WRITE32(clr->cb.cbl_seg.layout_type); ++ WRITE32(clr->cb.cbl_seg.iomode); ++ WRITE32(clr->cb.cbl_layoutchanged); ++ WRITE32(clr->cb.cbl_recall_type); ++ if (unlikely(clr->cb.cbl_recall_type == RETURN_FSID)) { ++ struct nfs4_fsid fsid = clr->cb.cbl_fsid; ++ ++ RESERVE_SPACE(16); ++ WRITE64(fsid.major); ++ WRITE64(fsid.minor); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "fsid 0x%llx-0x%llx\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, fsid.major, fsid.minor); ++ } else if (clr->cb.cbl_recall_type == RETURN_FILE) { ++ int len = clr->clr_file->fi_fhlen; ++ stateid_t *cbl_sid = (stateid_t *)&clr->cb.cbl_sid; ++ ++ RESERVE_SPACE(20 + len); ++ WRITE32(len); ++ WRITEMEM(clr->clr_file->fi_fhval, len); ++ WRITE64(clr->cb.cbl_seg.offset); ++ WRITE64(clr->cb.cbl_seg.length); ++ encode_stateid(xdr, cbl_sid); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "offset %lld length %lld stateid " STATEID_FMT "\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, ++ clr->cb.cbl_seg.offset, clr->cb.cbl_seg.length, ++ STATEID_VAL(cbl_sid)); ++ } else { ++ dprintk("%s: type %x iomode %d changed %d recall_type %d\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type); ++ } ++ hdr->nops++; ++} ++ ++static void ++encode_cb_device(struct xdr_stream *xdr, struct nfs4_notify_device *nd, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ int i; ++ int len = nd->nd_list->cbd_len; ++ struct nfsd4_pnfs_cb_dev_item *cbd = nd->nd_list->cbd_list; ++ ++ dprintk("NFSD %s: --> num %d\n", __func__, len); ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ RESERVE_SPACE(8); ++ WRITE32(OP_CB_DEVICE); ++ ++ /* notify4 cnda_changes<>; */ ++ WRITE32(len); ++ for (i = 0; i < len; i++) { ++ dprintk("%s: nt %d lt %d devid x%llx-x%llx im %d i %d\n", ++ __func__, cbd[i].cbd_notify_type, ++ cbd[i].cbd_layout_type, ++ cbd[i].cbd_devid.sbid, ++ cbd[i].cbd_devid.devid, ++ cbd[i].cbd_immediate, i); ++ ++ BUG_ON(cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_DELETE); ++ RESERVE_SPACE(32); ++ /* bitmap4 notify_mask; */ ++ WRITE32(1); ++ WRITE32(cbd[i].cbd_notify_type); ++ /* opaque notify_vals<>; */ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) ++ WRITE32(24); ++ else ++ WRITE32(20); ++ WRITE32(cbd[i].cbd_layout_type); ++ WRITE64(cbd[i].cbd_devid.sbid); ++ WRITE64(cbd[i].cbd_devid.devid); ++ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) { ++ RESERVE_SPACE(4); ++ WRITE32(cbd[i].cbd_immediate); ++ } ++ } ++ hdr->nops++; ++} ++#endif /* CONFIG_PNFSD */ ++ + static int + nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) + { +@@ -288,6 +425,45 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst * + return 0; + } + ++#if defined(CONFIG_PNFSD) ++static int ++nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, u32 *p, ++ struct nfs4_rpc_args *rpc_args) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_layoutrecall *args = rpc_args->args_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = rpc_args->args_seq.cbs_minorversion, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_cb_compound_hdr(&xdr, &hdr); ++ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); ++ encode_cb_layout(&xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++ return 0; ++} ++ ++static int ++nfs4_xdr_enc_cb_device(struct rpc_rqst *req, u32 *p, ++ struct nfs4_rpc_args *rpc_args) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_notify_device *args = rpc_args->args_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = rpc_args->args_seq.cbs_minorversion, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_cb_compound_hdr(&xdr, &hdr); ++ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); ++ encode_cb_device(&xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++ return 0; ++} ++#endif /* CONFIG_PNFSD */ + + static int + decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ +@@ -403,6 +579,48 @@ out: + return status; + } + ++#if defined(CONFIG_PNFSD) ++static int ++nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, u32 *p, ++ struct nfsd4_cb_sequence *seq) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_cb_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_cb_sequence(&xdr, seq, rqstp); ++ if (status) ++ goto out; ++ status = decode_cb_op_hdr(&xdr, OP_CB_LAYOUT); ++out: ++ return status; ++} ++ ++static int ++nfs4_xdr_dec_cb_device(struct rpc_rqst *rqstp, u32 *p, ++ struct nfsd4_cb_sequence *seq) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_cb_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_cb_sequence(&xdr, seq, rqstp); ++ if (status) ++ goto out; ++ status = decode_cb_op_hdr(&xdr, OP_CB_DEVICE); ++out: ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * RPC procedure tables + */ +@@ -420,6 +638,10 @@ out: + static struct rpc_procinfo nfs4_cb_procedures[] = { + PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null), + PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall), ++#if defined(CONFIG_PNFSD) ++ PROC(CB_LAYOUT, COMPOUND, enc_cb_layout, dec_cb_layout), ++ PROC(CB_DEVICE, COMPOUND, enc_cb_device, dec_cb_device), ++#endif + }; + + static struct rpc_version nfs_cb_version4 = { +@@ -606,10 +828,9 @@ out: + * TODO: cb_sequence should support referring call lists, cachethis, multiple + * slots, and mark callback channel down on communication errors. + */ +-static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_prepare_sequence(struct rpc_task *task, ++ struct nfs4_client *clp) + { +- struct nfs4_delegation *dp = calldata; +- struct nfs4_client *clp = dp->dl_client; + struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; + u32 minorversion = clp->cl_cb_conn.cb_minorversion; + int status = 0; +@@ -629,11 +850,15 @@ static void nfsd4_cb_prepare(struct rpc_ + rpc_call_start(task); + } + +-static void nfsd4_cb_done(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_recall_prepare(struct rpc_task *task, void *calldata) + { + struct nfs4_delegation *dp = calldata; +- struct nfs4_client *clp = dp->dl_client; ++ nfsd4_cb_prepare_sequence(task, dp->dl_client); ++} + ++static void nfsd4_cb_done_sequence(struct rpc_task *task, ++ struct nfs4_client *clp) ++{ + dprintk("%s: minorversion=%d\n", __func__, + clp->cl_cb_conn.cb_minorversion); + +@@ -657,7 +882,7 @@ static void nfsd4_cb_recall_done(struct + struct nfs4_client *clp = dp->dl_client; + struct rpc_clnt *current_rpc_client = clp->cl_cb_client; + +- nfsd4_cb_done(task, calldata); ++ nfsd4_cb_done_sequence(task, clp); + + if (current_rpc_client == NULL) { + /* We're shutting down; give up. */ +@@ -688,7 +913,7 @@ static void nfsd4_cb_recall_done(struct + if (dp->dl_retries--) { + rpc_delay(task, 2*HZ); + task->tk_status = 0; +- rpc_restart_call(task); ++ rpc_restart_call_prepare(task); + return; + } else { + atomic_set(&clp->cl_cb_set, 0); +@@ -704,7 +929,7 @@ static void nfsd4_cb_recall_release(void + } + + static const struct rpc_call_ops nfsd4_cb_recall_ops = { +- .rpc_call_prepare = nfsd4_cb_prepare, ++ .rpc_call_prepare = nfsd4_cb_recall_prepare, + .rpc_call_done = nfsd4_cb_recall_done, + .rpc_release = nfsd4_cb_recall_release, + }; +@@ -781,3 +1006,173 @@ void nfsd4_cb_recall(struct nfs4_delegat + { + queue_work(callback_wq, &dp->dl_recall.cb_work); + } ++ ++#if defined(CONFIG_PNFSD) ++static void nfsd4_cb_layout_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ nfsd4_cb_prepare_sequence(task, clr->clr_client); ++} ++ ++static void nfsd4_cb_layout_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ struct nfs4_client *clp = clr->clr_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ if (!task->tk_status) ++ return; ++ ++ printk("%s: clp %p cb_client %p fp %p failed with status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ clr->clr_file, ++ task->tk_status); ++ ++ switch (task->tk_status) { ++ case -EIO: ++ /* Network partition? */ ++ atomic_set(&clp->cl_cb_set, 0); ++ warn_no_callback_path(clp, task->tk_status); ++ /* FIXME: ++ * The pnfs standard states that we need to only expire ++ * the client after at-least "lease time" .eg lease-time * 2 ++ * when failing to communicate a recall ++ */ ++ break; ++ case -NFS4ERR_DELAY: ++ /* Pole the client until it's done with the layout */ ++ rpc_delay(task, HZ/100); /* 10 mili-seconds */ ++ task->tk_status = 0; ++ rpc_restart_call_prepare(task); ++ break; ++ case -NFS4ERR_NOMATCHING_LAYOUT: ++ task->tk_status = 0; ++ nomatching_layout(clr); ++ } ++} ++ ++static void nfsd4_cb_layout_release(void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ kfree(clr->clr_args); ++ clr->clr_args = NULL; ++ put_layoutrecall(clr); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_layout_ops = { ++ .rpc_call_prepare = nfsd4_cb_layout_prepare, ++ .rpc_call_done = nfsd4_cb_layout_done, ++ .rpc_release = nfsd4_cb_layout_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++int ++nfsd4_cb_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfs4_client *clp = clr->clr_client; ++ struct rpc_clnt *clnt = clp->cl_cb_client; ++ struct nfs4_rpc_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_LAYOUT], ++ .rpc_cred = callback_cred ++ }; ++ int status; ++ ++ args = kzalloc(sizeof(*args), GFP_KERNEL); ++ if (!args) { ++ status = -ENOMEM; ++ goto out; ++ } ++ clr->clr_args = args; ++ args->args_op = clr; ++ msg.rpc_argp = args; ++ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, ++ &nfsd4_cb_layout_ops, clr); ++out: ++ if (status) { ++ kfree(args); ++ put_layoutrecall(clr); ++ } ++ dprintk("NFSD: nfsd4_cb_layout: status %d\n", status); ++ return status; ++} ++ ++static void nfsd4_cb_device_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ nfsd4_cb_prepare_sequence(task, cbnd->nd_client); ++} ++ ++static void nfsd4_cb_device_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ struct nfs4_client *clp = cbnd->nd_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ dprintk("%s: clp %p cb_client %p: status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ task->tk_status); ++ ++ if (task->tk_status == -EIO) { ++ /* Network partition? */ ++ atomic_set(&clp->cl_cb_set, 0); ++ warn_no_callback_path(clp, task->tk_status); ++ } ++} ++ ++static void nfsd4_cb_device_release(void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ kfree(cbnd->nd_args); ++ cbnd->nd_args = NULL; ++ kfree(cbnd); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_device_ops = { ++ .rpc_call_prepare = nfsd4_cb_device_prepare, ++ .rpc_call_done = nfsd4_cb_device_done, ++ .rpc_release = nfsd4_cb_device_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++int ++nfsd4_cb_notify_device(struct nfs4_notify_device *cbnd) ++{ ++ struct nfs4_client *clp = cbnd->nd_client; ++ struct rpc_clnt *clnt = clp->cl_cb_client; ++ struct nfs4_rpc_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_DEVICE], ++ .rpc_cred = callback_cred ++ }; ++ int status = -EIO; ++ ++ dprintk("%s: clp %p\n", __func__, clp); ++ ++ args = kzalloc(sizeof(*args), GFP_KERNEL); ++ if (!args) { ++ status = -ENOMEM; ++ goto out; ++ } ++ args->args_op = cbnd; ++ msg.rpc_argp = args; ++ ++ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, ++ &nfsd4_cb_device_ops, cbnd); ++out: ++ if (status) ++ kfree(args); ++ dprintk("%s: status %d\n", __func__, status); ++ return status; ++} ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-31 20:42:05.556172071 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-31 20:42:05.556172071 -0400 +@@ -0,0 +1,1679 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ *****************************************************************************/ ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Globals */ ++static u32 current_layoutid = 1; ++ ++/* ++ * Currently used for manipulating the layout state. ++ */ ++static DEFINE_SPINLOCK(layout_lock); ++ ++#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_SMP) ++# define BUG_ON_UNLOCKED_LAYOUT() BUG_ON(!spin_is_locked(&layout_lock)) ++#else ++# define BUG_ON_UNLOCKED_LAYOUT() ++#endif ++ ++/* ++ * Layout state - NFSv4.1 pNFS ++ */ ++static struct kmem_cache *pnfs_layout_slab; ++static struct kmem_cache *pnfs_layoutrecall_slab; ++ ++/* hash table for nfsd4_pnfs_deviceid.sbid */ ++#define SBID_HASH_BITS 8 ++#define SBID_HASH_SIZE (1 << SBID_HASH_BITS) ++#define SBID_HASH_MASK (SBID_HASH_SIZE - 1) ++ ++struct sbid_tracker { ++ u64 id; ++ struct super_block *sb; ++ struct list_head hash; ++}; ++ ++static u64 current_sbid; ++static struct list_head sbid_hashtbl[SBID_HASH_SIZE]; ++ ++static inline unsigned long ++sbid_hashval(struct super_block *sb) ++{ ++ return hash_ptr(sb, SBID_HASH_BITS); ++} ++ ++static inline struct sbid_tracker * ++alloc_sbid(void) ++{ ++ return kmalloc(sizeof(struct sbid_tracker), GFP_KERNEL); ++} ++ ++static void ++destroy_sbid(struct sbid_tracker *sbid) ++{ ++ spin_lock(&layout_lock); ++ list_del(&sbid->hash); ++ spin_unlock(&layout_lock); ++ kfree(sbid); ++} ++ ++void ++nfsd4_free_pnfs_slabs(void) ++{ ++ int i; ++ struct sbid_tracker *sbid; ++ ++ nfsd4_free_slab(&pnfs_layout_slab); ++ nfsd4_free_slab(&pnfs_layoutrecall_slab); ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ while (!list_empty(&sbid_hashtbl[i])) { ++ sbid = list_first_entry(&sbid_hashtbl[i], ++ struct sbid_tracker, ++ hash); ++ destroy_sbid(sbid); ++ } ++ } ++} ++ ++int ++nfsd4_init_pnfs_slabs(void) ++{ ++ int i; ++ ++ pnfs_layout_slab = kmem_cache_create("pnfs_layouts", ++ sizeof(struct nfs4_layout), 0, 0, NULL); ++ if (pnfs_layout_slab == NULL) ++ return -ENOMEM; ++ pnfs_layoutrecall_slab = kmem_cache_create("pnfs_layoutrecalls", ++ sizeof(struct nfs4_layoutrecall), 0, 0, NULL); ++ if (pnfs_layoutrecall_slab == NULL) ++ return -ENOMEM; ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ INIT_LIST_HEAD(&sbid_hashtbl[i]); ++ } ++ ++ return 0; ++} ++ ++/* XXX: Need to implement the notify types and track which ++ * clients have which devices. */ ++void pnfs_set_device_notify(clientid_t *clid, unsigned int types) ++{ ++ struct nfs4_client *clp; ++ dprintk("%s: -->\n", __func__); ++ ++ nfs4_lock_state(); ++ /* Indicate that client has a device so we can only notify ++ * the correct clients */ ++ clp = find_confirmed_client(clid); ++ if (clp) { ++ atomic_inc(&clp->cl_deviceref); ++ dprintk("%s: Incr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++ } ++ nfs4_unlock_state(); ++} ++ ++/* Clear notifications for this client ++ * XXX: Do we need to loop through a clean up all ++ * krefs when nfsd cleans up the client? */ ++void pnfs_clear_device_notify(struct nfs4_client *clp) ++{ ++ atomic_dec(&clp->cl_deviceref); ++ dprintk("%s: Decr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++} ++ ++static struct nfs4_layout_state * ++alloc_init_layout_state(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid) ++{ ++ struct nfs4_layout_state *new; ++ ++ /* FIXME: use a kmem_cache */ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return new; ++ get_nfs4_file(fp); ++ INIT_LIST_HEAD(&new->ls_perfile); ++ INIT_LIST_HEAD(&new->ls_layouts); ++ kref_init(&new->ls_ref); ++ new->ls_client = clp; ++ new->ls_file = fp; ++ new->ls_stateid.si_boot = stateid->si_boot; ++ new->ls_stateid.si_stateownerid = 0; /* identifies layout stateid */ ++ new->ls_stateid.si_generation = 1; ++ spin_lock(&layout_lock); ++ new->ls_stateid.si_fileid = current_layoutid++; ++ list_add(&new->ls_perfile, &fp->fi_layout_states); ++ spin_unlock(&layout_lock); ++ return new; ++} ++ ++static inline void ++get_layout_state(struct nfs4_layout_state *ls) ++{ ++ kref_get(&ls->ls_ref); ++} ++ ++static void ++destroy_layout_state_common(struct nfs4_layout_state *ls) ++{ ++ struct nfs4_file *fp = ls->ls_file; ++ ++ dprintk("pNFS %s: ls %p fp %p clp %p\n", __func__, ls, fp, ++ ls->ls_client); ++ BUG_ON(!list_empty(&ls->ls_layouts)); ++ kfree(ls); ++ put_nfs4_file(fp); ++} ++ ++static void ++destroy_layout_state(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ spin_lock(&layout_lock); ++ list_del(&ls->ls_perfile); ++ spin_unlock(&layout_lock); ++ destroy_layout_state_common(ls); ++} ++ ++static void ++destroy_layout_state_locked(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ list_del(&ls->ls_perfile); ++ destroy_layout_state_common(ls); ++} ++ ++static inline void ++put_layout_state(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state); ++} ++ ++static inline void ++put_layout_state_locked(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state_locked); ++} ++ ++/* ++ * Search the fp->fi_layout_state list for a layout state with the clientid. ++ * If not found, then this is a 'first open/delegation/lock stateid' from ++ * the client for this file. ++ * Called under the layout_lock. ++ */ ++static struct nfs4_layout_state * ++find_get_layout_state(struct nfs4_client *clp, struct nfs4_file *fp) ++{ ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_for_each_entry(ls, &fp->fi_layout_states, ls_perfile) { ++ if (ls->ls_client == clp) { ++ dprintk("pNFS %s: before GET ls %p ls_ref %d\n", ++ __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ get_layout_state(ls); ++ return ls; ++ } ++ } ++ return NULL; ++} ++ ++static __be32 ++verify_stateid(struct nfs4_file *fp, stateid_t *stateid) ++{ ++ struct nfs4_stateid *local = NULL; ++ struct nfs4_delegation *temp = NULL; ++ ++ /* check if open or lock stateid */ ++ local = find_stateid(stateid, RD_STATE); ++ if (local) ++ return 0; ++ temp = find_delegation_stateid(fp->fi_inode, stateid); ++ if (temp) ++ return 0; ++ return nfserr_bad_stateid; ++} ++ ++/* ++ * nfs4_preocess_layout_stateid () ++ * ++ * We have looked up the nfs4_file corresponding to the current_fh, and ++ * confirmed the clientid. Pull the few tests from nfs4_preprocess_stateid_op() ++ * that make sense with a layout stateid. ++ * ++ * Called with the state_lock held ++ * Returns zero and stateid is updated, or error. ++ * ++ * Note: the struct nfs4_layout_state pointer is only set by layoutget. ++ */ ++static __be32 ++nfs4_process_layout_stateid(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid, struct nfs4_layout_state **lsp) ++{ ++ struct nfs4_layout_state *ls = NULL; ++ __be32 status = 0; ++ ++ dprintk("--> %s clp %p fp %p \n", __func__, clp, fp); ++ ++ dprintk("%s: operation stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ goto out; ++ ++ /* Is this the first use of this layout ? */ ++ spin_lock(&layout_lock); ++ ls = find_get_layout_state(clp, fp); ++ spin_unlock(&layout_lock); ++ if (!ls) { ++ /* Only alloc layout state on layoutget (which sets lsp). */ ++ if (!lsp) { ++ dprintk("%s ERROR: Not layoutget & no layout stateid\n", ++ __func__); ++ status = nfserr_bad_stateid; ++ goto out; ++ } ++ dprintk("%s Initial stateid for layout: file %p client %p\n", ++ __func__, fp, clp); ++ ++ /* verify input stateid */ ++ status = verify_stateid(fp, stateid); ++ if (status) { ++ dprintk("%s ERROR: invalid open/deleg/lock stateid\n", ++ __func__); ++ goto out; ++ } ++ ls = alloc_init_layout_state(clp, fp, stateid); ++ if (!ls) { ++ dprintk("%s pNFS ERROR: no memory for layout state\n", ++ __func__); ++ status = nfserr_resource; ++ goto out; ++ } ++ } else { ++ dprintk("%s Not initial stateid. Layout state %p file %p\n", ++ __func__, ls, fp); ++ ++ /* BAD STATEID */ ++ status = nfserr_bad_stateid; ++ if (memcmp(&ls->ls_stateid.si_opaque, &stateid->si_opaque, ++ sizeof(stateid_opaque_t)) != 0) { ++ ++ /* if a LAYOUTGET operation and stateid is a valid ++ * open/deleg/lock stateid, accept it as a parallel ++ * initial layout stateid ++ */ ++ if (lsp && ((verify_stateid(fp, stateid)) == 0)) { ++ dprintk("%s parallel initial layout state\n", ++ __func__); ++ goto update; ++ } ++ ++ dprintk("%s ERROR bad opaque in stateid 1\n", __func__); ++ goto out_put; ++ } ++ ++ /* stateid is a valid layout stateid for this file. */ ++ if (stateid->si_generation > ls->ls_stateid.si_generation) { ++ dprintk("%s bad stateid 1\n", __func__); ++ goto out_put; ++ } ++update: ++ update_stateid(&ls->ls_stateid); ++ dprintk("%s Updated ls_stateid to %d on layoutstate %p\n", ++ __func__, ls->ls_stateid.si_generation, ls); ++ } ++ status = 0; ++ /* Set the stateid to be encoded */ ++ memcpy(stateid, &ls->ls_stateid, sizeof(stateid_t)); ++ ++ /* Return the layout state if requested */ ++ if (lsp) { ++ get_layout_state(ls); ++ *lsp = ls; ++ } ++ dprintk("%s: layout stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(&ls->ls_stateid)); ++out_put: ++ dprintk("%s PUT LO STATE:\n", __func__); ++ put_layout_state(ls); ++out: ++ dprintk("<-- %s status %d\n", __func__, htonl(status)); ++ ++ return status; ++} ++ ++static inline struct nfs4_layout * ++alloc_layout(void) ++{ ++ return kmem_cache_alloc(pnfs_layout_slab, GFP_KERNEL); ++} ++ ++static inline void ++free_layout(struct nfs4_layout *lp) ++{ ++ kmem_cache_free(pnfs_layout_slab, lp); ++} ++ ++static void ++init_layout(struct nfs4_layout_state *ls, ++ struct nfs4_layout *lp, ++ struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg) ++{ ++ dprintk("pNFS %s: ls %p lp %p clp %p fp %p ino %p\n", __func__, ++ ls, lp, clp, fp, fp->fi_inode); ++ ++ get_nfs4_file(fp); ++ lp->lo_client = clp; ++ lp->lo_file = fp; ++ get_layout_state(ls); ++ lp->lo_state = ls; ++ memcpy(&lp->lo_seg, seg, sizeof(lp->lo_seg)); ++ spin_lock(&layout_lock); ++ list_add_tail(&lp->lo_perstate, &ls->ls_layouts); ++ list_add_tail(&lp->lo_perclnt, &clp->cl_layouts); ++ list_add_tail(&lp->lo_perfile, &fp->fi_layouts); ++ spin_unlock(&layout_lock); ++ dprintk("pNFS %s end\n", __func__); ++} ++ ++static void ++dequeue_layout(struct nfs4_layout *lp) ++{ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del(&lp->lo_perclnt); ++ list_del(&lp->lo_perfile); ++ list_del(&lp->lo_perstate); ++} ++ ++static void ++destroy_layout(struct nfs4_layout *lp) ++{ ++ struct nfs4_client *clp; ++ struct nfs4_file *fp; ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ clp = lp->lo_client; ++ fp = lp->lo_file; ++ ls = lp->lo_state; ++ dprintk("pNFS %s: lp %p clp %p fp %p ino %p ls_layouts empty %d\n", ++ __func__, lp, clp, fp, fp->fi_inode, ++ list_empty(&ls->ls_layouts)); ++ ++ kmem_cache_free(pnfs_layout_slab, lp); ++ /* release references taken by init_layout */ ++ put_layout_state_locked(ls); ++ put_nfs4_file(fp); ++} ++ ++void fs_layout_return(struct super_block *sb, struct inode *ino, ++ struct nfsd4_pnfs_layoutreturn *lrp, int flags, ++ void *recall_cookie) ++{ ++ int ret; ++ ++ if (unlikely(!sb->s_pnfs_op->layout_return)) ++ return; ++ ++ lrp->lr_flags = flags; ++ lrp->args.lr_cookie = recall_cookie; ++ ++ if (!ino) /* FSID or ALL */ ++ ino = sb->s_root->d_inode; ++ ++ ret = sb->s_pnfs_op->layout_return(ino, &lrp->args); ++ dprintk("%s: inode %lu iomode=%d offset=0x%llx length=0x%llx " ++ "cookie = %p flags 0x%x status=%d\n", ++ __func__, ino->i_ino, lrp->args.lr_seg.iomode, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, ++ recall_cookie, flags, ret); ++} ++ ++static u64 ++alloc_init_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ struct sbid_tracker *new = alloc_sbid(); ++ unsigned long hash_idx = sbid_hashval(sb); ++ u64 id = 0; ++ ++ if (likely(new)) { ++ spin_lock(&layout_lock); ++ id = ++current_sbid; ++ new->id = (id << SBID_HASH_BITS) | (hash_idx & SBID_HASH_MASK); ++ id = new->id; ++ BUG_ON(id == 0); ++ new->sb = sb; ++ ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) ++ if (sbid->sb == sb) { ++ kfree(new); ++ id = sbid->id; ++ spin_unlock(&layout_lock); ++ return id; ++ } ++ list_add(&new->hash, &sbid_hashtbl[hash_idx]); ++ spin_unlock(&layout_lock); ++ } ++ return id; ++} ++ ++struct super_block * ++find_sbid_id(u64 id) ++{ ++ struct sbid_tracker *sbid; ++ struct super_block *sb = NULL; ++ unsigned long hash_idx = id & SBID_HASH_MASK; ++ int pos = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->id != id) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ sb = sbid->sb; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return sb; ++} ++ ++u64 ++find_create_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ unsigned long hash_idx = sbid_hashval(sb); ++ int pos = 0; ++ u64 id = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->sb != sb) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ id = sbid->id; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ if (!id) ++ id = alloc_init_sbid(sb); ++ ++ return id; ++} ++ ++/* ++ * Create a layoutrecall structure ++ * An optional layoutrecall can be cloned (except for the layoutrecall lists) ++ */ ++static struct nfs4_layoutrecall * ++alloc_init_layoutrecall(struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_client *clp, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ dprintk("NFSD %s\n", __func__); ++ clr = kmem_cache_alloc(pnfs_layoutrecall_slab, GFP_KERNEL); ++ if (clr == NULL) ++ return clr; ++ ++ dprintk("NFSD %s -->\n", __func__); ++ ++ memset(clr, 0, sizeof(*clr)); ++ if (lrfile) ++ get_nfs4_file(lrfile); ++ clr->clr_client = clp; ++ clr->clr_file = lrfile; ++ clr->cb = *cbl; ++ ++ kref_init(&clr->clr_ref); ++ INIT_LIST_HEAD(&clr->clr_perclnt); ++ ++ dprintk("NFSD %s return %p\n", __func__, clr); ++ return clr; ++} ++ ++static void ++get_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ kref_get(&clr->clr_ref); ++} ++ ++static void ++destroy_layoutrecall(struct kref *kref) ++{ ++ struct nfs4_layoutrecall *clr = ++ container_of(kref, struct nfs4_layoutrecall, clr_ref); ++ dprintk("pNFS %s: clr %p fp %p clp %p\n", __func__, clr, ++ clr->clr_file, clr->clr_client); ++ BUG_ON(!list_empty(&clr->clr_perclnt)); ++ if (clr->clr_file) ++ put_nfs4_file(clr->clr_file); ++ kmem_cache_free(pnfs_layoutrecall_slab, clr); ++} ++ ++int ++put_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ return kref_put(&clr->clr_ref, destroy_layoutrecall); ++} ++ ++void * ++layoutrecall_done(struct nfs4_layoutrecall *clr) ++{ ++ void *recall_cookie = clr->cb.cbl_cookie; ++ struct nfs4_layoutrecall *parent = clr->parent; ++ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del_init(&clr->clr_perclnt); ++ put_layoutrecall(clr); ++ ++ if (parent && !put_layoutrecall(parent)) ++ recall_cookie = NULL; ++ ++ return recall_cookie; ++} ++ ++/* ++ * get_state() and cb_get_state() are ++ */ ++void ++release_pnfs_ds_dev_list(struct nfs4_stateid *stp) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ while (!list_empty(&stp->st_pnfs_ds_id)) { ++ ddp = list_entry(stp->st_pnfs_ds_id.next, ++ struct pnfs_ds_dev_entry, dd_dev_entry); ++ list_del(&ddp->dd_dev_entry); ++ kfree(ddp); ++ } ++} ++ ++static int ++nfs4_add_pnfs_ds_dev(struct nfs4_stateid *stp, u32 dsid) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ ddp = kmalloc(sizeof(*ddp), GFP_KERNEL); ++ if (!ddp) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&ddp->dd_dev_entry); ++ list_add(&ddp->dd_dev_entry, &stp->st_pnfs_ds_id); ++ ddp->dd_dsid = dsid; ++ return 0; ++} ++ ++/* ++ * are two octet ranges overlapping? ++ * start1 last1 ++ * |-----------------| ++ * start2 last2 ++ * |----------------| ++ */ ++static inline int ++lo_seg_overlapping(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 last1 = last_byte_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 last2 = last_byte_offset(start2, l2->length); ++ int ret; ++ ++ /* if last1 == start2 there's a single byte overlap */ ++ ret = (last2 >= start1) && (last1 >= start2); ++ dprintk("%s: l1 %llu:%lld l2 %llu:%lld ret=%d\n", __func__, ++ l1->offset, l1->length, l2->offset, l2->length, ret); ++ return ret; ++} ++ ++static inline int ++same_fsid_major(struct nfs4_fsid *fsid, u64 major) ++{ ++ return fsid->major == major; ++} ++ ++static inline int ++same_fsid(struct nfs4_fsid *fsid, struct svc_fh *current_fh) ++{ ++ return same_fsid_major(fsid, current_fh->fh_export->ex_fsid); ++} ++ ++/* ++ * find a layout recall conflicting with the specified layoutget ++ */ ++static int ++is_layout_recalled(struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (clr, &clp->cl_layoutrecalls, clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != seg->layout_type) ++ continue; ++ if (clr->cb.cbl_recall_type == RETURN_ALL) ++ goto found; ++ if (clr->cb.cbl_recall_type == RETURN_FSID) { ++ if (same_fsid(&clr->cb.cbl_fsid, current_fh)) ++ goto found; ++ else ++ continue; ++ } ++ BUG_ON(clr->cb.cbl_recall_type != RETURN_FILE); ++ if (clr->cb.cbl_seg.clientid == seg->clientid && ++ lo_seg_overlapping(&clr->cb.cbl_seg, seg)) ++ goto found; ++ } ++ spin_unlock(&layout_lock); ++ return 0; ++found: ++ spin_unlock(&layout_lock); ++ return 1; ++} ++ ++/* ++ * are two octet ranges overlapping or adjacent? ++ */ ++static inline int ++lo_seg_mergeable(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ /* is end1 == start2 ranges are adjacent */ ++ return (end2 >= start1) && (end1 >= start2); ++} ++ ++static void ++extend_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lg) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lg_start = lg->offset; ++ u64 lg_end = end_offset(lg_start, lg->length); ++ ++ /* lo already covers lg? */ ++ if (lo_start <= lg_start && lg_end <= lo_end) ++ return; ++ ++ /* extend start offset */ ++ if (lo_start > lg_start) ++ lo_start = lg_start; ++ ++ /* extend end offset */ ++ if (lo_end < lg_end) ++ lo_end = lg_end; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? ++ lo_end : lo_end - lo_start; ++} ++ ++static struct nfs4_layout * ++merge_layout(struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layout *lp = NULL; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (lp, &fp->fi_layouts, lo_perfile) ++ if (lp->lo_seg.layout_type == seg->layout_type && ++ lp->lo_seg.clientid == seg->clientid && ++ lp->lo_seg.iomode == seg->iomode && ++ lo_seg_mergeable(&lp->lo_seg, seg)) { ++ extend_layout(&lp->lo_seg, seg); ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return lp; ++} ++ ++__be32 ++nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *lgp, ++ struct exp_xdr_stream *xdr) ++{ ++ u32 status; ++ __be32 nfserr; ++ struct inode *ino = lgp->lg_fhp->fh_dentry->d_inode; ++ struct super_block *sb = ino->i_sb; ++ int can_merge; ++ struct nfs4_file *fp; ++ struct nfs4_client *clp; ++ struct nfs4_layout *lp = NULL; ++ struct nfs4_layout_state *ls = NULL; ++ struct nfsd4_pnfs_layoutget_arg args = { ++ .lg_minlength = lgp->lg_minlength, ++ .lg_fh = &lgp->lg_fhp->fh_handle, ++ }; ++ struct nfsd4_pnfs_layoutget_res res = { ++ .lg_seg = lgp->lg_seg, ++ }; ++ ++ dprintk("NFSD: %s Begin\n", __func__); ++ ++ args.lg_sbid = find_create_sbid(sb); ++ if (!args.lg_sbid) { ++ nfserr = nfserr_layouttrylater; ++ goto out; ++ } ++ ++ can_merge = sb->s_pnfs_op->can_merge_layouts != NULL && ++ sb->s_pnfs_op->can_merge_layouts(lgp->lg_seg.layout_type); ++ ++ nfs4_lock_state(); ++ fp = find_alloc_file(ino, lgp->lg_fhp); ++ clp = find_confirmed_client((clientid_t *)&lgp->lg_seg.clientid); ++ dprintk("pNFS %s: fp %p clp %p \n", __func__, fp, clp); ++ if (!fp || !clp) { ++ nfserr = nfserr_inval; ++ goto out_unlock; ++ } ++ ++ /* Check decoded layout stateid */ ++ nfserr = nfs4_process_layout_stateid(clp, fp, &lgp->lg_sid, &ls); ++ if (nfserr) ++ goto out_unlock; ++ ++ if (is_layout_recalled(clp, lgp->lg_fhp, &lgp->lg_seg)) { ++ nfserr = nfserr_recallconflict; ++ goto out; ++ } ++ ++ /* pre-alloc layout in case we can't merge after we call ++ * the file system ++ */ ++ lp = alloc_layout(); ++ if (!lp) { ++ nfserr = nfserr_layouttrylater; ++ goto out_unlock; ++ } ++ ++ dprintk("pNFS %s: pre-export type 0x%x maxcount %Zd " ++ "iomode %u offset %llu length %llu\n", ++ __func__, lgp->lg_seg.layout_type, ++ exp_xdr_qbytes(xdr->end - xdr->p), ++ lgp->lg_seg.iomode, lgp->lg_seg.offset, lgp->lg_seg.length); ++ ++ /* FIXME: need to eliminate the use of the state lock */ ++ nfs4_unlock_state(); ++ status = sb->s_pnfs_op->layout_get(ino, xdr, &args, &res); ++ nfs4_lock_state(); ++ ++ dprintk("pNFS %s: post-export status %u " ++ "iomode %u offset %llu length %llu\n", ++ __func__, status, res.lg_seg.iomode, ++ res.lg_seg.offset, res.lg_seg.length); ++ ++ /* ++ * The allowable error codes for the layout_get pNFS export ++ * operations vector function (from the file system) can be ++ * expanded as needed to include other errors defined for ++ * the RFC 5561 LAYOUTGET operation. ++ */ ++ switch (status) { ++ case 0: ++ nfserr = NFS4_OK; ++ break; ++ case NFS4ERR_ACCESS: ++ case NFS4ERR_BADIOMODE: ++ /* No support for LAYOUTIOMODE4_RW layouts */ ++ case NFS4ERR_BADLAYOUT: ++ /* No layout matching loga_minlength rules */ ++ case NFS4ERR_INVAL: ++ case NFS4ERR_IO: ++ case NFS4ERR_LAYOUTTRYLATER: ++ case NFS4ERR_LAYOUTUNAVAILABLE: ++ case NFS4ERR_LOCKED: ++ case NFS4ERR_NOSPC: ++ case NFS4ERR_RECALLCONFLICT: ++ case NFS4ERR_SERVERFAULT: ++ case NFS4ERR_TOOSMALL: ++ /* Requested layout too big for loga_maxcount */ ++ case NFS4ERR_WRONG_TYPE: ++ /* Not a regular file */ ++ nfserr = cpu_to_be32(status); ++ goto out_freelayout; ++ default: ++ BUG(); ++ nfserr = nfserr_serverfault; ++ } ++ ++ lgp->lg_seg = res.lg_seg; ++ lgp->lg_roc = res.lg_return_on_close; ++ ++ /* SUCCESS! ++ * Can the new layout be merged into an existing one? ++ * If so, free unused layout struct ++ */ ++ if (can_merge && merge_layout(fp, clp, &res.lg_seg)) ++ goto out_freelayout; ++ ++ /* Can't merge, so let's initialize this new layout */ ++ init_layout(ls, lp, fp, clp, lgp->lg_fhp, &res.lg_seg); ++out_unlock: ++ if (ls) ++ put_layout_state(ls); ++ if (fp) ++ put_nfs4_file(fp); ++ nfs4_unlock_state(); ++out: ++ dprintk("pNFS %s: lp %p exit nfserr %u\n", __func__, lp, ++ be32_to_cpu(nfserr)); ++ return nfserr; ++out_freelayout: ++ free_layout(lp); ++ goto out_unlock; ++} ++ ++static void ++trim_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lr) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lr_start = lr->offset; ++ u64 lr_end = end_offset(lr_start, lr->length); ++ ++ dprintk("%s:Begin lo %llu:%lld lr %llu:%lld\n", __func__, ++ lo->offset, lo->length, lr->offset, lr->length); ++ ++ /* lr fully covers lo? */ ++ if (lr_start <= lo_start && lo_end <= lr_end) { ++ lo->length = 0; ++ goto out; ++ } ++ ++ /* ++ * split not supported yet. retain layout segment. ++ * remains must be returned by the client ++ * on the final layout return. ++ */ ++ if (lo_start < lr_start && lr_end < lo_end) { ++ dprintk("%s: split not supported\n", __func__); ++ goto out; ++ } ++ ++ if (lo_start < lr_start) ++ lo_end = lr_start - 1; ++ else /* lr_end < lo_end */ ++ lo_start = lr_end + 1; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? lo_end : lo_end - lo_start; ++out: ++ dprintk("%s:End lo %llu:%lld\n", __func__, lo->offset, lo->length); ++} ++ ++static int ++pnfs_return_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ dprintk("%s: clp %p fp %p\n", __func__, clp, fp); ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &fp->fi_layouts, lo_perfile) { ++ dprintk("%s: lp %p client %p,%p lo_type %x,%x iomode %d,%d\n", ++ __func__, lp, ++ lp->lo_client, clp, ++ lp->lo_seg.layout_type, lrp->args.lr_seg.layout_type, ++ lp->lo_seg.iomode, lrp->args.lr_seg.iomode); ++ if (lp->lo_client != clp || ++ lp->lo_seg.layout_type != lrp->args.lr_seg.layout_type || ++ (lp->lo_seg.iomode != lrp->args.lr_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) || ++ !lo_seg_overlapping(&lp->lo_seg, &lrp->args.lr_seg)) ++ continue; ++ layouts_found++; ++ trim_layout(&lp->lo_seg, &lrp->args.lr_seg); ++ if (!lp->lo_seg.length) { ++ lrp->lrs_present = 0; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ } ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++pnfs_return_client_layouts(struct nfs4_client *clp, ++ struct nfsd4_pnfs_layoutreturn *lrp, u64 ex_fsid) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &clp->cl_layouts, lo_perclnt) { ++ if (lrp->args.lr_seg.layout_type != lp->lo_seg.layout_type || ++ (lrp->args.lr_seg.iomode != lp->lo_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY)) ++ continue; ++ ++ if (lrp->args.lr_return_type == RETURN_FSID && ++ !same_fsid_major(&lp->lo_file->fi_fsid, ex_fsid)) ++ continue; ++ ++ layouts_found++; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++recall_return_perfect_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode || ++ clr->cb.cbl_recall_type != lrp->args.lr_return_type) ++ return 0; ++ ++ return (clr->cb.cbl_recall_type == RETURN_FILE && ++ clr->clr_file == fp && ++ clr->cb.cbl_seg.offset == lrp->args.lr_seg.offset && ++ clr->cb.cbl_seg.length == lrp->args.lr_seg.length) || ++ ++ (clr->cb.cbl_recall_type == RETURN_FSID && ++ same_fsid(&clr->cb.cbl_fsid, current_fh)) || ++ ++ clr->cb.cbl_recall_type == RETURN_ALL; ++} ++ ++static int ++recall_return_partial_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ /* iomode matching? */ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode && ++ clr->cb.cbl_seg.iomode != IOMODE_ANY && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) ++ return 0; ++ ++ if (clr->cb.cbl_recall_type == RETURN_ALL || ++ lrp->args.lr_return_type == RETURN_ALL) ++ return 1; ++ ++ /* fsid matches? */ ++ if (clr->cb.cbl_recall_type == RETURN_FSID || ++ lrp->args.lr_return_type == RETURN_FSID) ++ return same_fsid(&clr->cb.cbl_fsid, current_fh); ++ ++ /* file matches, range overlapping? */ ++ return clr->clr_file == fp && ++ lo_seg_overlapping(&clr->cb.cbl_seg, &lrp->args.lr_seg); ++} ++ ++int nfs4_pnfs_return_layout(struct super_block *sb, struct svc_fh *current_fh, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status = 0; ++ int layouts_found = 0; ++ struct inode *ino = current_fh->fh_dentry->d_inode; ++ struct nfs4_file *fp = NULL; ++ struct nfs4_client *clp; ++ struct nfs4_layoutrecall *clr, *nextclr; ++ u64 ex_fsid = current_fh->fh_export->ex_fsid; ++ void *recall_cookie = NULL; ++ ++ dprintk("NFSD: %s\n", __func__); ++ ++ nfs4_lock_state(); ++ clp = find_confirmed_client((clientid_t *)&lrp->args.lr_seg.clientid); ++ if (!clp) ++ goto out; ++ ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ fp = find_file(ino); ++ if (!fp) { ++ printk(KERN_ERR "%s: RETURN_FILE: no nfs4_file for " ++ "ino %p:%lu\n", ++ __func__, ino, ino ? ino->i_ino : 0L); ++ goto out; ++ } ++ ++ /* Check the stateid */ ++ dprintk("%s PROCESS LO_STATEID inode %p\n", __func__, ino); ++ status = nfs4_process_layout_stateid(clp, fp, &lrp->lr_sid, ++ NULL); ++ if (status) ++ goto out_put_file; ++ ++ /* update layouts */ ++ layouts_found = pnfs_return_file_layouts(clp, fp, lrp); ++ /* optimize for the all-empty case */ ++ if (list_empty(&fp->fi_layouts)) ++ recall_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ } else { ++ layouts_found = pnfs_return_client_layouts(clp, lrp, ex_fsid); ++ } ++ ++ dprintk("pNFS %s: clp %p fp %p layout_type 0x%x iomode %d " ++ "return_type %d fsid 0x%llx offset %llu length %llu: " ++ "layouts_found %d\n", ++ __func__, clp, fp, lrp->args.lr_seg.layout_type, ++ lrp->args.lr_seg.iomode, lrp->args.lr_return_type, ++ ex_fsid, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, layouts_found); ++ ++ /* update layoutrecalls ++ * note: for RETURN_{FSID,ALL}, fp may be NULL ++ */ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (clr, nextclr, &clp->cl_layoutrecalls, ++ clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != lrp->args.lr_seg.layout_type) ++ continue; ++ ++ if (recall_return_perfect_match(clr, lrp, fp, current_fh)) ++ recall_cookie = layoutrecall_done(clr); ++ else if (layouts_found && ++ recall_return_partial_match(clr, lrp, fp, current_fh)) ++ clr->clr_time = CURRENT_TIME; ++ } ++ spin_unlock(&layout_lock); ++ ++out_put_file: ++ if (fp) ++ put_nfs4_file(fp); ++out: ++ nfs4_unlock_state(); ++ ++ /* call exported filesystem layout_return (ignore return-code) */ ++ fs_layout_return(sb, ino, lrp, 0, recall_cookie); ++ ++ dprintk("pNFS %s: exit status %d \n", __func__, status); ++ return status; ++} ++ ++/* ++ * PNFS Metadata server export operations callback for get_state ++ * ++ * called by the cluster fs when it receives a get_state() from a data ++ * server. ++ * returns status, or pnfs_get_state* with pnfs_get_state->status set. ++ * ++ */ ++int ++nfs4_pnfs_cb_get_state(struct super_block *sb, struct pnfs_get_state *arg) ++{ ++ struct nfs4_stateid *stp; ++ int flags = LOCK_STATE | OPEN_STATE; /* search both hash tables */ ++ int status = -EINVAL; ++ struct inode *ino; ++ struct nfs4_delegation *dl; ++ stateid_t *stid = (stateid_t *)&arg->stid; ++ ++ dprintk("NFSD: %s sid=" STATEID_FMT " ino %llu\n", __func__, ++ STATEID_VAL(stid), arg->ino); ++ ++ nfs4_lock_state(); ++ stp = find_stateid(stid, flags); ++ if (!stp) { ++ ino = iget_locked(sb, arg->ino); ++ if (!ino) ++ goto out; ++ ++ if (ino->i_state & I_NEW) { ++ iget_failed(ino); ++ goto out; ++ } ++ ++ dl = find_delegation_stateid(ino, stid); ++ if (dl) ++ status = 0; ++ ++ iput(ino); ++ } else { ++ /* XXX ANDROS: marc removed nfs4_check_fh - how come? */ ++ ++ /* arg->devid is the Data server id, set by the cluster fs */ ++ status = nfs4_add_pnfs_ds_dev(stp, arg->dsid); ++ if (status) ++ goto out; ++ ++ arg->access = stp->st_access_bmap; ++ *(clientid_t *)&arg->clid = ++ stp->st_stateowner->so_client->cl_clientid; ++ } ++out: ++ nfs4_unlock_state(); ++ return status; ++} ++ ++static int ++cl_has_file_layout(struct nfs4_client *clp, struct nfs4_file *lrfile, ++ stateid_t *lsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ struct nfs4_layout_state *ls; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) { ++ if (lp->lo_file != lrfile) ++ continue; ++ ++ ls = find_get_layout_state(clp, lrfile); ++ if (!ls) { ++ /* This shouldn't happen as the file should have a ++ * layout stateid if it has a layout. ++ */ ++ printk(KERN_ERR "%s: file %p has no layout stateid\n", ++ __func__, lrfile); ++ WARN_ON(1); ++ break; ++ } ++ update_stateid(&ls->ls_stateid); ++ memcpy(lsid, &ls->ls_stateid, sizeof(stateid_t)); ++ put_layout_state_locked(ls); ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return found; ++} ++ ++static int ++cl_has_fsid_layout(struct nfs4_client *clp, struct nfs4_fsid *fsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ ++ /* note: minor version unused */ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) ++ if (lp->lo_file->fi_fsid.major == fsid->major) { ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return found; ++} ++ ++static int ++cl_has_any_layout(struct nfs4_client *clp) ++{ ++ return !list_empty(&clp->cl_layouts); ++} ++ ++static int ++cl_has_layout(struct nfs4_client *clp, struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile, stateid_t *lsid) ++{ ++ switch (cbl->cbl_recall_type) { ++ case RETURN_FILE: ++ return cl_has_file_layout(clp, lrfile, lsid); ++ case RETURN_FSID: ++ return cl_has_fsid_layout(clp, &cbl->cbl_fsid); ++ default: ++ return cl_has_any_layout(clp); ++ } ++} ++ ++/* ++ * Called without the layout_lock. ++ */ ++void ++nomatching_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfsd4_pnfs_layoutreturn lr = { ++ .args.lr_return_type = clr->cb.cbl_recall_type, ++ .args.lr_seg = clr->cb.cbl_seg, ++ }; ++ struct inode *inode; ++ void *recall_cookie; ++ ++ if (clr->clr_file) { ++ inode = igrab(clr->clr_file->fi_inode); ++ if (WARN_ON(!inode)) ++ return; ++ } else { ++ inode = NULL; ++ } ++ ++ dprintk("%s: clp %p fp %p: simulating layout_return\n", __func__, ++ clr->clr_client, clr->clr_file); ++ ++ if (clr->cb.cbl_recall_type == RETURN_FILE) ++ pnfs_return_file_layouts(clr->clr_client, clr->clr_file, &lr); ++ else ++ pnfs_return_client_layouts(clr->clr_client, &lr, ++ clr->cb.cbl_fsid.major); ++ ++ spin_lock(&layout_lock); ++ recall_cookie = layoutrecall_done(clr); ++ spin_unlock(&layout_lock); ++ ++ fs_layout_return(clr->clr_sb, inode, &lr, LR_FLAG_INTERN, ++ recall_cookie); ++ iput(inode); ++} ++ ++void pnfs_expire_client(struct nfs4_client *clp) ++{ ++ for (;;) { ++ struct nfs4_layoutrecall *lrp = NULL; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layoutrecalls)) { ++ lrp = list_entry(clp->cl_layoutrecalls.next, ++ struct nfs4_layoutrecall, clr_perclnt); ++ get_layoutrecall(lrp); ++ } ++ spin_unlock(&layout_lock); ++ if (!lrp) ++ break; ++ ++ dprintk("%s: lrp %p, fp %p\n", __func__, lrp, lrp->clr_file); ++ BUG_ON(lrp->clr_client != clp); ++ nomatching_layout(lrp); ++ put_layoutrecall(lrp); ++ } ++ ++ for (;;) { ++ struct nfs4_layout *lp = NULL; ++ struct inode *inode = NULL; ++ struct nfsd4_pnfs_layoutreturn lr; ++ bool empty = false; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layouts)) { ++ lp = list_entry(clp->cl_layouts.next, ++ struct nfs4_layout, lo_perclnt); ++ inode = igrab(lp->lo_file->fi_inode); ++ memset(&lr, 0, sizeof(lr)); ++ lr.args.lr_return_type = RETURN_FILE; ++ lr.args.lr_seg = lp->lo_seg; ++ empty = list_empty(&lp->lo_file->fi_layouts); ++ BUG_ON(lp->lo_client != clp); ++ dequeue_layout(lp); ++ destroy_layout(lp); /* do not access lp after this */ ++ } ++ spin_unlock(&layout_lock); ++ if (!lp) ++ break; ++ ++ if (WARN_ON(!inode)) ++ break; ++ ++ dprintk("%s: inode %lu lp %p clp %p\n", __func__, inode->i_ino, ++ lp, clp); ++ ++ fs_layout_return(inode->i_sb, inode, &lr, LR_FLAG_EXPIRE, ++ empty ? PNFS_LAST_LAYOUT_NO_RECALLS : NULL); ++ iput(inode); ++ } ++} ++ ++struct create_recall_list_arg { ++ struct nfsd4_pnfs_cb_layout *cbl; ++ struct nfs4_file *lrfile; ++ struct list_head *todolist; ++ unsigned todo_count; ++}; ++ ++/* ++ * look for matching layout for the given client ++ * and add a pending layout recall to the todo list ++ * if found any. ++ * returns: ++ * 0 if layouts found or negative error. ++ */ ++static int ++lo_recall_per_client(struct nfs4_client *clp, void *p) ++{ ++ stateid_t lsid; ++ struct nfs4_layoutrecall *pending; ++ struct create_recall_list_arg *arg = p; ++ ++ memset(&lsid, 0, sizeof(lsid)); ++ if (!cl_has_layout(clp, arg->cbl, arg->lrfile, &lsid)) ++ return 0; ++ ++ /* Matching put done by layoutreturn */ ++ pending = alloc_init_layoutrecall(arg->cbl, clp, arg->lrfile); ++ /* out of memory, drain todo queue */ ++ if (!pending) ++ return -ENOMEM; ++ ++ *(stateid_t *)&pending->cb.cbl_sid = lsid; ++ list_add(&pending->clr_perclnt, arg->todolist); ++ arg->todo_count++; ++ return 0; ++} ++ ++/* Create a layoutrecall structure for each client based on the ++ * original structure. */ ++int ++create_layout_recall_list(struct list_head *todolist, unsigned *todo_len, ++ struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_client *clp; ++ struct create_recall_list_arg arg = { ++ .cbl = cbl, ++ .lrfile = lrfile, ++ .todolist = todolist, ++ }; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ /* If client given by fs, just do single client */ ++ if (cbl->cbl_seg.clientid) { ++ clp = find_confirmed_client( ++ (clientid_t *)&cbl->cbl_seg.clientid); ++ if (!clp) { ++ status = -ENOENT; ++ dprintk("%s: clientid %llx not found\n", __func__, ++ (unsigned long long)cbl->cbl_seg.clientid); ++ goto out; ++ } ++ ++ status = lo_recall_per_client(clp, &arg); ++ } else { ++ /* Check all clients for layout matches */ ++ status = filter_confirmed_clients(lo_recall_per_client, &arg); ++ } ++ ++out: ++ *todo_len = arg.todo_count; ++ dprintk("%s: <-- list len %u status %d\n", __func__, *todo_len, status); ++ return status; ++} ++ ++/* ++ * Recall layouts asynchronously ++ * Called with state lock. ++ */ ++static int ++spawn_layout_recall(struct super_block *sb, struct list_head *todolist, ++ unsigned todo_len) ++{ ++ struct nfs4_layoutrecall *pending; ++ struct nfs4_layoutrecall *parent = NULL; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ if (todo_len > 1) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ ++ parent = alloc_init_layoutrecall(&pending->cb, NULL, ++ pending->clr_file); ++ if (unlikely(!parent)) { ++ /* We want forward progress. If parent cannot be ++ * allocated take the first one as parent but don't ++ * execute it. Caller must check for -EAGAIN, if so ++ * When the partial recalls return, ++ * nfsd_layout_recall_cb should be called again. ++ */ ++ list_del_init(&pending->clr_perclnt); ++ if (todo_len > 2) { ++ parent = pending; ++ } else { ++ parent = NULL; ++ put_layoutrecall(pending); ++ } ++ --todo_len; ++ status = -ENOMEM; ++ } ++ } ++ ++ while (!list_empty(todolist)) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ list_del_init(&pending->clr_perclnt); ++ dprintk("%s: clp %p cb_client %p fp %p\n", __func__, ++ pending->clr_client, ++ pending->clr_client->cl_cb_client, ++ pending->clr_file); ++ if (unlikely(!pending->clr_client->cl_cb_client)) { ++ printk(KERN_INFO ++ "%s: clientid %08x/%08x has no callback path\n", ++ __func__, ++ pending->clr_client->cl_clientid.cl_boot, ++ pending->clr_client->cl_clientid.cl_id); ++ put_layoutrecall(pending); ++ continue; ++ } ++ ++ pending->clr_time = CURRENT_TIME; ++ pending->clr_sb = sb; ++ if (parent) { ++ /* If we created a parent its initial ref count is 1. ++ * We will need to de-ref it eventually. So we just ++ * don't increment on behalf of the last one. ++ */ ++ if (todo_len != 1) ++ get_layoutrecall(parent); ++ } ++ pending->parent = parent; ++ get_layoutrecall(pending); ++ /* Add to list so corresponding layoutreturn can find req */ ++ list_add(&pending->clr_perclnt, ++ &pending->clr_client->cl_layoutrecalls); ++ ++ nfsd4_cb_layout(pending); ++ --todo_len; ++ } ++ ++ return status; ++} ++ ++/* ++ * Spawn a thread to perform a recall layout ++ * ++ */ ++int nfsd_layout_recall_cb(struct super_block *sb, struct inode *inode, ++ struct nfsd4_pnfs_cb_layout *cbl) ++{ ++ int status; ++ struct nfs4_file *lrfile = NULL; ++ struct list_head todolist; ++ unsigned todo_len = 0; ++ ++ dprintk("NFSD nfsd_layout_recall_cb: inode %p cbl %p\n", inode, cbl); ++ BUG_ON(!cbl); ++ BUG_ON(cbl->cbl_recall_type != RETURN_FILE && ++ cbl->cbl_recall_type != RETURN_FSID && ++ cbl->cbl_recall_type != RETURN_ALL); ++ BUG_ON(cbl->cbl_recall_type == RETURN_FILE && !inode); ++ BUG_ON(cbl->cbl_seg.iomode != IOMODE_READ && ++ cbl->cbl_seg.iomode != IOMODE_RW && ++ cbl->cbl_seg.iomode != IOMODE_ANY); ++ ++ if (nfsd_serv == NULL) { ++ dprintk("NFSD nfsd_layout_recall_cb: nfsd_serv == NULL\n"); ++ return -ENOENT; ++ } ++ ++ nfs4_lock_state(); ++ status = -ENOENT; ++ if (inode) { ++ lrfile = find_file(inode); ++ if (!lrfile) { ++ dprintk("NFSD nfsd_layout_recall_cb: " ++ "nfs4_file not found\n"); ++ goto err; ++ } ++ if (cbl->cbl_recall_type == RETURN_FSID) ++ cbl->cbl_fsid = lrfile->fi_fsid; ++ } ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ /* If no cookie provided by FS, return a default one */ ++ if (!cbl->cbl_cookie) ++ cbl->cbl_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ ++ status = create_layout_recall_list(&todolist, &todo_len, cbl, lrfile); ++ if (list_empty(&todolist)) { ++ status = -ENOENT; ++ } else { ++ /* process todolist even if create_layout_recall_list ++ * returned an error */ ++ int status2 = spawn_layout_recall(sb, &todolist, todo_len); ++ if (status2) ++ status = status2; ++ } ++ ++err: ++ nfs4_unlock_state(); ++ if (lrfile) ++ put_nfs4_file(lrfile); ++ return (todo_len && status) ? -EAGAIN : status; ++} ++ ++struct create_device_notify_list_arg { ++ struct list_head *todolist; ++ struct nfsd4_pnfs_cb_dev_list *ndl; ++}; ++ ++static int ++create_device_notify_per_cl(struct nfs4_client *clp, void *p) ++{ ++ struct nfs4_notify_device *cbnd; ++ struct create_device_notify_list_arg *arg = p; ++ ++ if (atomic_read(&clp->cl_deviceref) <= 0) ++ return 0; ++ ++ cbnd = kmalloc(sizeof(*cbnd), GFP_KERNEL); ++ if (!cbnd) ++ return -ENOMEM; ++ ++ cbnd->nd_list = arg->ndl; ++ cbnd->nd_client = clp; ++ list_add(&cbnd->nd_perclnt, arg->todolist); ++ return 0; ++} ++ ++/* Create a list of clients to send device notifications. */ ++int ++create_device_notify_list(struct list_head *todolist, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ int status; ++ struct create_device_notify_list_arg arg = { ++ .todolist = todolist, ++ .ndl = ndl, ++ }; ++ ++ nfs4_lock_state(); ++ status = filter_confirmed_clients(create_device_notify_per_cl, &arg); ++ nfs4_unlock_state(); ++ ++ return status; ++} ++ ++/* ++ * For each client that a device, send a device notification. ++ * XXX: Need to track which clients have which devices. ++ */ ++int nfsd_device_notify_cb(struct super_block *sb, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ struct nfs4_notify_device *cbnd; ++ unsigned int notify_num = 0; ++ int status2, status = 0; ++ struct list_head todolist; ++ ++ BUG_ON(!ndl || ndl->cbd_len == 0 || !ndl->cbd_list); ++ ++ dprintk("NFSD %s: cbl %p len %u\n", __func__, ndl, ndl->cbd_len); ++ ++ if (nfsd_serv == NULL) ++ return -ENOENT; ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ status = create_device_notify_list(&todolist, ndl); ++ ++ while (!list_empty(&todolist)) { ++ cbnd = list_entry(todolist.next, struct nfs4_notify_device, ++ nd_perclnt); ++ list_del_init(&cbnd->nd_perclnt); ++ status2 = nfsd4_cb_notify_device(cbnd); ++ pnfs_clear_device_notify(cbnd->nd_client); ++ if (status2) { ++ kfree(cbnd); ++ status = status2; ++ } ++ notify_num++; ++ } ++ ++ dprintk("NFSD %s: status %d clients %u\n", ++ __func__, status, notify_num); ++ return status; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-31 20:42:05.557222774 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-31 20:42:05.557222774 -0400 +@@ -0,0 +1,461 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsfh.h" ++#include "nfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Just use a linked list. Do not expect more than 32 dlm_device_entries ++ * the first implementation will just use one device per cluster file system ++ */ ++ ++static LIST_HEAD(dlm_device_list); ++static DEFINE_SPINLOCK(dlm_device_list_lock); ++ ++struct dlm_device_entry { ++ struct list_head dlm_dev_list; ++ char disk_name[DISK_NAME_LEN]; ++ int num_ds; ++ char ds_list[NFSD_DLM_DS_LIST_MAX]; ++}; ++ ++static struct dlm_device_entry * ++_nfsd4_find_pnfs_dlm_device(char *disk_name) ++{ ++ struct dlm_device_entry *dlm_pdev; ++ ++ dprintk("--> %s disk name %s\n", __func__, disk_name); ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) { ++ dprintk("%s Look for dlm_pdev %s\n", __func__, ++ dlm_pdev->disk_name); ++ if (!memcmp(dlm_pdev->disk_name, disk_name, strlen(disk_name))) { ++ spin_unlock(&dlm_device_list_lock); ++ return dlm_pdev; ++ } ++ } ++ spin_unlock(&dlm_device_list_lock); ++ return NULL; ++} ++ ++static struct dlm_device_entry * ++nfsd4_find_pnfs_dlm_device(struct super_block *sb) { ++ char dname[BDEVNAME_SIZE]; ++ ++ bdevname(sb->s_bdev, dname); ++ return _nfsd4_find_pnfs_dlm_device(dname); ++} ++ ++ssize_t ++nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen) ++{ ++ char *pos = buf; ++ ssize_t size = 0; ++ struct dlm_device_entry *dlm_pdev; ++ int ret = -EINVAL; ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) ++ { ++ int advanced; ++ advanced = snprintf(pos, buflen - size, "%s:%s\n", dlm_pdev->disk_name, dlm_pdev->ds_list); ++ if (advanced >= buflen - size) ++ goto out; ++ size += advanced; ++ pos += advanced; ++ } ++ ret = size; ++ ++out: ++ spin_unlock(&dlm_device_list_lock); ++ return ret; ++} ++ ++bool nfsd4_validate_pnfs_dlm_device(char *ds_list, int *num_ds) ++{ ++ char *start = ds_list; ++ ++ *num_ds = 0; ++ ++ while (*start) { ++ struct sockaddr_storage tempAddr; ++ int ipLen = strcspn(start, ","); ++ ++ if (!rpc_pton(start, ipLen, (struct sockaddr *)&tempAddr, sizeof(tempAddr))) ++ return false; ++ (*num_ds)++; ++ start += ipLen + 1; ++ } ++ return true; ++} ++ ++/* ++ * pnfs_dlm_device string format: ++ * block-device-path:, ++ * ++ * Examples ++ * /dev/sda:192.168.1.96,192.168.1.97' creates a data server list with ++ * two data servers for the dlm cluster file system mounted on /dev/sda. ++ * ++ * /dev/sda:192.168.1.96,192.168.1.100' ++ * replaces the data server list for /dev/sda ++ * ++ * Only the deviceid == 1 is supported. Can add device id to ++ * pnfs_dlm_device string when needed. ++ * ++ * Only the round robin each data server once stripe index is supported. ++ */ ++int ++nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len) ++ ++{ ++ struct dlm_device_entry *new, *found; ++ char *bufp = pnfs_dlm_device; ++ char *endp = bufp + strlen(bufp); ++ int err = -ENOMEM; ++ ++ dprintk("--> %s len %d\n", __func__, len); ++ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return err; ++ ++ err = -EINVAL; ++ /* disk_name */ ++ /* FIXME: need to check for valid disk_name. search superblocks? ++ * check for slash dev slash ? ++ */ ++ len = strcspn(bufp, ":"); ++ if (len > DISK_NAME_LEN) ++ goto out_free; ++ memcpy(new->disk_name, bufp, len); ++ ++ err = -EINVAL; ++ bufp += len + 1; ++ if (bufp >= endp) ++ goto out_free; ++ ++ /* data server list */ ++ /* FIXME: need to check for comma separated valid ip format */ ++ len = strcspn(bufp, ":"); ++ if (len > NFSD_DLM_DS_LIST_MAX) ++ goto out_free; ++ memcpy(new->ds_list, bufp, len); ++ ++ ++ /* validate the ips */ ++ if (!nfsd4_validate_pnfs_dlm_device(new->ds_list, &(new->num_ds))) ++ goto out_free; ++ ++ dprintk("%s disk_name %s num_ds %d ds_list %s\n", __func__, ++ new->disk_name, new->num_ds, new->ds_list); ++ ++ found = _nfsd4_find_pnfs_dlm_device(new->disk_name); ++ if (found) { ++ /* FIXME: should compare found->ds_list with new->ds_list ++ * and if it is different, kick off a CB_NOTIFY change ++ * deviceid. ++ */ ++ dprintk("%s pnfs_dlm_device %s:%s already in cache " ++ " replace ds_list with new ds_list %s\n", __func__, ++ found->disk_name, found->ds_list, new->ds_list); ++ memset(found->ds_list, 0, DISK_NAME_LEN); ++ memcpy(found->ds_list, new->ds_list, strlen(new->ds_list)); ++ found->num_ds = new->num_ds; ++ kfree(new); ++ } else { ++ dprintk("%s Adding pnfs_dlm_device %s:%s\n", __func__, ++ new->disk_name, new->ds_list); ++ spin_lock(&dlm_device_list_lock); ++ list_add(&new->dlm_dev_list, &dlm_device_list); ++ spin_unlock(&dlm_device_list_lock); ++ } ++ dprintk("<-- %s Success\n", __func__); ++ return 0; ++ ++out_free: ++ kfree(new); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ struct dlm_device_entry *dlm_pdev, *next; ++ ++ dprintk("--> %s\n", __func__); ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry_safe (dlm_pdev, next, &dlm_device_list, ++ dlm_dev_list) { ++ list_del(&dlm_pdev->dlm_dev_list); ++ kfree(dlm_pdev); ++ } ++ spin_unlock(&dlm_device_list_lock); ++} ++ ++static int nfsd4_pnfs_dlm_getdeviter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ printk(KERN_ERR "%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return -ENOTSUPP; ++ } ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ return 0; ++} ++ ++static int nfsd4_pnfs_dlm_getdevinfo(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err, len, i = 0; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_devaddr *daddr; ++ struct dlm_device_entry *dlm_pdev; ++ char *bufp; ++ ++ err = -ENOTSUPP; ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ dprintk("%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return err; ++ } ++ ++ /* We only hand out a deviceid of 1 in LAYOUTGET, so a GETDEVICEINFO ++ * with a gdia_device_id != 1 is invalid. ++ */ ++ err = -EINVAL; ++ if (devid->devid != 1) { ++ dprintk("%s: WARNING: didn't receive a deviceid of " ++ "1 (got: 0x%llx)\n", __func__, devid->devid); ++ return err; ++ } ++ ++ /* ++ * If the DS list has not been established, return -EINVAL ++ */ ++ dlm_pdev = nfsd4_find_pnfs_dlm_device(sb); ++ if (!dlm_pdev) { ++ dprintk("%s: DEBUG: disk %s Not Found\n", __func__, ++ sb->s_bdev->bd_disk->disk_name); ++ return err; ++ } ++ ++ dprintk("%s: Found disk %s with DS list |%s|\n", ++ __func__, dlm_pdev->disk_name, dlm_pdev->ds_list); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ fdev.fl_device_length = dlm_pdev->num_ds; ++ ++ err = -ENOMEM; ++ len = sizeof(*fdev.fl_device_list) * fdev.fl_device_length; ++ fdev.fl_device_list = kzalloc(len, GFP_KERNEL); ++ if (!fdev.fl_device_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device list " ++ "buffer for %d DSes.\n", __func__, i); ++ fdev.fl_device_length = 0; ++ goto out; ++ } ++ ++ /* Set a simple stripe indicie */ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = kzalloc(sizeof(u32) * ++ fdev.fl_stripeindices_length, GFP_KERNEL); ++ ++ if (!fdev.fl_stripeindices_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a stripeindices " ++ "list buffer for %d DSes.\n", __func__, i); ++ goto out; ++ } ++ for (i = 0; i < fdev.fl_stripeindices_length; i++) ++ fdev.fl_stripeindices_list[i] = i; ++ ++ /* Transfer the data server list with a single multipath entry */ ++ bufp = dlm_pdev->ds_list; ++ for (i = 0; i < fdev.fl_device_length; i++) { ++ daddr = kmalloc(sizeof(*daddr), GFP_KERNEL); ++ if (!daddr) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device " ++ "addr buffer.\n", __func__); ++ goto out; ++ } ++ ++ daddr->r_netid.data = "tcp"; ++ daddr->r_netid.len = 3; ++ ++ len = strcspn(bufp, ","); ++ daddr->r_addr.data = kmalloc(len + 4, GFP_KERNEL); ++ memcpy(daddr->r_addr.data, bufp, len); ++ /* ++ * append the port number. interpreted as two more bytes ++ * beyond the quad: ".8.1" -> 0x08.0x01 -> 0x0801 = port 2049. ++ */ ++ memcpy(daddr->r_addr.data + len, ".8.1", 4); ++ daddr->r_addr.len = len + 4; ++ ++ fdev.fl_device_list[i].fl_multipath_length = 1; ++ fdev.fl_device_list[i].fl_multipath_list = daddr; ++ ++ dprintk("%s: encoding DS |%s|\n", __func__, bufp); ++ ++ bufp += len + 1; ++ } ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ for (i = 0; i < fdev.fl_device_length; i++) ++ kfree(fdev.fl_device_list[i].fl_multipath_list); ++ kfree(fdev.fl_device_list); ++ kfree(fdev.fl_stripeindices_list); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize >= NFSSVC_MAXBLKSIZE) ++ return blocksize; ++ return NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++} ++ ++/* ++ * Look up inode block device in pnfs_dlm_device list. ++ * Hash on the inode->i_ino and number of data servers. ++ */ ++static int dlm_ino_hash(struct inode *ino) ++{ ++ struct dlm_device_entry *de; ++ u32 hash_mask = 0; ++ ++ /* If can't find the inode block device in the pnfs_dlm_deivce list ++ * then don't hand out a layout ++ */ ++ de = nfsd4_find_pnfs_dlm_device(ino->i_sb); ++ if (!de) ++ return -1; ++ hash_mask = de->num_ds - 1; ++ return ino->i_ino & hash_mask; ++} ++ ++static enum nfsstat4 nfsd4_pnfs_dlm_layoutget(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ int index; ++ enum nfsstat4 rc = NFS4_OK; ++ ++ dprintk("%s: LAYOUT_GET\n", __func__); ++ ++ /* DLM exported file systems only support layouts for READ */ ++ if (res->lg_seg.iomode == IOMODE_RW) ++ return NFS4ERR_BADIOMODE; ++ ++ index = dlm_ino_hash(inode); ++ dprintk("%s first stripe index %d i_ino %lu\n", __func__, index, ++ inode->i_ino); ++ if (index < 0) ++ return NFS4ERR_LAYOUTUNAVAILABLE; ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ /* Always give out whole file layouts */ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ /* Always give out READ ONLY layouts */ ++ res->lg_seg.iomode = IOMODE_READ; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = false; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = args->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = index; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ memcpy(fhp, args->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++nfsd4_pnfs_dlm_layouttype(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++const struct pnfs_export_operations pnfs_dlm_export_ops = { ++ .layout_type = nfsd4_pnfs_dlm_layouttype, ++ .get_device_info = nfsd4_pnfs_dlm_getdevinfo, ++ .get_device_iter = nfsd4_pnfs_dlm_getdeviter, ++ .layout_get = nfsd4_pnfs_dlm_layoutget, ++}; ++EXPORT_SYMBOL(pnfs_dlm_export_ops); +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-31 20:42:05.558141620 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-31 20:42:05.558141620 -0400 +@@ -0,0 +1,620 @@ ++/* ++* linux/fs/nfsd/nfs4pnfsds.c ++* ++* Copyright (c) 2005 The Regents of the University of Michigan. ++* All rights reserved. ++* ++* Andy Adamson ++* ++* Redistribution and use in source and binary forms, with or without ++* modification, are permitted provided that the following conditions ++* are met: ++* ++* 1. Redistributions of source code must retain the above copyright ++* notice, this list of conditions and the following disclaimer. ++* 2. Redistributions in binary form must reproduce the above copyright ++* notice, this list of conditions and the following disclaimer in the ++* documentation and/or other materials provided with the distribution. ++* 3. Neither the name of the University nor the names of its ++* contributors may be used to endorse or promote products derived ++* from this software without specific prior written permission. ++* ++* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++* ++*/ ++#if defined(CONFIG_PNFSD) ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsd.h" ++#include "pnfsd.h" ++#include "state.h" ++ ++/* ++ ******************* ++ * PNFS ++ ******************* ++ */ ++/* ++ * Hash tables for pNFS Data Server state ++ * ++ * mds_nodeid: list of struct pnfs_mds_id one per Metadata server (MDS) using ++ * this data server (DS). ++ * ++ * mds_clid_hashtbl[]: uses clientid_hashval(), hash of all clientids obtained ++ * from any MDS. ++ * ++ * ds_stid_hashtbl[]: uses stateid_hashval(), hash of all stateids obtained ++ * from any MDS. ++ * ++ */ ++/* Hash tables for clientid state */ ++#define CLIENT_HASH_BITS 4 ++#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) ++#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) ++ ++#define clientid_hashval(id) \ ++ ((id) & CLIENT_HASH_MASK) ++ ++/* hash table for pnfs_ds_stateid */ ++#define STATEID_HASH_BITS 10 ++#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) ++#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) ++ ++#define stateid_hashval(owner_id, file_id) \ ++ (((owner_id) + (file_id)) & STATEID_HASH_MASK) ++ ++static struct list_head mds_id_tbl; ++static struct list_head mds_clid_hashtbl[CLIENT_HASH_SIZE]; ++static struct list_head ds_stid_hashtbl[STATEID_HASH_SIZE]; ++ ++static inline void put_ds_clientid(struct pnfs_ds_clientid *dcp); ++static inline void put_ds_mdsid(struct pnfs_mds_id *mdp); ++ ++/* Mutex for data server state. Needs to be separate from ++ * mds state mutex since a node can be both mds and ds */ ++static DEFINE_MUTEX(ds_mutex); ++static struct thread_info *ds_mutex_owner; ++ ++static void ++ds_lock_state(void) ++{ ++ mutex_lock(&ds_mutex); ++ ds_mutex_owner = current_thread_info(); ++} ++ ++static void ++ds_unlock_state(void) ++{ ++ BUG_ON(ds_mutex_owner != current_thread_info()); ++ ds_mutex_owner = NULL; ++ mutex_unlock(&ds_mutex); ++} ++ ++static int ++cmp_clid(const clientid_t *cl1, const clientid_t *cl2) ++{ ++ return (cl1->cl_boot == cl2->cl_boot) && ++ (cl1->cl_id == cl2->cl_id); ++} ++ ++void ++nfs4_pnfs_state_init(void) ++{ ++ int i; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&mds_clid_hashtbl[i]); ++ ++ for (i = 0; i < STATEID_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&ds_stid_hashtbl[i]); ++ ++ INIT_LIST_HEAD(&mds_id_tbl); ++} ++ ++static struct pnfs_mds_id * ++find_pnfs_mds_id(u32 mdsid) ++{ ++ struct pnfs_mds_id *local = NULL; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ list_for_each_entry(local, &mds_id_tbl, di_hash) { ++ if (local->di_mdsid == mdsid) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_clientid * ++find_pnfs_ds_clientid(const clientid_t *clid) ++{ ++ struct pnfs_ds_clientid *local = NULL; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = clientid_hashval(clid->cl_id); ++ list_for_each_entry(local, &mds_clid_hashtbl[hashval], dc_hash) { ++ if (cmp_clid(&local->dc_mdsclid, clid)) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_stateid * ++find_pnfs_ds_stateid(stateid_t *stid) ++{ ++ struct pnfs_ds_stateid *local = NULL; ++ u32 st_id = stid->si_stateownerid; ++ u32 f_id = stid->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_for_each_entry(local, &ds_stid_hashtbl[hashval], ds_hash) ++ if ((local->ds_stid.si_stateownerid == st_id) && ++ (local->ds_stid.si_fileid == f_id) && ++ (local->ds_stid.si_boot == stid->si_boot)) { ++ stateid_t *sid = &local->ds_stid; ++ dprintk("NFSD: %s <-- %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, local, local->ds_flags, ++ STATEID_VAL(sid)); ++ return local; ++ } ++ return NULL; ++} ++ ++static void ++release_ds_mdsid(struct kref *kref) ++{ ++ struct pnfs_mds_id *mdp = ++ container_of(kref, struct pnfs_mds_id, di_ref); ++ dprintk("pNFSD: %s\n", __func__); ++ ++ list_del(&mdp->di_hash); ++ list_del(&mdp->di_mdsclid); ++ kfree(mdp); ++} ++ ++static void ++release_ds_clientid(struct kref *kref) ++{ ++ struct pnfs_ds_clientid *dcp = ++ container_of(kref, struct pnfs_ds_clientid, dc_ref); ++ struct pnfs_mds_id *mdp; ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(dcp->dc_mdsid); ++ if (mdp) ++ put_ds_mdsid(mdp); ++ ++ list_del(&dcp->dc_hash); ++ list_del(&dcp->dc_stateid); ++ list_del(&dcp->dc_permdsid); ++ kfree(dcp); ++} ++ ++static void ++release_ds_stateid(struct kref *kref) ++{ ++ struct pnfs_ds_stateid *dsp = ++ container_of(kref, struct pnfs_ds_stateid, ds_ref); ++ struct pnfs_ds_clientid *dcp; ++ dprintk("pNFS %s: dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid(&dsp->ds_mdsclid); ++ if (dcp) ++ put_ds_clientid(dcp); ++ ++ list_del(&dsp->ds_hash); ++ list_del(&dsp->ds_perclid); ++ kfree(dsp); ++} ++ ++static inline void ++put_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_put(&dcp->dc_ref, release_ds_clientid); ++} ++ ++static inline void ++get_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_get(&dcp->dc_ref); ++} ++ ++static inline void ++put_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_put(&mdp->di_ref, release_ds_mdsid); ++} ++ ++static inline void ++get_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_get(&mdp->di_ref); ++} ++ ++static inline void ++put_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_put(&dsp->ds_ref, release_ds_stateid); ++} ++ ++static inline void ++get_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_get(&dsp->ds_ref); ++} ++ ++void ++nfs4_pnfs_state_shutdown(void) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int i; ++ ++ dprintk("pNFSD %s: -->\n", __func__); ++ ++ ds_lock_state(); ++ for (i = 0; i < STATEID_HASH_SIZE; i++) { ++ while (!list_empty(&ds_stid_hashtbl[i])) { ++ dsp = list_entry(ds_stid_hashtbl[i].next, ++ struct pnfs_ds_stateid, ds_hash); ++ put_ds_stateid(dsp); ++ } ++ } ++ ds_unlock_state(); ++} ++ ++static struct pnfs_mds_id * ++alloc_init_mds_id(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = kmalloc(sizeof(*mdp), GFP_KERNEL); ++ if (!mdp) ++ return NULL; ++ INIT_LIST_HEAD(&mdp->di_hash); ++ INIT_LIST_HEAD(&mdp->di_mdsclid); ++ list_add(&mdp->di_hash, &mds_id_tbl); ++ mdp->di_mdsid = gsp->dsid; ++ mdp->di_mdsboot = 0; ++ kref_init(&mdp->di_ref); ++ return mdp; ++} ++ ++static struct pnfs_ds_clientid * ++alloc_init_ds_clientid(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ struct pnfs_ds_clientid *dcp; ++ clientid_t *clid = (clientid_t *)&gsp->clid; ++ unsigned int hashval = clientid_hashval(clid->cl_id); ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(gsp->dsid); ++ if (!mdp) { ++ mdp = alloc_init_mds_id(gsp); ++ if (!mdp) ++ return NULL; ++ } else { ++ get_ds_mdsid(mdp); ++ } ++ ++ dcp = kmalloc(sizeof(*dcp), GFP_KERNEL); ++ if (!dcp) ++ return NULL; ++ ++ INIT_LIST_HEAD(&dcp->dc_hash); ++ INIT_LIST_HEAD(&dcp->dc_stateid); ++ INIT_LIST_HEAD(&dcp->dc_permdsid); ++ list_add(&dcp->dc_hash, &mds_clid_hashtbl[hashval]); ++ list_add(&dcp->dc_permdsid, &mdp->di_mdsclid); ++ dcp->dc_mdsclid = *clid; ++ kref_init(&dcp->dc_ref); ++ dcp->dc_mdsid = gsp->dsid; ++ return dcp; ++} ++ ++static struct pnfs_ds_stateid * ++alloc_init_ds_stateid(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct pnfs_ds_stateid *dsp; ++ u32 st_id = stidp->si_stateownerid; ++ u32 f_id = stidp->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ dsp = kmalloc(sizeof(*dsp), GFP_KERNEL); ++ if (!dsp) ++ return dsp; ++ ++ INIT_LIST_HEAD(&dsp->ds_hash); ++ INIT_LIST_HEAD(&dsp->ds_perclid); ++ memcpy(&dsp->ds_stid, stidp, sizeof(stateid_t)); ++ fh_copy_shallow(&dsp->ds_fh, &cfh->fh_handle); ++ dsp->ds_access = 0; ++ dsp->ds_status = 0; ++ dsp->ds_flags = 0L; ++ kref_init(&dsp->ds_ref); ++ set_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ clear_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ init_waitqueue_head(&dsp->ds_waitq); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_add(&dsp->ds_hash, &ds_stid_hashtbl[hashval]); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++static int ++update_ds_stateid(struct pnfs_ds_stateid *dsp, struct svc_fh *cfh, ++ struct pnfs_get_state *gsp) ++{ ++ struct pnfs_ds_clientid *dcp; ++ int new = 0; ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid((clientid_t *)&gsp->clid); ++ if (!dcp) { ++ dcp = alloc_init_ds_clientid(gsp); ++ if (!dcp) ++ return 1; ++ new = 1; ++ } ++ if (test_bit(DS_STATEID_NEW, &dsp->ds_flags)) { ++ list_add(&dsp->ds_perclid, &dcp->dc_stateid); ++ if (!new) ++ get_ds_clientid(dcp); ++ } ++ ++ memcpy(&dsp->ds_stid, &gsp->stid, sizeof(stateid_t)); ++ dsp->ds_access = gsp->access; ++ dsp->ds_status = 0; ++ dsp->ds_verifier[0] = gsp->verifier[0]; ++ dsp->ds_verifier[1] = gsp->verifier[1]; ++ memcpy(&dsp->ds_mdsclid, &gsp->clid, sizeof(clientid_t)); ++ set_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ clear_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ return 0; ++} ++ ++int ++nfs4_pnfs_cb_change_state(struct pnfs_get_state *gs) ++{ ++ stateid_t *stid = (stateid_t *)&gs->stid; ++ struct pnfs_ds_stateid *dsp; ++ ++ dprintk("pNFSD: %s stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stid)); ++ ++ ds_lock_state(); ++ dsp = find_pnfs_ds_stateid(stid); ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ if (dsp) ++ return 0; ++ return -ENOENT; ++} ++ ++/* Retrieves and validates stateid. ++ * If stateid exists and its fields match, return it. ++ * If stateid exists but either the generation or ++ * ownerids don't match, check with mds to see if it is valid. ++ * If the stateid doesn't exist, the first thread creates a ++ * invalid *marker* stateid, then checks to see if the ++ * stateid exists on the mds. If so, it validates the *marker* ++ * stateid and updates its fields. Subsequent threads that ++ * find the *marker* stateid wait until it is valid or an error ++ * occurs. ++ * Called with ds_state_lock. ++ */ ++static struct pnfs_ds_stateid * ++nfsv4_ds_get_state(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct inode *ino = cfh->fh_dentry->d_inode; ++ struct super_block *sb; ++ struct pnfs_ds_stateid *dsp = NULL; ++ struct pnfs_get_state gs = { ++ .access = 0, ++ }; ++ int status = 0, waiter = 0; ++ ++ dprintk("pNFSD: %s -->\n", __func__); ++ ++ dsp = find_pnfs_ds_stateid(stidp); ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags) && ++ (stidp->si_generation == dsp->ds_stid.si_generation)) ++ goto out_noput; ++ ++ sb = ino->i_sb; ++ if (!sb || !sb->s_pnfs_op->get_state) ++ goto out_noput; ++ ++ /* Uninitialize current state if it exists yet it doesn't match. ++ * If it is already invalid, another thread is checking state */ ++ if (dsp) { ++ if (!test_and_clear_bit(DS_STATEID_VALID, &dsp->ds_flags)) ++ waiter = 1; ++ } else { ++ dsp = alloc_init_ds_stateid(cfh, stidp); ++ if (!dsp) ++ goto out_noput; ++ } ++ ++ dprintk("pNFSD: %s Starting loop\n", __func__); ++ get_ds_stateid(dsp); ++ while (!test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ ds_unlock_state(); ++ ++ /* Another thread is checking the state */ ++ if (waiter) { ++ dprintk("pNFSD: %s waiting\n", __func__); ++ wait_event_interruptible_timeout(dsp->ds_waitq, ++ (test_bit(DS_STATEID_VALID, &dsp->ds_flags) || ++ test_bit(DS_STATEID_ERROR, &dsp->ds_flags)), ++ msecs_to_jiffies(1024)); ++ dprintk("pNFSD: %s awake\n", __func__); ++ ds_lock_state(); ++ if (test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ goto out; ++ ++ continue; ++ } ++ ++ /* Validate stateid on mds */ ++ dprintk("pNFSD: %s Checking state on MDS\n", __func__); ++ memcpy(&gs.stid, stidp, sizeof(stateid_t)); ++ status = sb->s_pnfs_op->get_state(ino, &cfh->fh_handle, &gs); ++ dprintk("pNFSD: %s from MDS status %d\n", __func__, status); ++ ds_lock_state(); ++ /* if !status and stateid is valid, update id and mark valid */ ++ if (status || update_ds_stateid(dsp, cfh, &gs)) { ++ set_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ /* remove invalid stateid from list */ ++ put_ds_stateid(dsp); ++ wake_up(&dsp->ds_waitq); ++ goto out; ++ } ++ ++ wake_up(&dsp->ds_waitq); ++ } ++out: ++ if (dsp) ++ put_ds_stateid(dsp); ++out_noput: ++ if (dsp) ++ dprintk("pNFSD: %s <-- dsp %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, dsp, dsp->ds_flags, STATEID_VAL(&dsp->ds_stid)); ++ /* If error, return null */ ++ if (dsp && test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ dsp = NULL; ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++int ++nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *cfh, stateid_t *stateid) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int status = 0; ++ ++ dprintk("pNFSD: %s --> " STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ /* Must release state lock while verifying stateid on mds */ ++ nfs4_unlock_state(); ++ ds_lock_state(); ++ dsp = nfsv4_ds_get_state(cfh, stateid); ++ if (dsp) { ++ get_ds_stateid(dsp); ++ dprintk("pNFSD: %s Found " STATEID_FMT "\n", __func__, ++ STATEID_VAL(&dsp->ds_stid)); ++ ++ dprintk("NFSD: %s: dsp %p fh_size %u:%u " ++ "fh [%08x:%08x:%08x:%08x]:[%08x:%08x:%08x:%08x] " ++ "gen %x:%x\n", ++ __func__, dsp, ++ cfh->fh_handle.fh_size, dsp->ds_fh.fh_size, ++ ((unsigned *)&cfh->fh_handle.fh_base)[0], ++ ((unsigned *)&cfh->fh_handle.fh_base)[1], ++ ((unsigned *)&cfh->fh_handle.fh_base)[2], ++ ((unsigned *)&cfh->fh_handle.fh_base)[3], ++ ((unsigned *)&dsp->ds_fh.fh_base)[0], ++ ((unsigned *)&dsp->ds_fh.fh_base)[1], ++ ((unsigned *)&dsp->ds_fh.fh_base)[2], ++ ((unsigned *)&dsp->ds_fh.fh_base)[3], ++ stateid->si_generation, dsp->ds_stid.si_generation); ++ } ++ ++ if (!dsp || ++ (cfh->fh_handle.fh_size != dsp->ds_fh.fh_size) || ++ (memcmp(&cfh->fh_handle.fh_base, &dsp->ds_fh.fh_base, ++ dsp->ds_fh.fh_size) != 0) || ++ (stateid->si_generation > dsp->ds_stid.si_generation)) ++ status = nfserr_bad_stateid; ++ else if (stateid->si_generation < dsp->ds_stid.si_generation) ++ status = nfserr_old_stateid; ++ ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ nfs4_lock_state(); ++ dprintk("pNFSD: %s <-- status %d\n", __func__, be32_to_cpu(status)); ++ return status; ++} ++ ++void ++nfs4_ds_get_verifier(stateid_t *stateid, struct super_block *sb, u32 *p) ++{ ++ struct pnfs_ds_stateid *dsp = NULL; ++ ++ dprintk("pNFSD: %s --> stid %p\n", __func__, stateid); ++ ++ ds_lock_state(); ++ if (stateid != NULL) { ++ dsp = find_pnfs_ds_stateid(stateid); ++ if (dsp) ++ get_ds_stateid(dsp); ++ } ++ ++ /* XXX: Should we fetch the stateid or wait if some other ++ * thread is currently retrieving the stateid ? */ ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ *p++ = dsp->ds_verifier[0]; ++ *p++ = dsp->ds_verifier[1]; ++ put_ds_stateid(dsp); ++ } else { ++ /* must be on MDS */ ++ ds_unlock_state(); ++ sb->s_pnfs_op->get_verifier(sb, p); ++ ds_lock_state(); ++ p += 2; ++ } ++ ds_unlock_state(); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-31 20:41:19.198160463 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-31 20:42:05.559129617 -0400 +@@ -34,10 +34,14 @@ + */ + #include + #include ++#include ++#include ++#include + + #include "cache.h" + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_PROC + +@@ -372,6 +376,24 @@ nfsd4_open(struct svc_rqst *rqstp, struc + * set, (2) sets open->op_stateid, (3) sets open->op_delegation. + */ + status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); ++#if defined(CONFIG_SPNFS) ++ if (!status && spnfs_enabled()) { ++ struct inode *inode = cstate->current_fh.fh_dentry->d_inode; ++ ++ status = spnfs_open(inode, open); ++ if (status) { ++ dprintk( ++ "nfsd: pNFS could not be enabled for inode: %lu\n", ++ inode->i_ino); ++ /* ++ * XXX When there's a failure then need to indicate to ++ * future ops that no pNFS is available. Should I save ++ * the status in the inode? It's kind of a big hammer. ++ * But there may be no stripes available? ++ */ ++ } ++ } ++#endif /* CONFIG_SPNFS */ + out: + if (open->op_stateowner) { + nfs4_get_stateowner(open->op_stateowner); +@@ -454,16 +476,30 @@ nfsd4_access(struct svc_rqst *rqstp, str + &access->ac_supported); + } + ++static void ++nfsd4_get_verifier(struct super_block *sb, nfs4_verifier *verf) ++{ ++ u32 *p = (u32 *)verf->data; ++ ++#if defined(CONFIG_PNFSD) ++ if (sb->s_pnfs_op && sb->s_pnfs_op->get_verifier) { ++ nfs4_ds_get_verifier(NULL, sb, p); ++ return; ++ } ++#endif /* CONFIG_PNFSD */ ++ ++ *p++ = nfssvc_boot.tv_sec; ++ *p++ = nfssvc_boot.tv_usec; ++} ++ + static __be32 + nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_commit *commit) + { + __be32 status; + +- u32 *p = (u32 *)commit->co_verf.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; +- ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &commit->co_verf); + status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + commit->co_count); + if (status == nfserr_symlink) +@@ -816,7 +852,6 @@ nfsd4_write(struct svc_rqst *rqstp, stru + { + stateid_t *stateid = &write->wr_stateid; + struct file *filp = NULL; +- u32 *p; + __be32 status = nfs_ok; + unsigned long cnt; + +@@ -838,13 +873,49 @@ nfsd4_write(struct svc_rqst *rqstp, stru + + cnt = write->wr_buflen; + write->wr_how_written = write->wr_stable_how; +- p = (u32 *)write->wr_verifier.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; + ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &write->wr_verifier); ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(cstate->current_fh.fh_dentry->d_inode, 0)) { ++ status = bl_layoutrecall(cstate->current_fh.fh_dentry->d_inode, ++ RETURN_FILE, write->wr_offset, write->wr_buflen); ++ if (!status) { ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++ } ++ } else ++#endif ++ ++ if (spnfs_enabled()) { ++ status = spnfs_write(cstate->current_fh.fh_dentry->d_inode, ++ write->wr_offset, write->wr_buflen, write->wr_vlen, ++ rqstp); ++ if (status == nfs_ok) { ++ /* DMXXX: HACK to get filesize set */ ++ /* write one byte at offset+length-1 */ ++ struct kvec k[1]; ++ char zero = 0; ++ unsigned long cnt = 1; ++ ++ k[0].iov_base = (void *)&zero; ++ k[0].iov_len = 1; ++ nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset+write->wr_buflen-1, k, 1, ++ &cnt, &write->wr_how_written); ++ } ++ } else /* we're not an MDS */ ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++#else + status = nfsd_write(rqstp, &cstate->current_fh, filp, + write->wr_offset, rqstp->rq_vec, write->wr_vlen, + &cnt, &write->wr_how_written); ++#endif /* CONFIG_SPNFS */ ++ + if (filp) + fput(filp); + +@@ -935,6 +1006,306 @@ nfsd4_verify(struct svc_rqst *rqstp, str + return status == nfserr_same ? nfs_ok : status; + } + ++#if defined(CONFIG_PNFSD) ++ ++static __be32 ++nfsd4_layout_verify(struct super_block *sb, struct svc_export *exp, ++ unsigned int layout_type) ++{ ++ int status, type; ++ ++ /* check to see if pNFS is supported. */ ++ status = nfserr_layoutunavailable; ++ if (exp && exp->ex_pnfs == 0) { ++ dprintk("%s: Underlying file system " ++ "is not exported over pNFS\n", __func__); ++ goto out; ++ } ++ if (!sb->s_pnfs_op || !sb->s_pnfs_op->layout_type) { ++ dprintk("%s: Underlying file system " ++ "does not support pNFS\n", __func__); ++ goto out; ++ } ++ ++ type = sb->s_pnfs_op->layout_type(sb); ++ ++ /* check to see if requested layout type is supported. */ ++ status = nfserr_unknown_layouttype; ++ if (!type) ++ dprintk("BUG: %s: layout_type 0 is reserved and must not be " ++ "used by filesystem\n", __func__); ++ else if (type != layout_type) ++ dprintk("%s: requested layout type %d " ++ "does not match supported type %d\n", ++ __func__, layout_type, type); ++ else ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevlist(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevlist *gdlp) ++{ ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ int status; ++ ++ dprintk("%s: type %u maxdevices %u cookie %llu verf %llu\n", ++ __func__, gdlp->gd_layout_type, gdlp->gd_maxdevices, ++ gdlp->gd_cookie, gdlp->gd_verf); ++ ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* We must be able to encode at list one device */ ++ if (!gdlp->gd_maxdevices) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ gdlp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Do nothing if underlying file system does not support ++ * getdevicelist */ ++ if (!sb->s_pnfs_op->get_device_iter) { ++ status = nfserr_notsupp; ++ goto out; ++ } ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdlp->gd_fhp = &cstate->current_fh; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevinfo(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevinfo *gdp) ++{ ++ struct super_block *sb; ++ int status; ++ clientid_t clid; ++ ++ dprintk("%s: layout_type %u dev_id %llx:%llx maxcnt %u\n", ++ __func__, gdp->gd_layout_type, gdp->gd_devid.sbid, ++ gdp->gd_devid.devid, gdp->gd_maxcount); ++ ++ status = nfserr_inval; ++ sb = find_sbid_id(gdp->gd_devid.sbid); ++ dprintk("%s: sb %p\n", __func__, sb); ++ if (!sb) { ++ status = nfserr_noent; ++ goto out; ++ } ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, NULL, gdp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdp->gd_sb = sb; ++ ++ /* Update notifications */ ++ copy_clientid(&clid, cstate->session); ++ pnfs_set_device_notify(&clid, gdp->gd_notify_types); ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutget(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lgp->lg_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_badiomode; ++ if (lgp->lg_seg.iomode != IOMODE_READ && ++ lgp->lg_seg.iomode != IOMODE_RW) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lgp->lg_seg.iomode); ++ goto out; ++ } ++ ++ /* Set up arguments so layout can be retrieved at encode time */ ++ lgp->lg_fhp = current_fh; ++ copy_clientid((clientid_t *)&lgp->lg_seg.clientid, cstate->session); ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutcommit(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ int status; ++ struct inode *ino = NULL; ++ struct iattr ia; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ dprintk("NFSD: nfsd4_layoutcommit \n"); ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ ino = current_fh->fh_dentry->d_inode; ++ if (!ino) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = ino->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lcp->args.lc_seg.layout_type); ++ if (status) ++ goto out; ++ ++ /* This will only extend the file length. Do a quick ++ * check to see if there is any point in waiting for the update ++ * locks. ++ * TODO: Is this correct for all back ends? ++ */ ++ dprintk("%s:new offset: %d new size: %llu old size: %lld\n", ++ __func__, lcp->args.lc_newoffset, lcp->args.lc_last_wr + 1, ++ ino->i_size); ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lcp->args.lc_seg.clientid, cstate->session); ++ lcp->res.lc_size_chg = 0; ++ if (sb->s_pnfs_op->layout_commit) { ++ status = sb->s_pnfs_op->layout_commit(ino, &lcp->args, &lcp->res); ++ dprintk("%s:layout_commit result %d\n", __func__, status); ++ } else { ++ fh_lock(current_fh); ++ if ((lcp->args.lc_newoffset == 0) || ++ ((lcp->args.lc_last_wr + 1) <= ino->i_size)) { ++ status = 0; ++ lcp->res.lc_size_chg = 0; ++ fh_unlock(current_fh); ++ goto out; ++ } ++ ++ /* Try our best to update the file size */ ++ dprintk("%s: Modifying file size\n", __func__); ++ ia.ia_valid = ATTR_SIZE; ++ ia.ia_size = lcp->args.lc_last_wr + 1; ++ status = notify_change(current_fh->fh_dentry, &ia); ++ fh_unlock(current_fh); ++ dprintk("%s:notify_change result %d\n", __func__, status); ++ } ++ ++ if (!status && lcp->res.lc_size_chg && ++ EX_ISSYNC(current_fh->fh_export)) { ++ dprintk("%s: Synchronously writing inode size %llu\n", ++ __func__, ino->i_size); ++ write_inode_now(ino, 1); ++ lcp->res.lc_newsize = i_size_read(ino); ++ } ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutreturn(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lrp->args.lr_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_return_type != RETURN_FILE && ++ lrp->args.lr_return_type != RETURN_FSID && ++ lrp->args.lr_return_type != RETURN_ALL) { ++ dprintk("pNFS %s: invalid return_type %d\n", __func__, ++ lrp->args.lr_return_type); ++ goto out; ++ } ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_seg.iomode != IOMODE_READ && ++ lrp->args.lr_seg.iomode != IOMODE_RW && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lrp->args.lr_seg.iomode); ++ goto out; ++ } ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lrp->args.lr_seg.clientid, cstate->session); ++ lrp->lrs_present = (lrp->args.lr_return_type == RETURN_FILE); ++ status = nfs4_pnfs_return_layout(sb, current_fh, lrp); ++out: ++ dprintk("pNFS %s: status %d return_type 0x%x lrs_present %d\n", ++ __func__, status, lrp->args.lr_return_type, lrp->lrs_present); ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * NULL call. + */ +@@ -1317,6 +1688,29 @@ static struct nfsd4_operation nfsd4_ops[ + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_RECLAIM_COMPLETE", + }, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICELIST] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevlist, ++ .op_name = "OP_GETDEVICELIST", ++ }, ++ [OP_GETDEVICEINFO] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevinfo, ++ .op_flags = ALLOWED_WITHOUT_FH, ++ .op_name = "OP_GETDEVICEINFO", ++ }, ++ [OP_LAYOUTGET] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutget, ++ .op_name = "OP_LAYOUTGET", ++ }, ++ [OP_LAYOUTCOMMIT] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutcommit, ++ .op_name = "OP_LAYOUTCOMMIT", ++ }, ++ [OP_LAYOUTRETURN] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutreturn, ++ .op_name = "OP_LAYOUTRETURN", ++ }, ++#endif /* CONFIG_PNFSD */ + }; + + static const char *nfsd4_op_name(unsigned opnum) +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-08-31 20:41:19.200150153 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-31 20:42:05.561202607 -0400 +@@ -42,6 +42,8 @@ + #include "xdr4.h" + #include "vfs.h" + ++#include "pnfsd.h" ++ + #define NFSDDBG_FACILITY NFSDDBG_PROC + + /* Globals */ +@@ -60,8 +62,6 @@ static u64 current_sessionid = 1; + #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) + + /* forward declarations */ +-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); +-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); + static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; + static void nfs4_set_recdir(char *recdir); + +@@ -69,6 +69,7 @@ static void nfs4_set_recdir(char *recdir + + /* Currently used for almost all code touching nfsv4 state: */ + static DEFINE_MUTEX(client_mutex); ++struct task_struct *client_mutex_owner; + + /* + * Currently used for the del_recall_lru and file hash table. In an +@@ -86,11 +87,21 @@ void + nfs4_lock_state(void) + { + mutex_lock(&client_mutex); ++ client_mutex_owner = current; ++} ++ ++#define BUG_ON_UNLOCKED_STATE() BUG_ON(client_mutex_owner != current) ++ ++void ++nfs4_bug_on_unlocked_state(void) ++{ ++ BUG_ON(client_mutex_owner != current); + } + + void + nfs4_unlock_state(void) + { ++ client_mutex_owner = NULL; + mutex_unlock(&client_mutex); + } + +@@ -109,7 +120,7 @@ opaque_hashval(const void *ptr, int nbyt + + static struct list_head del_recall_lru; + +-static inline void ++inline void + put_nfs4_file(struct nfs4_file *fi) + { + if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { +@@ -120,7 +131,7 @@ put_nfs4_file(struct nfs4_file *fi) + } + } + +-static inline void ++inline void + get_nfs4_file(struct nfs4_file *fi) + { + atomic_inc(&fi->fi_ref); +@@ -230,7 +241,10 @@ nfs4_close_delegation(struct nfs4_delega + * but we want to remove the lease in any case. */ + if (dp->dl_flock) + vfs_setlease(filp, F_UNLCK, &dp->dl_flock); ++ BUG_ON_UNLOCKED_STATE(); ++ nfs4_unlock_state(); /* allow nested layout recall/return */ + nfsd_close(filp); ++ nfs4_lock_state(); + } + + /* Called under the state lock. */ +@@ -266,8 +280,8 @@ static DEFINE_SPINLOCK(client_lock); + * reclaim_str_hashtbl[] holds known client info from previous reset/reboot + * used in reboot/reset lease grace period processing + * +- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed +- * setclientid_confirmed info. ++ * conf_id_hashtbl[], and conf_str_hashtbl[] hold ++ * confirmed setclientid_confirmed info. + * + * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed + * setclientid info. +@@ -292,6 +306,7 @@ static void unhash_generic_stateid(struc + list_del(&stp->st_hash); + list_del(&stp->st_perfile); + list_del(&stp->st_perstateowner); ++ release_pnfs_ds_dev_list(stp); + } + + static void free_generic_stateid(struct nfs4_stateid *stp) +@@ -345,7 +360,10 @@ static void release_open_stateid(struct + { + unhash_generic_stateid(stp); + release_stateid_lockowners(stp); ++ BUG_ON_UNLOCKED_STATE(); ++ nfs4_unlock_state(); /* allow nested layout recall/return */ + nfsd_close(stp->st_vfs_file); ++ nfs4_lock_state(); + free_generic_stateid(stp); + } + +@@ -739,6 +757,8 @@ expire_client(struct nfs4_client *clp) + struct nfs4_delegation *dp; + struct list_head reaplist; + ++ BUG_ON_UNLOCKED_STATE(); ++ + INIT_LIST_HEAD(&reaplist); + spin_lock(&recall_lock); + while (!list_empty(&clp->cl_delegations)) { +@@ -758,6 +778,7 @@ expire_client(struct nfs4_client *clp) + sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); + release_openowner(sop); + } ++ pnfs_expire_client(clp); + nfsd4_set_callback_client(clp, NULL); + if (clp->cl_cb_conn.cb_xprt) + svc_xprt_put(clp->cl_cb_conn.cb_xprt); +@@ -770,6 +791,13 @@ expire_client(struct nfs4_client *clp) + spin_unlock(&client_lock); + } + ++void expire_client_lock(struct nfs4_client *clp) ++{ ++ nfs4_lock_state(); ++ expire_client(clp); ++ nfs4_unlock_state(); ++} ++ + static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) + { + memcpy(target->cl_verifier.data, source->data, +@@ -859,6 +887,11 @@ static struct nfs4_client *create_client + INIT_LIST_HEAD(&clp->cl_strhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&clp->cl_layouts); ++ INIT_LIST_HEAD(&clp->cl_layoutrecalls); ++ atomic_set(&clp->cl_deviceref, 0); ++#endif /* CONFIG_PNFSD */ + INIT_LIST_HEAD(&clp->cl_sessions); + INIT_LIST_HEAD(&clp->cl_lru); + clp->cl_time = get_seconds(); +@@ -908,7 +941,7 @@ move_to_confirmed(struct nfs4_client *cl + renew_client(clp); + } + +-static struct nfs4_client * ++struct nfs4_client * + find_confirmed_client(clientid_t *clid) + { + struct nfs4_client *clp; +@@ -978,6 +1011,24 @@ find_unconfirmed_client_by_str(const cha + return NULL; + } + ++int ++filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), ++ void *arg) ++{ ++ struct nfs4_client *clp, *next; ++ int i, status = 0; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ list_for_each_entry_safe (clp, next, &conf_str_hashtbl[i], ++ cl_strhash) { ++ status = func(clp, arg); ++ if (status) ++ break; ++ } ++ ++ return status; ++} ++ + static void + gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) + { +@@ -1110,8 +1161,12 @@ nfsd4_replay_cache_entry(struct nfsd4_co + static void + nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) + { +- /* pNFS is not supported */ ++#if defined(CONFIG_PNFSD) ++ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS | ++ EXCHGID4_FLAG_USE_PNFS_DS; ++#else /* CONFIG_PNFSD */ + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; ++#endif /* CONFIG_PNFSD */ + + /* Referrals are supported, Migration is not. */ + new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; +@@ -1301,6 +1356,13 @@ nfsd4_create_session(struct svc_rqst *rq + struct nfsd4_clid_slot *cs_slot = NULL; + int status = 0; + ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ /* XXX hack to get local ip address */ ++ memcpy(&pnfsd_lexp_addr, &rqstp->rq_xprt->xpt_local, ++ sizeof(pnfsd_lexp_addr)); ++ pnfs_lexp_addr_len = rqstp->rq_xprt->xpt_locallen; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ + nfs4_lock_state(); + unconf = find_unconfirmed_client(&cr_ses->clientid); + conf = find_confirmed_client(&cr_ses->clientid); +@@ -1340,25 +1402,26 @@ nfsd4_create_session(struct svc_rqst *rq + cs_slot->sl_seqid++; /* from 0 to 1 */ + move_to_confirmed(unconf); + +- if (cr_ses->flags & SESSION4_BACK_CHAN) { +- unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; +- svc_xprt_get(rqstp->rq_xprt); +- rpc_copy_addr( +- (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, +- sa); +- unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); +- unconf->cl_cb_conn.cb_minorversion = +- cstate->minorversion; +- unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; +- unconf->cl_cb_seq_nr = 1; +- nfsd4_probe_callback(unconf, &unconf->cl_cb_conn); +- } ++ if (is_ds_only_session(unconf->cl_exchange_flags)) ++ cr_ses->flags &= ~SESSION4_BACK_CHAN; ++ + conf = unconf; + } else { + status = nfserr_stale_clientid; + goto out; + } + ++ if (cr_ses->flags & SESSION4_BACK_CHAN) { ++ conf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; ++ svc_xprt_get(rqstp->rq_xprt); ++ rpc_copy_addr((struct sockaddr *)&conf->cl_cb_conn.cb_addr, sa); ++ conf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); ++ conf->cl_cb_conn.cb_minorversion = cstate->minorversion; ++ conf->cl_cb_conn.cb_prog = cr_ses->callback_prog; ++ conf->cl_cb_seq_nr = 1; ++ nfsd4_probe_callback(conf, &conf->cl_cb_conn); ++ } ++ + /* + * We do not support RDMA or persistent sessions + */ +@@ -1746,7 +1809,7 @@ out: + + /* OPEN Share state helper functions */ + static inline struct nfs4_file * +-alloc_init_file(struct inode *ino) ++alloc_init_file(struct inode *ino, struct svc_fh *current_fh) + { + struct nfs4_file *fp; + unsigned int hashval = file_hashval(ino); +@@ -1760,6 +1823,16 @@ alloc_init_file(struct inode *ino) + fp->fi_inode = igrab(ino); + fp->fi_id = current_fileid++; + fp->fi_had_conflict = false; ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&fp->fi_layouts); ++ INIT_LIST_HEAD(&fp->fi_layout_states); ++ fp->fi_fsid.major = current_fh->fh_export->ex_fsid; ++ fp->fi_fsid.minor = 0; ++ fp->fi_fhlen = current_fh->fh_handle.fh_size; ++ BUG_ON(fp->fi_fhlen > sizeof(fp->fi_fhval)); ++ memcpy(fp->fi_fhval, ¤t_fh->fh_handle.fh_base, ++ fp->fi_fhlen); ++#endif /* CONFIG_PNFSD */ + spin_lock(&recall_lock); + list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); +@@ -1768,7 +1841,7 @@ alloc_init_file(struct inode *ino) + return NULL; + } + +-static void ++void + nfsd4_free_slab(struct kmem_cache **slab) + { + if (*slab == NULL) +@@ -1784,6 +1857,7 @@ nfsd4_free_slabs(void) + nfsd4_free_slab(&file_slab); + nfsd4_free_slab(&stateid_slab); + nfsd4_free_slab(&deleg_slab); ++ nfsd4_free_pnfs_slabs(); + } + + static int +@@ -1805,6 +1879,8 @@ nfsd4_init_slabs(void) + sizeof(struct nfs4_delegation), 0, 0, NULL); + if (deleg_slab == NULL) + goto out_nomem; ++ if (nfsd4_init_pnfs_slabs()) ++ goto out_nomem; + return 0; + out_nomem: + nfsd4_free_slabs(); +@@ -1878,6 +1954,9 @@ init_stateid(struct nfs4_stateid *stp, s + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); + INIT_LIST_HEAD(&stp->st_perfile); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &stateid_hashtbl[hashval]); + list_add(&stp->st_perstateowner, &sop->so_stateids); + list_add(&stp->st_perfile, &fp->fi_stateids); +@@ -1919,6 +1998,7 @@ find_openstateowner_str(unsigned int has + { + struct nfs4_stateowner *so = NULL; + ++ BUG_ON_UNLOCKED_STATE(); + list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { + if (same_owner_str(so, &open->op_owner, &open->op_clientid)) + return so; +@@ -1927,7 +2007,7 @@ find_openstateowner_str(unsigned int has + } + + /* search file_hashtbl[] for file */ +-static struct nfs4_file * ++struct nfs4_file * + find_file(struct inode *ino) + { + unsigned int hashval = file_hashval(ino); +@@ -1945,6 +2025,18 @@ find_file(struct inode *ino) + return NULL; + } + ++struct nfs4_file * ++find_alloc_file(struct inode *ino, struct svc_fh *current_fh) ++{ ++ struct nfs4_file *fp; ++ ++ fp = find_file(ino); ++ if (fp) ++ return fp; ++ ++ return alloc_init_file(ino, current_fh); ++} ++ + static inline int access_valid(u32 x, u32 minorversion) + { + if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) +@@ -2503,7 +2595,7 @@ nfsd4_process_open2(struct svc_rqst *rqs + if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) + goto out; + status = nfserr_resource; +- fp = alloc_init_file(ino); ++ fp = alloc_init_file(ino, current_fh); + if (fp == NULL) + goto out; + } +@@ -2730,7 +2822,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct + return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_path.dentry->d_inode; + } + +-static int ++int + STALE_STATEID(stateid_t *stateid) + { + if (stateid->si_boot == boot_time) +@@ -2740,6 +2832,16 @@ STALE_STATEID(stateid_t *stateid) + return 1; + } + ++__be32 ++nfs4_check_stateid(stateid_t *stateid) ++{ ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ return nfserr_bad_stateid; ++ if (STALE_STATEID(stateid)) ++ return nfserr_stale_stateid; ++ return 0; ++} ++ + static inline int + access_permit_read(unsigned long access_bmap) + { +@@ -2848,6 +2950,24 @@ nfs4_preprocess_stateid_op(struct nfsd4_ + if (grace_disallows_io(ino)) + return nfserr_grace; + ++#if defined(CONFIG_PNFSD) ++ if (pnfs_fh_is_ds(¤t_fh->fh_handle)) { ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ status = nfserr_bad_stateid; ++ else ++#ifdef CONFIG_GFS2_FS_LOCKING_DLM ++ { ++ dprintk("%s Don't check DS stateid\n", __func__); ++ return 0; ++ } ++#else /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ status = nfs4_preprocess_pnfs_ds_stateid(current_fh, ++ stateid); ++#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ goto out; ++ } ++#endif /* CONFIG_PNFSD */ ++ + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + +@@ -2924,13 +3044,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_co + *stpp = NULL; + *sopp = NULL; + +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { +- dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); +- return nfserr_bad_stateid; +- } +- +- if (STALE_STATEID(stateid)) +- return nfserr_stale_stateid; ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ return status; + + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; +@@ -3205,11 +3321,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + nfs4_lock_state(); +- status = nfserr_bad_stateid; +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) +- goto out; +- status = nfserr_stale_stateid; +- if (STALE_STATEID(stateid)) ++ status = nfs4_check_stateid(stateid); ++ if (status) + goto out; + status = nfserr_bad_stateid; + if (!is_delegation_stateid(stateid)) +@@ -3238,26 +3351,6 @@ out: + #define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) + #define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) + +-static inline u64 +-end_offset(u64 start, u64 len) +-{ +- u64 end; +- +- end = start + len; +- return end >= start ? end: NFS4_MAX_UINT64; +-} +- +-/* last octet in a range */ +-static inline u64 +-last_byte_offset(u64 start, u64 len) +-{ +- u64 end; +- +- BUG_ON(!len); +- end = start + len; +- return end > start ? end - 1: NFS4_MAX_UINT64; +-} +- + #define lockownerid_hashval(id) \ + ((id) & LOCK_HASH_MASK) + +@@ -3274,7 +3367,7 @@ static struct list_head lock_ownerid_has + static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; + static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; + +-static struct nfs4_stateid * ++struct nfs4_stateid * + find_stateid(stateid_t *stid, int flags) + { + struct nfs4_stateid *local; +@@ -3303,7 +3396,7 @@ find_stateid(stateid_t *stid, int flags) + return NULL; + } + +-static struct nfs4_delegation * ++struct nfs4_delegation * + find_delegation_stateid(struct inode *ino, stateid_t *stid) + { + struct nfs4_file *fp; +@@ -3436,6 +3529,9 @@ alloc_init_lock_stateid(struct nfs4_stat + INIT_LIST_HEAD(&stp->st_perfile); + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]); + list_add(&stp->st_perfile, &fp->fi_stateids); + list_add(&stp->st_perstateowner, &sop->so_stateids); +@@ -3998,6 +4094,9 @@ nfs4_state_init(void) + INIT_LIST_HEAD(&client_lru); + INIT_LIST_HEAD(&del_recall_lru); + reclaim_str_hashtbl_size = 0; ++#if defined(CONFIG_PNFSD) ++ nfs4_pnfs_state_init(); ++#endif /* CONFIG_PNFSD */ + return 0; + } + +@@ -4110,6 +4209,7 @@ __nfs4_state_shutdown(void) + } + + nfsd4_shutdown_recdir(); ++ nfs4_pnfs_state_shutdown(); + nfs4_init = 0; + } + +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-08-31 20:41:19.202150173 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-31 20:42:05.563232916 -0400 +@@ -47,9 +47,14 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_XDR + +@@ -1234,6 +1239,138 @@ nfsd4_decode_sequence(struct nfsd4_compo + DECODE_TAIL; + } + ++#if defined(CONFIG_PNFSD) ++static __be32 ++nfsd4_decode_getdevlist(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16 + sizeof(nfs4_verifier)); ++ READ32(gdevl->gd_layout_type); ++ READ32(gdevl->gd_maxdevices); ++ READ64(gdevl->gd_cookie); ++ COPYMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_getdevinfo(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ u32 num; ++ DECODE_HEAD; ++ ++ READ_BUF(12 + sizeof(struct nfsd4_pnfs_deviceid)); ++ READ64(gdev->gd_devid.sbid); ++ READ64(gdev->gd_devid.devid); ++ READ32(gdev->gd_layout_type); ++ READ32(gdev->gd_maxcount); ++ READ32(num); ++ if (num) { ++ READ_BUF(4); ++ READ32(gdev->gd_notify_types); ++ } else { ++ gdev->gd_notify_types = 0; ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(36); ++ READ32(lgp->lg_signal); ++ READ32(lgp->lg_seg.layout_type); ++ READ32(lgp->lg_seg.iomode); ++ READ64(lgp->lg_seg.offset); ++ READ64(lgp->lg_seg.length); ++ READ64(lgp->lg_minlength); ++ nfsd4_decode_stateid(argp, &lgp->lg_sid); ++ READ_BUF(4); ++ READ32(lgp->lg_maxcount); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ DECODE_HEAD; ++ u32 timechange; ++ ++ READ_BUF(20); ++ READ64(lcp->args.lc_seg.offset); ++ READ64(lcp->args.lc_seg.length); ++ READ32(lcp->args.lc_reclaim); ++ nfsd4_decode_stateid(argp, &lcp->lc_sid); ++ READ_BUF(4); ++ READ32(lcp->args.lc_newoffset); ++ if (lcp->args.lc_newoffset) { ++ READ_BUF(8); ++ READ64(lcp->args.lc_last_wr); ++ } else ++ lcp->args.lc_last_wr = 0; ++ READ_BUF(4); ++ READ32(timechange); ++ if (timechange) { ++ READ_BUF(12); ++ READ64(lcp->args.lc_mtime.seconds); ++ READ32(lcp->args.lc_mtime.nseconds); ++ } else { ++ lcp->args.lc_mtime.seconds = 0; ++ lcp->args.lc_mtime.nseconds = 0; ++ } ++ READ_BUF(8); ++ READ32(lcp->args.lc_seg.layout_type); ++ /* XXX: saving XDR'ed layout update. Since we don't have the ++ * current_fh yet, and therefore no export_ops, we can't call ++ * the layout specific decode routines. File and pVFS2 ++ * do not use the layout update.... ++ */ ++ READ32(lcp->args.lc_up_len); ++ if (lcp->args.lc_up_len > 0) { ++ READ_BUF(lcp->args.lc_up_len); ++ READMEM(lcp->args.lc_up_layout, lcp->args.lc_up_len); ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16); ++ READ32(lrp->args.lr_reclaim); ++ READ32(lrp->args.lr_seg.layout_type); ++ READ32(lrp->args.lr_seg.iomode); ++ READ32(lrp->args.lr_return_type); ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ READ_BUF(16); ++ READ64(lrp->args.lr_seg.offset); ++ READ64(lrp->args.lr_seg.length); ++ nfsd4_decode_stateid(argp, &lrp->lr_sid); ++ READ_BUF(4); ++ READ32(lrp->args.lrf_body_len); ++ if (lrp->args.lrf_body_len > 0) { ++ READ_BUF(lrp->args.lrf_body_len); ++ READMEM(lrp->args.lrf_body, lrp->args.lrf_body_len); ++ } ++ } ++ ++ DECODE_TAIL; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) + { +@@ -1335,11 +1472,19 @@ static nfsd4_dec nfsd41_dec_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, + [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, +@@ -2136,6 +2281,36 @@ out_acl: + } + WRITE64(stat.ino); + } ++#if defined(CONFIG_PNFSD) ++ if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { ++ struct super_block *sb = dentry->d_inode->i_sb; ++ int type = 0; ++ ++ /* Query the filesystem for supported pNFS layout types. ++ * Currently, we only support one layout type per file system. ++ * The export_ops->layout_type() returns the pnfs_layouttype4. ++ */ ++ buflen -= 4; ++ if (buflen < 0) /* length */ ++ goto out_resource; ++ ++ if (sb && sb->s_pnfs_op && sb->s_pnfs_op->layout_type) ++ type = sb->s_pnfs_op->layout_type(sb); ++ if (type) { ++ if ((buflen -= 4) < 0) /* type */ ++ goto out_resource; ++ WRITE32(1); /* length */ ++ WRITE32(type); /* type */ ++ } else ++ WRITE32(0); /* length */ ++ } ++ ++ if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ if ((buflen -= 4) < 0) ++ goto out_resource; ++ WRITE32(stat.blksize); ++ } ++#endif /* CONFIG_PNFSD */ + if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + WRITE32(3); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); +@@ -2366,6 +2541,10 @@ nfsd4_encode_commit(struct nfsd4_compoun + if (!nfserr) { + RESERVE_SPACE(8); + WRITEMEM(commit->co_verf.data, 8); ++ dprintk("NFSD: nfsd4_encode_commit: verifier %x:%x\n", ++ ((u32 *)(&commit->co_verf.data))[0], ++ ((u32 *)(&commit->co_verf.data))[1]); ++ + ADJUST_ARGS(); + } + return nfserr; +@@ -2620,9 +2799,20 @@ nfsd4_encode_read(struct nfsd4_compoundr + } + read->rd_vlen = v; + ++#if defined(CONFIG_SPNFS) ++ if (spnfs_enabled()) ++ nfserr = spnfs_read(read->rd_fhp->fh_dentry->d_inode, ++ read->rd_offset, &maxcount, read->rd_vlen, ++ resp->rqstp); ++ else /* we're not an MDS */ ++ nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, ++ read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, ++ &maxcount); ++#else + nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, + read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, + &maxcount); ++#endif /* CONFIG_SPNFS */ + + if (nfserr == nfserr_symlink) + nfserr = nfserr_inval; +@@ -2926,6 +3116,9 @@ nfsd4_encode_write(struct nfsd4_compound + WRITE32(write->wr_bytes_written); + WRITE32(write->wr_how_written); + WRITEMEM(write->wr_verifier.data, 8); ++ dprintk("NFSD: nfsd4_encode_write: verifier %x:%x\n", ++ ((u32 *)(&write->wr_verifier.data))[0], ++ ((u32 *)(&write->wr_verifier.data))[1]); + ADJUST_ARGS(); + } + return nfserr; +@@ -3069,6 +3262,343 @@ nfsd4_encode_sequence(struct nfsd4_compo + return 0; + } + ++#if defined(CONFIG_PNFSD) ++ ++/* Uses the export interface to iterate through the available devices ++ * and encodes them on the response stream. ++ */ ++static __be32 ++nfsd4_encode_devlist_iterator(struct nfsd4_compoundres *resp, ++ struct nfsd4_pnfs_getdevlist *gdevl, ++ unsigned int *dev_count) ++{ ++ struct super_block *sb = gdevl->gd_fhp->fh_dentry->d_inode->i_sb; ++ __be32 nfserr; ++ int status; ++ __be32 *p; ++ struct nfsd4_pnfs_dev_iter_res res = { ++ .gd_cookie = gdevl->gd_cookie, ++ .gd_verf = gdevl->gd_verf, ++ .gd_eof = 0 ++ }; ++ u64 sbid; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ sbid = find_create_sbid(sb); ++ *dev_count = 0; ++ do { ++ status = sb->s_pnfs_op->get_device_iter(sb, ++ gdevl->gd_layout_type, ++ &res); ++ if (status) { ++ if (status == -ENOENT) { ++ res.gd_eof = 1; ++ /* return success */ ++ break; ++ } ++ nfserr = nfserrno(status); ++ goto out_err; ++ } ++ ++ /* Encode device id and layout type */ ++ RESERVE_SPACE(sizeof(struct nfsd4_pnfs_deviceid)); ++ WRITE64((__be64)sbid); ++ WRITE64(res.gd_devid); /* devid minor */ ++ ADJUST_ARGS(); ++ (*dev_count)++; ++ } while (*dev_count < gdevl->gd_maxdevices && !res.gd_eof); ++ gdevl->gd_cookie = res.gd_cookie; ++ gdevl->gd_verf = res.gd_verf; ++ gdevl->gd_eof = res.gd_eof; ++ nfserr = nfs_ok; ++out_err: ++ dprintk("%s: Encoded %u devices\n", __func__, *dev_count); ++ return nfserr; ++} ++ ++/* Encodes the response of get device list. ++*/ ++static __be32 ++nfsd4_encode_getdevlist(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ unsigned int dev_count = 0, lead_count; ++ u32 *p_in = resp->p; ++ __be32 *p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ /* Ensure we have room for cookie, verifier, and devlist len, ++ * which we will backfill in after we encode as many devices as possible ++ */ ++ lead_count = 8 + sizeof(nfs4_verifier) + 4; ++ RESERVE_SPACE(lead_count); ++ /* skip past these values */ ++ p += XDR_QUADLEN(lead_count); ++ ADJUST_ARGS(); ++ ++ /* Iterate over as many device ids as possible on the xdr stream */ ++ nfserr = nfsd4_encode_devlist_iterator(resp, gdevl, &dev_count); ++ if (nfserr) ++ goto out_err; ++ ++ /* Backfill in cookie, verf and number of devices encoded */ ++ p = p_in; ++ WRITE64(gdevl->gd_cookie); ++ WRITEMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ WRITE32(dev_count); ++ ++ /* Skip over devices */ ++ p += XDR_QUADLEN(dev_count * sizeof(struct nfsd4_pnfs_deviceid)); ++ ADJUST_ARGS(); ++ ++ /* are we at the end of devices? */ ++ RESERVE_SPACE(4); ++ WRITE32(gdevl->gd_eof); ++ ADJUST_ARGS(); ++ ++ dprintk("%s: done.\n", __func__); ++ ++ nfserr = nfs_ok; ++out: ++ return nfserr; ++out_err: ++ p = p_in; ++ ADJUST_ARGS(); ++ goto out; ++} ++ ++/* For a given device id, have the file system retrieve and encode the ++ * associated device. For file layout, the encoding function is ++ * passed down to the file system. The file system then has the option ++ * of using this encoding function or one of its own. ++ * ++ * Note: the file system must return the XDR size of struct device_addr4 ++ * da_addr_body in pnfs_xdr_info.bytes_written on NFS4ERR_TOOSMALL for the ++ * gdir_mincount calculation. ++ */ ++static __be32 ++nfsd4_encode_getdevinfo(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ struct super_block *sb; ++ int maxcount = 0, type_notify_len = 12; ++ __be32 *p, *p_save = NULL, *p_in = resp->p; ++ struct exp_xdr_stream xdr; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = gdev->gd_sb; ++ ++ if (gdev->gd_maxcount != 0) { ++ /* FIXME: this will be bound by the session max response */ ++ maxcount = svc_max_payload(resp->rqstp); ++ if (maxcount > gdev->gd_maxcount) ++ maxcount = gdev->gd_maxcount; ++ ++ /* Ensure have room for type and notify field */ ++ maxcount -= type_notify_len; ++ if (maxcount < 0) { ++ nfserr = -ETOOSMALL; ++ goto toosmall; ++ } ++ } ++ ++ RESERVE_SPACE(4); ++ WRITE32(gdev->gd_layout_type); ++ ADJUST_ARGS(); ++ ++ /* If maxcount is 0 then just update notifications */ ++ if (gdev->gd_maxcount == 0) ++ goto handle_notifications; ++ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ nfserr = sb->s_pnfs_op->get_device_info(sb, &xdr, gdev->gd_layout_type, ++ &gdev->gd_devid); ++ if (nfserr) ++ goto err; ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ BUG_ON(xdr.p > xdr.end); ++ ++ /* Update the xdr stream with the number of bytes encoded ++ * by the file system. ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++handle_notifications: ++ /* Encode supported device notifications */ ++ RESERVE_SPACE(4); ++ if (sb->s_pnfs_op->set_device_notify) { ++ struct pnfs_devnotify_arg dn_args; ++ ++ dn_args.dn_layout_type = gdev->gd_layout_type; ++ dn_args.dn_devid = gdev->gd_devid; ++ dn_args.dn_notify_types = gdev->gd_notify_types; ++ nfserr = sb->s_pnfs_op->set_device_notify(sb, &dn_args); ++ if (nfserr) ++ goto err; ++ WRITE32(dn_args.dn_notify_types); ++ } else { ++ WRITE32(0); ++ } ++ ADJUST_ARGS(); ++ ++out: ++ return nfserrno(nfserr); ++toosmall: ++ dprintk("%s: maxcount too small\n", __func__); ++ RESERVE_SPACE(4); ++ WRITE32((p_save ? (xdr.p - p_save) * 4 : 0) + type_notify_len); ++ ADJUST_ARGS(); ++ goto out; ++err: ++ /* Rewind to the beginning */ ++ p = p_in; ++ ADJUST_ARGS(); ++ if (nfserr == -ETOOSMALL) ++ goto toosmall; ++ printk(KERN_ERR "%s: export ERROR %d\n", __func__, nfserr); ++ goto out; ++} ++ ++static __be32 ++nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, ++ __be32 nfserr, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int maxcount, leadcount; ++ struct super_block *sb; ++ struct exp_xdr_stream xdr; ++ __be32 *p, *p_save, *p_start = resp->p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = lgp->lg_fhp->fh_dentry->d_inode->i_sb; ++ maxcount = PAGE_SIZE; ++ if (maxcount > lgp->lg_maxcount) ++ maxcount = lgp->lg_maxcount; ++ ++ /* Check for space on xdr stream */ ++ leadcount = 36 + sizeof(stateid_opaque_t); ++ RESERVE_SPACE(leadcount); ++ /* encode layout metadata after file system encodes layout */ ++ p += XDR_QUADLEN(leadcount); ++ ADJUST_ARGS(); ++ ++ /* Ensure have room for ret_on_close, off, len, iomode, type */ ++ maxcount -= leadcount; ++ if (maxcount < 0) { ++ printk(KERN_ERR "%s: buffer too small\n", __func__); ++ nfserr = nfserr_toosmall; ++ goto err; ++ } ++ ++ /* Set xdr info so file system can encode layout */ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ /* Retrieve, encode, and merge layout; process stateid */ ++ nfserr = nfs4_pnfs_get_layout(lgp, &xdr); ++ if (nfserr) ++ goto err; ++ ++ /* Ensure file system returned enough bytes for the client ++ * to access. ++ */ ++ if (lgp->lg_seg.length < lgp->lg_minlength) { ++ nfserr = nfserr_badlayout; ++ goto err; ++ } ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ ++ /* Rewind to beginning and encode attrs */ ++ resp->p = p_start; ++ RESERVE_SPACE(4); ++ WRITE32(lgp->lg_roc); /* return on close */ ++ ADJUST_ARGS(); ++ nfsd4_encode_stateid(resp, &lgp->lg_sid); ++ RESERVE_SPACE(28); ++ /* Note: response logr_layout array count, always one for now */ ++ WRITE32(1); ++ WRITE64(lgp->lg_seg.offset); ++ WRITE64(lgp->lg_seg.length); ++ WRITE32(lgp->lg_seg.iomode); ++ WRITE32(lgp->lg_seg.layout_type); ++ ++ /* Update the xdr stream with the number of bytes written ++ * by the file system ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++ return nfs_ok; ++err: ++ resp->p = p_start; ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lcp->res.lc_size_chg); ++ ADJUST_ARGS(); ++ if (lcp->res.lc_size_chg) { ++ RESERVE_SPACE(8); ++ WRITE64(lcp->res.lc_newsize); ++ ADJUST_ARGS(); ++ } ++out: ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lrp->lrs_present != 0); /* got stateid? */ ++ ADJUST_ARGS(); ++ if (lrp->lrs_present) ++ nfsd4_encode_stateid(resp, &lrp->lr_sid); ++out: ++ return nfserr; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) + { +@@ -3129,11 +3659,19 @@ static nfsd4_enc nfsd4_enc_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, + [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, +diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c +--- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-08-31 20:41:19.203150982 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-31 20:42:05.565212801 -0400 +@@ -13,10 +13,15 @@ + #include + #include + #include ++#include + + #include "nfsd.h" + #include "cache.h" + ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++#include ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + /* + * We have a single directory with 9 nodes in it. + */ +@@ -49,6 +54,9 @@ enum { + NFSD_Gracetime, + NFSD_RecoveryDir, + #endif ++#ifdef CONFIG_PNFSD ++ NFSD_pnfs_dlm_device, ++#endif + }; + + /* +@@ -74,6 +82,9 @@ static ssize_t write_leasetime(struct fi + static ssize_t write_gracetime(struct file *file, char *buf, size_t size); + static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); + #endif ++#ifdef CONFIG_PNFSD ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size); ++#endif + + static ssize_t (*write_op[])(struct file *, char *, size_t) = { + [NFSD_Svc] = write_svc, +@@ -96,6 +107,9 @@ static ssize_t (*write_op[])(struct file + [NFSD_Gracetime] = write_gracetime, + [NFSD_RecoveryDir] = write_recoverydir, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = write_pnfs_dlm_device, ++#endif + }; + + static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) +@@ -1349,6 +1363,68 @@ static ssize_t write_recoverydir(struct + + #endif + ++#ifdef CONFIG_PNFSD ++ ++static ssize_t __write_pnfs_dlm_device(struct file *file, char *buf, ++ size_t size) ++{ ++ char *mesg = buf; ++ char *pnfs_dlm_device; ++ int max_size = NFSD_PNFS_DLM_DEVICE_MAX; ++ int len, ret = 0; ++ ++ if (size > 0) { ++ ret = -EINVAL; ++ if (size > max_size || buf[size-1] != '\n') ++ return ret; ++ buf[size-1] = 0; ++ ++ pnfs_dlm_device = mesg; ++ len = qword_get(&mesg, pnfs_dlm_device, size); ++ if (len <= 0) ++ return ret; ++ ++ ret = nfsd4_set_pnfs_dlm_device(pnfs_dlm_device, len); ++ } else ++ return nfsd4_get_pnfs_dlm_device_list(buf, SIMPLE_TRANSACTION_LIMIT); ++ ++ return ret <= 0 ? ret : strlen(buf); ++} ++ ++/** ++ * write_pnfs_dlm_device - Set or report the current pNFS data server list ++ * ++ * Input: ++ * buf: ignored ++ * size: zero ++ * ++ * OR ++ * ++ * Input: ++ * buf: C string containing a block device name, ++ * a colon, and then a comma separated ++ * list of pNFS data server IPv4 addresses ++ * size: non-zero length of C string in @buf ++ * Output: ++ * On success: passed-in buffer filled with '\n'-terminated C ++ * string containing a block device name, a colon, and ++ * then a comma separated list of pNFS ++ * data server IPv4 addresses. ++ * return code is the size in bytes of the string ++ * On error: return code is a negative errno value ++ */ ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size) ++{ ++ ssize_t rv; ++ ++ mutex_lock(&nfsd_mutex); ++ rv = __write_pnfs_dlm_device(file, buf, size); ++ mutex_unlock(&nfsd_mutex); ++ return rv; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ + /*----------------------------------------------------------------------------*/ + /* + * populating the filesystem. +@@ -1383,6 +1459,10 @@ static int nfsd_fill_super(struct super_ + [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = {"pnfs_dlm_device", &transaction_ops, ++ S_IWUSR|S_IRUSR}, ++#endif + /* last one */ {""} + }; + return simple_fill_super(sb, 0x6e667364, nfsd_files); +@@ -1421,6 +1501,9 @@ static int create_proc_exports_entry(voi + } + #endif + ++#if defined(CONFIG_SPNFS_BLOCK) ++int nfsd_bl_init(void); ++#endif + static int __init init_nfsd(void) + { + int retval; +@@ -1443,6 +1526,15 @@ static int __init init_nfsd(void) + retval = create_proc_exports_entry(); + if (retval) + goto out_free_idmap; ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ retval = spnfs_init_proc(); ++ if (retval != 0) ++ goto out_free_idmap; ++#if defined(CONFIG_SPNFS_BLOCK) ++ nfsd_bl_init(); ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + retval = register_filesystem(&nfsd_fs_type); + if (retval) + goto out_free_all; +@@ -1465,7 +1557,22 @@ out_free_stat: + + static void __exit exit_nfsd(void) + { ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ remove_proc_entry("fs/nfs/spnfs/recall", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/getfh", NULL); ++ remove_proc_entry("fs/nfs/spnfs/config", NULL); ++ remove_proc_entry("fs/nfs/spnfs/ctl", NULL); ++ remove_proc_entry("fs/nfs/spnfs", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutsegsize", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ + nfsd_export_shutdown(); ++ nfsd4_pnfs_dlm_shutdown(); + nfsd_reply_cache_shutdown(); + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); +diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h +--- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-08-31 20:41:19.204160960 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-31 20:42:05.565212801 -0400 +@@ -285,11 +285,17 @@ extern time_t nfsd4_grace; + #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ + NFSD4_SUPPORTED_ATTRS_WORD0 + ++#if defined(CONFIG_PNFSD) ++#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ ++ (NFSD4_SUPPORTED_ATTRS_WORD1 | FATTR4_WORD1_FS_LAYOUT_TYPES) ++#else /* CONFIG_PNFSD */ + #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ + NFSD4_SUPPORTED_ATTRS_WORD1 ++#endif /* CONFIG_PNFSD */ + + #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ +- (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) ++ (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT | \ ++ FATTR4_WORD2_LAYOUT_BLKSIZE) + + static inline u32 nfsd_suppattrs0(u32 minorversion) + { +diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.c +--- linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-08-31 20:42:05.566222921 -0400 +@@ -10,6 +10,7 @@ + #include + + #include ++#include + #include "nfsd.h" + #include "vfs.h" + #include "auth.h" +@@ -139,6 +140,7 @@ static inline __be32 check_pseudo_root(s + static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) + { + struct knfsd_fh *fh = &fhp->fh_handle; ++ int fsid_type; + struct fid *fid = NULL, sfid; + struct svc_export *exp; + struct dentry *dentry; +@@ -159,7 +161,8 @@ static __be32 nfsd_set_fh_dentry(struct + return error; + if (fh->fh_auth_type != 0) + return error; +- len = key_len(fh->fh_fsid_type) / 4; ++ fsid_type = pnfs_fh_fsid_type(fh); ++ len = key_len(fsid_type) / 4; + if (len == 0) + return error; + if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { +@@ -172,7 +175,7 @@ static __be32 nfsd_set_fh_dentry(struct + data_left -= len; + if (data_left < 0) + return error; +- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth); ++ exp = rqst_exp_find(rqstp, fsid_type, fh->fh_auth); + fid = (struct fid *)(fh->fh_auth + len); + } else { + __u32 tfh[2]; +diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.h +--- linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-08-31 20:42:05.567233002 -0400 +@@ -14,6 +14,7 @@ enum nfsd_fsid { + FSID_UUID8, + FSID_UUID16, + FSID_UUID16_INUM, ++ FSID_MAX + }; + + enum fsid_source { +@@ -205,4 +206,42 @@ fh_unlock(struct svc_fh *fhp) + } + } + ++#if defined(CONFIG_PNFSD) ++ ++/* ++ * fh_fsid_type is overloaded to indicate whether a filehandle was one supplied ++ * to a DS by LAYOUTGET. nfs4_preprocess_stateid_op() uses this to decide how ++ * to handle a given stateid. ++ */ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return fh->fh_fsid_type >= FSID_MAX; ++} ++ ++static inline void pnfs_fh_mark_ds(struct knfsd_fh *fh) ++{ ++ BUG_ON(fh->fh_version != 1); ++ BUG_ON(pnfs_fh_is_ds(fh)); ++ fh->fh_fsid_type += FSID_MAX; ++} ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ ++/* allows fh_verify() to check the real fsid_type (i.e., not overloaded). */ ++static inline int pnfs_fh_fsid_type(struct knfsd_fh *fh) ++{ ++ int fsid_type = fh->fh_fsid_type; ++ ++ if (pnfs_fh_is_ds(fh)) ++ return fsid_type - FSID_MAX; ++ return fsid_type; ++} ++ + #endif /* _LINUX_NFSD_FH_INT_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/nfssvc.c +--- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-08-31 20:41:17.274232911 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-08-31 20:42:05.568144414 -0400 +@@ -115,7 +115,7 @@ struct svc_program nfsd_program = { + + }; + +-u32 nfsd_supported_minorversion; ++u32 nfsd_supported_minorversion = NFSD_SUPPORTED_MINOR_VERSION; + + int nfsd_vers(int vers, enum vers_op change) + { +diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pnfsd.h +--- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-08-31 20:42:05.569090615 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-08-31 20:42:05.569090615 -0400 +@@ -0,0 +1,143 @@ ++/* ++ * Copyright (c) 2005 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef LINUX_NFSD_PNFSD_H ++#define LINUX_NFSD_PNFSD_H ++ ++#include ++#include ++ ++#include "state.h" ++#include "xdr4.h" ++ ++/* outstanding layout stateid */ ++struct nfs4_layout_state { ++ struct list_head ls_perfile; ++ struct list_head ls_layouts; /* list of nfs4_layouts */ ++ struct kref ls_ref; ++ struct nfs4_client *ls_client; ++ struct nfs4_file *ls_file; ++ stateid_t ls_stateid; ++}; ++ ++/* outstanding layout */ ++struct nfs4_layout { ++ struct list_head lo_perfile; /* hash by f_id */ ++ struct list_head lo_perclnt; /* hash by clientid */ ++ struct list_head lo_perstate; ++ struct nfs4_file *lo_file; /* backpointer */ ++ struct nfs4_client *lo_client; ++ struct nfs4_layout_state *lo_state; ++ struct nfsd4_layout_seg lo_seg; ++}; ++ ++struct pnfs_inval_state { ++ struct knfsd_fh mdsfh; /* needed only by invalidate all */ ++ stateid_t stid; ++ clientid_t clid; ++ u32 status; ++}; ++ ++/* pNFS Data Server state */ ++#define DS_STATEID_VALID 0 ++#define DS_STATEID_ERROR 1 ++#define DS_STATEID_NEW 2 ++ ++struct pnfs_ds_stateid { ++ struct list_head ds_hash; /* ds_stateid hash entry */ ++ struct list_head ds_perclid; /* per client hash entry */ ++ stateid_t ds_stid; ++ struct knfsd_fh ds_fh; ++ unsigned long ds_access; ++ u32 ds_status; /* from MDS */ ++ u32 ds_verifier[2]; /* from MDS */ ++ wait_queue_head_t ds_waitq; ++ unsigned long ds_flags; ++ struct kref ds_ref; ++ clientid_t ds_mdsclid; ++}; ++ ++struct pnfs_ds_clientid { ++ struct list_head dc_hash; /* mds_clid_hashtbl entry */ ++ struct list_head dc_stateid; /* ds_stateid head */ ++ struct list_head dc_permdsid; /* per mdsid hash entry */ ++ clientid_t dc_mdsclid; ++ struct kref dc_ref; ++ uint32_t dc_mdsid; ++}; ++ ++struct pnfs_mds_id { ++ struct list_head di_hash; /* mds_nodeid list entry */ ++ struct list_head di_mdsclid; /* mds_clientid head */ ++ uint32_t di_mdsid; ++ time_t di_mdsboot; /* mds boot time */ ++ struct kref di_ref; ++}; ++ ++/* notify device request (from exported filesystem) */ ++struct nfs4_notify_device { ++ struct nfsd4_pnfs_cb_dev_list *nd_list; ++ struct nfs4_client *nd_client; ++ struct list_head nd_perclnt; ++ ++ void *nd_args; /* nfsd internal */ ++}; ++ ++u64 find_create_sbid(struct super_block *); ++struct super_block *find_sbid_id(u64); ++__be32 nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *, struct exp_xdr_stream *); ++int nfs4_pnfs_return_layout(struct super_block *, struct svc_fh *, ++ struct nfsd4_pnfs_layoutreturn *); ++int nfs4_pnfs_cb_get_state(struct super_block *, struct pnfs_get_state *); ++int nfs4_pnfs_cb_change_state(struct pnfs_get_state *); ++void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++int put_layoutrecall(struct nfs4_layoutrecall *); ++void nomatching_layout(struct nfs4_layoutrecall *); ++void *layoutrecall_done(struct nfs4_layoutrecall *); ++int nfsd4_cb_layout(struct nfs4_layoutrecall *); ++int nfsd_layout_recall_cb(struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++int nfsd_device_notify_cb(struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++int nfsd4_cb_notify_device(struct nfs4_notify_device *); ++void pnfs_set_device_notify(clientid_t *, unsigned int types); ++void pnfs_clear_device_notify(struct nfs4_client *); ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++extern struct sockaddr pnfsd_lexp_addr; ++extern size_t pnfs_lexp_addr_len; ++ ++extern void pnfsd_lexp_init(struct inode *); ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#endif /* LINUX_NFSD_PNFSD_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c +--- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-31 20:42:05.569090615 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-31 20:42:05.569090615 -0400 +@@ -0,0 +1,225 @@ ++/* ++ * linux/fs/nfsd/pnfs_lexp.c ++ * ++ * pNFS export of local filesystems. ++ * ++ * Export local file systems over the files layout type. ++ * The MDS (metadata server) functions also as a single DS (data server). ++ * This is mostly useful for development and debugging purposes. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * Copyright (C) 2008 Benny Halevy, ++ * ++ * Initial implementation was based on the pnfs-gfs2 patches done ++ * by David M. Richter ++ */ ++ ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++struct sockaddr pnfsd_lexp_addr; ++size_t pnfs_lexp_addr_len; ++ ++static int ++pnfsd_lexp_layout_type(struct super_block *sb) ++{ ++ int ret = LAYOUT_NFSV4_1_FILES; ++ dprintk("<-- %s: return %d\n", __func__, ret); ++ return ret; ++} ++ ++static int ++pnfsd_lexp_get_device_iter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ ++ dprintk("<-- %s: return 0\n", __func__); ++ return 0; ++} ++ ++static int ++pnfsd_lexp_get_device_info(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_multipath fl_devices[1]; ++ u32 fl_stripe_indices[1] = { 0 }; ++ struct pnfs_filelayout_devaddr daddr; ++ /* %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x.%03u.%03u */ ++ char daddr_buf[8*4 + 2*3 + 10]; ++ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ ++ if (devid->devid != 1) { ++ printk(KERN_ERR "%s: WARNING: didn't receive a deviceid of 1 " ++ "(got: 0x%llx)\n", __func__, devid->devid); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ /* count the number of comma-delimited DS IPs */ ++ fdev.fl_device_length = 1; ++ fdev.fl_device_list = fl_devices; ++ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = fl_stripe_indices; ++ ++ daddr.r_addr.data = daddr_buf; ++ daddr.r_addr.len = sizeof(daddr_buf); ++ err = __svc_print_netaddr(&pnfsd_lexp_addr, &daddr.r_addr); ++ if (err < 0) ++ goto out; ++ daddr.r_addr.len = err; ++ switch (pnfsd_lexp_addr.sa_family) { ++ case AF_INET: ++ daddr.r_netid.data = "tcp"; ++ daddr.r_netid.len = 3; ++ break; ++ case AF_INET6: ++ daddr.r_netid.data = "tcp6"; ++ daddr.r_netid.len = 4; ++ break; ++ default: ++ BUG(); ++ } ++ fdev.fl_device_list[0].fl_multipath_length = 1; ++ fdev.fl_device_list[0].fl_multipath_list = &daddr; ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ dprintk("<-- %s: return %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize < NFSSVC_MAXBLKSIZE) ++ blocksize = NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++ dprintk("%s: return %d\n", __func__, blocksize); ++ return blocksize; ++} ++ ++static enum nfsstat4 ++pnfsd_lexp_layout_get(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ enum nfsstat4 rc = NFS4_OK; ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ ++ dprintk("--> %s: inode=%p\n", __func__, inode); ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = true; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = arg->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = 0; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ memcpy(fhp, arg->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ dprintk("<-- %s: return %d\n", __func__, rc); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++pnfsd_lexp_layout_commit(struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ dprintk("%s: (unimplemented)\n", __func__); ++ ++ return 0; ++} ++ ++static int ++pnfsd_lexp_layout_return(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ dprintk("%s: (unimplemented)\n", __func__); ++ ++ return 0; ++} ++ ++static int pnfsd_lexp_get_state(struct inode *inode, struct knfsd_fh *fh, ++ struct pnfs_get_state *p) ++{ ++ return 0; /* just use the current stateid */ ++} ++ ++static struct pnfs_export_operations pnfsd_lexp_ops = { ++ .layout_type = pnfsd_lexp_layout_type, ++ .get_device_info = pnfsd_lexp_get_device_info, ++ .get_device_iter = pnfsd_lexp_get_device_iter, ++ .layout_get = pnfsd_lexp_layout_get, ++ .layout_commit = pnfsd_lexp_layout_commit, ++ .layout_return = pnfsd_lexp_layout_return, ++ .get_state = pnfsd_lexp_get_state, ++}; ++ ++void ++pnfsd_lexp_init(struct inode *inode) ++{ ++ dprintk("%s: &pnfsd_lexp_ops=%p\n", __func__, &pnfsd_lexp_ops); ++ inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_com.c +--- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-31 20:42:05.570119170 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-08-31 20:42:05.570119170 -0400 +@@ -0,0 +1,535 @@ ++/* ++ * fs/nfsd/spnfs_com.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * Based heavily on idmap.c ++ * ++ */ ++ ++/* ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Marius Aamodt Eriksen ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++static ssize_t spnfs_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t spnfs_pipe_downcall(struct file *, const char __user *, ++ size_t); ++static void spnfs_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops spnfs_upcall_ops = { ++ .upcall = spnfs_pipe_upcall, ++ .downcall = spnfs_pipe_downcall, ++ .destroy_msg = spnfs_pipe_destroy_msg, ++}; ++ ++/* evil global variable */ ++struct spnfs *global_spnfs; ++struct spnfs_config *spnfs_config; ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++int spnfs_use_layoutsegments; ++uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++/* ++ * Used by spnfs_enabled() ++ * Tracks if the subsystem has been initialized at some point. It doesn't ++ * matter if it's not currently initialized. ++ */ ++static int spnfs_enabled_at_some_point; ++ ++/* call this to start the ball rolling */ ++/* code it like we're going to avoid the global variable in the future */ ++int ++nfsd_spnfs_new(void) ++{ ++ struct spnfs *spnfs = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ if (global_spnfs != NULL) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ spnfs = kzalloc(sizeof(*spnfs), GFP_KERNEL); ++ if (spnfs == NULL){ ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ spnfs->spnfs_dentry = rpc_mkpipe(nd.path.dentry, "spnfs", spnfs, ++ &spnfs_upcall_ops, 0); ++ if (IS_ERR(spnfs->spnfs_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ ++ mutex_init(&spnfs->spnfs_lock); ++ mutex_init(&spnfs->spnfs_plock); ++ init_waitqueue_head(&spnfs->spnfs_wq); ++ ++ global_spnfs = spnfs; ++ spnfs_enabled_at_some_point = 1; ++ ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(spnfs); ++ return rc; ++} ++ ++/* again, code it like we're going to remove the global variable */ ++void ++nfsd_spnfs_delete(void) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ ++ if (!spnfs) ++ return; ++ rpc_unlink(spnfs->spnfs_dentry); ++ rpc_put_mount(); ++ global_spnfs = NULL; ++ kfree(spnfs); ++} ++ ++/* RPC pipefs upcall/downcall routines */ ++/* looks like this code is invoked by the rpc_pipe code */ ++/* to handle upcalls on things we've queued elsewhere */ ++/* See nfs_idmap_id for an exmaple of enqueueing */ ++static ssize_t ++spnfs_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, ++ char __user *dst, size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied; ++ ssize_t left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ return mlen; ++} ++ ++static ssize_t ++spnfs_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ struct spnfs *spnfs = (struct spnfs *)rpci->private; ++ struct spnfs_msg *im_in = NULL, *im = &spnfs->spnfs_im; ++ int ret; ++ ++ if (mlen != sizeof(struct spnfs_msg)) ++ return -ENOSPC; ++ ++ im_in = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im_in == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(im_in, src, mlen) != 0) ++ return -EFAULT; ++ ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ ret = mlen; ++ im->im_status = im_in->im_status; ++ /* If we got an error, terminate now, and wake up pending upcalls */ ++ if (!(im_in->im_status & SPNFS_STATUS_SUCCESS)) { ++ wake_up(&spnfs->spnfs_wq); ++ goto out; ++ } ++ ++ ret = -EINVAL; ++ /* Did we match the current upcall? */ ++ /* DMXXX: do not understand the comment above, from original code */ ++ /* DMXXX: when do we _not_ match the current upcall? */ ++ /* DMXXX: anyway, let's to a simplistic check */ ++ if (im_in->im_type == im->im_type) { ++ /* copy the response into the spnfs struct */ ++ memcpy(&im->im_res, &im_in->im_res, sizeof(im->im_res)); ++ ret = mlen; ++ } else ++ dprintk("spnfs: downcall type != upcall type\n"); ++ ++ ++ wake_up(&spnfs->spnfs_wq); ++/* DMXXX handle rval processing */ ++out: ++ mutex_unlock(&spnfs->spnfs_plock); ++ kfree(im_in); ++ return ret; ++} ++ ++static void ++spnfs_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ struct spnfs_msg *im = msg->data; ++ struct spnfs *spnfs = container_of(im, struct spnfs, spnfs_im); ++ ++ if (msg->errno >= 0) ++ return; ++ mutex_lock(&spnfs->spnfs_plock); ++ im->im_status = SPNFS_STATUS_FAIL; /* DMXXX */ ++ wake_up(&spnfs->spnfs_wq); ++ mutex_unlock(&spnfs->spnfs_plock); ++} ++ ++/* generic upcall. called by functions in spnfs_ops.c */ ++int ++spnfs_upcall(struct spnfs *spnfs, struct spnfs_msg *upmsg, ++ union spnfs_msg_res *res) ++{ ++ struct rpc_pipe_msg msg; ++ struct spnfs_msg *im; ++ DECLARE_WAITQUEUE(wq, current); ++ int ret = -EIO; ++ int rval; ++ ++ im = &spnfs->spnfs_im; ++ ++ mutex_lock(&spnfs->spnfs_lock); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ memset(im, 0, sizeof(*im)); ++ memcpy(im, upmsg, sizeof(*upmsg)); ++ ++ memset(&msg, 0, sizeof(msg)); ++ msg.data = im; ++ msg.len = sizeof(*im); ++ ++ add_wait_queue(&spnfs->spnfs_wq, &wq); ++ rval = rpc_queue_upcall(spnfs->spnfs_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&spnfs->spnfs_plock); ++ schedule(); ++ current->state = TASK_RUNNING; ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ if (im->im_status & SPNFS_STATUS_SUCCESS) { ++ /* copy our result from the upcall */ ++ memcpy(res, &im->im_res, sizeof(*res)); ++ ret = 0; ++ } ++ ++out: ++ memset(im, 0, sizeof(*im)); ++ mutex_unlock(&spnfs->spnfs_plock); ++ mutex_unlock(&spnfs->spnfs_lock); ++ return(ret); ++} ++ ++/* ++ * This is used to determine if the spnfsd daemon has been started at ++ * least once since the system came up. This is used to by the export ++ * mechanism to decide if spnfs is in use. ++ * ++ * Returns non-zero if the spnfsd has initialized the communication pipe ++ * at least once. ++ */ ++int spnfs_enabled(void) ++{ ++ return spnfs_enabled_at_some_point; ++} ++ ++#ifdef CONFIG_PROC_FS ++ ++/* ++ * procfs virtual files for user/kernel space communication: ++ * ++ * ctl - currently just an on/off switch...can be expanded ++ * getfh - fd to fh conversion ++ * recall - recall a layout from the command line, for example: ++ * echo > /proc/fs/spnfs/recall ++ * config - configuration info, e.g., stripe size, num ds, etc. ++ */ ++ ++/*-------------- start ctl -------------------------*/ ++static ssize_t ctl_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int cmd, rc; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (cmd) { ++ rc = nfsd_spnfs_new(); ++ if (rc != 0) ++ return rc; ++ } else ++ nfsd_spnfs_delete(); ++ ++ return count; ++} ++ ++static const struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++/*-------------- end ctl ---------------------------*/ ++ ++/*-------------- start config -------------------------*/ ++static ssize_t config_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ static struct spnfs_config cfg; ++ ++ if (copy_from_user(&cfg, buf, count)) ++ return -EFAULT; ++ ++ spnfs_config = &cfg; ++ return 0; ++} ++ ++static const struct file_operations config_ops = { ++ .write = config_write, ++}; ++/*-------------- end config ---------------------------*/ ++ ++/*-------------- start getfh -----------------------*/ ++static int getfh_open(struct inode *inode, struct file *file) ++{ ++ file->private_data = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); ++ if (file->private_data == NULL) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static ssize_t getfh_read(struct file *file, char __user *buf, size_t count, ++ loff_t *offset) ++{ ++ if (copy_to_user(buf, file->private_data, sizeof(struct nfs_fh))) ++ return -EFAULT; ++ ++ return count; ++} ++ ++static ssize_t getfh_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int fd; ++ ++ if (copy_from_user((int *)&fd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (spnfs_getfh(fd, file->private_data) != 0) ++ return -EIO; ++ ++ return count; ++} ++ ++static int getfh_release(struct inode *inode, struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static const struct file_operations getfh_ops = { ++ .open = getfh_open, ++ .read = getfh_read, ++ .write = getfh_write, ++ .release = getfh_release, ++}; ++/*-------------- end getfh ------------------------*/ ++ ++ ++/*-------------- start recall layout --------------*/ ++static ssize_t recall_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char input[128]; ++ char *path, *str, *p; ++ int rc; ++ u64 off = 0, len = 0; ++ ++ if (count > 128) ++ return -EINVAL; ++ ++ if (copy_from_user(input, buf, count)) ++ return -EFAULT; ++ ++ /* assumes newline-terminated path */ ++ p = memchr(input, '\n', count); ++ if (p == NULL) ++ return -EINVAL; ++ *p = '\0'; ++ ++ /* ++ * Scan for path and, optionally, an offset and length ++ * of a layout segment to be recalled; if there are two ++ * fields, they're assumed to be path and offset. ++ */ ++ p = input; ++ path = strsep(&p, " "); ++ if (path == NULL) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &off); ++ if (rc != 0) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &len); ++ if (rc != 0) ++ return -EINVAL; ++ } ++ } ++ ++ rc = spnfs_test_layoutrecall(path, off, len); ++ if (rc != 0) ++ return rc; ++ ++ return count; ++} ++ ++static const struct file_operations recall_ops = { ++ .write = recall_write, ++}; ++/*-------------- end recall layout --------------*/ ++ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++/*-------------- start layoutseg -------------------------*/ ++static ssize_t layoutseg_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[3]; ++ ++ if (copy_from_user(cmd, buf, 1)) ++ return -EFAULT; ++ if (cmd[0] == '0') ++ spnfs_use_layoutsegments = 0; ++ else ++ spnfs_use_layoutsegments = 1; ++ ++ return count; ++} ++ ++static const struct file_operations layoutseg_ops = { ++ .write = layoutseg_write, ++}; ++/*-------------- end layoutseg ---------------------------*/ ++ ++/*-------------- start layoutsegsize -------------------------*/ ++static ssize_t layoutsegsize_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[50]; ++ ++ if (copy_from_user(cmd, buf, 49)) ++ return -EFAULT; ++ layoutsegment_size = simple_strtoull(cmd, NULL, 10); ++ ++ return count; ++} ++ ++static const struct file_operations layoutsegsize_ops = { ++ .write = layoutsegsize_write, ++}; ++/*-------------- end layoutsegsize ---------------------------*/ ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++int ++spnfs_init_proc(void) ++{ ++ struct proc_dir_entry *entry; ++ ++ entry = proc_mkdir("fs/spnfs", NULL); ++ if (!entry) ++ return -ENOMEM; ++ ++ entry = create_proc_entry("fs/spnfs/ctl", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &ctl_ops; ++ ++ entry = create_proc_entry("fs/spnfs/config", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &config_ops; ++ ++ entry = create_proc_entry("fs/spnfs/getfh", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &getfh_ops; ++ ++ entry = create_proc_entry("fs/spnfs/recall", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &recall_ops; ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++ entry = create_proc_entry("fs/spnfs/layoutseg", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutseg_ops; ++ ++ entry = create_proc_entry("fs/spnfs/layoutsegsize", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutsegsize_ops; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ return 0; ++} ++#endif /* CONFIG_PROC_FS */ +diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c +--- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-31 20:42:05.571097807 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-08-31 20:42:05.572091128 -0400 +@@ -0,0 +1,878 @@ ++/* ++ * fs/nfsd/spnfs_ops.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * ++ */ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++/* comment out CONFIG_SPNFS_TEST for non-test behaviour */ ++/* #define CONFIG_SPNFS_TEST 1 */ ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++/* ++ * The functions that are called from elsewhere in the kernel ++ * to perform tasks in userspace ++ * ++ */ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++extern int spnfs_use_layoutsegments; ++extern uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++extern struct spnfs *global_spnfs; ++ ++int ++spnfs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++enum nfsstat4 ++spnfs_layoutget(struct inode *inode, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *lg_arg, ++ struct nfsd4_pnfs_layoutget_res *lg_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct pnfs_filelayout_layout *flp = NULL; ++ int status, i; ++ enum nfsstat4 nfserr; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ im->im_type = SPNFS_TYPE_LAYOUTGET; ++ im->im_args.layoutget_args.inode = inode->i_ino; ++ im->im_args.layoutget_args.generation = inode->i_generation; ++ ++ /* call function to queue the msg for upcall */ ++ if (spnfs_upcall(spnfs, im, res) != 0) { ++ dprintk("failed spnfs upcall: layoutget\n"); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto layoutget_cleanup; ++ } ++ status = res->layoutget_res.status; ++ if (status != 0) { ++ /* FIXME? until user mode is fixed, translate system error */ ++ switch (status) { ++ case -E2BIG: ++ case -ETOOSMALL: ++ nfserr = NFS4ERR_TOOSMALL; ++ break; ++ case -ENOMEM: ++ case -EAGAIN: ++ case -EINTR: ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ break; ++ case -ENOENT: ++ nfserr = NFS4ERR_BADLAYOUT; ++ break; ++ default: ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ } ++ dprintk("spnfs layout_get upcall: status=%d nfserr=%u\n", ++ status, nfserr); ++ goto layoutget_cleanup; ++ } ++ ++ lg_res->lg_return_on_close = 0; ++#if defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ /* if spnfs_use_layoutsegments & layoutsegment_size == 0, use */ ++ /* the amount requested by the client. */ ++ if (spnfs_use_layoutsegments) { ++ if (layoutsegment_size != 0) ++ lg_res->lg_seg.length = layoutsegment_size; ++ } else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ flp = kmalloc(sizeof(struct pnfs_filelayout_layout), GFP_KERNEL); ++ if (flp == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ flp->device_id.sbid = lg_arg->lg_sbid; ++ flp->device_id.devid = res->layoutget_res.devid; ++ flp->lg_layout_type = 1; /* XXX */ ++ flp->lg_stripe_type = res->layoutget_res.stripe_type; ++ flp->lg_commit_through_mds = 0; ++ flp->lg_stripe_unit = res->layoutget_res.stripe_size; ++ flp->lg_first_stripe_index = 0; ++ flp->lg_pattern_offset = 0; ++ flp->lg_fh_length = res->layoutget_res.stripe_count; ++ ++ flp->lg_fh_list = kmalloc(flp->lg_fh_length * sizeof(struct knfsd_fh), ++ GFP_KERNEL); ++ if (flp->lg_fh_list == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ /* ++ * FIX: Doing an extra copy here. Should group res.flist's fh_len ++ * and fh_val into a knfsd_fh structure. ++ */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ flp->lg_fh_list[i].fh_size = res->layoutget_res.flist[i].fh_len; ++ memcpy(&flp->lg_fh_list[i].fh_base, ++ res->layoutget_res.flist[i].fh_val, ++ res->layoutget_res.flist[i].fh_len); ++ } ++ ++ /* encode the layoutget body */ ++ nfserr = filelayout_encode_layout(xdr, flp); ++ ++layoutget_cleanup: ++ if (flp) { ++ if (flp->lg_fh_list) ++ kfree(flp->lg_fh_list); ++ kfree(flp); ++ } ++ kfree(im); ++ kfree(res); ++ ++ return nfserr; ++} ++ ++int ++spnfs_layoutcommit(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutreturn(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for ino = %lu\n", ++ __func__, inode->i_ino); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* XXX figure out how to get a sb since there's no inode ptr */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = offset; ++ lr.cbl_seg.length = len; ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ lr.cbl_layoutchanged = 0; ++ ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ ++ return 0; ++} ++ ++ ++int ++spnfs_test_layoutrecall(char *path, u64 offset, u64 len) ++{ ++ struct nameidata nd; ++ struct inode *inode; ++ int type, rc; ++ ++ dprintk("%s: path=%s, offset=%llu, len=%llu\n", ++ __func__, path, offset, len); ++ ++ if (strcmp(path, "all") == 0) { ++ inode = NULL; ++ type = RETURN_ALL; ++ } else { ++ rc = path_lookup(path, 0, &nd); ++ if (rc != 0) ++ return -ENOENT; ++ ++ /* ++ * XXX todo: add a RETURN_FSID scenario here...maybe if ++ * inode is a dir... ++ */ ++ ++ inode = nd.path.dentry->d_inode; ++ type = RETURN_FILE; ++ } ++ ++ if (len == 0) ++ len = NFS4_MAX_UINT64; ++ ++ rc = spnfs_layoutrecall(inode, type, offset, len); ++ ++ if (type != RETURN_ALL) ++ path_put(&nd.path); ++ return rc; ++} ++ ++int ++spnfs_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *gd_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* XXX keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEITER; ++ im->im_args.getdeviceiter_args.cookie = gd_res->gd_cookie; ++ im->im_args.getdeviceiter_args.verf = gd_res->gd_verf; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceiter_out; ++ } ++ status = res->getdeviceiter_res.status; ++ ++ if (res->getdeviceiter_res.eof) ++ gd_res->gd_eof = 1; ++ else { ++ gd_res->gd_devid = res->getdeviceiter_res.devid; ++ gd_res->gd_cookie = res->getdeviceiter_res.cookie; ++ gd_res->gd_verf = res->getdeviceiter_res.verf; ++ gd_res->gd_eof = 0; ++ } ++ ++getdeviceiter_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++#ifdef CONFIG_SPNFS_TEST ++/* ++ * Setup the rq_res xdr_buf. The svc_rqst rq_respages[1] page contains the ++ * 1024 encoded stripe indices. ++ * ++ * Skip the devaddr4 length and encode the indicies count (1024) in the ++ * rq_res.head and set the rq_res.head length. ++ * ++ * Set the rq_res page_len to 4096 (for the 1024 stripe indices). ++ * Set the rq_res xdr_buf tail base to rq_respages[0] just after the ++ * rq_res head to hold the rest of the getdeviceinfo return. ++ * ++ * So rq_respages[rq_resused - 1] contains the rq_res.head and rq_res.tail and ++ * rq_respages[rq_resused] contains the rq_res.pages. ++ */ ++static int spnfs_test_indices_xdr(struct pnfs_xdr_info *info, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ struct nfsd4_compoundres *resp = info->resp; ++ struct svc_rqst *rqstp = resp->rqstp; ++ struct xdr_buf *xb = &resp->rqstp->rq_res; ++ __be32 *p; ++ ++ p = nfsd4_xdr_reserve_space(resp, 8); ++ p++; /* Fill in length later */ ++ *p++ = cpu_to_be32(fdev->fl_stripeindices_length); /* 1024 */ ++ resp->p = p; ++ ++ xb->head[0].iov_len = (char *)resp->p - (char *)xb->head[0].iov_base; ++ xb->pages = &rqstp->rq_respages[rqstp->rq_resused]; ++ xb->page_base = 0; ++ xb->page_len = PAGE_SIZE; /* page of 1024 encoded indices */ ++ xb->tail[0].iov_base = resp->p; ++ resp->end = xb->head[0].iov_base + PAGE_SIZE; ++ xb->tail[0].iov_len = (char *)resp->end - (char *)resp->p; ++ return 0; ++} ++/* ++ * Return a stripeindices of length 1024 to test ++ * the pNFS client multipage getdeviceinfo implementation. ++ * ++ * Encode a page of stripe indices. ++ */ ++static void spnfs_set_test_indices(struct pnfs_filelayout_device *fldev, ++ struct spnfs_device *dev, ++ struct pnfs_devinfo_arg *info) ++{ ++ struct svc_rqst *rqstp = info->xdr.resp->rqstp; ++ __be32 *p; ++ int i, j = 0; ++ ++ p = (__be32 *)page_address(rqstp->rq_respages[rqstp->rq_resused]); ++ fldev->fl_stripeindices_length = 1024; ++ /* round-robin the data servers device index into the stripe indicie */ ++ for (i = 0; i < 1024; i++) { ++ *p++ = cpu_to_be32(j); ++ if (j < dev->dscount - 1) ++ j++; ++ else ++ j = 0; ++ } ++ fldev->fl_stripeindices_list = NULL; ++} ++#endif /* CONFIG_SPNFS_TEST */ ++ ++int ++spnfs_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct spnfs_device *dev; ++ struct pnfs_filelayout_device *fldev = NULL; ++ struct pnfs_filelayout_multipath *mp = NULL; ++ struct pnfs_filelayout_devaddr *fldap = NULL; ++ int status = 0, i, len; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEINFO; ++ /* XXX FIX: figure out what to do about fsid */ ++ im->im_args.getdeviceinfo_args.devid = devid->devid; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceinfo_out; ++ } ++ status = res->getdeviceinfo_res.status; ++ if (status != 0) ++ goto getdeviceinfo_out; ++ ++ dev = &res->getdeviceinfo_res.devinfo; ++ ++ /* Fill in the device data, i.e., nfs4_1_file_layout_ds_addr4 */ ++ fldev = kzalloc(sizeof(struct pnfs_filelayout_device), GFP_KERNEL); ++ if (fldev == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ /* ++ * Stripe count is the same as data server count for our purposes ++ */ ++ fldev->fl_stripeindices_length = dev->dscount; ++ fldev->fl_device_length = dev->dscount; ++ ++ /* Set stripe indices */ ++#ifdef CONFIG_SPNFS_TEST ++ spnfs_set_test_indices(fldev, dev, info); ++ fldev->fl_enc_stripe_indices = spnfs_test_indices_xdr; ++#else /* CONFIG_SPNFS_TEST */ ++ fldev->fl_stripeindices_list = ++ kmalloc(fldev->fl_stripeindices_length * sizeof(u32), ++ GFP_KERNEL); ++ if (fldev->fl_stripeindices_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_stripeindices_length; i++) ++ fldev->fl_stripeindices_list[i] = i; ++#endif /* CONFIG_SPNFS_TEST */ ++ ++ /* ++ * Set the device's data server addresses No multipath for spnfs, ++ * so mp length is always 1. ++ * ++ */ ++ fldev->fl_device_list = ++ kmalloc(fldev->fl_device_length * ++ sizeof(struct pnfs_filelayout_multipath), ++ GFP_KERNEL); ++ if (fldev->fl_device_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ mp = &fldev->fl_device_list[i]; ++ mp->fl_multipath_length = 1; ++ mp->fl_multipath_list = ++ kmalloc(sizeof(struct pnfs_filelayout_devaddr), ++ GFP_KERNEL); ++ if (mp->fl_multipath_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ fldap = mp->fl_multipath_list; ++ ++ /* ++ * Copy the netid into the device address, for example: "tcp" ++ */ ++ len = strlen(dev->dslist[i].netid); ++ fldap->r_netid.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_netid.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_netid.data, dev->dslist[i].netid, len); ++ fldap->r_netid.len = len; ++ ++ /* ++ * Copy the network address into the device address, ++ * for example: "10.35.9.16.08.01" ++ */ ++ len = strlen(dev->dslist[i].addr); ++ fldap->r_addr.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_addr.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_addr.data, dev->dslist[i].addr, len); ++ fldap->r_addr.len = len; ++ } ++ ++ /* encode the device data */ ++ status = filelayout_encode_devinfo(xdr, fldev); ++ ++getdeviceinfo_out: ++ if (fldev) { ++ kfree(fldev->fl_stripeindices_list); ++ if (fldev->fl_device_list) { ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ fldap = ++ fldev->fl_device_list[i].fl_multipath_list; ++ kfree(fldap->r_netid.data); ++ kfree(fldap->r_addr.data); ++ kfree(fldap); ++ } ++ kfree(fldev->fl_device_list); ++ } ++ kfree(fldev); ++ } ++ ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_setattr(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_open(struct inode *inode, struct nfsd4_open *open) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_OPEN; ++ im->im_args.open_args.inode = inode->i_ino; ++ im->im_args.open_args.generation = inode->i_generation; ++ im->im_args.open_args.create = open->op_create; ++ im->im_args.open_args.createmode = open->op_createmode; ++ im->im_args.open_args.truncate = open->op_truncate; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto open_out; ++ } ++ status = res->open_res.status; ++ ++open_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_create(void) ++{ ++ return 0; ++} ++ ++/* ++ * Invokes the spnfsd with the inode number of the object to remove. ++ * The file has already been removed on the MDS, so all the spnsfd ++ * daemon does is remove the stripes. ++ * Returns 0 on success otherwise error code ++ */ ++int ++spnfs_remove(unsigned long ino, unsigned long generation) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_REMOVE; ++ im->im_args.remove_args.inode = ino; ++ im->im_args.remove_args.generation = generation; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto remove_out; ++ } ++ status = res->remove_res.status; ++ ++remove_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++static int ++read_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_read(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ if (err == 0) ++ break; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDONLY | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0 ; vnum < vlen ; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = read_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err < 0) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ if (err < iolen) { ++ bytecount += err; ++ goto read_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++read_out: ++ *lenp = bytecount; ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ return status; ++} ++ ++__be32 ++spnfs_read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return read(inode, offset, lenp, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++static int ++write_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_write(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDWR | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto write_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0; vnum < vlen; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = write_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err != iolen) { ++ dprintk("spnfs_write: err=%d expected %Zd\n", err, len); ++ status = nfserr_io; ++ goto write_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++write_out: ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ ++ return status; ++} ++ ++__be32 ++spnfs_write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return write(inode, offset, len, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++int ++spnfs_commit(void) ++{ ++ return 0; ++} ++ ++/* ++ * Return the state for this object. ++ * At this time simply return 0 to indicate success and use the existing state ++ */ ++int ++spnfs_get_state(struct inode *inode, struct knfsd_fh *fh, struct pnfs_get_state *arg) ++{ ++ return 0; ++} ++ ++/* ++ * Return the filehandle for the specified file descriptor ++ */ ++int ++spnfs_getfh(int fd, struct nfs_fh *fh) ++{ ++ struct file *file; ++ ++ file = fget(fd); ++ if (file == NULL) ++ return -EIO; ++ ++ memcpy(fh, NFS_FH(file->f_dentry->d_inode), sizeof(struct nfs_fh)); ++ fput(file); ++ return 0; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h +--- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-08-31 20:41:19.205016844 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-31 20:42:05.572091128 -0400 +@@ -242,6 +242,12 @@ struct nfs4_client { + u32 cl_cb_seq_nr; + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ ++#if defined(CONFIG_PNFSD) ++ struct list_head cl_layouts; /* outstanding layouts */ ++ struct list_head cl_layoutrecalls; /* outstanding layoutrecall ++ callbacks */ ++ atomic_t cl_deviceref; /* Num outstanding devs */ ++#endif /* CONFIG_PNFSD */ + }; + + static inline void +@@ -342,12 +348,31 @@ struct nfs4_file { + struct list_head fi_hash; /* hash by "struct inode *" */ + struct list_head fi_stateids; + struct list_head fi_delegations; ++#if defined(CONFIG_PNFSD) ++ struct list_head fi_layouts; ++ struct list_head fi_layout_states; ++#endif /* CONFIG_PNFSD */ + struct inode *fi_inode; + u32 fi_id; /* used with stateowner->so_id + * for stateid_hashtbl hash */ + bool fi_had_conflict; ++#if defined(CONFIG_PNFSD) ++ /* used by layoutget / layoutrecall */ ++ struct nfs4_fsid fi_fsid; ++ u32 fi_fhlen; ++ u8 fi_fhval[NFS4_FHSIZE]; ++#endif /* CONFIG_PNFSD */ + }; + ++#if defined(CONFIG_PNFSD) ++/* pNFS Metadata server state */ ++ ++struct pnfs_ds_dev_entry { ++ struct list_head dd_dev_entry; /* st_pnfs_ds_id entry */ ++ u32 dd_dsid; ++}; ++#endif /* CONFIG_PNFSD */ ++ + /* + * nfs4_stateid can either be an open stateid or (eventually) a lock stateid + * +@@ -370,6 +395,9 @@ struct nfs4_stateid { + struct list_head st_perfile; + struct list_head st_perstateowner; + struct list_head st_lockowners; ++#if defined(CONFIG_PNFSD) ++ struct list_head st_pnfs_ds_id; ++#endif /* CONFIG_PNFSD */ + struct nfs4_stateowner * st_stateowner; + struct nfs4_file * st_file; + stateid_t st_stateid; +@@ -421,6 +449,34 @@ extern void nfsd4_recdir_purge_old(void) + extern int nfsd4_create_clid_dir(struct nfs4_client *clp); + extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); + extern void release_session_client(struct nfsd4_session *); ++extern void nfsd4_free_slab(struct kmem_cache **); ++extern struct nfs4_file *find_file(struct inode *); ++extern struct nfs4_file *find_alloc_file(struct inode *, struct svc_fh *); ++extern void put_nfs4_file(struct nfs4_file *); ++extern void get_nfs4_file(struct nfs4_file *); ++extern struct nfs4_client *find_confirmed_client(clientid_t *); ++extern struct nfs4_stateid *find_stateid(stateid_t *, int flags); ++extern struct nfs4_delegation *find_delegation_stateid(struct inode *, stateid_t *); ++extern __be32 nfs4_check_stateid(stateid_t *); ++extern void expire_client_lock(struct nfs4_client *); ++extern int filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), void *); ++ ++#if defined(CONFIG_PNFSD) ++extern int nfsd4_init_pnfs_slabs(void); ++extern void nfsd4_free_pnfs_slabs(void); ++extern void pnfs_expire_client(struct nfs4_client *); ++extern void release_pnfs_ds_dev_list(struct nfs4_stateid *); ++extern void nfs4_pnfs_state_init(void); ++extern void nfs4_pnfs_state_shutdown(void); ++extern void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++extern int nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *, stateid_t *); ++#else /* CONFIG_PNFSD */ ++static inline void nfsd4_free_pnfs_slabs(void) {} ++static inline int nfsd4_init_pnfs_slabs(void) { return 0; } ++static inline void pnfs_expire_client(struct nfs4_client *clp) {} ++static inline void release_pnfs_ds_dev_list(struct nfs4_stateid *stp) {} ++static inline void nfs4_pnfs_state_shutdown(void) {} ++#endif /* CONFIG_PNFSD */ + + static inline void + nfs4_put_stateowner(struct nfs4_stateowner *so) +@@ -434,4 +490,24 @@ nfs4_get_stateowner(struct nfs4_stateown + kref_get(&so->so_ref); + } + ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ + #endif /* NFSD4_STATE_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs.c +--- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-08-31 20:41:17.275233561 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-08-31 20:42:05.573121119 -0400 +@@ -37,7 +37,12 @@ + #ifdef CONFIG_NFSD_V4 + #include + #include ++#include ++#include + #endif /* CONFIG_NFSD_V4 */ ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif + + #include "nfsd.h" + #include "vfs.h" +@@ -383,6 +388,12 @@ nfsd_setattr(struct svc_rqst *rqstp, str + NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); + if (err) + goto out; ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, 0)) { ++ err = bl_layoutrecall(inode, RETURN_FILE, ++ iap->ia_size, inode->i_size - iap->ia_size); ++ } ++#endif /* CONFIG_SPNFS_BLOCK */ + } + + /* +@@ -1703,6 +1714,11 @@ nfsd_rename(struct svc_rqst *rqstp, stru + struct inode *fdir, *tdir; + __be32 err; + int host_err; ++#ifdef CONFIG_SPNFS ++ unsigned long ino = 0; ++ unsigned long generation = 0; ++ unsigned int nlink = 0; ++#endif /* CONFIG_SPNFS */ + + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); + if (err) +@@ -1766,7 +1782,26 @@ nfsd_rename(struct svc_rqst *rqstp, stru + if (host_err) + goto out_dput_new; + ++#ifdef CONFIG_SPNFS ++ /* ++ * if the target is a preexisting regular file, remember the ++ * inode number and generation so we can delete the stripes; ++ * save the link count as well so that the stripes only get ++ * get deleted when the last link is deleted ++ */ ++ if (ndentry && ndentry->d_inode && S_ISREG(ndentry->d_inode->i_mode)) { ++ ino = ndentry->d_inode->i_ino; ++ generation = ndentry->d_inode->i_generation; ++ nlink = ndentry->d_inode->i_nlink; ++ } ++#endif /* CONFIG_SPNFS */ ++ + host_err = vfs_rename(fdir, odentry, tdir, ndentry); ++#ifdef CONFIG_SPNFS ++ if (spnfs_enabled() && (!host_err && ino && nlink == 1)) ++ spnfs_remove(ino, generation); ++#endif /* CONFIG_SPNFS */ ++ + if (!host_err) { + host_err = commit_metadata(tfhp); + if (!host_err) +@@ -1807,6 +1842,11 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + struct inode *dirp; + __be32 err; + int host_err; ++#if defined(CONFIG_SPNFS) ++ unsigned long ino; ++ unsigned long generation; ++ unsigned int nlink; ++#endif /* defined(CONFIG_SPNFS) */ + + err = nfserr_acces; + if (!flen || isdotent(fname, flen)) +@@ -1830,6 +1870,17 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + goto out; + } + ++#if defined(CONFIG_SPNFS) ++ /* ++ * Remember the inode number to communicate to the spnfsd ++ * for removal of stripes; save the link count as well so that ++ * the stripes only get get deleted when the last link is deleted ++ */ ++ ino = rdentry->d_inode->i_ino; ++ generation = rdentry->d_inode->i_generation; ++ nlink = rdentry->d_inode->i_nlink; ++#endif /* defined(CONFIG_SPNFS) */ ++ + if (!type) + type = rdentry->d_inode->i_mode & S_IFMT; + +@@ -1854,6 +1905,29 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + if (!host_err) + host_err = commit_metadata(fhp); + ++#if defined(CONFIG_SPNFS) ++ /* ++ * spnfs: notify spnfsd of removal to destroy stripes ++ */ ++/* ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (sb->s_export_op->spnfs_remove) { ++*/ ++ dprintk("%s check if spnfs_enabled\n", __FUNCTION__); ++ if (spnfs_enabled() && nlink == 1) { ++ BUG_ON(ino == 0); ++ dprintk("%s calling spnfs_remove inumber=%ld\n", ++ __FUNCTION__, ino); ++ if (spnfs_remove(ino, generation) == 0) { ++ dprintk("%s spnfs_remove success\n", __FUNCTION__); ++ } else { ++ /* XXX How do we make this atomic? */ ++ printk(KERN_WARNING "nfsd: pNFS could not " ++ "remove stripes for inode: %ld\n", ino); ++ } ++ } ++#endif /* defined(CONFIG_SPNFS) */ ++ + mnt_drop_write(fhp->fh_export->ex_path.mnt); + out_nfserr: + err = nfserrno(host_err); +diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h +--- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-08-31 20:41:19.206170424 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-31 20:42:05.575139084 -0400 +@@ -37,6 +37,8 @@ + #ifndef _LINUX_NFSD_XDR4_H + #define _LINUX_NFSD_XDR4_H + ++#include ++ + #include "state.h" + #include "nfsd.h" + +@@ -385,6 +387,51 @@ struct nfsd4_reclaim_complete { + u32 rca_one_fs; + }; + ++struct nfsd4_pnfs_getdevinfo { ++ struct nfsd4_pnfs_deviceid gd_devid; /* request */ ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxcount; /* request */ ++ u32 gd_notify_types;/* request */ ++ struct super_block *gd_sb; ++}; ++ ++struct nfsd4_pnfs_getdevlist { ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxdevices; /* request */ ++ u64 gd_cookie; /* request - response */ ++ u64 gd_verf; /* request - response */ ++ struct svc_fh *gd_fhp; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutget { ++ u64 lg_minlength; /* request */ ++ u32 lg_signal; /* request */ ++ u32 lg_maxcount; /* request */ ++ struct svc_fh *lg_fhp; /* request */ ++ stateid_t lg_sid; /* request/response */ ++ struct nfsd4_layout_seg lg_seg; /* request/response */ ++ u32 lg_roc; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit { ++ struct nfsd4_pnfs_layoutcommit_arg args; ++ stateid_t lc_sid; /* request */ ++ struct nfsd4_pnfs_layoutcommit_res res; ++}; ++ ++enum layoutreturn_flags { ++ LR_FLAG_INTERN = 1 << 0, /* internal return */ ++ LR_FLAG_EXPIRE = 1 << 1, /* return on client expiration */ ++}; ++ ++struct nfsd4_pnfs_layoutreturn { ++ struct nfsd4_pnfs_layoutreturn_arg args; ++ u32 lr_flags; ++ stateid_t lr_sid; /* request/resopnse */ ++ u32 lrs_present; /* response */ ++}; ++ + struct nfsd4_op { + int opnum; + __be32 status; +@@ -426,6 +473,13 @@ struct nfsd4_op { + struct nfsd4_destroy_session destroy_session; + struct nfsd4_sequence sequence; + struct nfsd4_reclaim_complete reclaim_complete; ++#if defined(CONFIG_PNFSD) ++ struct nfsd4_pnfs_getdevlist pnfs_getdevlist; ++ struct nfsd4_pnfs_getdevinfo pnfs_getdevinfo; ++ struct nfsd4_pnfs_layoutget pnfs_layoutget; ++ struct nfsd4_pnfs_layoutcommit pnfs_layoutcommit; ++ struct nfsd4_pnfs_layoutreturn pnfs_layoutreturn; ++#endif /* CONFIG_PNFSD */ + } u; + struct nfs4_replay * replay; + }; +diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file.c +--- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-31 20:41:19.146161064 -0400 ++++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-31 20:42:05.515139585 -0400 +@@ -36,6 +36,7 @@ + #include "internal.h" + #include "iostat.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_FILE + +@@ -388,12 +389,17 @@ static int nfs_write_begin(struct file * + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int once_thru = 0; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + mapping->host->i_ino, len, (long long) pos); + ++ pnfs_update_layout(mapping->host, ++ nfs_file_open_context(file), ++ 0, NFS4_MAX_UINT64, IOMODE_RW, ++ &lseg); + start: + /* + * Prevent starvation issues if someone is doing a consistency +@@ -402,17 +408,22 @@ start: + ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) +- return ret; ++ goto out; + + page = grab_cache_page_write_begin(mapping, index, flags); +- if (!page) +- return -ENOMEM; ++ if (!page) { ++ ret = -ENOMEM; ++ goto out; ++ } + *pagep = page; + +- ret = nfs_flush_incompatible(file, page); ++ ret = nfs_flush_incompatible(file, page, lseg); + if (ret) { + unlock_page(page); + page_cache_release(page); ++ *pagep = NULL; ++ *fsdata = NULL; ++ goto out; + } else if (!once_thru && + nfs_want_read_modify_write(file, page, pos, len)) { + once_thru = 1; +@@ -421,6 +432,12 @@ start: + if (!ret) + goto start; + } ++ ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata); ++ out: ++ if (ret) { ++ put_lseg(lseg); ++ *fsdata = NULL; ++ } + return ret; + } + +@@ -430,6 +447,7 @@ static int nfs_write_end(struct file *fi + { + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + int status; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, +@@ -456,10 +474,17 @@ static int nfs_write_end(struct file *fi + zero_user_segment(page, pglen, PAGE_CACHE_SIZE); + } + +- status = nfs_updatepage(file, page, offset, copied); ++ lseg = nfs4_pull_lseg_from_fsdata(file, fsdata); ++ status = pnfs_write_end(file, page, pos, len, copied, lseg); ++ if (status) ++ goto out; ++ status = nfs_updatepage(file, page, offset, copied, lseg, fsdata); + ++ out: + unlock_page(page); + page_cache_release(page); ++ pnfs_write_end_cleanup(file, fsdata); ++ put_lseg(lseg); + + if (status < 0) + return status; +@@ -570,6 +595,8 @@ static int nfs_vm_page_mkwrite(struct vm + /* make sure the cache has finished storing the page */ + nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); + ++ /* XXX Do we want to call pnfs_update_layout here? */ ++ + lock_page(page); + mapping = page->mapping; + if (mapping != dentry->d_inode->i_mapping) +@@ -580,11 +607,11 @@ static int nfs_vm_page_mkwrite(struct vm + if (pagelen == 0) + goto out_unlock; + +- ret = nfs_flush_incompatible(filp, page); ++ ret = nfs_flush_incompatible(filp, page, NULL); + if (ret != 0) + goto out_unlock; + +- ret = nfs_updatepage(filp, page, 0, pagelen); ++ ret = nfs_updatepage(filp, page, 0, pagelen, NULL, NULL); + out_unlock: + if (!ret) + return VM_FAULT_LOCKED; +diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inode.c +--- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-08-31 20:41:19.149170418 -0400 ++++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-31 20:42:05.516222809 -0400 +@@ -48,6 +48,7 @@ + #include "internal.h" + #include "fscache.h" + #include "dns_resolve.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_VFS + +@@ -278,7 +279,7 @@ nfs_fhget(struct super_block *sb, struct + */ + inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; + if (S_ISREG(inode->i_mode)) { +- inode->i_fop = &nfs_file_operations; ++ inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; + inode->i_data.a_ops = &nfs_file_aops; + inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; + } else if (S_ISDIR(inode->i_mode)) { +@@ -530,6 +531,68 @@ out: + return err; + } + ++static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) ++{ ++ atomic_set(&l_ctx->count, 1); ++ l_ctx->lockowner = current->files; ++ l_ctx->pid = current->tgid; ++ INIT_LIST_HEAD(&l_ctx->list); ++} ++ ++static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) ++{ ++ struct nfs_lock_context *pos; ++ ++ list_for_each_entry(pos, &ctx->lock_context.list, list) { ++ if (pos->lockowner != current->files) ++ continue; ++ if (pos->pid != current->tgid) ++ continue; ++ atomic_inc(&pos->count); ++ return pos; ++ } ++ return NULL; ++} ++ ++struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) ++{ ++ struct nfs_lock_context *res, *new = NULL; ++ struct inode *inode = ctx->path.dentry->d_inode; ++ ++ spin_lock(&inode->i_lock); ++ res = __nfs_find_lock_context(ctx); ++ if (res == NULL) { ++ spin_unlock(&inode->i_lock); ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (new == NULL) ++ return NULL; ++ nfs_init_lock_context(new); ++ spin_lock(&inode->i_lock); ++ res = __nfs_find_lock_context(ctx); ++ if (res == NULL) { ++ list_add_tail(&new->list, &ctx->lock_context.list); ++ new->open_context = ctx; ++ res = new; ++ new = NULL; ++ } ++ } ++ spin_unlock(&inode->i_lock); ++ kfree(new); ++ return res; ++} ++ ++void nfs_put_lock_context(struct nfs_lock_context *l_ctx) ++{ ++ struct nfs_open_context *ctx = l_ctx->open_context; ++ struct inode *inode = ctx->path.dentry->d_inode; ++ ++ if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) ++ return; ++ list_del(&l_ctx->list); ++ spin_unlock(&inode->i_lock); ++ kfree(l_ctx); ++} ++ + /** + * nfs_close_context - Common close_context() routine NFSv2/v3 + * @ctx: pointer to context +@@ -566,11 +629,11 @@ static struct nfs_open_context *alloc_nf + path_get(&ctx->path); + ctx->cred = get_rpccred(cred); + ctx->state = NULL; +- ctx->lockowner = current->files; + ctx->flags = 0; + ctx->error = 0; + ctx->dir_cookie = 0; +- atomic_set(&ctx->count, 1); ++ nfs_init_lock_context(&ctx->lock_context); ++ ctx->lock_context.open_context = ctx; + } + return ctx; + } +@@ -578,15 +641,16 @@ static struct nfs_open_context *alloc_nf + struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) + { + if (ctx != NULL) +- atomic_inc(&ctx->count); ++ atomic_inc(&ctx->lock_context.count); + return ctx; + } ++EXPORT_SYMBOL(get_nfs_open_context); + + static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) + { + struct inode *inode = ctx->path.dentry->d_inode; + +- if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) ++ if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) + return; + list_del(&ctx->list); + spin_unlock(&inode->i_lock); +@@ -933,6 +997,7 @@ void nfs_fattr_init(struct nfs_fattr *fa + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); + } ++EXPORT_SYMBOL(nfs_fattr_init); + + struct nfs_fattr *nfs_alloc_fattr(void) + { +@@ -1142,6 +1207,14 @@ static int nfs_update_inode(struct inode + server->fsid = fattr->fsid; + + /* ++ * file needs layout commit, server attributes may be stale ++ */ ++ if (layoutcommit_needed(nfsi) && nfsi->change_attr >= fattr->change_attr) { ++ dprintk("NFS: %s: layoutcommit is needed for file %s/%ld\n", ++ __func__, inode->i_sb->s_id, inode->i_ino); ++ return 0; ++ } ++ /* + * Update the read time so we don't revalidate too often. + */ + nfsi->read_cache_jiffies = fattr->time_start; +@@ -1340,9 +1413,10 @@ static int nfs_update_inode(struct inode + */ + void nfs4_clear_inode(struct inode *inode) + { ++ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); ++ + /* If we are holding a delegation, return it! */ + nfs_inode_return_delegation_noreclaim(inode); +- /* First call standard NFS clear_inode() code */ + nfs_clear_inode(inode); + } + #endif +@@ -1367,7 +1441,10 @@ struct inode *nfs_alloc_inode(struct sup + + void nfs_destroy_inode(struct inode *inode) + { +- kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); ++ struct nfs_inode *nfsi = NFS_I(inode); ++ ++ pnfs_destroy_layout(nfsi); ++ kmem_cache_free(nfs_inode_cachep, nfsi); + } + + static inline void nfs4_init_once(struct nfs_inode *nfsi) +@@ -1377,6 +1454,11 @@ static inline void nfs4_init_once(struct + nfsi->delegation = NULL; + nfsi->delegation_state = 0; + init_rwsem(&nfsi->rwsem); ++#ifdef CONFIG_NFS_V4_1 ++ init_waitqueue_head(&nfsi->lo_waitq); ++ nfsi->pnfs_layout_suspend = 0; ++ nfsi->layout = NULL; ++#endif /* CONFIG_NFS_V4_1 */ + #endif + } + +@@ -1488,6 +1570,12 @@ static int __init init_nfs_fs(void) + if (err) + goto out0; + ++#ifdef CONFIG_NFS_V4_1 ++ err = pnfs_initialize(); ++ if (err) ++ goto out00; ++#endif /* CONFIG_NFS_V4_1 */ ++ + #ifdef CONFIG_PROC_FS + rpc_proc_register(&nfs_rpcstat); + #endif +@@ -1498,6 +1586,10 @@ out: + #ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); + #endif ++#ifdef CONFIG_NFS_V4_1 ++out00: ++ pnfs_uninitialize(); ++#endif /* CONFIG_NFS_V4_1 */ + nfs_destroy_directcache(); + out0: + nfs_destroy_writepagecache(); +@@ -1531,6 +1623,9 @@ static void __exit exit_nfs_fs(void) + #ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); + #endif ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_uninitialize(); ++#endif + unregister_nfs_fs(); + nfs_fs_proc_exit(); + nfsiod_stop(); +diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/internal.h +--- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-08-31 20:41:19.149170418 -0400 ++++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-31 20:42:05.517099944 -0400 +@@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_serv + struct nfs_fattr *); + extern void nfs_mark_client_ready(struct nfs_client *clp, int state); + extern int nfs4_check_client_ready(struct nfs_client *clp); ++extern int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++ const struct sockaddr *sa2); ++extern int nfs4_set_client(struct nfs_server *server, ++ const char *hostname, ++ const struct sockaddr *addr, ++ const size_t addrlen, ++ const char *ip_addr, ++ rpc_authflavor_t authflavour, ++ int proto, const struct rpc_timeout *timeparms, ++ u32 minorversion); + #ifdef CONFIG_PROC_FS + extern int __init nfs_fs_proc_init(void); + extern void nfs_fs_proc_exit(void); +@@ -201,6 +211,8 @@ extern const u32 nfs41_maxwrite_overhead + extern struct rpc_procinfo nfs4_procedures[]; + #endif + ++extern int nfs4_recover_expired_lease(struct nfs_client *clp); ++ + /* proc.c */ + void nfs_close_context(struct nfs_open_context *ctx, int is_sync); + +@@ -248,10 +260,31 @@ extern int nfs4_get_rootfh(struct nfs_se + #endif + + /* read.c */ ++extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); ++extern int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); + extern void nfs_read_prepare(struct rpc_task *task, void *calldata); + + /* write.c */ ++extern int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs); + extern void nfs_write_prepare(struct rpc_task *task, void *calldata); ++extern void nfs_mark_list_commit(struct list_head *head); + #ifdef CONFIG_MIGRATION + extern int nfs_migrate_page(struct address_space *, + struct page *, struct page *); +diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kconfig +--- linux-2.6.34.noarch/fs/nfs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-08-31 20:42:05.500123860 -0400 +@@ -79,10 +79,48 @@ config NFS_V4_1 + depends on NFS_V4 && EXPERIMENTAL + help + This option enables support for minor version 1 of the NFSv4 protocol +- (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. ++ (RFC5661) including support for the parallel NFS (pNFS) features ++ in the kernel's NFS client. + + Unless you're an NFS developer, say N. + ++config PNFS_FILE_LAYOUT ++ tristate "NFS client support for the pNFS nfs-files layout (DEVELOPER ONLY)" ++ depends on NFS_FS && NFS_V4_1 ++ default y ++ help ++ This option enables support for the pNFS nfs-files layout. ++ ++ Unless you're an NFS developer, say N. ++ ++config PNFS_OBJLAYOUT ++ tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD ++ help ++ Say M here if you want your pNFS client to support the Objects Layout Driver. ++ Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and ++ upper level driver (SCSI_OSD_ULD). ++ ++ If unsure, say N. ++ ++config PNFS_PANLAYOUT ++ tristate "Provide support for the Panasas OSD Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on PNFS_OBJLAYOUT ++ help ++ Say M or y here if you want your pNFS client to support the Panasas OSD Layout Driver. ++ ++ If unsure, say N. ++ ++config PNFS_BLOCK ++ tristate "Provide a pNFS block client (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 ++ select MD ++ select BLK_DEV_DM ++ help ++ Say M or y here if you want your pNfs client to support the block protocol ++ ++ If unsure, say N. ++ + config ROOT_NFS + bool "Root file system on NFS" + depends on NFS_FS=y && IP_PNP +diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Makefile +--- linux-2.6.34.noarch/fs/nfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-08-31 20:42:05.501268752 -0400 +@@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4x + delegation.o idmap.o \ + callback.o callback_xdr.o callback_proc.o \ + nfs4namespace.o ++nfs-$(CONFIG_NFS_V4_1) += pnfs.o + nfs-$(CONFIG_SYSCTL) += sysctl.o + nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o ++ ++obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o ++nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o ++ ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ ++obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ +diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs3proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-08-31 20:41:19.152180625 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-31 20:42:05.518232887 -0400 +@@ -833,6 +833,7 @@ const struct nfs_rpc_ops nfs_v3_clientop + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs3_dir_inode_operations, + .file_inode_ops = &nfs3_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs3_proc_get_root, + .getattr = nfs3_proc_getattr, + .setattr = nfs3_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-31 20:42:05.519163219 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-08-31 20:42:05.520222923 -0400 +@@ -0,0 +1,768 @@ ++/* ++ * linux/fs/nfs/nfs4filelayout.c ++ * ++ * Module for the pnfs nfs4 file layout driver. ++ * Defines all I/O and Policy interface operations, plus code ++ * to register itself with the pNFS client. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfs4filelayout.h" ++#include "nfs4_fs.h" ++#include "internal.h" ++#include "pnfs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Dean Hildebrand "); ++MODULE_DESCRIPTION("The NFSv4 file layout driver"); ++ ++/* Callback operations to the pNFS client */ ++struct pnfs_client_operations *pnfs_callback_ops; ++ ++/* Forward declaration */ ++struct layoutdriver_io_operations filelayout_io_operations; ++ ++int ++filelayout_initialize_mountpoint(struct nfs_server *nfss, ++ const struct nfs_fh *mntfh) ++{ ++ int status = nfs4_alloc_init_deviceid_cache(nfss->nfs_client, ++ nfs4_fl_free_deviceid_callback); ++ if (status) { ++ printk(KERN_WARNING "%s: deviceid cache could not be " ++ "initialized\n", __func__); ++ return status; ++ } ++ dprintk("%s: deviceid cache has been initialized successfully\n", ++ __func__); ++ return 0; ++} ++ ++/* Uninitialize a mountpoint by destroying its device list */ ++int ++filelayout_uninitialize_mountpoint(struct nfs_server *nfss) ++{ ++ dprintk("--> %s\n", __func__); ++ ++ if (nfss->pnfs_curr_ld && nfss->nfs_client->cl_devid_cache) ++ nfs4_put_deviceid_cache(nfss->nfs_client); ++ return 0; ++} ++ ++/* This function is used by the layout driver to calculate the ++ * offset of the file on the dserver based on whether the ++ * layout type is STRIPE_DENSE or STRIPE_SPARSE ++ */ ++static loff_t ++filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ ++ switch (flseg->stripe_type) { ++ case STRIPE_SPARSE: ++ return offset; ++ ++ case STRIPE_DENSE: ++ { ++ u32 stripe_width; ++ u64 tmp, off; ++ u32 unit = flseg->stripe_unit; ++ ++ stripe_width = unit * FILE_DSADDR(lseg)->stripe_count; ++ tmp = off = offset - flseg->pattern_offset; ++ do_div(tmp, stripe_width); ++ return tmp * unit + do_div(off, unit); ++ } ++ default: ++ BUG(); ++ } ++ ++ /* We should never get here... just to stop the gcc warning */ ++ return 0; ++} ++ ++/* ++ * Call ops for the async read/write cases ++ * In the case of dense layouts, the offset needs to be reset to its ++ * original value. ++ */ ++static void filelayout_read_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ if (rdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ rdata->args.offset, rdata->fldata.orig_offset); ++ rdata->args.offset = rdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ rdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_read_release(void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ put_lseg(rdata->pdata.lseg); ++ rdata->pdata.lseg = NULL; ++ rdata->pdata.call_ops->rpc_release(data); ++} ++ ++static void filelayout_write_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ if (wdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ wdata->args.offset, wdata->fldata.orig_offset); ++ wdata->args.offset = wdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_write_release(void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ put_lseg(wdata->pdata.lseg); ++ wdata->pdata.lseg = NULL; ++ wdata->pdata.call_ops->rpc_release(data); ++} ++ ++struct rpc_call_ops filelayout_read_call_ops = { ++ .rpc_call_prepare = nfs_read_prepare, ++ .rpc_call_done = filelayout_read_call_done, ++ .rpc_release = filelayout_read_release, ++}; ++ ++struct rpc_call_ops filelayout_write_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_write_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* Perform sync or async reads. ++ * ++ * An optimization for the NFS file layout driver ++ * allows the original read/write data structs to be passed in the ++ * last argument. ++ * ++ * TODO: join with write_pagelist? ++ */ ++static enum pnfs_try_status ++filelayout_read_pagelist(struct nfs_read_data *data, unsigned nr_pages) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ dprintk("--> %s ino %lu nr_pages %d pgbase %u req %Zu@%llu\n", ++ __func__, data->inode->i_ino, nr_pages, ++ data->args.pgbase, (size_t)data->args.count, offset); ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s USE DS:ip %x %s\n", __func__, ++ htonl(ds->ds_ip_addr), ds->r_addr); ++ ++ /* just try the first data server for the index..*/ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ ++ /* ++ * Now get the file offset on the dserver ++ * Set the read offset to this offset, and ++ * save the original offset in orig_offset ++ * In the case of aync reads, the offset will be reset in the ++ * call_ops->rpc_call_done() routine. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* Perform an asynchronous read */ ++ nfs_initiate_read(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_read_call_ops); ++ ++ data->pdata.pnfs_error = 0; ++ ++ return PNFS_ATTEMPTED; ++} ++ ++/* Perform async writes. */ ++static enum pnfs_try_status ++filelayout_write_pagelist(struct nfs_write_data *data, unsigned nr_pages, int sync) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu %s\n", __func__, ++ data->inode->i_ino, sync, (size_t) data->args.count, offset, ++ htonl(ds->ds_ip_addr), ntohs(ds->ds_port), ds->r_addr); ++ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ /* ++ * Get the file offset on the dserver. Set the write offset to ++ * this offset and save the original offset. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* ++ * Perform an asynchronous write The offset will be reset in the ++ * call_ops->rpc_call_done() routine ++ */ ++ nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_write_call_ops, sync); ++ ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++} ++ ++/* ++ * Create a filelayout layout structure and return it. The pNFS client ++ * will use the pnfs_layout_hdr type to refer to the layout for this ++ * inode from now on. ++ */ ++static struct pnfs_layout_hdr * ++filelayout_alloc_layout(struct inode *inode) ++{ ++ struct nfs4_filelayout *flp; ++ ++ dprintk("NFS_FILELAYOUT: allocating layout\n"); ++ flp = kzalloc(sizeof(struct nfs4_filelayout), GFP_KERNEL); ++ return flp ? &flp->fl_layout : NULL; ++} ++ ++/* Free a filelayout layout structure */ ++static void ++filelayout_free_layout(struct pnfs_layout_hdr *lo) ++{ ++ dprintk("NFS_FILELAYOUT: freeing layout\n"); ++ kfree(FILE_LO(lo)); ++} ++ ++/* ++ * filelayout_check_layout() ++ * ++ * Make sure layout segment parameters are sane WRT the device. ++ * ++ * Notes: ++ * 1) current code insists that # stripe index = # data servers in ds_list ++ * which is wrong. ++ * 2) pattern_offset is ignored and must == 0 which is wrong; ++ * 3) the pattern_offset needs to be a mutliple of the stripe unit. ++ * 4) stripe unit is multiple of page size ++ */ ++ ++static int ++filelayout_check_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct nfs4_filelayout_segment *fl = LSEG_LD_DATA(lseg); ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ int status = -EINVAL; ++ struct nfs_server *nfss = NFS_SERVER(PNFS_INODE(lo)); ++ ++ dprintk("--> %s\n", __func__); ++ /* find in list or get from server and reference the deviceid */ ++ dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, &fl->dev_id); ++ if (dsaddr == NULL) { ++ dsaddr = get_device_info(PNFS_INODE(lo), &fl->dev_id); ++ if (dsaddr == NULL) { ++ dprintk("%s NO device for dev_id %s\n", ++ __func__, deviceid_fmt(&fl->dev_id)); ++ goto out; ++ } ++ } ++ if (fl->first_stripe_index < 0 || ++ fl->first_stripe_index > dsaddr->stripe_count) { ++ dprintk("%s Bad first_stripe_index %d\n", ++ __func__, fl->first_stripe_index); ++ goto out_put; ++ } ++ ++ if (fl->pattern_offset != 0) { ++ dprintk("%s Unsupported no-zero pattern_offset %Ld\n", ++ __func__, fl->pattern_offset); ++ goto out_put; ++ } ++ ++ if (fl->stripe_unit % PAGE_SIZE) { ++ dprintk("%s Stripe unit (%u) not page aligned\n", ++ __func__, fl->stripe_unit); ++ goto out_put; ++ } ++ ++ /* XXX only support SPARSE packing. Don't support use MDS open fh */ ++ if (!(fl->num_fh == 1 || fl->num_fh == dsaddr->ds_num)) { ++ dprintk("%s num_fh %u not equal to 1 or ds_num %u\n", ++ __func__, fl->num_fh, dsaddr->ds_num); ++ goto out_put; ++ } ++ ++ if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { ++ dprintk("%s Stripe unit (%u) not aligned with rsize %u " ++ "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, ++ nfss->wsize); ++ } ++ ++ nfs4_set_layout_deviceid(lseg, &dsaddr->deviceid); ++ ++ status = 0; ++out: ++ dprintk("--> %s returns %d\n", __func__, status); ++ return status; ++out_put: ++ nfs4_put_unset_layout_deviceid(lseg, &dsaddr->deviceid, ++ nfs4_fl_free_deviceid_callback); ++ goto out; ++} ++ ++static void _filelayout_free_lseg(struct pnfs_layout_segment *lseg); ++static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl); ++ ++/* Decode layout and store in layoutid. Overwrite any existing layout ++ * information for this file. ++ */ ++static int ++filelayout_set_layout(struct nfs4_filelayout *flo, ++ struct nfs4_filelayout_segment *fl, ++ struct nfs4_layoutget_res *lgr) ++{ ++ uint32_t *p = (uint32_t *)lgr->layout.buf; ++ uint32_t nfl_util; ++ int i; ++ ++ dprintk("%s: set_layout_map Begin\n", __func__); ++ ++ memcpy(&fl->dev_id, p, NFS4_PNFS_DEVICEID4_SIZE); ++ p += XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE); ++ nfl_util = be32_to_cpup(p++); ++ if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) ++ fl->commit_through_mds = 1; ++ if (nfl_util & NFL4_UFLG_DENSE) ++ fl->stripe_type = STRIPE_DENSE; ++ else ++ fl->stripe_type = STRIPE_SPARSE; ++ fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; ++ ++ if (!flo->stripe_unit) ++ flo->stripe_unit = fl->stripe_unit; ++ else if (flo->stripe_unit != fl->stripe_unit) { ++ printk(KERN_NOTICE "%s: updating strip_unit from %u to %u\n", ++ __func__, flo->stripe_unit, fl->stripe_unit); ++ flo->stripe_unit = fl->stripe_unit; ++ } ++ ++ fl->first_stripe_index = be32_to_cpup(p++); ++ p = xdr_decode_hyper(p, &fl->pattern_offset); ++ fl->num_fh = be32_to_cpup(p++); ++ ++ dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu dev_id %s\n", ++ __func__, nfl_util, fl->num_fh, fl->first_stripe_index, ++ fl->pattern_offset, deviceid_fmt(&fl->dev_id)); ++ ++ if (fl->num_fh * sizeof(struct nfs_fh) > 2*PAGE_SIZE) { ++ fl->fh_array = vmalloc(fl->num_fh * sizeof(struct nfs_fh)); ++ if (fl->fh_array) ++ memset(fl->fh_array, 0, ++ fl->num_fh * sizeof(struct nfs_fh)); ++ } else { ++ fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh), ++ GFP_KERNEL); ++ } ++ if (!fl->fh_array) ++ return -ENOMEM; ++ ++ for (i = 0; i < fl->num_fh; i++) { ++ /* fh */ ++ fl->fh_array[i].size = be32_to_cpup(p++); ++ if (sizeof(struct nfs_fh) < fl->fh_array[i].size) { ++ printk(KERN_ERR "Too big fh %d received %d\n", ++ i, fl->fh_array[i].size); ++ /* Layout is now invalid, pretend it doesn't exist */ ++ filelayout_free_fh_array(fl); ++ fl->num_fh = 0; ++ break; ++ } ++ memcpy(fl->fh_array[i].data, p, fl->fh_array[i].size); ++ p += XDR_QUADLEN(fl->fh_array[i].size); ++ dprintk("DEBUG: %s: fh len %d\n", __func__, ++ fl->fh_array[i].size); ++ } ++ ++ return 0; ++} ++ ++static struct pnfs_layout_segment * ++filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, ++ struct nfs4_layoutget_res *lgr) ++{ ++ struct nfs4_filelayout *flo = FILE_LO(layoutid); ++ struct pnfs_layout_segment *lseg; ++ int rc; ++ ++ dprintk("--> %s\n", __func__); ++ lseg = kzalloc(sizeof(struct pnfs_layout_segment) + ++ sizeof(struct nfs4_filelayout_segment), GFP_KERNEL); ++ if (!lseg) ++ return NULL; ++ ++ rc = filelayout_set_layout(flo, LSEG_LD_DATA(lseg), lgr); ++ ++ if (rc != 0 || filelayout_check_layout(layoutid, lseg)) { ++ _filelayout_free_lseg(lseg); ++ lseg = NULL; ++ } ++ return lseg; ++} ++ ++static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) ++{ ++ if (fl->num_fh * sizeof(struct nfs_fh) > 2*PAGE_SIZE) ++ vfree(fl->fh_array); ++ else ++ kfree(fl->fh_array); ++ ++ fl->fh_array = NULL; ++} ++ ++static void ++_filelayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ filelayout_free_fh_array(LSEG_LD_DATA(lseg)); ++ kfree(lseg); ++} ++ ++static void ++filelayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ dprintk("--> %s\n", __func__); ++ nfs4_put_unset_layout_deviceid(lseg, lseg->deviceid, ++ nfs4_fl_free_deviceid_callback); ++ _filelayout_free_lseg(lseg); ++} ++ ++/* Allocate a new nfs_write_data struct and initialize */ ++static struct nfs_write_data * ++filelayout_clone_write_data(struct nfs_write_data *old) ++{ ++ static struct nfs_write_data *new; ++ ++ new = nfs_commitdata_alloc(); ++ if (!new) ++ goto out; ++ kref_init(&new->refcount); ++ new->parent = old; ++ kref_get(&old->refcount); ++ new->inode = old->inode; ++ new->cred = old->cred; ++ new->args.offset = 0; ++ new->args.count = 0; ++ new->res.count = 0; ++ new->res.fattr = &new->fattr; ++ nfs_fattr_init(&new->fattr); ++ new->res.verf = &new->verf; ++ new->args.context = get_nfs_open_context(old->args.context); ++ new->pdata.lseg = NULL; ++ new->pdata.call_ops = old->pdata.call_ops; ++ new->pdata.how = old->pdata.how; ++out: ++ return new; ++} ++ ++static void filelayout_commit_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static struct rpc_call_ops filelayout_commit_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_commit_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* ++ * Execute a COMMIT op to the MDS or to each data server on which a page ++ * in 'pages' exists. ++ * Invoke the pnfs_commit_complete callback. ++ */ ++enum pnfs_try_status ++filelayout_commit(struct nfs_write_data *data, int sync) ++{ ++ LIST_HEAD(head); ++ struct nfs_page *req; ++ loff_t file_offset = 0; ++ u16 idx, i; ++ struct list_head **ds_page_list = NULL; ++ u16 *indices_used; ++ int num_indices_seen = 0; ++ const struct rpc_call_ops *call_ops; ++ struct rpc_clnt *clnt; ++ struct nfs_write_data **clone_list = NULL; ++ struct nfs_write_data *dsdata; ++ struct nfs4_pnfs_ds *ds; ++ ++ dprintk("%s data %p sync %d\n", __func__, data, sync); ++ ++ /* Alloc room for both in one go */ ++ ds_page_list = kzalloc((NFS4_PNFS_MAX_MULTI_CNT + 1) * ++ (sizeof(u16) + sizeof(struct list_head *)), ++ GFP_KERNEL); ++ if (!ds_page_list) ++ goto mem_error; ++ indices_used = (u16 *) (ds_page_list + NFS4_PNFS_MAX_MULTI_CNT + 1); ++ /* ++ * Sort pages based on which ds to send to. ++ * MDS is given index equal to NFS4_PNFS_MAX_MULTI_CNT. ++ * Note we are assuming there is only a single lseg in play. ++ * When that is not true, we could first sort on lseg, then ++ * sort within each as we do here. ++ */ ++ while (!list_empty(&data->pages)) { ++ req = nfs_list_entry(data->pages.next); ++ nfs_list_remove_request(req); ++ if (!req->wb_lseg || ++ ((struct nfs4_filelayout_segment *) ++ LSEG_LD_DATA(req->wb_lseg))->commit_through_mds) ++ idx = NFS4_PNFS_MAX_MULTI_CNT; ++ else { ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ idx = nfs4_fl_calc_ds_index(req->wb_lseg, file_offset); ++ } ++ if (ds_page_list[idx]) { ++ /* Already seen this idx */ ++ list_add(&req->wb_list, ds_page_list[idx]); ++ } else { ++ /* New idx not seen so far */ ++ list_add_tail(&req->wb_list, &head); ++ indices_used[num_indices_seen++] = idx; ++ } ++ ds_page_list[idx] = &req->wb_list; ++ } ++ /* Once created, clone must be released via call_op */ ++ clone_list = kzalloc(num_indices_seen * ++ sizeof(struct nfs_write_data *), GFP_KERNEL); ++ if (!clone_list) ++ goto mem_error; ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ clone_list[i] = filelayout_clone_write_data(data); ++ if (!clone_list[i]) ++ goto mem_error; ++ } ++ clone_list[i] = data; ++ /* ++ * Now send off the RPCs to each ds. Note that it is important ++ * that any RPC to the MDS be sent last (or at least after all ++ * clones have been made.) ++ */ ++ for (i = 0; i < num_indices_seen; i++) { ++ dsdata = clone_list[i]; ++ idx = indices_used[i]; ++ list_cut_position(&dsdata->pages, &head, ds_page_list[idx]); ++ if (idx == NFS4_PNFS_MAX_MULTI_CNT) { ++ call_ops = data->pdata.call_ops;; ++ clnt = NFS_CLIENT(dsdata->inode); ++ ds = NULL; ++ } else { ++ struct nfs_fh *fh; ++ ++ call_ops = &filelayout_commit_call_ops; ++ req = nfs_list_entry(dsdata->pages.next); ++ ds = nfs4_fl_prepare_ds(req->wb_lseg, idx); ++ if (!ds) { ++ /* Trigger retry of this chunk through MDS */ ++ dsdata->task.tk_status = -EIO; ++ data->pdata.call_ops->rpc_release(dsdata); ++ continue; ++ } ++ clnt = ds->ds_clp->cl_rpcclient; ++ dsdata->fldata.ds_nfs_client = ds->ds_clp; ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ fh = nfs4_fl_select_ds_fh(req->wb_lseg, file_offset); ++ if (fh) ++ dsdata->args.fh = fh; ++ } ++ dprintk("%s: Initiating commit: %llu USE DS:\n", ++ __func__, file_offset); ++ print_ds(ds); ++ ++ /* Send COMMIT to data server */ ++ nfs_initiate_commit(dsdata, clnt, call_ops, sync); ++ } ++ kfree(clone_list); ++ kfree(ds_page_list); ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++ ++ mem_error: ++ if (clone_list) { ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ if (!clone_list[i]) ++ break; ++ data->pdata.call_ops->rpc_release(clone_list[i]); ++ } ++ kfree(clone_list); ++ } ++ kfree(ds_page_list); ++ /* One of these will be empty, but doesn't hurt to do both */ ++ nfs_mark_list_commit(&head); ++ nfs_mark_list_commit(&data->pages); ++ data->pdata.call_ops->rpc_release(data); ++ return PNFS_ATTEMPTED; ++} ++ ++/* Return the stripesize for the specified file */ ++ssize_t ++filelayout_get_stripesize(struct pnfs_layout_hdr *lo) ++{ ++ struct nfs4_filelayout *flo = FILE_LO(lo); ++ ++ return flo->stripe_unit; ++} ++ ++/* ++ * filelayout_pg_test(). Called by nfs_can_coalesce_requests() ++ * ++ * return 1 : coalesce page ++ * return 0 : don't coalesce page ++ */ ++int ++filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ u64 p_stripe, r_stripe; ++ ++ if (pgio->pg_boundary == 0) ++ return 1; ++ p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; ++ r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; ++ ++ do_div(p_stripe, pgio->pg_boundary); ++ do_div(r_stripe, pgio->pg_boundary); ++ ++ return (p_stripe == r_stripe); ++} ++ ++struct layoutdriver_io_operations filelayout_io_operations = { ++ .commit = filelayout_commit, ++ .read_pagelist = filelayout_read_pagelist, ++ .write_pagelist = filelayout_write_pagelist, ++ .alloc_layout = filelayout_alloc_layout, ++ .free_layout = filelayout_free_layout, ++ .alloc_lseg = filelayout_alloc_lseg, ++ .free_lseg = filelayout_free_lseg, ++ .initialize_mountpoint = filelayout_initialize_mountpoint, ++ .uninitialize_mountpoint = filelayout_uninitialize_mountpoint, ++}; ++ ++struct layoutdriver_policy_operations filelayout_policy_operations = { ++ .flags = PNFS_USE_RPC_CODE, ++ .get_stripesize = filelayout_get_stripesize, ++ .pg_test = filelayout_pg_test, ++}; ++ ++struct pnfs_layoutdriver_type filelayout_type = { ++ .id = LAYOUT_NFSV4_1_FILES, ++ .name = "LAYOUT_NFSV4_1_FILES", ++ .ld_io_ops = &filelayout_io_operations, ++ .ld_policy_ops = &filelayout_policy_operations, ++}; ++ ++static int __init nfs4filelayout_init(void) ++{ ++ printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", ++ __func__); ++ ++ /* ++ * Need to register file_operations struct with global list to indicate ++ * that NFS4 file layout is a possible pNFS I/O module ++ */ ++ pnfs_callback_ops = pnfs_register_layoutdriver(&filelayout_type); ++ ++ return 0; ++} ++ ++static void __exit nfs4filelayout_exit(void) ++{ ++ printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", ++ __func__); ++ ++ /* Unregister NFS4 file layout driver with pNFS client*/ ++ pnfs_unregister_layoutdriver(&filelayout_type); ++} ++ ++module_init(nfs4filelayout_init); ++module_exit(nfs4filelayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-31 20:42:05.521233147 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-31 20:42:05.521233147 -0400 +@@ -0,0 +1,635 @@ ++/* ++ * linux/fs/nfs/nfs4filelayoutdev.c ++ * ++ * Device operations for the pnfs nfs4 file layout driver. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * Garth Goodson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include "nfs4filelayout.h" ++#include "internal.h" ++#include "nfs4_fs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++DEFINE_SPINLOCK(nfs4_ds_cache_lock); ++static LIST_HEAD(nfs4_data_server_cache); ++ ++void ++print_ds(struct nfs4_pnfs_ds *ds) ++{ ++ if (ds == NULL) { ++ dprintk("%s NULL device \n", __func__); ++ return; ++ } ++ dprintk(" ip_addr %x\n", ntohl(ds->ds_ip_addr)); ++ dprintk(" port %hu\n", ntohs(ds->ds_port)); ++ dprintk(" client %p\n", ds->ds_clp); ++ dprintk(" ref count %d\n", atomic_read(&ds->ds_count)); ++ if (ds->ds_clp) ++ dprintk(" cl_exchange_flags %x\n", ++ ds->ds_clp->cl_exchange_flags); ++ dprintk(" ip:port %s\n", ds->r_addr); ++} ++ ++void ++print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr) ++{ ++ int i; ++ ++ dprintk("%s dsaddr->ds_num %d\n", __func__, ++ dsaddr->ds_num); ++ for (i = 0; i < dsaddr->ds_num; i++) ++ print_ds(dsaddr->ds_list[i]); ++} ++ ++/* Debugging function assuming a 64bit major/minor split of the deviceid */ ++char * ++deviceid_fmt(const struct pnfs_deviceid *dev_id) ++{ ++ static char buf[17]; ++ uint32_t *p = (uint32_t *)dev_id->data; ++ uint64_t major, minor; ++ ++ p = xdr_decode_hyper(p, &major); ++ p = xdr_decode_hyper(p, &minor); ++ ++ sprintf(buf, "%08llu %08llu", major, minor); ++ return buf; ++} ++ ++/* nfs4_ds_cache_lock is held */ ++static inline struct nfs4_pnfs_ds * ++_data_server_lookup(u32 ip_addr, u32 port) ++{ ++ struct nfs4_pnfs_ds *ds; ++ ++ dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", ++ ntohl(ip_addr), ntohs(port)); ++ ++ list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { ++ if (ds->ds_ip_addr == ip_addr && ++ ds->ds_port == port) { ++ return ds; ++ } ++ } ++ return NULL; ++} ++ ++/* Create an rpc to the data server defined in 'dev_list' */ ++static int ++nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) ++{ ++ struct nfs_server *tmp; ++ struct sockaddr_in sin; ++ struct rpc_clnt *mds_clnt = mds_srv->client; ++ struct nfs_client *clp = mds_srv->nfs_client; ++ struct sockaddr *mds_addr; ++ int err = 0; ++ ++ dprintk("--> %s ip:port %s au_flavor %d\n", __func__, ++ ds->r_addr, mds_clnt->cl_auth->au_flavor); ++ ++ sin.sin_family = AF_INET; ++ sin.sin_addr.s_addr = ds->ds_ip_addr; ++ sin.sin_port = ds->ds_port; ++ ++ /* ++ * If this DS is also the MDS, use the MDS session only if the ++ * MDS exchangeid flags show the EXCHGID4_FLAG_USE_PNFS_DS pNFS role. ++ */ ++ mds_addr = (struct sockaddr *)&clp->cl_addr; ++ if (nfs_sockaddr_cmp((struct sockaddr *)&sin, mds_addr)) { ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO "ip:port %s is not a pNFS Data " ++ "Server\n", ds->r_addr); ++ err = -ENODEV; ++ } else { ++ atomic_inc(&clp->cl_count); ++ ds->ds_clp = clp; ++ dprintk("%s Using MDS Session for DS\n", __func__); ++ } ++ goto out; ++ } ++ ++ /* Temporay server for nfs4_set_client */ ++ tmp = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); ++ if (!tmp) ++ goto out; ++ ++ /* ++ * Set a retrans, timeout interval, and authflavor equual to the MDS ++ * values. Use the MDS nfs_client cl_ipaddr field so as to use the ++ * same co_ownerid as the MDS. ++ */ ++ err = nfs4_set_client(tmp, ++ mds_srv->nfs_client->cl_hostname, ++ (struct sockaddr *)&sin, ++ sizeof(struct sockaddr), ++ mds_srv->nfs_client->cl_ipaddr, ++ mds_clnt->cl_auth->au_flavor, ++ IPPROTO_TCP, ++ mds_clnt->cl_xprt->timeout, ++ 1 /* minorversion */); ++ if (err < 0) ++ goto out_free; ++ ++ clp = tmp->nfs_client; ++ ++ /* Ask for only the EXCHGID4_FLAG_USE_PNFS_DS pNFS role */ ++ dprintk("%s EXCHANGE_ID for clp %p\n", __func__, clp); ++ clp->cl_exchange_flags = EXCHGID4_FLAG_USE_PNFS_DS; ++ ++ err = nfs4_recover_expired_lease(clp); ++ if (!err) ++ err = nfs4_check_client_ready(clp); ++ if (err) ++ goto out_put; ++ ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO "ip:port %s is not a pNFS Data Server\n", ++ ds->r_addr); ++ err = -ENODEV; ++ goto out_put; ++ } ++ /* ++ * Mask the (possibly) returned EXCHGID4_FLAG_USE_PNFS_MDS pNFS role ++ * The is_ds_only_session depends on this. ++ */ ++ clp->cl_exchange_flags &= ~EXCHGID4_FLAG_USE_PNFS_MDS; ++ /* ++ * Set DS lease equal to the MDS lease, renewal is scheduled in ++ * create_session ++ */ ++ spin_lock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time; ++ spin_unlock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_last_renewal = jiffies; ++ ++ clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); ++ ds->ds_clp = clp; ++ ++ dprintk("%s: ip=%x, port=%hu, rpcclient %p\n", __func__, ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), ++ clp->cl_rpcclient); ++out_free: ++ kfree(tmp); ++out: ++ dprintk("%s Returns %d\n", __func__, err); ++ return err; ++out_put: ++ nfs_put_client(clp); ++ goto out_free; ++} ++ ++static void ++destroy_ds(struct nfs4_pnfs_ds *ds) ++{ ++ dprintk("--> %s\n", __func__); ++ print_ds(ds); ++ ++ if (ds->ds_clp) ++ nfs_put_client(ds->ds_clp); ++ kfree(ds); ++} ++ ++static void ++nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) ++{ ++ struct nfs4_pnfs_ds *ds; ++ int i; ++ ++ dprintk("%s: device id=%s\n", __func__, ++ deviceid_fmt(&dsaddr->deviceid.de_id)); ++ ++ for (i = 0; i < dsaddr->ds_num; i++) { ++ ds = dsaddr->ds_list[i]; ++ if (ds != NULL) { ++ if (atomic_dec_and_lock(&ds->ds_count, ++ &nfs4_ds_cache_lock)) { ++ list_del_init(&ds->ds_node); ++ spin_unlock(&nfs4_ds_cache_lock); ++ destroy_ds(ds); ++ } ++ } ++ } ++ kfree(dsaddr->stripe_indices); ++ kfree(dsaddr); ++} ++ ++void ++nfs4_fl_free_deviceid_callback(struct kref *kref) ++{ ++ struct nfs4_deviceid *device = ++ container_of(kref, struct nfs4_deviceid, de_kref); ++ struct nfs4_file_layout_dsaddr *dsaddr = ++ container_of(device, struct nfs4_file_layout_dsaddr, deviceid); ++ ++ nfs4_fl_free_deviceid(dsaddr); ++} ++ ++static void ++nfs4_pnfs_ds_add(struct inode *inode, struct nfs4_pnfs_ds **dsp, ++ u32 ip_addr, u32 port, char *r_addr, int len) ++{ ++ struct nfs4_pnfs_ds *tmp_ds, *ds; ++ ++ *dsp = NULL; ++ ++ ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL); ++ if (!ds) ++ return; ++ ++ spin_lock(&nfs4_ds_cache_lock); ++ tmp_ds = _data_server_lookup(ip_addr, port); ++ if (tmp_ds == NULL) { ++ ds->ds_ip_addr = ip_addr; ++ ds->ds_port = port; ++ strncpy(ds->r_addr, r_addr, len); ++ atomic_set(&ds->ds_count, 1); ++ INIT_LIST_HEAD(&ds->ds_node); ++ ds->ds_clp = NULL; ++ list_add(&ds->ds_node, &nfs4_data_server_cache); ++ *dsp = ds; ++ dprintk("%s add new data server ip 0x%x\n", __func__, ++ ds->ds_ip_addr); ++ spin_unlock(&nfs4_ds_cache_lock); ++ } else { ++ atomic_inc(&tmp_ds->ds_count); ++ *dsp = tmp_ds; ++ dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", ++ __func__, tmp_ds->ds_ip_addr, ++ atomic_read(&tmp_ds->ds_count)); ++ spin_unlock(&nfs4_ds_cache_lock); ++ kfree(ds); ++ } ++} ++ ++static struct nfs4_pnfs_ds * ++decode_and_add_ds(uint32_t **pp, struct inode *inode) ++{ ++ struct nfs4_pnfs_ds *ds = NULL; ++ char r_addr[29]; /* max size of ip/port string */ ++ int len; ++ u32 ip_addr, port; ++ int tmp[6]; ++ uint32_t *p = *pp; ++ ++ dprintk("%s enter\n", __func__); ++ /* check and skip r_netid */ ++ len = be32_to_cpup(p++); ++ /* "tcp" */ ++ if (len != 3) { ++ printk("%s: ERROR: non TCP r_netid len %d\n", ++ __func__, len); ++ goto out_err; ++ } ++ /* ++ * Read the bytes into a temporary buffer ++ * XXX: should probably sanity check them ++ */ ++ tmp[0] = be32_to_cpup(p++); ++ ++ len = be32_to_cpup(p++); ++ if (len >= sizeof(r_addr)) { ++ printk("%s: ERROR: Device ip/port too long (%d)\n", ++ __func__, len); ++ goto out_err; ++ } ++ memcpy(r_addr, p, len); ++ p += XDR_QUADLEN(len); ++ *pp = p; ++ r_addr[len] = '\0'; ++ sscanf(r_addr, "%d.%d.%d.%d.%d.%d", &tmp[0], &tmp[1], ++ &tmp[2], &tmp[3], &tmp[4], &tmp[5]); ++ ip_addr = htonl((tmp[0]<<24) | (tmp[1]<<16) | (tmp[2]<<8) | (tmp[3])); ++ port = htons((tmp[4] << 8) | (tmp[5])); ++ ++ nfs4_pnfs_ds_add(inode, &ds, ip_addr, port, r_addr, len); ++ ++ dprintk("%s: addr:port string = %s\n", __func__, r_addr); ++ return ds; ++out_err: ++ dprintk("%s returned NULL\n", __func__); ++ return NULL; ++} ++ ++/* Decode opaque device data and return the result */ ++static struct nfs4_file_layout_dsaddr* ++decode_device(struct inode *ino, struct pnfs_device *pdev) ++{ ++ int i, dummy; ++ u32 cnt, num; ++ u8 *indexp; ++ uint32_t *p = (u32 *)pdev->area, *indicesp; ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ ++ /* Get the stripe count (number of stripe index) */ ++ cnt = be32_to_cpup(p++); ++ dprintk("%s stripe count %d\n", __func__, cnt); ++ if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { ++ printk(KERN_WARNING "%s: stripe count %d greater than " ++ "supported maximum %d\n", __func__, ++ cnt, NFS4_PNFS_MAX_STRIPE_CNT); ++ goto out_err; ++ } ++ ++ /* Check the multipath list count */ ++ indicesp = p; ++ p += XDR_QUADLEN(cnt << 2); ++ num = be32_to_cpup(p++); ++ dprintk("%s ds_num %u\n", __func__, num); ++ if (num > NFS4_PNFS_MAX_MULTI_CNT) { ++ printk(KERN_WARNING "%s: multipath count %d greater than " ++ "supported maximum %d\n", __func__, ++ num, NFS4_PNFS_MAX_MULTI_CNT); ++ goto out_err; ++ } ++ dsaddr = kzalloc(sizeof(*dsaddr) + ++ (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), ++ GFP_KERNEL); ++ if (!dsaddr) ++ goto out_err; ++ ++ dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL); ++ if (!dsaddr->stripe_indices) ++ goto out_err_free; ++ ++ dsaddr->stripe_count = cnt; ++ dsaddr->ds_num = num; ++ ++ memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ ++ /* Go back an read stripe indices */ ++ p = indicesp; ++ indexp = &dsaddr->stripe_indices[0]; ++ for (i = 0; i < dsaddr->stripe_count; i++) { ++ dummy = be32_to_cpup(p++); ++ *indexp = dummy; /* bound by NFS4_PNFS_MAX_MULTI_CNT */ ++ indexp++; ++ } ++ /* Skip already read multipath list count */ ++ p++; ++ ++ for (i = 0; i < dsaddr->ds_num; i++) { ++ int j; ++ ++ dummy = be32_to_cpup(p++); /* multipath count */ ++ if (dummy > 1) { ++ printk(KERN_WARNING ++ "%s: Multipath count %d not supported, " ++ "skipping all greater than 1\n", __func__, ++ dummy); ++ } ++ for (j = 0; j < dummy; j++) { ++ if (j == 0) { ++ dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); ++ if (dsaddr->ds_list[i] == NULL) ++ goto out_err_free; ++ } else { ++ u32 len; ++ /* skip extra multipath */ ++ len = be32_to_cpup(p++); ++ p += XDR_QUADLEN(len); ++ len = be32_to_cpup(p++); ++ p += XDR_QUADLEN(len); ++ continue; ++ } ++ } ++ } ++ nfs4_init_deviceid_node(&dsaddr->deviceid); ++ ++ return dsaddr; ++ ++out_err_free: ++ nfs4_fl_free_deviceid(dsaddr); ++out_err: ++ dprintk("%s ERROR: returning NULL\n", __func__); ++ return NULL; ++} ++ ++/* ++ * Decode the opaque device specified in 'dev' ++ * and add it to the list of available devices. ++ * If the deviceid is already cached, nfs4_add_deviceid will return ++ * a pointer to the cached struct and throw away the new. ++ */ ++static struct nfs4_file_layout_dsaddr* ++decode_and_add_device(struct inode *inode, struct pnfs_device *dev) ++{ ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ struct nfs4_deviceid *d; ++ ++ dsaddr = decode_device(inode, dev); ++ if (!dsaddr) { ++ printk(KERN_WARNING "%s: Could not decode or add device\n", ++ __func__); ++ return NULL; ++ } ++ ++ d = nfs4_add_get_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, ++ &dsaddr->deviceid); ++ ++ return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); ++} ++ ++/* ++ * Retrieve the information for dev_id, add it to the list ++ * of available devices, and return it. ++ */ ++struct nfs4_file_layout_dsaddr * ++get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id) ++{ ++ struct pnfs_device *pdev = NULL; ++ u32 max_resp_sz; ++ int max_pages; ++ struct page **pages = NULL; ++ struct nfs4_file_layout_dsaddr *dsaddr = NULL; ++ int rc, i; ++ struct nfs_server *server = NFS_SERVER(inode); ++ ++ /* ++ * Use the session max response size as the basis for setting ++ * GETDEVICEINFO's maxcount ++ */ ++ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; ++ max_pages = max_resp_sz >> PAGE_SHIFT; ++ dprintk("%s inode %p max_resp_sz %u max_pages %d\n", ++ __func__, inode, max_resp_sz, max_pages); ++ ++ pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL); ++ if (pdev == NULL) ++ return NULL; ++ ++ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); ++ if (pages == NULL) { ++ kfree(pdev); ++ return NULL; ++ } ++ for (i = 0; i < max_pages; i++) { ++ pages[i] = alloc_page(GFP_KERNEL); ++ if (!pages[i]) ++ goto out_free; ++ } ++ ++ /* set pdev->area */ ++ pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); ++ if (!pdev->area) ++ goto out_free; ++ ++ memcpy(&pdev->dev_id, dev_id, NFS4_PNFS_DEVICEID4_SIZE); ++ pdev->layout_type = LAYOUT_NFSV4_1_FILES; ++ pdev->pages = pages; ++ pdev->pgbase = 0; ++ pdev->pglen = PAGE_SIZE * max_pages; ++ pdev->mincount = 0; ++ /* TODO: Update types when CB_NOTIFY_DEVICEID is available */ ++ pdev->dev_notify_types = 0; ++ ++ rc = pnfs_callback_ops->nfs_getdeviceinfo(server, pdev); ++ dprintk("%s getdevice info returns %d\n", __func__, rc); ++ if (rc) ++ goto out_free; ++ ++ /* ++ * Found new device, need to decode it and then add it to the ++ * list of known devices for this mountpoint. ++ */ ++ dsaddr = decode_and_add_device(inode, pdev); ++out_free: ++ if (pdev->area != NULL) ++ vunmap(pdev->area); ++ for (i = 0; i < max_pages; i++) ++ __free_page(pages[i]); ++ kfree(pages); ++ kfree(pdev); ++ dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); ++ return dsaddr; ++} ++ ++struct nfs4_file_layout_dsaddr * ++nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ ++ d = nfs4_find_get_deviceid(clp->cl_devid_cache, id); ++ dprintk("%s device id (%s) nfs4_deviceid %p\n", __func__, ++ deviceid_fmt(id), d); ++ return (d == NULL) ? NULL : ++ container_of(d, struct nfs4_file_layout_dsaddr, deviceid); ++} ++ ++/* ++ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit ++ * Then: ((res + fsi) % dsaddr->stripe_count) ++ */ ++static inline u32 ++_nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ u64 tmp; ++ ++ tmp = offset - flseg->pattern_offset; ++ do_div(tmp, flseg->stripe_unit); ++ tmp += flseg->first_stripe_index; ++ return do_div(tmp, FILE_DSADDR(lseg)->stripe_count); ++} ++ ++u32 ++nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ u32 j; ++ ++ j = _nfs4_fl_calc_j_index(lseg, offset); ++ return FILE_DSADDR(lseg)->stripe_indices[j]; ++} ++ ++struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ u32 i; ++ ++ if (flseg->stripe_type == STRIPE_SPARSE) { ++ if (flseg->num_fh == 1) ++ i = 0; ++ else if (flseg->num_fh == 0) ++ return NULL; ++ else ++ i = nfs4_fl_calc_ds_index(lseg, offset); ++ } else ++ i = _nfs4_fl_calc_j_index(lseg, offset); ++ return &flseg->fh_array[i]; ++} ++ ++struct nfs4_pnfs_ds * ++nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ ++ dsaddr = FILE_DSADDR(lseg); ++ if (dsaddr->ds_list[ds_idx] == NULL) { ++ printk(KERN_ERR "%s: No data server for device id (%s)!!\n", ++ __func__, deviceid_fmt(&flseg->dev_id)); ++ return NULL; ++ } ++ ++ if (!dsaddr->ds_list[ds_idx]->ds_clp) { ++ int err; ++ ++ err = nfs4_pnfs_ds_create(PNFS_NFS_SERVER(lseg->layout), ++ dsaddr->ds_list[ds_idx]); ++ if (err) { ++ printk(KERN_ERR "%s nfs4_pnfs_ds_create error %d\n", ++ __func__, err); ++ return NULL; ++ } ++ } ++ dprintk("%s: dev_id=%s, ds_idx=%u\n", ++ __func__, deviceid_fmt(&flseg->dev_id), ds_idx); ++ ++ return dsaddr->ds_list[ds_idx]; ++} ++ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-31 20:42:05.520222923 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-08-31 20:42:05.520222923 -0400 +@@ -0,0 +1,96 @@ ++/* ++ * pnfs_nfs4filelayout.h ++ * ++ * NFSv4 file layout driver data structures. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef FS_NFS_NFS4FILELAYOUT_H ++#define FS_NFS_NFS4FILELAYOUT_H ++ ++#include ++#include ++ ++#define NFS4_PNFS_DEV_HASH_BITS 5 ++#define NFS4_PNFS_DEV_HASH_SIZE (1 << NFS4_PNFS_DEV_HASH_BITS) ++#define NFS4_PNFS_DEV_HASH_MASK (NFS4_PNFS_DEV_HASH_SIZE - 1) ++ ++#define NFS4_PNFS_MAX_STRIPE_CNT 4096 ++#define NFS4_PNFS_MAX_MULTI_CNT 64 /* 256 fit into a u8 stripe_index */ ++#define NFS4_PNFS_MAX_MULTI_DS 2 ++ ++#define FILE_DSADDR(lseg) (container_of(lseg->deviceid, \ ++ struct nfs4_file_layout_dsaddr, \ ++ deviceid)) ++ ++enum stripetype4 { ++ STRIPE_SPARSE = 1, ++ STRIPE_DENSE = 2 ++}; ++ ++/* Individual ip address */ ++struct nfs4_pnfs_ds { ++ struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ ++ u32 ds_ip_addr; ++ u32 ds_port; ++ struct nfs_client *ds_clp; ++ atomic_t ds_count; ++ char r_addr[29]; ++}; ++ ++struct nfs4_file_layout_dsaddr { ++ struct nfs4_deviceid deviceid; ++ u32 stripe_count; ++ u8 *stripe_indices; ++ u32 ds_num; ++ struct nfs4_pnfs_ds *ds_list[1]; ++}; ++ ++struct nfs4_pnfs_dev_hlist { ++ rwlock_t dev_lock; ++ struct hlist_head dev_list[NFS4_PNFS_DEV_HASH_SIZE]; ++}; ++ ++struct nfs4_filelayout_segment { ++ u32 stripe_type; ++ u32 commit_through_mds; ++ u32 stripe_unit; ++ u32 first_stripe_index; ++ u64 pattern_offset; ++ struct pnfs_deviceid dev_id; ++ unsigned int num_fh; ++ struct nfs_fh *fh_array; ++}; ++ ++struct nfs4_filelayout { ++ struct pnfs_layout_hdr fl_layout; ++ u32 stripe_unit; ++}; ++ ++extern struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset); ++ ++static inline struct nfs4_filelayout * ++FILE_LO(struct pnfs_layout_hdr *lo) ++{ ++ return container_of(lo, struct nfs4_filelayout, fl_layout); ++} ++ ++extern struct pnfs_client_operations *pnfs_callback_ops; ++ ++extern void nfs4_fl_free_deviceid_callback(struct kref *); ++extern void print_ds(struct nfs4_pnfs_ds *ds); ++char *deviceid_fmt(const struct pnfs_deviceid *dev_id); ++u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset); ++struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, ++ u32 ds_idx); ++extern struct nfs4_file_layout_dsaddr * ++nfs4_fl_find_get_deviceid(struct nfs_client *, struct pnfs_deviceid *dev_id); ++struct nfs4_file_layout_dsaddr * ++get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id); ++ ++#endif /* FS_NFS_NFS4FILELAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nfs4_fs.h +--- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-31 20:41:19.154160465 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-31 20:42:05.519163219 -0400 +@@ -45,8 +45,28 @@ enum nfs4_client_state { + NFS4CLNT_RECLAIM_NOGRACE, + NFS4CLNT_DELEGRETURN, + NFS4CLNT_SESSION_RESET, +- NFS4CLNT_SESSION_DRAINING, + NFS4CLNT_RECALL_SLOT, ++ NFS4CLNT_LAYOUT_RECALL, ++}; ++ ++enum nfs4_session_state { ++ NFS4_SESSION_INITING, ++ NFS4_SESSION_DRAINING, ++}; ++ ++struct nfs4_minor_version_ops { ++ u32 minor_version; ++ ++ int (*call_sync)(struct nfs_server *server, ++ struct rpc_message *msg, ++ struct nfs4_sequence_args *args, ++ struct nfs4_sequence_res *res, ++ int cache_reply); ++ int (*validate_stateid)(struct nfs_delegation *, ++ const nfs4_stateid *); ++ const struct nfs4_state_recovery_ops *reboot_recovery_ops; ++ const struct nfs4_state_recovery_ops *nograce_recovery_ops; ++ const struct nfs4_state_maintenance_ops *state_renewal_ops; + }; + + /* +@@ -89,7 +109,6 @@ struct nfs_unique_id { + */ + struct nfs4_state_owner { + struct nfs_unique_id so_owner_id; +- struct nfs_client *so_client; + struct nfs_server *so_server; + struct rb_node so_client_node; + +@@ -99,7 +118,6 @@ struct nfs4_state_owner { + atomic_t so_count; + unsigned long so_flags; + struct list_head so_states; +- struct list_head so_delegations; + struct nfs_seqid_counter so_seqid; + struct rpc_sequence so_sequence; + }; +@@ -125,10 +143,20 @@ enum { + * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) + */ + ++struct nfs4_lock_owner { ++ unsigned int lo_type; ++#define NFS4_ANY_LOCK_TYPE (0U) ++#define NFS4_FLOCK_LOCK_TYPE (1U << 0) ++#define NFS4_POSIX_LOCK_TYPE (1U << 1) ++ union { ++ fl_owner_t posix_owner; ++ pid_t flock_owner; ++ } lo_u; ++}; ++ + struct nfs4_lock_state { + struct list_head ls_locks; /* Other lock stateids */ + struct nfs4_state * ls_state; /* Pointer to open state */ +- fl_owner_t ls_owner; /* POSIX lock owner */ + #define NFS_LOCK_INITIALIZED 1 + int ls_flags; + struct nfs_seqid_counter ls_seqid; +@@ -136,6 +164,7 @@ struct nfs4_lock_state { + struct nfs_unique_id ls_id; + nfs4_stateid ls_stateid; + atomic_t ls_count; ++ struct nfs4_lock_owner ls_owner; + }; + + /* bits for nfs4_state->flags */ +@@ -219,22 +248,34 @@ extern int nfs4_open_revalidate(struct i + extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); + extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, + struct nfs4_fs_locations *fs_locations, struct page *page); ++extern void nfs4_release_lockowner(const struct nfs4_lock_state *); + +-extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[]; +-extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[]; + #if defined(CONFIG_NFS_V4_1) +-extern int nfs4_setup_sequence(struct nfs_client *clp, ++static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) ++{ ++ return server->nfs_client->cl_session; ++} ++ ++extern int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task); + extern void nfs4_destroy_session(struct nfs4_session *session); + extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); ++extern int nfs4_proc_exchange_id(struct nfs_client *, struct rpc_cred *); + extern int nfs4_proc_create_session(struct nfs_client *); + extern int nfs4_proc_destroy_session(struct nfs4_session *); + extern int nfs4_init_session(struct nfs_server *server); + extern int nfs4_proc_get_lease_time(struct nfs_client *clp, + struct nfs_fsinfo *fsinfo); + #else /* CONFIG_NFS_v4_1 */ +-static inline int nfs4_setup_sequence(struct nfs_client *clp, ++static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) ++{ ++ return NULL; ++} ++ ++static inline int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task) + { +@@ -247,12 +288,12 @@ static inline int nfs4_init_session(stru + } + #endif /* CONFIG_NFS_V4_1 */ + +-extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[]; ++extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; + + extern const u32 nfs4_fattr_bitmap[2]; + extern const u32 nfs4_statfs_bitmap[2]; + extern const u32 nfs4_pathconf_bitmap[2]; +-extern const u32 nfs4_fsinfo_bitmap[2]; ++extern const u32 nfs4_fsinfo_bitmap[3]; + extern const u32 nfs4_fs_locations_bitmap[2]; + + /* nfs4renewd.c */ +@@ -284,7 +325,7 @@ extern void nfs41_handle_sequence_flag_e + extern void nfs41_handle_recall_slot(struct nfs_client *clp); + extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); + extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); +-extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); ++extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); + + extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); + extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); +@@ -293,6 +334,7 @@ extern void nfs_increment_lock_seqid(int + extern void nfs_release_seqid(struct nfs_seqid *seqid); + extern void nfs_free_seqid(struct nfs_seqid *seqid); + ++/* write.c */ + extern const nfs4_stateid zero_stateid; + + /* nfs4xdr.c */ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-08-31 20:41:19.157140145 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-31 20:42:05.524099925 -0400 +@@ -49,12 +49,14 @@ + #include + #include + #include ++#include + + #include "nfs4_fs.h" + #include "delegation.h" + #include "internal.h" + #include "iostat.h" + #include "callback.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PROC + +@@ -67,7 +69,7 @@ struct nfs4_opendata; + static int _nfs4_proc_open(struct nfs4_opendata *data); + static int _nfs4_recover_proc_open(struct nfs4_opendata *data); + static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); +-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); ++static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, struct nfs_client *); + static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, +@@ -125,11 +127,16 @@ const u32 nfs4_pathconf_bitmap[2] = { + 0 + }; + +-const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE ++const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE + | FATTR4_WORD0_MAXREAD + | FATTR4_WORD0_MAXWRITE + | FATTR4_WORD0_LEASE_TIME, ++#ifdef CONFIG_NFS_V4_1 ++ FATTR4_WORD1_FS_LAYOUT_TYPES, ++ FATTR4_WORD2_LAYOUT_BLKSIZE ++#else /* CONFIG_NFS_V4_1 */ + 0 ++#endif /* CONFIG_NFS_V4_1 */ + }; + + const u32 nfs4_fs_locations_bitmap[2] = { +@@ -356,7 +363,7 @@ static void nfs41_check_drain_session_co + { + struct rpc_task *task; + +- if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) { ++ if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); + if (task) + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); +@@ -370,12 +377,11 @@ static void nfs41_check_drain_session_co + complete(&ses->complete); + } + +-static void nfs41_sequence_free_slot(const struct nfs_client *clp, +- struct nfs4_sequence_res *res) ++static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) + { + struct nfs4_slot_table *tbl; + +- tbl = &clp->cl_session->fc_slot_table; ++ tbl = &res->sr_session->fc_slot_table; + if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { + /* just wake up the next guy waiting since + * we may have not consumed a slot after all */ +@@ -385,18 +391,17 @@ static void nfs41_sequence_free_slot(con + + spin_lock(&tbl->slot_tbl_lock); + nfs4_free_slot(tbl, res->sr_slotid); +- nfs41_check_drain_session_complete(clp->cl_session); ++ nfs41_check_drain_session_complete(res->sr_session); + spin_unlock(&tbl->slot_tbl_lock); + res->sr_slotid = NFS4_MAX_SLOT_TABLE; + } + +-static void nfs41_sequence_done(struct nfs_client *clp, +- struct nfs4_sequence_res *res, +- int rpc_status) ++static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) + { + unsigned long timestamp; + struct nfs4_slot_table *tbl; + struct nfs4_slot *slot; ++ struct nfs_client *clp; + + /* + * sr_status remains 1 if an RPC level error occurred. The server +@@ -411,13 +416,16 @@ static void nfs41_sequence_done(struct n + if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) + goto out; + ++ tbl = &res->sr_session->fc_slot_table; ++ slot = tbl->slots + res->sr_slotid; ++ + /* Check the SEQUENCE operation status */ +- if (res->sr_status == 0) { +- tbl = &clp->cl_session->fc_slot_table; +- slot = tbl->slots + res->sr_slotid; ++ switch (res->sr_status) { ++ case 0: + /* Update the slot's sequence and clientid lease timer */ + ++slot->seq_nr; + timestamp = res->sr_renewal_time; ++ clp = res->sr_session->clp; + spin_lock(&clp->cl_lock); + if (time_before(clp->cl_last_renewal, timestamp)) + clp->cl_last_renewal = timestamp; +@@ -425,11 +433,39 @@ static void nfs41_sequence_done(struct n + /* Check sequence flags */ + if (atomic_read(&clp->cl_count) > 1) + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); ++ break; ++ case -NFS4ERR_DELAY: ++ /* The server detected a resend of the RPC call and ++ * returned NFS4ERR_DELAY as per Section 2.10.6.2 ++ * of RFC5661. ++ */ ++ dprintk("%s: slot=%d seq=%d: Operation in progress\n", ++ __func__, res->sr_slotid, slot->seq_nr); ++ goto out_retry; ++ default: ++ /* Just update the slot sequence no. */ ++ ++slot->seq_nr; + } + out: + /* The session may be reset by one of the error handlers. */ + dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); +- nfs41_sequence_free_slot(clp, res); ++ nfs41_sequence_free_slot(res); ++ return 1; ++out_retry: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ rpc_restart_call(task); ++ /* FIXME: rpc_restart_call() should be made to return success/fail */ ++ if (RPC_ASSASSINATED(task)) ++ goto out; ++ return 0; ++} ++ ++static int nfs4_sequence_done(struct rpc_task *task, ++ struct nfs4_sequence_res *res) ++{ ++ if (res->sr_session == NULL) ++ return 1; ++ return nfs41_sequence_done(task, res); + } + + /* +@@ -480,12 +516,11 @@ static int nfs41_setup_sequence(struct n + if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) + return 0; + +- memset(res, 0, sizeof(*res)); + res->sr_slotid = NFS4_MAX_SLOT_TABLE; + tbl = &session->fc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); +- if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) && ++ if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && + !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { + /* + * The state manager will wait until the slot table is empty. +@@ -525,6 +560,7 @@ static int nfs41_setup_sequence(struct n + res->sr_session = session; + res->sr_slotid = slotid; + res->sr_renewal_time = jiffies; ++ res->sr_status_flags = 0; + /* + * sr_status is only set in decode_sequence, and so will remain + * set to 1 if an rpc level failure occurs. +@@ -533,33 +569,36 @@ static int nfs41_setup_sequence(struct n + return 0; + } + +-int nfs4_setup_sequence(struct nfs_client *clp, ++int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply, + struct rpc_task *task) + { ++ struct nfs4_session *session = nfs4_get_session(server); + int ret = 0; + ++ if (ds_session) ++ session = ds_session; ++ if (session == NULL) { ++ args->sa_session = NULL; ++ res->sr_session = NULL; ++ goto out; ++ } ++ + dprintk("--> %s clp %p session %p sr_slotid %d\n", +- __func__, clp, clp->cl_session, res->sr_slotid); ++ __func__, session->clp, session, res->sr_slotid); + +- if (!nfs4_has_session(clp)) +- goto out; +- ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply, ++ ret = nfs41_setup_sequence(session, args, res, cache_reply, + task); +- if (ret && ret != -EAGAIN) { +- /* terminate rpc task */ +- task->tk_status = ret; +- task->tk_action = NULL; +- } + out: + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; + } + + struct nfs41_call_sync_data { +- struct nfs_client *clp; ++ const struct nfs_server *seq_server; + struct nfs4_sequence_args *seq_args; + struct nfs4_sequence_res *seq_res; + int cache_reply; +@@ -569,9 +608,9 @@ static void nfs41_call_sync_prepare(stru + { + struct nfs41_call_sync_data *data = calldata; + +- dprintk("--> %s data->clp->cl_session %p\n", __func__, +- data->clp->cl_session); +- if (nfs4_setup_sequence(data->clp, data->seq_args, ++ dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); ++ ++ if (nfs4_setup_sequence(data->seq_server, NULL, data->seq_args, + data->seq_res, data->cache_reply, task)) + return; + rpc_call_start(task); +@@ -587,7 +626,7 @@ static void nfs41_call_sync_done(struct + { + struct nfs41_call_sync_data *data = calldata; + +- nfs41_sequence_done(data->clp, data->seq_res, task->tk_status); ++ nfs41_sequence_done(task, data->seq_res); + } + + struct rpc_call_ops nfs41_call_sync_ops = { +@@ -600,8 +639,7 @@ struct rpc_call_ops nfs41_call_priv_sync + .rpc_call_done = nfs41_call_sync_done, + }; + +-static int nfs4_call_sync_sequence(struct nfs_client *clp, +- struct rpc_clnt *clnt, ++static int nfs4_call_sync_sequence(struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, +@@ -611,13 +649,13 @@ static int nfs4_call_sync_sequence(struc + int ret; + struct rpc_task *task; + struct nfs41_call_sync_data data = { +- .clp = clp, ++ .seq_server = server, + .seq_args = args, + .seq_res = res, + .cache_reply = cache_reply, + }; + struct rpc_task_setup task_setup = { +- .rpc_client = clnt, ++ .rpc_client = server->client, + .rpc_message = msg, + .callback_ops = &nfs41_call_sync_ops, + .callback_data = &data +@@ -642,10 +680,15 @@ int _nfs4_call_sync_session(struct nfs_s + struct nfs4_sequence_res *res, + int cache_reply) + { +- return nfs4_call_sync_sequence(server->nfs_client, server->client, +- msg, args, res, cache_reply, 0); ++ return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0); + } + ++#else ++static int nfs4_sequence_done(struct rpc_task *task, ++ struct nfs4_sequence_res *res) ++{ ++ return 1; ++} + #endif /* CONFIG_NFS_V4_1 */ + + int _nfs4_call_sync(struct nfs_server *server, +@@ -659,18 +702,9 @@ int _nfs4_call_sync(struct nfs_server *s + } + + #define nfs4_call_sync(server, msg, args, res, cache_reply) \ +- (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \ ++ (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \ + &(res)->seq_res, (cache_reply)) + +-static void nfs4_sequence_done(const struct nfs_server *server, +- struct nfs4_sequence_res *res, int rpc_status) +-{ +-#ifdef CONFIG_NFS_V4_1 +- if (nfs4_has_session(server->nfs_client)) +- nfs41_sequence_done(server->nfs_client, res, rpc_status); +-#endif /* CONFIG_NFS_V4_1 */ +-} +- + static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) + { + struct nfs_inode *nfsi = NFS_I(dir); +@@ -745,19 +779,14 @@ static struct nfs4_opendata *nfs4_openda + p->o_arg.server = server; + p->o_arg.bitmask = server->attr_bitmask; + p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; +- if (flags & O_EXCL) { +- if (nfs4_has_persistent_session(server->nfs_client)) { +- /* GUARDED */ +- p->o_arg.u.attrs = &p->attrs; +- memcpy(&p->attrs, attrs, sizeof(p->attrs)); +- } else { /* EXCLUSIVE4_1 */ +- u32 *s = (u32 *) p->o_arg.u.verifier.data; +- s[0] = jiffies; +- s[1] = current->pid; +- } +- } else if (flags & O_CREAT) { ++ if (flags & O_CREAT) { ++ u32 *s; ++ + p->o_arg.u.attrs = &p->attrs; + memcpy(&p->attrs, attrs, sizeof(p->attrs)); ++ s = (u32 *) p->o_arg.u.verifier.data; ++ s[0] = jiffies; ++ s[1] = current->pid; + } + p->c_arg.fh = &p->o_res.fh; + p->c_arg.stateid = &p->o_res.stateid; +@@ -851,8 +880,10 @@ static void update_open_stateflags(struc + static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) + { + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) +- memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data)); +- memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data)); ++ memcpy(state->stateid.u.data, stateid->u.data, ++ sizeof(state->stateid.u.data)); ++ memcpy(state->open_stateid.u.data, stateid->u.data, ++ sizeof(state->open_stateid.u.data)); + switch (fmode) { + case FMODE_READ: + set_bit(NFS_O_RDONLY_STATE, &state->flags); +@@ -880,7 +911,8 @@ static void __update_open_stateid(struct + */ + write_seqlock(&state->seqlock); + if (deleg_stateid != NULL) { +- memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data)); ++ memcpy(state->stateid.u.data, deleg_stateid->u.data, ++ sizeof(state->stateid.u.data)); + set_bit(NFS_DELEGATED_STATE, &state->flags); + } + if (open_stateid != NULL) +@@ -911,7 +943,8 @@ static int update_open_stateid(struct nf + + if (delegation == NULL) + delegation = &deleg_cur->stateid; +- else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0) ++ else if (memcmp(deleg_cur->stateid.u.data, delegation->u.data, ++ NFS4_STATEID_SIZE) != 0) + goto no_delegation_unlock; + + nfs_mark_delegation_referenced(deleg_cur); +@@ -973,7 +1006,8 @@ static struct nfs4_state *nfs4_try_open_ + break; + } + /* Save the delegation */ +- memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); ++ memcpy(stateid.u.data, delegation->stateid.u.data, ++ sizeof(stateid.u.data)); + rcu_read_unlock(); + ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); + if (ret != 0) +@@ -1127,10 +1161,13 @@ static int nfs4_open_recover(struct nfs4 + * Check if we need to update the current stateid. + */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 && +- memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) { ++ memcmp(state->stateid.u.data, state->open_stateid.u.data, ++ sizeof(state->stateid.u.data)) != 0) { + write_seqlock(&state->seqlock); + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) +- memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)); ++ memcpy(state->stateid.u.data, ++ state->open_stateid.u.data, ++ sizeof(state->stateid.u.data)); + write_sequnlock(&state->seqlock); + } + return 0; +@@ -1199,8 +1236,8 @@ static int _nfs4_open_delegation_recall( + if (IS_ERR(opendata)) + return PTR_ERR(opendata); + opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; +- memcpy(opendata->o_arg.u.delegation.data, stateid->data, +- sizeof(opendata->o_arg.u.delegation.data)); ++ memcpy(opendata->o_arg.u.delegation.u.data, stateid->u.data, ++ sizeof(opendata->o_arg.u.delegation.u.data)); + ret = nfs4_open_recover(opendata, state); + nfs4_opendata_put(opendata); + return ret; +@@ -1258,8 +1295,8 @@ static void nfs4_open_confirm_done(struc + if (RPC_ASSASSINATED(task)) + return; + if (data->rpc_status == 0) { +- memcpy(data->o_res.stateid.data, data->c_res.stateid.data, +- sizeof(data->o_res.stateid.data)); ++ memcpy(data->o_res.stateid.u.data, data->c_res.stateid.u.data, ++ sizeof(data->o_res.stateid.u.data)); + nfs_confirm_seqid(&data->owner->so_seqid, 0); + renew_lease(data->o_res.server, data->timestamp); + data->rpc_done = 1; +@@ -1356,13 +1393,13 @@ static void nfs4_open_prepare(struct rpc + } + /* Update sequence id. */ + data->o_arg.id = sp->so_owner_id.id; +- data->o_arg.clientid = sp->so_client->cl_clientid; ++ data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; + if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; + nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); + } + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->o_arg.server->nfs_client, ++ if (nfs4_setup_sequence(data->o_arg.server, NULL, + &data->o_arg.seq_args, + &data->o_res.seq_res, 1, task)) + return; +@@ -1385,8 +1422,8 @@ static void nfs4_open_done(struct rpc_ta + + data->rpc_status = task->tk_status; + +- nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->o_res.seq_res)) ++ return; + + if (RPC_ASSASSINATED(task)) + return; +@@ -1539,9 +1576,8 @@ static int _nfs4_proc_open(struct nfs4_o + return 0; + } + +-static int nfs4_recover_expired_lease(struct nfs_server *server) ++int nfs4_recover_expired_lease(struct nfs_client *clp) + { +- struct nfs_client *clp = server->nfs_client; + unsigned int loop; + int ret; + +@@ -1557,6 +1593,7 @@ static int nfs4_recover_expired_lease(st + } + return ret; + } ++EXPORT_SYMBOL(nfs4_recover_expired_lease); + + /* + * OPEN_EXPIRED: +@@ -1646,7 +1683,7 @@ static int _nfs4_do_open(struct inode *d + dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); + goto out_err; + } +- status = nfs4_recover_expired_lease(server); ++ status = nfs4_recover_expired_lease(server->nfs_client); + if (status != 0) + goto err_put_state_owner; + if (path->dentry->d_inode != NULL) +@@ -1773,7 +1810,7 @@ static int _nfs4_do_setattr(struct inode + if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { + /* Use that stateid */ + } else if (state != NULL) { +- nfs4_copy_stateid(&arg.stateid, state, current->files); ++ nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid); + } else + memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); + +@@ -1838,7 +1875,8 @@ static void nfs4_close_done(struct rpc_t + struct nfs4_state *state = calldata->state; + struct nfs_server *server = NFS_SERVER(calldata->inode); + +- nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status); ++ if (!nfs4_sequence_done(task, &calldata->res.seq_res)) ++ return; + if (RPC_ASSASSINATED(task)) + return; + /* hmm. we are done with the inode, and in the process of freeing +@@ -1858,7 +1896,7 @@ static void nfs4_close_done(struct rpc_t + if (calldata->arg.fmode == 0) + break; + default: +- if (nfs4_async_handle_error(task, server, state) == -EAGAIN) ++ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } + nfs_release_seqid(calldata->arg.seqid); +@@ -1903,7 +1941,7 @@ static void nfs4_close_prepare(struct rp + + nfs_fattr_init(calldata->res.fattr); + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client, ++ if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), NULL, + &calldata->arg.seq_args, &calldata->res.seq_res, + 1, task)) + return; +@@ -2323,6 +2361,9 @@ nfs4_proc_setattr(struct dentry *dentry, + struct nfs4_state *state = NULL; + int status; + ++ if (pnfs_ld_layoutret_on_setattr(inode)) ++ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); ++ + nfs_fattr_init(fattr); + + /* Search for an existing open(O_WRITE) file */ +@@ -2648,8 +2689,9 @@ static int nfs4_proc_unlink_done(struct + { + struct nfs_removeres *res = task->tk_msg.rpc_resp; + +- nfs4_sequence_done(res->server, &res->seq_res, task->tk_status); +- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) ++ if (!nfs4_sequence_done(task, &res->seq_res)) ++ return 0; ++ if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN) + return 0; + update_changeattr(dir, &res->cinfo); + nfs_post_op_update_inode(dir, res->dir_attr); +@@ -3090,18 +3132,31 @@ static int nfs4_proc_pathconf(struct nfs + static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) + { + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; + + dprintk("--> %s\n", __func__); + +- nfs4_sequence_done(server, &data->res.seq_res, task->tk_status); ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; + +- if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, server->nfs_client); ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } + + nfs_invalidate_atime(data->inode); +- if (task->tk_status > 0) ++ if (task->tk_status > 0 && client == server->nfs_client) + renew_lease(server, data->timestamp); + return 0; + } +@@ -3112,20 +3167,56 @@ static void nfs4_proc_read_setup(struct + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + } + ++static void pnfs4_update_write_done(struct nfs_inode *nfsi, struct nfs_write_data *data) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ + static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- +- nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, +- task->tk_status); ++ struct nfs_server *server = NFS_SERVER(inode); ++ struct nfs_client *client = server->nfs_client; + +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; ++ ++#ifdef CONFIG_NFS_V4_1 ++ /* restore original count after retry? */ ++ if (data->pdata.orig_count) { ++ dprintk("%s: restoring original count %u\n", __func__, ++ data->pdata.orig_count); ++ data->args.count = data->pdata.orig_count; ++ } ++ ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } ++ ++ /* ++ * MDS write: renew lease ++ * DS write: update lastbyte written, mark for layout commit ++ */ + if (task->tk_status >= 0) { +- renew_lease(NFS_SERVER(inode), data->timestamp); +- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ if (client == server->nfs_client) { ++ renew_lease(server, data->timestamp); ++ nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ } else ++ pnfs4_update_write_done(NFS_I(inode), data); + } + return 0; + } +@@ -3138,20 +3229,42 @@ static void nfs4_proc_write_setup(struct + data->res.server = server; + data->timestamp = jiffies; + ++#ifdef CONFIG_NFS_V4_1 ++ /* writes to DS use pnfs vector */ ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_WRITE]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; + } + + static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- +- nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, +- task->tk_status); +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { ++ struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; ++ ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS commit\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; ++ ++ if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL, NULL) == -EAGAIN) { + nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + return -EAGAIN; + } +- nfs_refresh_inode(inode, data->res.fattr); ++ if (client == server->nfs_client) ++ nfs_refresh_inode(inode, data->res.fattr); + return 0; + } + +@@ -3161,6 +3274,12 @@ static void nfs4_proc_commit_setup(struc + + data->args.bitmask = server->cache_consistency_bitmask; + data->res.server = server; ++#if defined(CONFIG_NFS_V4_1) ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_COMMIT]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; + } + +@@ -3464,9 +3583,12 @@ static int nfs4_proc_set_acl(struct inod + } + + static int +-_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state) ++nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state, struct nfs_client *clp) + { +- if (!clp || task->tk_status >= 0) ++ if (!clp) ++ clp = server->nfs_client; ++ ++ if (task->tk_status >= 0) + return 0; + switch(task->tk_status) { + case -NFS4ERR_ADMIN_REVOKED: +@@ -3491,8 +3613,9 @@ _nfs4_async_handle_error(struct rpc_task + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: +- dprintk("%s ERROR %d, Reset session\n", __func__, +- task->tk_status); ++ dprintk("%s ERROR %d, Reset session. Exchangeid " ++ "flags 0x%x\n", __func__, task->tk_status, ++ clp->cl_exchange_flags); + nfs4_schedule_state_recovery(clp); + task->tk_status = 0; + return -EAGAIN; +@@ -3512,6 +3635,8 @@ _nfs4_async_handle_error(struct rpc_task + task->tk_status = nfs4_map_errors(task->tk_status); + return 0; + do_state_recovery: ++ if (is_ds_only_client(clp)) ++ return 0; + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); + nfs4_schedule_state_recovery(clp); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) +@@ -3520,12 +3645,6 @@ do_state_recovery: + return -EAGAIN; + } + +-static int +-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) +-{ +- return _nfs4_async_handle_error(task, server, server->nfs_client, state); +-} +- + int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, + unsigned short port, struct rpc_cred *cred, + struct nfs4_setclientid_res *res) +@@ -3641,8 +3760,8 @@ static void nfs4_delegreturn_done(struct + { + struct nfs4_delegreturndata *data = calldata; + +- nfs4_sequence_done(data->res.server, &data->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; + + switch (task->tk_status) { + case -NFS4ERR_STALE_STATEID: +@@ -3651,8 +3770,8 @@ static void nfs4_delegreturn_done(struct + renew_lease(data->res.server, data->timestamp); + break; + default: +- if (nfs4_async_handle_error(task, data->res.server, NULL) == +- -EAGAIN) { ++ if (nfs4_async_handle_error(task, data->res.server, NULL, NULL) ++ == -EAGAIN) { + nfs_restart_rpc(task, data->res.server->nfs_client); + return; + } +@@ -3672,7 +3791,7 @@ static void nfs4_delegreturn_prepare(str + + d_data = (struct nfs4_delegreturndata *)data; + +- if (nfs4_setup_sequence(d_data->res.server->nfs_client, ++ if (nfs4_setup_sequence(d_data->res.server, NULL, + &d_data->args.seq_args, + &d_data->res.seq_res, 1, task)) + return; +@@ -3892,15 +4011,16 @@ static void nfs4_locku_done(struct rpc_t + { + struct nfs4_unlockdata *calldata = data; + +- nfs4_sequence_done(calldata->server, &calldata->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &calldata->res.seq_res)) ++ return; + if (RPC_ASSASSINATED(task)) + return; + switch (task->tk_status) { + case 0: +- memcpy(calldata->lsp->ls_stateid.data, +- calldata->res.stateid.data, +- sizeof(calldata->lsp->ls_stateid.data)); ++ memcpy(calldata->lsp->ls_stateid.u.data, ++ calldata->res.stateid.u.data, ++ sizeof(calldata->lsp->ls_stateid.u. ++ data)); + renew_lease(calldata->server, calldata->timestamp); + break; + case -NFS4ERR_BAD_STATEID: +@@ -3909,7 +4029,7 @@ static void nfs4_locku_done(struct rpc_t + case -NFS4ERR_EXPIRED: + break; + default: +- if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) ++ if (nfs4_async_handle_error(task, calldata->server, NULL, NULL) == -EAGAIN) + nfs_restart_rpc(task, + calldata->server->nfs_client); + } +@@ -3927,7 +4047,7 @@ static void nfs4_locku_prepare(struct rp + return; + } + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence(calldata->server->nfs_client, ++ if (nfs4_setup_sequence(calldata->server, NULL, + &calldata->arg.seq_args, + &calldata->res.seq_res, 1, task)) + return; +@@ -4082,7 +4202,8 @@ static void nfs4_lock_prepare(struct rpc + } else + data->arg.new_lock_owner = 0; + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args, ++ if (nfs4_setup_sequence(data->server, NULL, ++ &data->arg.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +@@ -4101,8 +4222,8 @@ static void nfs4_lock_done(struct rpc_ta + + dprintk("%s: begin!\n", __func__); + +- nfs4_sequence_done(data->server, &data->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; + + data->rpc_status = task->tk_status; + if (RPC_ASSASSINATED(task)) +@@ -4114,8 +4235,8 @@ static void nfs4_lock_done(struct rpc_ta + goto out; + } + if (data->rpc_status == 0) { +- memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, +- sizeof(data->lsp->ls_stateid.data)); ++ memcpy(data->lsp->ls_stateid.u.data, data->res.stateid.u.data, ++ sizeof(data->lsp->ls_stateid.u.data)); + data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; + renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp); + } +@@ -4424,6 +4545,34 @@ out: + return err; + } + ++static void nfs4_release_lockowner_release(void *calldata) ++{ ++ kfree(calldata); ++} ++ ++const struct rpc_call_ops nfs4_release_lockowner_ops = { ++ .rpc_release = nfs4_release_lockowner_release, ++}; ++ ++void nfs4_release_lockowner(const struct nfs4_lock_state *lsp) ++{ ++ struct nfs_server *server = lsp->ls_state->owner->so_server; ++ struct nfs_release_lockowner_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], ++ }; ++ ++ if (server->nfs_client->cl_mvops->minor_version != 0) ++ return; ++ args = kmalloc(sizeof(*args), GFP_NOFS); ++ if (!args) ++ return; ++ args->lock_owner.clientid = server->nfs_client->cl_clientid; ++ args->lock_owner.id = lsp->ls_id.id; ++ msg.rpc_argp = args; ++ rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); ++} ++ + #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" + + int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, +@@ -4526,7 +4675,7 @@ int nfs4_proc_exchange_id(struct nfs_cli + nfs4_verifier verifier; + struct nfs41_exchange_id_args args = { + .client = clp, +- .flags = clp->cl_exchange_flags, ++ .flags = clp->cl_exchange_flags & ~EXCHGID4_FLAG_CONFIRMED_R, + }; + struct nfs41_exchange_id_res res = { + .client = clp, +@@ -4574,6 +4723,7 @@ int nfs4_proc_exchange_id(struct nfs_cli + dprintk("<-- %s status= %d\n", __func__, status); + return status; + } ++EXPORT_SYMBOL(nfs4_proc_exchange_id); + + struct nfs4_get_lease_time_data { + struct nfs4_get_lease_time_args *args; +@@ -4611,7 +4761,8 @@ static void nfs4_get_lease_time_done(str + (struct nfs4_get_lease_time_data *)calldata; + + dprintk("--> %s\n", __func__); +- nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status); ++ if (!nfs41_sequence_done(task, &data->res->lr_seq_res)) ++ return; + switch (task->tk_status) { + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: +@@ -4805,13 +4956,6 @@ struct nfs4_session *nfs4_alloc_session( + if (!session) + return NULL; + +- /* +- * The create session reply races with the server back +- * channel probe. Mark the client NFS_CS_SESSION_INITING +- * so that the client back channel can find the +- * nfs_client struct +- */ +- clp->cl_cons_state = NFS_CS_SESSION_INITING; + init_completion(&session->complete); + + tbl = &session->fc_slot_table; +@@ -4824,6 +4968,8 @@ struct nfs4_session *nfs4_alloc_session( + spin_lock_init(&tbl->slot_tbl_lock); + rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); + ++ session->session_state = 1<clp = clp; + return session; + } +@@ -5040,6 +5186,10 @@ int nfs4_init_session(struct nfs_server + if (!nfs4_has_session(clp)) + return 0; + ++ session = clp->cl_session; ++ if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) ++ return 0; ++ + rsize = server->rsize; + if (rsize == 0) + rsize = NFS_MAX_FILE_IO_SIZE; +@@ -5047,11 +5197,10 @@ int nfs4_init_session(struct nfs_server + if (wsize == 0) + wsize = NFS_MAX_FILE_IO_SIZE; + +- session = clp->cl_session; + session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; + session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; + +- ret = nfs4_recover_expired_lease(server); ++ ret = nfs4_recover_expired_lease(server->nfs_client); + if (!ret) + ret = nfs4_check_client_ready(clp); + return ret; +@@ -5060,69 +5209,70 @@ int nfs4_init_session(struct nfs_server + /* + * Renew the cl_session lease. + */ +-static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +-{ ++struct nfs4_sequence_data { ++ struct nfs_client *clp; + struct nfs4_sequence_args args; + struct nfs4_sequence_res res; +- +- struct rpc_message msg = { +- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], +- .rpc_argp = &args, +- .rpc_resp = &res, +- .rpc_cred = cred, +- }; +- +- args.sa_cache_this = 0; +- +- return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args, +- &res, args.sa_cache_this, 1); +-} ++}; + + static void nfs41_sequence_release(void *data) + { +- struct nfs_client *clp = (struct nfs_client *)data; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); ++ kfree(calldata); ++} ++ ++static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp) ++{ ++ switch(task->tk_status) { ++ case -NFS4ERR_DELAY: ++ case -EKEYEXPIRED: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ return -EAGAIN; ++ default: ++ nfs4_schedule_state_recovery(clp); ++ } ++ return 0; + } + + static void nfs41_sequence_call_done(struct rpc_task *task, void *data) + { +- struct nfs_client *clp = (struct nfs_client *)data; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + +- nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status); ++ if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp)) ++ return; + + if (task->tk_status < 0) { + dprintk("%s ERROR %d\n", __func__, task->tk_status); + if (atomic_read(&clp->cl_count) == 1) + goto out; + +- if (_nfs4_async_handle_error(task, NULL, clp, NULL) +- == -EAGAIN) { +- nfs_restart_rpc(task, clp); ++ if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) { ++ rpc_restart_call_prepare(task); + return; + } + } + dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); + out: +- kfree(task->tk_msg.rpc_argp); +- kfree(task->tk_msg.rpc_resp); +- + dprintk("<-- %s\n", __func__); + } + + static void nfs41_sequence_prepare(struct rpc_task *task, void *data) + { +- struct nfs_client *clp; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + struct nfs4_sequence_args *args; + struct nfs4_sequence_res *res; + +- clp = (struct nfs_client *)data; + args = task->tk_msg.rpc_argp; + res = task->tk_msg.rpc_resp; + +- if (nfs4_setup_sequence(clp, args, res, 0, task)) ++ if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task)) + return; + rpc_call_start(task); + } +@@ -5133,32 +5283,67 @@ static const struct rpc_call_ops nfs41_s + .rpc_release = nfs41_sequence_release, + }; + +-static int nfs41_proc_async_sequence(struct nfs_client *clp, +- struct rpc_cred *cred) ++static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) + { +- struct nfs4_sequence_args *args; +- struct nfs4_sequence_res *res; ++ struct nfs4_sequence_data *calldata; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], + .rpc_cred = cred, + }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = clp->cl_rpcclient, ++ .rpc_message = &msg, ++ .callback_ops = &nfs41_sequence_ops, ++ .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT, ++ }; + + if (!atomic_inc_not_zero(&clp->cl_count)) +- return -EIO; +- args = kzalloc(sizeof(*args), GFP_NOFS); +- res = kzalloc(sizeof(*res), GFP_NOFS); +- if (!args || !res) { +- kfree(args); +- kfree(res); ++ return ERR_PTR(-EIO); ++ calldata = kmalloc(sizeof(*calldata), GFP_NOFS); ++ if (calldata == NULL) { + nfs_put_client(clp); +- return -ENOMEM; ++ return ERR_PTR(-ENOMEM); + } +- res->sr_slotid = NFS4_MAX_SLOT_TABLE; +- msg.rpc_argp = args; +- msg.rpc_resp = res; ++ calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ msg.rpc_argp = &calldata->args; ++ msg.rpc_resp = &calldata->res; ++ calldata->clp = clp; ++ task_setup_data.callback_data = calldata; + +- return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, +- &nfs41_sequence_ops, (void *)clp); ++ return rpc_run_task(&task_setup_data); ++} ++ ++static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred) ++{ ++ struct rpc_task *task; ++ int ret = 0; ++ ++ task = _nfs41_proc_sequence(clp, cred); ++ if (IS_ERR(task)) ++ ret = PTR_ERR(task); ++ else ++ rpc_put_task(task); ++ dprintk("<-- %s status=%d\n", __func__, ret); ++ return ret; ++} ++ ++static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) ++{ ++ struct rpc_task *task; ++ int ret; ++ ++ task = _nfs41_proc_sequence(clp, cred); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ ret = rpc_wait_for_completion_task(task); ++ if (!ret) ++ ret = task->tk_status; ++ rpc_put_task(task); ++out: ++ dprintk("<-- %s status=%d\n", __func__, ret); ++ return ret; + } + + struct nfs4_reclaim_complete_data { +@@ -5172,13 +5357,31 @@ static void nfs4_reclaim_complete_prepar + struct nfs4_reclaim_complete_data *calldata = data; + + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); +- if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, ++ if (nfs41_setup_sequence(calldata->clp->cl_session, ++ &calldata->arg.seq_args, + &calldata->res.seq_res, 0, task)) + return; + + rpc_call_start(task); + } + ++static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) ++{ ++ switch(task->tk_status) { ++ case 0: ++ case -NFS4ERR_COMPLETE_ALREADY: ++ case -NFS4ERR_WRONG_CRED: /* What to do here? */ ++ break; ++ case -NFS4ERR_DELAY: ++ case -EKEYEXPIRED: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ return -EAGAIN; ++ default: ++ nfs4_schedule_state_recovery(clp); ++ } ++ return 0; ++} ++ + static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) + { + struct nfs4_reclaim_complete_data *calldata = data; +@@ -5186,32 +5389,13 @@ static void nfs4_reclaim_complete_done(s + struct nfs4_sequence_res *res = &calldata->res.seq_res; + + dprintk("--> %s\n", __func__); +- nfs41_sequence_done(clp, res, task->tk_status); +- switch (task->tk_status) { +- case 0: +- case -NFS4ERR_COMPLETE_ALREADY: +- break; +- case -NFS4ERR_BADSESSION: +- case -NFS4ERR_DEADSESSION: +- /* +- * Handle the session error, but do not retry the operation, as +- * we have no way of telling whether the clientid had to be +- * reset before we got our reply. If reset, a new wave of +- * reclaim operations will follow, containing their own reclaim +- * complete. We don't want our retry to get on the way of +- * recovery by incorrectly indicating to the server that we're +- * done reclaiming state since the process had to be restarted. +- */ +- _nfs4_async_handle_error(task, NULL, clp, NULL); +- break; +- default: +- if (_nfs4_async_handle_error( +- task, NULL, clp, NULL) == -EAGAIN) { +- rpc_restart_call_prepare(task); +- return; +- } +- } ++ if (!nfs41_sequence_done(task, res)) ++ return; + ++ if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) { ++ rpc_restart_call_prepare(task); ++ return; ++ } + dprintk("<-- %s\n", __func__); + } + +@@ -5268,6 +5452,404 @@ out: + dprintk("<-- %s status=%d\n", __func__, status); + return status; + } ++ ++static void ++nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutget *lgp = calldata; ++ struct inode *ino = lgp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args, ++ &lgp->res.seq_res, 0, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutget *lgp = calldata; ++ struct inode *ino = lgp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!nfs4_sequence_done(task, &lgp->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ pnfs_get_layout_done(lgp, task->tk_status); ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ lgp->status = task->tk_status; ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void nfs4_layoutget_release(void *calldata) ++{ ++ struct nfs4_layoutget *lgp = calldata; ++ ++ dprintk("--> %s\n", __func__); ++ pnfs_layout_release(NFS_I(lgp->args.inode)->layout, NULL); ++ if (lgp->res.layout.buf != NULL) ++ free_page((unsigned long) lgp->res.layout.buf); ++ kfree(calldata); ++ dprintk("<-- %s\n", __func__); ++} ++ ++static const struct rpc_call_ops nfs4_layoutget_call_ops = { ++ .rpc_call_prepare = nfs4_layoutget_prepare, ++ .rpc_call_done = nfs4_layoutget_done, ++ .rpc_release = nfs4_layoutget_release, ++}; ++ ++/* FIXME: We need to call nfs4_handle_exception ++ * and deal with retries. ++ * Currently we can't since we release lgp and its contents. ++ */ ++static int _nfs4_proc_layoutget(struct nfs4_layoutget *lgp) ++{ ++ struct nfs_server *server = NFS_SERVER(lgp->args.inode); ++ struct rpc_task *task; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], ++ .rpc_argp = &lgp->args, ++ .rpc_resp = &lgp->res, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = server->client, ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_layoutget_call_ops, ++ .callback_data = lgp, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ int status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ ++ lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); ++ if (lgp->res.layout.buf == NULL) { ++ nfs4_layoutget_release(lgp); ++ return -ENOMEM; ++ } ++ ++ lgp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = lgp->status; ++ if (status != 0) ++ goto out; ++ status = pnfs_layout_process(lgp); ++out: ++ rpc_put_task(task); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ return status; ++} ++ ++int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) ++{ ++ struct nfs_server *server = NFS_SERVER(lgp->args.inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, _nfs4_proc_layoutget(lgp), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *data) ++{ ++ struct nfs4_layoutcommit_data *ldata = ++ (struct nfs4_layoutcommit_data *)data; ++ struct nfs_server *server = NFS_SERVER(ldata->args.inode); ++ ++ if (nfs4_setup_sequence(server, NULL, &ldata->args.seq_args, ++ &ldata->res.seq_res, 1, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void ++nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutcommit_data *data = ++ (struct nfs4_layoutcommit_data *)calldata; ++ struct nfs_server *server = NFS_SERVER(data->args.inode); ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ data->status = task->tk_status; ++} ++ ++static void nfs4_layoutcommit_release(void *lcdata) ++{ ++ struct nfs4_layoutcommit_data *data = ++ (struct nfs4_layoutcommit_data *)lcdata; ++ ++ put_rpccred(data->cred); ++ pnfs_cleanup_layoutcommit(lcdata); ++ pnfs_layoutcommit_free(lcdata); ++ /* Matched by get_layout in pnfs_layoutcommit_inode */ ++ put_layout(data->args.inode); ++} ++ ++static const struct rpc_call_ops nfs4_layoutcommit_ops = { ++ .rpc_call_prepare = nfs4_layoutcommit_prepare, ++ .rpc_call_done = nfs4_layoutcommit_done, ++ .rpc_release = nfs4_layoutcommit_release, ++}; ++ ++/* Execute a layoutcommit to the server */ ++static int ++_nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync) ++{ ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT], ++ .rpc_argp = &data->args, ++ .rpc_resp = &data->res, ++ .rpc_cred = data->cred, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .task = &data->task, ++ .rpc_client = NFS_CLIENT(data->args.inode), ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_layoutcommit_ops, ++ .callback_data = data, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ struct rpc_task *task; ++ int status = 0; ++ ++ dprintk("NFS: %4d initiating layoutcommit call. %llu@%llu lbw: %llu " ++ "type: %d issync %d\n", ++ data->task.tk_pid, ++ data->args.range.length, ++ data->args.range.offset, ++ data->args.lastbytewritten, ++ data->args.layout_type, issync); ++ ++ data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = data->status; ++out: ++ dprintk("%s: status %d\n", __func__, status); ++ rpc_put_task(task); ++ return 0; ++} ++ ++int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync) ++{ ++ struct nfs4_exception exception = { }; ++ struct nfs_server *server = NFS_SERVER(data->args.inode); ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_proc_layoutcommit(data, issync), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static void ++nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ if (nfs4_setup_sequence(server, NULL, &lrp->args.seq_args, ++ &lrp->res.seq_res, 0, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!nfs4_sequence_done(task, &lrp->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void nfs4_layoutreturn_release(void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; ++ ++ dprintk("--> %s return_type %d lo %p\n", __func__, ++ lrp->args.return_type, lo); ++ ++ if (lrp->args.return_type == RETURN_FILE) { ++ if (!lrp->res.lrs_present) ++ pnfs_set_layout_stateid(lo, &zero_stateid); ++ pnfs_layout_release(lo, &lrp->args.range); ++ } ++ kfree(calldata); ++ dprintk("<-- %s\n", __func__); ++} ++ ++static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { ++ .rpc_call_prepare = nfs4_layoutreturn_prepare, ++ .rpc_call_done = nfs4_layoutreturn_done, ++ .rpc_release = nfs4_layoutreturn_release, ++}; ++ ++int _nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync) ++{ ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ struct rpc_task *task; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], ++ .rpc_argp = &lrp->args, ++ .rpc_resp = &lrp->res, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = server->client, ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_layoutreturn_call_ops, ++ .callback_data = lrp, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ int status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ lrp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = task->tk_status; ++out: ++ dprintk("<-- %s\n", __func__); ++ rpc_put_task(task); ++ return status; ++} ++ ++int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync) ++{ ++ struct nfs_server *server = NFS_SERVER(lrp->args.inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_proc_layoutreturn(lrp, issync), ++ &exception); ++ } while (exception.retry); ++ ++ return err; ++} ++ ++/* ++ * Retrieve the list of Data Server devices from the MDS. ++ */ ++static int _nfs4_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_getdevicelist_args args = { ++ .fh = fh, ++ .layoutclass = server->pnfs_curr_ld->id, ++ }; ++ struct nfs4_getdevicelist_res res = { ++ .devlist = devlist, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], ++ .rpc_argp = &args, ++ .rpc_resp = &res, ++ }; ++ int status; ++ ++ dprintk("--> %s\n", __func__); ++ status = nfs4_call_sync(server, &msg, &args, &res, 0); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ return status; ++} ++ ++int nfs4_proc_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_getdevicelist(server, fh, devlist), ++ &exception); ++ } while (exception.retry); ++ ++ dprintk("nfs4_pnfs_getdevlist: err=%d, num_devs=%u\n", ++ err, devlist->num_devs); ++ ++ return err; ++} ++ ++int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) ++{ ++ struct nfs4_getdeviceinfo_args args = { ++ .pdev = pdev, ++ }; ++ struct nfs4_getdeviceinfo_res res = { ++ .pdev = pdev, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], ++ .rpc_argp = &args, ++ .rpc_resp = &res, ++ }; ++ int status; ++ ++ dprintk("--> %s\n", __func__); ++ status = nfs4_call_sync(server, &msg, &args, &res, 0); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ ++ return status; ++} ++ + #endif /* CONFIG_NFS_V4_1 */ + + struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { +@@ -5325,28 +5907,30 @@ struct nfs4_state_maintenance_ops nfs41_ + }; + #endif + +-/* +- * Per minor version reboot and network partition recovery ops +- */ +- +-struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = { +- &nfs40_reboot_recovery_ops, +-#if defined(CONFIG_NFS_V4_1) +- &nfs41_reboot_recovery_ops, +-#endif ++static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { ++ .minor_version = 0, ++ .call_sync = _nfs4_call_sync, ++ .validate_stateid = nfs4_validate_delegation_stateid, ++ .reboot_recovery_ops = &nfs40_reboot_recovery_ops, ++ .nograce_recovery_ops = &nfs40_nograce_recovery_ops, ++ .state_renewal_ops = &nfs40_state_renewal_ops, + }; + +-struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = { +- &nfs40_nograce_recovery_ops, + #if defined(CONFIG_NFS_V4_1) +- &nfs41_nograce_recovery_ops, +-#endif ++static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { ++ .minor_version = 1, ++ .call_sync = _nfs4_call_sync_session, ++ .validate_stateid = nfs41_validate_delegation_stateid, ++ .reboot_recovery_ops = &nfs41_reboot_recovery_ops, ++ .nograce_recovery_ops = &nfs41_nograce_recovery_ops, ++ .state_renewal_ops = &nfs41_state_renewal_ops, + }; ++#endif + +-struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = { +- &nfs40_state_renewal_ops, ++const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { ++ [0] = &nfs_v4_0_minor_ops, + #if defined(CONFIG_NFS_V4_1) +- &nfs41_state_renewal_ops, ++ [1] = &nfs_v4_1_minor_ops, + #endif + }; + +@@ -5364,6 +5948,7 @@ const struct nfs_rpc_ops nfs_v4_clientop + .dentry_ops = &nfs4_dentry_operations, + .dir_inode_ops = &nfs4_dir_inode_operations, + .file_inode_ops = &nfs4_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs4_proc_get_root, + .getattr = nfs4_proc_getattr, + .setattr = nfs4_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs/nfs4renewd.c +--- linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-08-31 20:42:05.526213255 -0400 +@@ -54,17 +54,17 @@ + void + nfs4_renew_state(struct work_struct *work) + { +- struct nfs4_state_maintenance_ops *ops; ++ const struct nfs4_state_maintenance_ops *ops; + struct nfs_client *clp = + container_of(work, struct nfs_client, cl_renewd.work); + struct rpc_cred *cred; + long lease; + unsigned long last, now; + +- ops = nfs4_state_renewal_ops[clp->cl_minorversion]; ++ ops = clp->cl_mvops->state_renewal_ops; + dprintk("%s: start\n", __func__); + /* Are there any active superblocks? */ +- if (list_empty(&clp->cl_superblocks)) ++ if (list_empty(&clp->cl_superblocks) && !is_ds_only_client(clp)) + goto out; + spin_lock(&clp->cl_lock); + lease = clp->cl_lease_time; +diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/nfs4state.c +--- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-08-31 20:41:19.158078621 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-31 20:42:05.527232994 -0400 +@@ -48,11 +48,13 @@ + #include + #include + #include ++#include + + #include "nfs4_fs.h" + #include "callback.h" + #include "delegation.h" + #include "internal.h" ++#include "pnfs.h" + + #define OPENOWNER_POOL_SIZE 8 + +@@ -126,6 +128,11 @@ static int nfs41_setup_state_renewal(str + int status; + struct nfs_fsinfo fsinfo; + ++ if (is_ds_only_client(clp)) { ++ nfs4_schedule_state_renewal(clp); ++ return 0; ++ } ++ + status = nfs4_proc_get_lease_time(clp, &fsinfo); + if (status == 0) { + /* Update lease time and schedule renewal */ +@@ -145,7 +152,9 @@ static void nfs4_end_drain_session(struc + struct nfs4_session *ses = clp->cl_session; + int max_slots; + +- if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) { ++ if (ses == NULL) ++ return; ++ if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + spin_lock(&ses->fc_slot_table.slot_tbl_lock); + max_slots = ses->fc_slot_table.max_slots; + while (max_slots--) { +@@ -167,7 +176,7 @@ static int nfs4_begin_drain_session(stru + struct nfs4_slot_table *tbl = &ses->fc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); +- set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state); ++ set_bit(NFS4_SESSION_DRAINING, &ses->session_state); + if (tbl->highest_used_slotid != -1) { + INIT_COMPLETION(ses->complete); + spin_unlock(&tbl->slot_tbl_lock); +@@ -371,7 +380,6 @@ nfs4_alloc_state_owner(void) + return NULL; + spin_lock_init(&sp->so_lock); + INIT_LIST_HEAD(&sp->so_states); +- INIT_LIST_HEAD(&sp->so_delegations); + rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); + sp->so_seqid.sequence = &sp->so_sequence; + spin_lock_init(&sp->so_sequence.lock); +@@ -384,7 +392,7 @@ static void + nfs4_drop_state_owner(struct nfs4_state_owner *sp) + { + if (!RB_EMPTY_NODE(&sp->so_client_node)) { +- struct nfs_client *clp = sp->so_client; ++ struct nfs_client *clp = sp->so_server->nfs_client; + + spin_lock(&clp->cl_lock); + rb_erase(&sp->so_client_node, &clp->cl_state_owners); +@@ -406,7 +414,6 @@ struct nfs4_state_owner *nfs4_get_state_ + new = nfs4_alloc_state_owner(); + if (new == NULL) + return NULL; +- new->so_client = clp; + new->so_server = server; + new->so_cred = cred; + spin_lock(&clp->cl_lock); +@@ -423,7 +430,7 @@ struct nfs4_state_owner *nfs4_get_state_ + + void nfs4_put_state_owner(struct nfs4_state_owner *sp) + { +- struct nfs_client *clp = sp->so_client; ++ struct nfs_client *clp = sp->so_server->nfs_client; + struct rpc_cred *cred = sp->so_cred; + + if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) +@@ -583,8 +590,24 @@ static void __nfs4_close(struct path *pa + if (!call_close) { + nfs4_put_open_state(state); + nfs4_put_state_owner(owner); +- } else ++ } else { ++ u32 roc_iomode; ++ struct nfs_inode *nfsi = NFS_I(state->inode); ++ ++ if (has_layout(nfsi) && ++ (roc_iomode = pnfs_layout_roc_iomode(nfsi)) != 0) { ++ struct pnfs_layout_range range = { ++ .iomode = roc_iomode, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ ++ pnfs_return_layout(state->inode, &range, NULL, ++ RETURN_FILE, wait); ++ } ++ + nfs4_do_close(path, state, gfp_mask, wait); ++ } + } + + void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) +@@ -602,12 +625,21 @@ void nfs4_close_sync(struct path *path, + * that is compatible with current->files + */ + static struct nfs4_lock_state * +-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) ++__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) + { + struct nfs4_lock_state *pos; + list_for_each_entry(pos, &state->lock_states, ls_locks) { +- if (pos->ls_owner != fl_owner) ++ if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type) + continue; ++ switch (pos->ls_owner.lo_type) { ++ case NFS4_POSIX_LOCK_TYPE: ++ if (pos->ls_owner.lo_u.posix_owner != fl_owner) ++ continue; ++ break; ++ case NFS4_FLOCK_LOCK_TYPE: ++ if (pos->ls_owner.lo_u.flock_owner != fl_pid) ++ continue; ++ } + atomic_inc(&pos->ls_count); + return pos; + } +@@ -619,10 +651,10 @@ __nfs4_find_lock_state(struct nfs4_state + * exists, return an uninitialized one. + * + */ +-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) ++static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) + { + struct nfs4_lock_state *lsp; +- struct nfs_client *clp = state->owner->so_client; ++ struct nfs_client *clp = state->owner->so_server->nfs_client; + + lsp = kzalloc(sizeof(*lsp), GFP_NOFS); + if (lsp == NULL) +@@ -633,7 +665,18 @@ static struct nfs4_lock_state *nfs4_allo + lsp->ls_seqid.sequence = &lsp->ls_sequence; + atomic_set(&lsp->ls_count, 1); + lsp->ls_state = state; +- lsp->ls_owner = fl_owner; ++ lsp->ls_owner.lo_type = type; ++ switch (lsp->ls_owner.lo_type) { ++ case NFS4_FLOCK_LOCK_TYPE: ++ lsp->ls_owner.lo_u.flock_owner = fl_pid; ++ break; ++ case NFS4_POSIX_LOCK_TYPE: ++ lsp->ls_owner.lo_u.posix_owner = fl_owner; ++ break; ++ default: ++ kfree(lsp); ++ return NULL; ++ } + spin_lock(&clp->cl_lock); + nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); + spin_unlock(&clp->cl_lock); +@@ -643,7 +686,7 @@ static struct nfs4_lock_state *nfs4_allo + + static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) + { +- struct nfs_client *clp = lsp->ls_state->owner->so_client; ++ struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client; + + spin_lock(&clp->cl_lock); + nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); +@@ -657,13 +700,13 @@ static void nfs4_free_lock_state(struct + * exists, return an uninitialized one. + * + */ +-static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) ++static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type) + { + struct nfs4_lock_state *lsp, *new = NULL; + + for(;;) { + spin_lock(&state->state_lock); +- lsp = __nfs4_find_lock_state(state, owner); ++ lsp = __nfs4_find_lock_state(state, owner, pid, type); + if (lsp != NULL) + break; + if (new != NULL) { +@@ -674,7 +717,7 @@ static struct nfs4_lock_state *nfs4_get_ + break; + } + spin_unlock(&state->state_lock); +- new = nfs4_alloc_lock_state(state, owner); ++ new = nfs4_alloc_lock_state(state, owner, pid, type); + if (new == NULL) + return NULL; + } +@@ -701,6 +744,8 @@ void nfs4_put_lock_state(struct nfs4_loc + if (list_empty(&state->lock_states)) + clear_bit(LK_STATE_IN_USE, &state->flags); + spin_unlock(&state->state_lock); ++ if (lsp->ls_flags & NFS_LOCK_INITIALIZED) ++ nfs4_release_lockowner(lsp); + nfs4_free_lock_state(lsp); + } + +@@ -728,7 +773,12 @@ int nfs4_set_lock_state(struct nfs4_stat + + if (fl->fl_ops != NULL) + return 0; +- lsp = nfs4_get_lock_state(state, fl->fl_owner); ++ if (fl->fl_flags & FL_POSIX) ++ lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE); ++ else if (fl->fl_flags & FL_FLOCK) ++ lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE); ++ else ++ return -EINVAL; + if (lsp == NULL) + return -ENOMEM; + fl->fl_u.nfs4_fl.owner = lsp; +@@ -740,7 +790,7 @@ int nfs4_set_lock_state(struct nfs4_stat + * Byte-range lock aware utility to initialize the stateid of read/write + * requests. + */ +-void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) ++void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid) + { + struct nfs4_lock_state *lsp; + int seq; +@@ -753,7 +803,7 @@ void nfs4_copy_stateid(nfs4_stateid *dst + return; + + spin_lock(&state->state_lock); +- lsp = __nfs4_find_lock_state(state, fl_owner); ++ lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); + if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) + memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); + spin_unlock(&state->state_lock); +@@ -1031,8 +1081,8 @@ restart: + * Open state on this file cannot be recovered + * All we can do is revert to using the zero stateid. + */ +- memset(state->stateid.data, 0, +- sizeof(state->stateid.data)); ++ memset(state->stateid.u.data, 0, ++ sizeof(state->stateid.u.data)); + /* Mark the file as being 'closed' */ + state->state = 0; + break; +@@ -1041,11 +1091,11 @@ restart: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: +- nfs4_state_mark_reclaim_nograce(sp->so_client, state); ++ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_NO_GRACE: +- nfs4_state_mark_reclaim_nograce(sp->so_client, state); ++ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: +@@ -1120,8 +1170,7 @@ static void nfs4_state_end_reclaim_reboo + if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + return; + +- nfs4_reclaim_complete(clp, +- nfs4_reboot_recovery_ops[clp->cl_minorversion]); ++ nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); + + for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); +@@ -1211,8 +1260,8 @@ restart: + static int nfs4_check_lease(struct nfs_client *clp) + { + struct rpc_cred *cred; +- struct nfs4_state_maintenance_ops *ops = +- nfs4_state_renewal_ops[clp->cl_minorversion]; ++ const struct nfs4_state_maintenance_ops *ops = ++ clp->cl_mvops->state_renewal_ops; + int status = -NFS4ERR_EXPIRED; + + /* Is the client already known to have an expired lease? */ +@@ -1235,8 +1284,8 @@ out: + static int nfs4_reclaim_lease(struct nfs_client *clp) + { + struct rpc_cred *cred; +- struct nfs4_state_recovery_ops *ops = +- nfs4_reboot_recovery_ops[clp->cl_minorversion]; ++ const struct nfs4_state_recovery_ops *ops = ++ clp->cl_mvops->reboot_recovery_ops; + int status = -ENOENT; + + cred = ops->get_clid_cred(clp); +@@ -1421,6 +1470,7 @@ static void nfs4_state_manager(struct nf + } + clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); ++ pnfs_destroy_all_layouts(clp); + } + + if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { +@@ -1444,7 +1494,7 @@ static void nfs4_state_manager(struct nf + /* First recover reboot state... */ + if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, +- nfs4_reboot_recovery_ops[clp->cl_minorversion]); ++ clp->cl_mvops->reboot_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + continue; +@@ -1458,7 +1508,7 @@ static void nfs4_state_manager(struct nf + /* Now recover expired state... */ + if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, +- nfs4_nograce_recovery_ops[clp->cl_minorversion]); ++ clp->cl_mvops->nograce_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || + test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) +diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-31 20:41:19.160150207 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-31 20:42:05.530092192 -0400 +@@ -50,8 +50,10 @@ + #include + #include + #include ++#include + #include "nfs4_fs.h" + #include "internal.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_XDR + +@@ -89,7 +91,7 @@ static int nfs4_stat_to_errno(int); + #define encode_getfh_maxsz (op_encode_hdr_maxsz) + #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ + ((3+NFS4_FHSIZE) >> 2)) +-#define nfs4_fattr_bitmap_maxsz 3 ++#define nfs4_fattr_bitmap_maxsz 4 + #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) + #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) + #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) +@@ -111,7 +113,11 @@ static int nfs4_stat_to_errno(int); + #define encode_restorefh_maxsz (op_encode_hdr_maxsz) + #define decode_restorefh_maxsz (op_decode_hdr_maxsz) + #define encode_fsinfo_maxsz (encode_getattr_maxsz) +-#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) ++/* The 5 accounts for the PNFS attributes, and assumes that at most three ++ * layout types will be returned. ++ */ ++#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ ++ nfs4_fattr_bitmap_maxsz + 8 + 5) + #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) + #define decode_renew_maxsz (op_decode_hdr_maxsz) + #define encode_setclientid_maxsz \ +@@ -202,14 +208,17 @@ static int nfs4_stat_to_errno(int); + #define encode_link_maxsz (op_encode_hdr_maxsz + \ + nfs4_name_maxsz) + #define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) ++#define encode_lockowner_maxsz (7) + #define encode_lock_maxsz (op_encode_hdr_maxsz + \ + 7 + \ +- 1 + encode_stateid_maxsz + 8) ++ 1 + encode_stateid_maxsz + 1 + \ ++ encode_lockowner_maxsz) + #define decode_lock_denied_maxsz \ + (8 + decode_lockowner_maxsz) + #define decode_lock_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) +-#define encode_lockt_maxsz (op_encode_hdr_maxsz + 12) ++#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \ ++ encode_lockowner_maxsz) + #define decode_lockt_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) + #define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \ +@@ -217,6 +226,11 @@ static int nfs4_stat_to_errno(int); + 4) + #define decode_locku_maxsz (op_decode_hdr_maxsz + \ + decode_stateid_maxsz) ++#define encode_release_lockowner_maxsz \ ++ (op_encode_hdr_maxsz + \ ++ encode_lockowner_maxsz) ++#define decode_release_lockowner_maxsz \ ++ (op_decode_hdr_maxsz) + #define encode_access_maxsz (op_encode_hdr_maxsz + 1) + #define decode_access_maxsz (op_decode_hdr_maxsz + 2) + #define encode_symlink_maxsz (op_encode_hdr_maxsz + \ +@@ -302,6 +316,35 @@ static int nfs4_stat_to_errno(int); + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) + #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) + #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) ++#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ ++ encode_verifier_maxsz) ++#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1 + \ ++ decode_verifier_maxsz + \ ++ XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ ++ NFS4_PNFS_DEVICEID4_SIZE)) ++#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ ++ XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE)) ++#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ ++ 4 /*layout type */ + \ ++ 4 /* opaque devaddr4 length */ +\ ++ 4 /* notification bitmap length */ + \ ++ 4 /* notification bitmap */) ++#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ ++ encode_stateid_maxsz) ++#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ ++ decode_stateid_maxsz + \ ++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) ++#define encode_layoutcommit_maxsz (18 + \ ++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \ ++ op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz) ++#define decode_layoutcommit_maxsz (3 + op_decode_hdr_maxsz) ++#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz + \ ++ 1 /* FIXME: opaque lrf_body always empty at ++ *the moment */) ++#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ ++ 1 + decode_stateid_maxsz) + #else /* CONFIG_NFS_V4_1 */ + #define encode_sequence_maxsz 0 + #define decode_sequence_maxsz 0 +@@ -471,6 +514,12 @@ static int nfs4_stat_to_errno(int); + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_locku_maxsz) ++#define NFS4_enc_release_lockowner_sz \ ++ (compound_encode_hdr_maxsz + \ ++ encode_lockowner_maxsz) ++#define NFS4_dec_release_lockowner_sz \ ++ (compound_decode_hdr_maxsz + \ ++ decode_lockowner_maxsz) + #define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ +@@ -685,6 +734,60 @@ static int nfs4_stat_to_errno(int); + #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_reclaim_complete_maxsz) ++#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_getdevicelist_maxsz) ++#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_getdevicelist_maxsz) ++#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_getdeviceinfo_maxsz) ++#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_getdeviceinfo_maxsz) ++#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_layoutget_maxsz) ++#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutget_maxsz) ++#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_layoutcommit_maxsz + \ ++ encode_getattr_maxsz) ++#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutcommit_maxsz + \ ++ decode_getattr_maxsz) ++#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_layoutreturn_maxsz) ++#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutreturn_maxsz) ++#define NFS4_enc_dswrite_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_write_maxsz) ++#define NFS4_dec_dswrite_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_write_maxsz) ++#define NFS4_enc_dscommit_sz (compound_encode_hdr_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_commit_maxsz) ++#define NFS4_dec_dscommit_sz (compound_decode_hdr_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_commit_maxsz) + + const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_encode_hdr_maxsz + +@@ -915,7 +1018,7 @@ static void encode_close(struct xdr_stre + p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_CLOSE); + *p++ = cpu_to_be32(arg->seqid->sequence->counter); +- xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_close_maxsz; + } +@@ -989,6 +1092,35 @@ static void encode_getattr_two(struct xd + hdr->replen += decode_getattr_maxsz; + } + ++static void ++encode_getattr_three(struct xdr_stream *xdr, ++ uint32_t bm0, uint32_t bm1, uint32_t bm2, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(OP_GETATTR); ++ if (bm2) { ++ p = reserve_space(xdr, 16); ++ *p++ = cpu_to_be32(3); ++ *p++ = cpu_to_be32(bm0); ++ *p++ = cpu_to_be32(bm1); ++ *p = cpu_to_be32(bm2); ++ } else if (bm1) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(2); ++ *p++ = cpu_to_be32(bm0); ++ *p = cpu_to_be32(bm1); ++ } else { ++ p = reserve_space(xdr, 8); ++ *p++ = cpu_to_be32(1); ++ *p = cpu_to_be32(bm0); ++ } ++ hdr->nops++; ++ hdr->replen += decode_getattr_maxsz; ++} ++ + static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { + encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], +@@ -997,8 +1129,11 @@ static void encode_getfattr(struct xdr_s + + static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { +- encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], +- bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); ++ encode_getattr_three(xdr, ++ bitmask[0] & nfs4_fsinfo_bitmap[0], ++ bitmask[1] & nfs4_fsinfo_bitmap[1], ++ bitmask[2] & nfs4_fsinfo_bitmap[2], ++ hdr); + } + + static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) +@@ -1042,6 +1177,17 @@ static inline uint64_t nfs4_lock_length( + return fl->fl_end - fl->fl_start + 1; + } + ++static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 28); ++ p = xdr_encode_hyper(p, lowner->clientid); ++ *p++ = cpu_to_be32(16); ++ p = xdr_encode_opaque_fixed(p, "lock id:", 8); ++ xdr_encode_hyper(p, lowner->id); ++} ++ + /* + * opcode,type,reclaim,offset,length,new_lock_owner = 32 + * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 +@@ -1058,18 +1204,16 @@ static void encode_lock(struct xdr_strea + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + *p = cpu_to_be32(args->new_lock_owner); + if (args->new_lock_owner){ +- p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); ++ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(args->open_seqid->sequence->counter); +- p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->open_stateid->u.data, ++ NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); +- p = xdr_encode_hyper(p, args->lock_owner.clientid); +- *p++ = cpu_to_be32(16); +- p = xdr_encode_opaque_fixed(p, "lock id:", 8); +- xdr_encode_hyper(p, args->lock_owner.id); ++ encode_lockowner(xdr, &args->lock_owner); + } + else { + p = reserve_space(xdr, NFS4_STATEID_SIZE+4); +- p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->lock_stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(args->lock_seqid->sequence->counter); + } + hdr->nops++; +@@ -1080,15 +1224,12 @@ static void encode_lockt(struct xdr_stre + { + __be32 *p; + +- p = reserve_space(xdr, 52); ++ p = reserve_space(xdr, 24); + *p++ = cpu_to_be32(OP_LOCKT); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); +- p = xdr_encode_hyper(p, args->lock_owner.clientid); +- *p++ = cpu_to_be32(16); +- p = xdr_encode_opaque_fixed(p, "lock id:", 8); +- xdr_encode_hyper(p, args->lock_owner.id); ++ encode_lockowner(xdr, &args->lock_owner); + hdr->nops++; + hdr->replen += decode_lockt_maxsz; + } +@@ -1101,13 +1242,25 @@ static void encode_locku(struct xdr_stre + *p++ = cpu_to_be32(OP_LOCKU); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + *p++ = cpu_to_be32(args->seqid->sequence->counter); +- p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->stateid->u.data, ++ NFS4_STATEID_SIZE); + p = xdr_encode_hyper(p, args->fl->fl_start); + xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + hdr->nops++; + hdr->replen += decode_locku_maxsz; + } + ++static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(OP_RELEASE_LOCKOWNER); ++ encode_lockowner(xdr, lowner); ++ hdr->nops++; ++ hdr->replen += decode_release_lockowner_maxsz; ++} ++ + static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) + { + int len = name->len; +@@ -1172,7 +1325,7 @@ static inline void encode_createmode(str + break; + default: + clp = arg->server->nfs_client; +- if (clp->cl_minorversion > 0) { ++ if (clp->cl_mvops->minor_version > 0) { + if (nfs4_has_persistent_session(clp)) { + *p = cpu_to_be32(NFS4_CREATE_GUARDED); + encode_attrs(xdr, arg->u.attrs, arg->server); +@@ -1251,7 +1404,7 @@ static inline void encode_claim_delegate + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); +- xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, stateid->u.data, NFS4_STATEID_SIZE); + encode_string(xdr, name->len, name->name); + } + +@@ -1282,7 +1435,7 @@ static void encode_open_confirm(struct x + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_CONFIRM); +- p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); + hdr->nops++; + hdr->replen += decode_open_confirm_maxsz; +@@ -1294,7 +1447,7 @@ static void encode_open_downgrade(struct + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); +- p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); + encode_share_access(xdr, arg->fmode); + hdr->nops++; +@@ -1324,17 +1477,17 @@ static void encode_putrootfh(struct xdr_ + hdr->replen += decode_putrootfh_maxsz; + } + +-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) ++static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx) + { + nfs4_stateid stateid; + __be32 *p; + + p = reserve_space(xdr, NFS4_STATEID_SIZE); + if (ctx->state != NULL) { +- nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); +- xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); ++ nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); ++ xdr_encode_opaque_fixed(p, stateid.u.data, NFS4_STATEID_SIZE); + } else +- xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, zero_stateid.u.data, NFS4_STATEID_SIZE); + } + + static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) +@@ -1344,7 +1497,7 @@ static void encode_read(struct xdr_strea + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_READ); + +- encode_stateid(xdr, args->context); ++ encode_stateid(xdr, args->context, args->lock_context); + + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->offset); +@@ -1448,7 +1601,7 @@ encode_setacl(struct xdr_stream *xdr, st + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); +- xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, zero_stateid.u.data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 2*4); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(FATTR4_WORD0_ACL); +@@ -1479,7 +1632,7 @@ static void encode_setattr(struct xdr_st + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); +- xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, arg->stateid.u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_setattr_maxsz; + encode_attrs(xdr, arg->iap, server); +@@ -1523,7 +1676,7 @@ static void encode_write(struct xdr_stre + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_WRITE); + +- encode_stateid(xdr, args->context); ++ encode_stateid(xdr, args->context, args->lock_context); + + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, args->offset); +@@ -1542,7 +1695,7 @@ static void encode_delegreturn(struct xd + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + + *p++ = cpu_to_be32(OP_DELEGRETURN); +- xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, stateid->u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_delegreturn_maxsz; + } +@@ -1696,6 +1849,162 @@ static void encode_sequence(struct xdr_s + #endif /* CONFIG_NFS_V4_1 */ + } + ++#ifdef CONFIG_NFS_V4_1 ++static void ++encode_getdevicelist(struct xdr_stream *xdr, ++ const struct nfs4_getdevicelist_args *args, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ nfs4_verifier dummy = { ++ .data = "dummmmmy", ++ }; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_GETDEVICELIST); ++ *p++ = cpu_to_be32(args->layoutclass); ++ *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); ++ xdr_encode_hyper(p, 0ULL); /* cookie */ ++ encode_nfs4_verifier(xdr, &dummy); ++ hdr->nops++; ++} ++ ++static void ++encode_getdeviceinfo(struct xdr_stream *xdr, ++ const struct nfs4_getdeviceinfo_args *args, ++ struct compound_hdr *hdr) ++{ ++ int has_bitmap = (args->pdev->dev_notify_types != 0); ++ int len = 16 + NFS4_PNFS_DEVICEID4_SIZE + (has_bitmap * 4); ++ __be32 *p; ++ ++ p = reserve_space(xdr, len); ++ *p++ = cpu_to_be32(OP_GETDEVICEINFO); ++ p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ *p++ = cpu_to_be32(args->pdev->layout_type); ++ *p++ = cpu_to_be32(args->pdev->pglen + len); /* gdia_maxcount */ ++ *p++ = cpu_to_be32(has_bitmap); /* bitmap length [01] */ ++ if (has_bitmap) ++ *p = cpu_to_be32(args->pdev->dev_notify_types); ++ hdr->nops++; ++} ++ ++static void ++encode_layoutget(struct xdr_stream *xdr, ++ const struct nfs4_layoutget_args *args, ++ struct compound_hdr *hdr) ++{ ++ nfs4_stateid stateid; ++ __be32 *p; ++ ++ p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(OP_LAYOUTGET); ++ *p++ = cpu_to_be32(0); /* Signal layout available */ ++ *p++ = cpu_to_be32(args->type); ++ *p++ = cpu_to_be32(args->range.iomode); ++ p = xdr_encode_hyper(p, args->range.offset); ++ p = xdr_encode_hyper(p, args->range.length); ++ p = xdr_encode_hyper(p, args->minlength); ++ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout); ++ p = xdr_encode_opaque_fixed(p, &stateid.u.data, NFS4_STATEID_SIZE); ++ *p = cpu_to_be32(args->maxcount); ++ ++ dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", ++ __func__, ++ args->type, ++ args->range.iomode, ++ (unsigned long)args->range.offset, ++ (unsigned long)args->range.length, ++ args->maxcount); ++ hdr->nops++; ++ hdr->replen += decode_layoutget_maxsz; ++} ++ ++static int ++encode_layoutcommit(struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *args, ++ struct compound_hdr *hdr) ++{ ++ struct layoutdriver_io_operations *ld_io_ops = ++ NFS_SERVER(args->inode)->pnfs_curr_ld->ld_io_ops; ++ __be32 *p; ++ ++ dprintk("%s: %llu@%llu lbw: %llu type: %d\n", __func__, ++ args->range.length, args->range.offset, args->lastbytewritten, ++ args->layout_type); ++ ++ p = reserve_space(xdr, 40 + NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); ++ p = xdr_encode_hyper(p, args->range.offset); ++ p = xdr_encode_hyper(p, args->range.length); ++ *p++ = cpu_to_be32(0); /* reclaim */ ++ p = xdr_encode_opaque_fixed(p, args->stateid.u.data, NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(1); /* newoffset = TRUE */ ++ p = xdr_encode_hyper(p, args->lastbytewritten); ++ *p = cpu_to_be32(args->time_modify_changed != 0); ++ if (args->time_modify_changed) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(0); ++ *p++ = cpu_to_be32(args->time_modify.tv_sec); ++ *p = cpu_to_be32(args->time_modify.tv_nsec); ++ } ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(args->layout_type); ++ ++ if (ld_io_ops->encode_layoutcommit) { ++ ld_io_ops->encode_layoutcommit(NFS_I(args->inode)->layout, ++ xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ xdr_encode_opaque(p, NULL, 0); ++ } ++ ++ hdr->nops++; ++ hdr->replen += decode_layoutcommit_maxsz; ++ return 0; ++} ++ ++static void ++encode_layoutreturn(struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args, ++ struct compound_hdr *hdr) ++{ ++ nfs4_stateid stateid; ++ __be32 *p; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_LAYOUTRETURN); ++ *p++ = cpu_to_be32(args->reclaim); ++ *p++ = cpu_to_be32(args->layout_type); ++ *p++ = cpu_to_be32(args->range.iomode); ++ *p = cpu_to_be32(args->return_type); ++ if (args->return_type == RETURN_FILE) { ++ struct layoutdriver_io_operations *ld_io_ops = ++ NFS_SERVER(args->inode)->pnfs_curr_ld->ld_io_ops; ++ ++ p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE); ++ p = xdr_encode_hyper(p, args->range.offset); ++ p = xdr_encode_hyper(p, args->range.length); ++ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout); ++ p = xdr_encode_opaque_fixed(p, &stateid.u.data, ++ NFS4_STATEID_SIZE); ++ dprintk("%s: call %pF\n", __func__, ++ ld_io_ops->encode_layoutreturn); ++ if (ld_io_ops->encode_layoutreturn) { ++ ld_io_ops->encode_layoutreturn( ++ NFS_I(args->inode)->layout, xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(0); ++ } ++ } ++ hdr->nops++; ++ hdr->replen += decode_layoutreturn_maxsz; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ + /* + * END OF "GENERIC" ENCODE ROUTINES. + */ +@@ -1704,7 +2013,7 @@ static u32 nfs4_xdr_minorversion(const s + { + #if defined(CONFIG_NFS_V4_1) + if (args->sa_session) +- return args->sa_session->clp->cl_minorversion; ++ return args->sa_session->clp->cl_mvops->minor_version; + #endif /* CONFIG_NFS_V4_1 */ + return 0; + } +@@ -2048,6 +2357,20 @@ static int nfs4_xdr_enc_locku(struct rpc + return 0; + } + ++static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = 0, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_release_lockowner(&xdr, &args->lock_owner, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ + /* + * Encode a READLINK request + */ +@@ -2330,7 +2653,7 @@ static int nfs4_xdr_enc_setclientid_conf + struct compound_hdr hdr = { + .nops = 0, + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); +@@ -2395,7 +2718,7 @@ static int nfs4_xdr_enc_exchange_id(stru + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = args->client->cl_minorversion, ++ .minorversion = args->client->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2413,7 +2736,7 @@ static int nfs4_xdr_enc_create_session(s + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = args->client->cl_minorversion, ++ .minorversion = args->client->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2431,7 +2754,7 @@ static int nfs4_xdr_enc_destroy_session( + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = session->clp->cl_minorversion, ++ .minorversion = session->clp->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2469,7 +2792,7 @@ static int nfs4_xdr_enc_get_lease_time(s + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); +@@ -2499,6 +2822,159 @@ static int nfs4_xdr_enc_reclaim_complete + return 0; + } + ++/* ++ * Encode GETDEVICELIST request ++ */ ++static int ++nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_getdevicelist_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_getdevicelist(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode GETDEVICEINFO request ++ */ ++static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_getdeviceinfo_args *args) ++{ ++ struct xdr_stream xdr; ++ struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ int replen; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_getdeviceinfo(&xdr, args, &hdr); ++ ++ /* set up reply kvec. Subtract notification bitmap max size (8) ++ * so that notification bitmap is put in xdr_buf tail */ ++ replen = (RPC_REPHDRSIZE + auth->au_rslack + ++ NFS4_dec_getdeviceinfo_sz - 8) << 2; ++ xdr_inline_pages(&req->rq_rcv_buf, replen, args->pdev->pages, ++ args->pdev->pgbase, args->pdev->pglen); ++ dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", ++ __func__, replen, args->pdev->pages, ++ args->pdev->pgbase, args->pdev->pglen); ++ ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTGET request ++ */ ++static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_layoutget_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, NFS_FH(args->inode), &hdr); ++ encode_layoutget(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTCOMMIT request ++ */ ++static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_layoutcommit_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_layoutcommit(&xdr, args, &hdr); ++ encode_getfattr(&xdr, args->bitmask, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTRETURN request ++ */ ++static int nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_layoutreturn_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, NFS_FH(args->inode), &hdr); ++ encode_layoutreturn(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server WRITE request ++ */ ++static int nfs4_xdr_enc_dswrite(struct rpc_rqst *req, uint32_t *p, ++ struct nfs_writeargs *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_write(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server COMMIT request ++ */ ++static int nfs4_xdr_enc_dscommit(struct rpc_rqst *req, uint32_t *p, ++ struct nfs_writeargs *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_commit(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} + #endif /* CONFIG_NFS_V4_1 */ + + static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +@@ -2599,14 +3075,17 @@ static int decode_attr_bitmap(struct xdr + goto out_overflow; + bmlen = be32_to_cpup(p); + +- bitmap[0] = bitmap[1] = 0; ++ bitmap[0] = bitmap[1] = bitmap[2] = 0; + p = xdr_inline_decode(xdr, (bmlen << 2)); + if (unlikely(!p)) + goto out_overflow; + if (bmlen > 0) { + bitmap[0] = be32_to_cpup(p++); +- if (bmlen > 1) +- bitmap[1] = be32_to_cpup(p); ++ if (bmlen > 1) { ++ bitmap[1] = be32_to_cpup(p++); ++ if (bmlen > 2) ++ bitmap[2] = be32_to_cpup(p); ++ } + } + return 0; + out_overflow: +@@ -2635,8 +3114,9 @@ static int decode_attr_supported(struct + decode_attr_bitmap(xdr, bitmask); + bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; + } else +- bitmask[0] = bitmask[1] = 0; +- dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); ++ bitmask[0] = bitmask[1] = bitmask[2] = 0; ++ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, ++ bitmask[0], bitmask[1], bitmask[2]); + return 0; + } + +@@ -3565,7 +4045,7 @@ static int decode_opaque_fixed(struct xd + + static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) + { +- return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); ++ return decode_opaque_fixed(xdr, stateid->u.data, NFS4_STATEID_SIZE); + } + + static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +@@ -3621,7 +4101,7 @@ out_overflow: + static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3647,7 +4127,7 @@ xdr_error: + static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3679,7 +4159,7 @@ xdr_error: + static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3705,7 +4185,7 @@ static int decode_getfattr(struct xdr_st + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}, ++ bitmap[3] = {0}, + type; + int status; + umode_t fmode = 0; +@@ -3824,24 +4304,101 @@ xdr_error: + return status; + } + +- +-static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) ++#if defined(CONFIG_NFS_V4_1) ++/* ++ * Decode potentially multiple layout types. Currently we only support ++ * one layout driver per file system. ++ */ ++static int decode_pnfs_list(struct xdr_stream *xdr, uint32_t *layoutclass) + { +- __be32 *savep; +- uint32_t attrlen, bitmap[2]; +- int status; ++ uint32_t *p; ++ int num; + +- if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +- goto xdr_error; +- if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) +- goto xdr_error; +- if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) +- goto xdr_error; ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ num = be32_to_cpup(p); + +- fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ ++ /* pNFS is not supported by the underlying file system */ ++ if (num == 0) { ++ *layoutclass = 0; ++ return 0; ++ } + +- if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) +- goto xdr_error; ++ /* TODO: We will eventually support multiple layout drivers ? */ ++ if (num > 1) ++ printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers " ++ "per filesystem not supported\n", __func__); ++ ++ /* Decode and set first layout type */ ++ p = xdr_inline_decode(xdr, num * 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ *layoutclass = be32_to_cpup(p); ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++/* ++ * The type of file system exported ++ */ ++static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, ++ uint32_t *layoutclass) ++{ ++ int status = 0; ++ ++ dprintk("%s: bitmap is %x\n", __func__, bitmap[1]); ++ if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) ++ return -EIO; ++ if (likely(bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES)) { ++ status = decode_pnfs_list(xdr, layoutclass); ++ bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; ++ } ++ return status; ++} ++ ++/* ++ * The prefered block size for layout directed io ++ */ ++static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, ++ uint32_t *res) ++{ ++ __be32 *p; ++ ++ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); ++ *res = 0; ++ if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) { ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++ } ++ *res = be32_to_cpup(p); ++ bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; ++ } ++ return 0; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ ++static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) ++{ ++ __be32 *savep; ++ uint32_t attrlen, bitmap[3]; ++ int status; ++ ++ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) ++ goto xdr_error; ++ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) ++ goto xdr_error; ++ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) ++ goto xdr_error; ++ ++ fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ ++ ++ if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) ++ goto xdr_error; + if ((status = decode_attr_maxfilesize(xdr, bitmap, &fsinfo->maxfilesize)) != 0) + goto xdr_error; + if ((status = decode_attr_maxread(xdr, bitmap, &fsinfo->rtmax)) != 0) +@@ -3850,6 +4407,14 @@ static int decode_fsinfo(struct xdr_stre + if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) + goto xdr_error; + fsinfo->wtpref = fsinfo->wtmax; ++#if defined(CONFIG_NFS_V4_1) ++ status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); ++ if (status) ++ goto xdr_error; ++ status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); ++ if (status) ++ goto xdr_error; ++#endif /* CONFIG_NFS_V4_1 */ + + status = verify_attr_len(xdr, savep, attrlen); + xdr_error: +@@ -3973,6 +4538,11 @@ static int decode_locku(struct xdr_strea + return status; + } + ++static int decode_release_lockowner(struct xdr_stream *xdr) ++{ ++ return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER); ++} ++ + static int decode_lookup(struct xdr_stream *xdr) + { + return decode_op_hdr(xdr, OP_LOOKUP); +@@ -4333,7 +4903,7 @@ static int decode_getacl(struct xdr_stre + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}; ++ bitmap[3] = {0}; + struct kvec *iov = req->rq_rcv_buf.head; + int status; + +@@ -4682,6 +5252,226 @@ out_overflow: + #endif /* CONFIG_NFS_V4_1 */ + } + ++#if defined(CONFIG_NFS_V4_1) ++/* ++ * TODO: Need to handle case when EOF != true; ++ */ ++static int decode_getdevicelist(struct xdr_stream *xdr, ++ struct pnfs_devicelist *res) ++{ ++ __be32 *p; ++ int status, i; ++ struct nfs_writeverf verftemp; ++ ++ status = decode_op_hdr(xdr, OP_GETDEVICELIST); ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 8 + 8 + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ ++ /* TODO: Skip cookie for now */ ++ p += 2; ++ ++ /* Read verifier */ ++ p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); ++ ++ res->num_devs = be32_to_cpup(p); ++ ++ dprintk("%s: num_dev %d\n", __func__, res->num_devs); ++ ++ if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) ++ return -NFS4ERR_REP_TOO_BIG; ++ ++ p = xdr_inline_decode(xdr, ++ res->num_devs * NFS4_PNFS_DEVICEID4_SIZE + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ for (i = 0; i < res->num_devs; i++) ++ p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ res->eof = be32_to_cpup(p); ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_getdeviceinfo(struct xdr_stream *xdr, ++ struct pnfs_device *pdev) ++{ ++ __be32 *p; ++ uint32_t len, type; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_GETDEVICEINFO); ++ if (status) { ++ if (status == -ETOOSMALL) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ pdev->mincount = be32_to_cpup(p); ++ dprintk("%s: Min count too small. mincnt = %u\n", ++ __func__, pdev->mincount); ++ } ++ return status; ++ } ++ ++ p = xdr_inline_decode(xdr, 8); ++ if (unlikely(!p)) ++ goto out_overflow; ++ type = be32_to_cpup(p++); ++ if (type != pdev->layout_type) { ++ dprintk("%s: layout mismatch req: %u pdev: %u\n", ++ __func__, pdev->layout_type, type); ++ return -EINVAL; ++ } ++ /* ++ * Get the length of the opaque device_addr4. xdr_read_pages places ++ * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages) ++ * and places the remaining xdr data in xdr_buf->tail ++ */ ++ pdev->mincount = be32_to_cpup(p); ++ xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ ++ ++ /* At most one bitmap word */ ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ len = be32_to_cpup(p); ++ if (len) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ pdev->dev_notify_types = be32_to_cpup(p); ++ } else ++ pdev->dev_notify_types = 0; ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, ++ struct nfs4_layoutget_res *res) ++{ ++ __be32 *p; ++ int status; ++ u32 layout_count, dummy; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTGET); ++ if (status) ++ return status; ++ p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->return_on_close = be32_to_cpup(p++); ++ p = xdr_decode_opaque_fixed(p, res->stateid.u.data, NFS4_STATEID_SIZE); ++ layout_count = be32_to_cpup(p); ++ if (!layout_count) { ++ dprintk("%s: server responded with empty layout array\n", ++ __func__); ++ return -EINVAL; ++ } ++ ++ p = xdr_inline_decode(xdr, 24); ++ if (unlikely(!p)) ++ goto out_overflow; ++ p = xdr_decode_hyper(p, &res->range.offset); ++ p = xdr_decode_hyper(p, &res->range.length); ++ res->range.iomode = be32_to_cpup(p++); ++ res->type = be32_to_cpup(p++); ++ ++ status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p); ++ if (unlikely(status)) ++ return status; ++ ++ dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", ++ __func__, ++ (unsigned long)res->range.offset, ++ (unsigned long)res->range.length, ++ res->range.iomode, ++ res->type, ++ res->layout.len); ++ ++ /* presuambly, nfs4_proc_layoutget allocated a single page */ ++ if (res->layout.len > PAGE_SIZE) ++ return -ENOMEM; ++ memcpy(res->layout.buf, p, res->layout.len); ++ ++ /* FIXME: the whole layout array should be passed up to the pnfs ++ * client */ ++ if (layout_count > 1) { ++ dprintk("%s: server responded with %d layouts, dropping tail\n", ++ __func__, layout_count); ++ ++ while (--layout_count) { ++ p = xdr_inline_decode(xdr, 24); ++ if (unlikely(!p)) ++ goto out_overflow; ++ status = decode_opaque_inline(xdr, &dummy, (char **)&p); ++ if (unlikely(status)) ++ return status; ++ } ++ } ++ ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutreturn(struct xdr_stream *xdr, ++ struct nfs4_layoutreturn_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTRETURN); ++ if (status) ++ return status; ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->lrs_present = be32_to_cpup(p); ++ if (res->lrs_present) ++ status = decode_stateid(xdr, &res->stateid); ++ return status; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutcommit(struct xdr_stream *xdr, ++ struct rpc_rqst *req, ++ struct nfs4_layoutcommit_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->sizechanged = be32_to_cpup(p); ++ ++ if (res->sizechanged) { ++ p = xdr_inline_decode(xdr, 8); ++ if (unlikely(!p)) ++ goto out_overflow; ++ xdr_decode_hyper(p, &res->newsize); ++ } ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ + /* + * END OF "GENERIC" DECODE ROUTINES. + */ +@@ -5259,6 +6049,19 @@ out: + return status; + } + ++static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (!status) ++ status = decode_release_lockowner(&xdr); ++ return status; ++} ++ + /* + * Decode READLINK response + */ +@@ -5696,6 +6499,186 @@ static int nfs4_xdr_dec_reclaim_complete + status = decode_reclaim_complete(&xdr, (void *)NULL); + return status; + } ++ ++/* ++ * Decode GETDEVICELIST response ++ */ ++static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_getdevicelist_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ dprintk("encoding getdevicelist!\n"); ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status != 0) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status != 0) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status != 0) ++ goto out; ++ status = decode_getdevicelist(&xdr, res->devlist); ++out: ++ return status; ++} ++ ++/* ++ * Decode GETDEVINFO response ++ */ ++static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_getdeviceinfo_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status != 0) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status != 0) ++ goto out; ++ status = decode_getdeviceinfo(&xdr, res->pdev); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTGET response ++ */ ++static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_layoutget_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutget(&xdr, rqstp, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTRETURN response ++ */ ++static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_layoutreturn_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutreturn(&xdr, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTCOMMIT response ++ */ ++static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_layoutcommit_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutcommit(&xdr, rqstp, res); ++ if (status) ++ goto out; ++ decode_getfattr(&xdr, res->fattr, res->server, ++ !RPC_IS_ASYNC(rqstp->rq_task)); ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server WRITE response ++ */ ++static int nfs4_xdr_dec_dswrite(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs_writeres *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_write(&xdr, res); ++ if (!status) ++ return res->count; ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server COMMIT response ++ */ ++static int nfs4_xdr_dec_dscommit(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs_writeres *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_commit(&xdr, res); ++out: ++ return status; ++} + #endif /* CONFIG_NFS_V4_1 */ + + __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) +@@ -5866,6 +6849,7 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(GETACL, enc_getacl, dec_getacl), + PROC(SETACL, enc_setacl, dec_setacl), + PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), ++ PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), + #if defined(CONFIG_NFS_V4_1) + PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), + PROC(CREATE_SESSION, enc_create_session, dec_create_session), +@@ -5873,6 +6857,13 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(SEQUENCE, enc_sequence, dec_sequence), + PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), ++ PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), ++ PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), ++ PROC(LAYOUTGET, enc_layoutget, dec_layoutget), ++ PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), ++ PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), ++ PROC(PNFS_WRITE, enc_dswrite, dec_dswrite), ++ PROC(PNFS_COMMIT, enc_dscommit, dec_dscommit), + #endif /* CONFIG_NFS_V4_1 */ + }; + +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild +--- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-31 20:42:05.532213157 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-08-31 20:42:05.532213157 -0400 +@@ -0,0 +1,11 @@ ++# ++# Makefile for the pNFS Objects Layout Driver kernel module ++# ++objlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o objio_osd.o ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o ++ ++# ++# Panasas pNFS Layout Driver kernel module ++# ++panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o ++obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-31 20:42:05.533243491 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-31 20:42:05.534105468 -0400 +@@ -0,0 +1,1087 @@ ++/* ++ * objio_osd.c ++ * ++ * pNFS Objects layout implementation over open-osd initiator library ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++#define _LLU(x) ((unsigned long long)x) ++ ++enum { BIO_MAX_PAGES_KMALLOC = ++ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), ++}; ++ ++/* A per mountpoint struct currently for device cache */ ++struct objio_mount_type { ++ struct list_head dev_list; ++ spinlock_t dev_list_lock; ++}; ++ ++struct _dev_ent { ++ struct list_head list; ++ struct pnfs_deviceid d_id; ++ struct osd_dev *od; ++}; ++ ++static void _dev_list_remove_all(struct objio_mount_type *omt) ++{ ++ spin_lock(&omt->dev_list_lock); ++ ++ while (!list_empty(&omt->dev_list)) { ++ struct _dev_ent *de = list_entry(omt->dev_list.next, ++ struct _dev_ent, list); ++ ++ list_del_init(&de->list); ++ osduld_put_device(de->od); ++ kfree(de); ++ } ++ ++ spin_unlock(&omt->dev_list_lock); ++} ++ ++static struct osd_dev *___dev_list_find(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id) ++{ ++ struct list_head *le; ++ ++ list_for_each(le, &omt->dev_list) { ++ struct _dev_ent *de = list_entry(le, struct _dev_ent, list); ++ ++ if (0 == memcmp(&de->d_id, d_id, sizeof(*d_id))) ++ return de->od; ++ } ++ ++ return NULL; ++} ++ ++static struct osd_dev *_dev_list_find(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id) ++{ ++ struct osd_dev *od; ++ ++ spin_lock(&omt->dev_list_lock); ++ od = ___dev_list_find(omt, d_id); ++ spin_unlock(&omt->dev_list_lock); ++ return od; ++} ++ ++static int _dev_list_add(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id, struct osd_dev *od) ++{ ++ struct _dev_ent *de = kzalloc(sizeof(*de), GFP_KERNEL); ++ ++ if (!de) ++ return -ENOMEM; ++ ++ spin_lock(&omt->dev_list_lock); ++ ++ if (___dev_list_find(omt, d_id)) { ++ kfree(de); ++ goto out; ++ } ++ ++ de->d_id = *d_id; ++ de->od = od; ++ list_add(&de->list, &omt->dev_list); ++ ++out: ++ spin_unlock(&omt->dev_list_lock); ++ return 0; ++} ++ ++struct objio_segment { ++ struct pnfs_osd_layout *layout; ++ ++ unsigned mirrors_p1; ++ unsigned stripe_unit; ++ unsigned group_width; /* Data stripe_units without integrity comps */ ++ u64 group_depth; ++ unsigned group_count; ++ ++ unsigned num_comps; ++ /* variable length */ ++ struct osd_dev *ods[1]; ++}; ++ ++struct objio_state; ++typedef ssize_t (*objio_done_fn)(struct objio_state *ios); ++ ++struct objio_state { ++ /* Generic layer */ ++ struct objlayout_io_state ol_state; ++ ++ struct objio_segment *objio_seg; ++ ++ struct kref kref; ++ objio_done_fn done; ++ void *private; ++ ++ unsigned long length; ++ unsigned numdevs; /* Actually used devs in this IO */ ++ /* A per-device variable array of size numdevs */ ++ struct _objio_per_comp { ++ struct bio *bio; ++ struct osd_request *or; ++ unsigned long length; ++ u64 offset; ++ unsigned dev; ++ } per_dev[]; ++}; ++ ++/* Send and wait for a get_device_info of devices in the layout, ++ then look them up with the osd_initiator library */ ++static struct osd_dev *_device_lookup(struct pnfs_layout_hdr *pnfslay, ++ struct objio_segment *objio_seg, unsigned comp) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ struct pnfs_osd_deviceaddr *deviceaddr; ++ struct pnfs_deviceid *d_id; ++ struct osd_dev *od; ++ struct osd_dev_info odi; ++ struct objio_mount_type *omt = PNFS_NFS_SERVER(pnfslay)->pnfs_ld_data; ++ int err; ++ ++ d_id = &layout->olo_comps[comp].oc_object_id.oid_device_id; ++ ++ od = _dev_list_find(omt, d_id); ++ if (od) ++ return od; ++ ++ err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr); ++ if (unlikely(err)) { ++ dprintk("%s: objlayout_get_deviceinfo=>%d\n", __func__, err); ++ return ERR_PTR(err); ++ } ++ ++ odi.systemid_len = deviceaddr->oda_systemid.len; ++ if (odi.systemid_len > sizeof(odi.systemid)) { ++ err = -EINVAL; ++ goto out; ++ } else if (odi.systemid_len) ++ memcpy(odi.systemid, deviceaddr->oda_systemid.data, ++ odi.systemid_len); ++ odi.osdname_len = deviceaddr->oda_osdname.len; ++ odi.osdname = (u8 *)deviceaddr->oda_osdname.data; ++ ++ if (!odi.osdname_len && !odi.systemid_len) { ++ dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", ++ __func__); ++ err = -ENODEV; ++ goto out; ++ } ++ ++ od = osduld_info_lookup(&odi); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ dprintk("%s: osduld_info_lookup => %d\n", __func__, err); ++ goto out; ++ } ++ ++ _dev_list_add(omt, d_id, od); ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ objlayout_put_deviceinfo(deviceaddr); ++ return err ? ERR_PTR(err) : od; ++} ++ ++static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, ++ struct objio_segment *objio_seg) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ unsigned i, num_comps = layout->olo_num_comps; ++ int err; ++ ++ /* lookup all devices */ ++ for (i = 0; i < num_comps; i++) { ++ struct osd_dev *od; ++ ++ od = _device_lookup(pnfslay, objio_seg, i); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ goto out; ++ } ++ objio_seg->ods[i] = od; ++ } ++ objio_seg->num_comps = num_comps; ++ err = 0; ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ return err; ++} ++ ++static int _verify_data_map(struct pnfs_osd_layout *layout) ++{ ++ struct pnfs_osd_data_map *data_map = &layout->olo_map; ++ u64 stripe_length; ++ u32 group_width; ++ ++/* FIXME: Only raid0 for now. if not go through MDS */ ++ if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { ++ printk(KERN_ERR "Only RAID_0 for now\n"); ++ return -ENOTSUPP; ++ } ++ if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { ++ printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", ++ data_map->odm_num_comps, data_map->odm_mirror_cnt); ++ return -EINVAL; ++ } ++ ++ if (data_map->odm_group_width) ++ group_width = data_map->odm_group_width; ++ else ++ group_width = data_map->odm_num_comps / ++ (data_map->odm_mirror_cnt + 1); ++ ++ stripe_length = (u64)data_map->odm_stripe_unit * group_width; ++ if (stripe_length >= (1ULL << 32)) { ++ printk(KERN_ERR "Total Stripe length(0x%llx)" ++ " >= 32bit is not supported\n", _LLU(stripe_length)); ++ return -ENOTSUPP; ++ } ++ ++ if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { ++ printk(KERN_ERR "Stripe Unit(0x%llx)" ++ " must be Multples of PAGE_SIZE(0x%lx)\n", ++ _LLU(data_map->odm_stripe_unit), PAGE_SIZE); ++ return -ENOTSUPP; ++ } ++ ++ return 0; ++} ++ ++int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ struct objio_segment *objio_seg; ++ int err; ++ ++ err = _verify_data_map(layout); ++ if (unlikely(err)) ++ return err; ++ ++ objio_seg = kzalloc(sizeof(*objio_seg) + ++ (layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]), ++ GFP_KERNEL); ++ if (!objio_seg) ++ return -ENOMEM; ++ ++ objio_seg->layout = layout; ++ err = objio_devices_lookup(pnfslay, objio_seg); ++ if (err) ++ goto free_seg; ++ ++ objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1; ++ objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit; ++ if (layout->olo_map.odm_group_width) { ++ objio_seg->group_width = layout->olo_map.odm_group_width; ++ objio_seg->group_depth = layout->olo_map.odm_group_depth; ++ objio_seg->group_count = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1 / ++ objio_seg->group_width; ++ } else { ++ objio_seg->group_width = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1; ++ objio_seg->group_depth = -1; ++ objio_seg->group_count = 1; ++ } ++ ++ *outp = objio_seg; ++ return 0; ++ ++free_seg: ++ dprintk("%s: Error: return %d\n", __func__, err); ++ kfree(objio_seg); ++ *outp = NULL; ++ return err; ++} ++ ++void objio_free_lseg(void *p) ++{ ++ struct objio_segment *objio_seg = p; ++ ++ kfree(objio_seg); ++} ++ ++int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct objio_segment *objio_seg = seg; ++ struct objio_state *ios; ++ const unsigned first_size = sizeof(*ios) + ++ objio_seg->num_comps * sizeof(ios->per_dev[0]); ++ const unsigned sec_size = objio_seg->num_comps * ++ sizeof(ios->ol_state.ioerrs[0]); ++ ++ dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps); ++ ios = kzalloc(first_size + sec_size, GFP_KERNEL); ++ if (unlikely(!ios)) ++ return -ENOMEM; ++ ++ ios->objio_seg = objio_seg; ++ ios->ol_state.ioerrs = ((void *)ios) + first_size; ++ ios->ol_state.num_comps = objio_seg->num_comps; ++ ++ *outp = &ios->ol_state; ++ return 0; ++} ++ ++void objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ ++ kfree(ios); ++} ++ ++enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) ++{ ++ switch (oep) { ++ case OSD_ERR_PRI_NO_ERROR: ++ return (enum pnfs_osd_errno)0; ++ ++ case OSD_ERR_PRI_CLEAR_PAGES: ++ BUG_ON(1); ++ return 0; ++ ++ case OSD_ERR_PRI_RESOURCE: ++ return PNFS_OSD_ERR_RESOURCE; ++ case OSD_ERR_PRI_BAD_CRED: ++ return PNFS_OSD_ERR_BAD_CRED; ++ case OSD_ERR_PRI_NO_ACCESS: ++ return PNFS_OSD_ERR_NO_ACCESS; ++ case OSD_ERR_PRI_UNREACHABLE: ++ return PNFS_OSD_ERR_UNREACHABLE; ++ case OSD_ERR_PRI_NOT_FOUND: ++ return PNFS_OSD_ERR_NOT_FOUND; ++ case OSD_ERR_PRI_NO_SPACE: ++ return PNFS_OSD_ERR_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case OSD_ERR_PRI_EIO: ++ return PNFS_OSD_ERR_EIO; ++ } ++} ++ ++static void _clear_bio(struct bio *bio) ++{ ++ struct bio_vec *bv; ++ unsigned i; ++ ++ __bio_for_each_segment(bv, bio, i, 0) { ++ unsigned this_count = bv->bv_len; ++ ++ if (likely(PAGE_SIZE == this_count)) ++ clear_highpage(bv->bv_page); ++ else ++ zero_user(bv->bv_page, bv->bv_offset, this_count); ++ } ++} ++ ++static int _io_check(struct objio_state *ios, bool is_write) ++{ ++ enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; ++ int lin_ret = 0; ++ int i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_sense_info osi; ++ struct osd_request *or = ios->per_dev[i].or; ++ int ret; ++ ++ if (!or) ++ continue; ++ ++ ret = osd_req_decode_sense(or, &osi); ++ if (likely(!ret)) ++ continue; ++ ++ if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { ++ /* start read offset passed endof file */ ++ BUG_ON(is_write); ++ _clear_bio(ios->per_dev[i].bio); ++ dprintk("%s: start read offset passed end of file " ++ "offset=0x%llx, length=0x%lx\n", __func__, ++ _LLU(ios->per_dev[i].offset), ++ ios->per_dev[i].length); ++ ++ continue; /* we recovered */ ++ } ++ objlayout_io_set_result(&ios->ol_state, ios->per_dev[i].dev, ++ osd_pri_2_pnfs_err(osi.osd_err_pri), ++ ios->per_dev[i].offset, ++ ios->per_dev[i].length, ++ is_write); ++ ++ if (osi.osd_err_pri >= oep) { ++ oep = osi.osd_err_pri; ++ lin_ret = ret; ++ } ++ } ++ ++ return lin_ret; ++} ++ ++/* ++ * Common IO state helpers. ++ */ ++static void _io_free(struct objio_state *ios) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[i]; ++ ++ if (per_dev->or) { ++ osd_end_request(per_dev->or); ++ per_dev->or = NULL; ++ } ++ ++ if (per_dev->bio) { ++ bio_put(per_dev->bio); ++ per_dev->bio = NULL; ++ } ++ } ++} ++ ++struct osd_dev * _io_od(struct objio_state *ios, unsigned dev) ++{ ++ unsigned min_dev = ios->objio_seg->layout->olo_comps_index; ++ unsigned max_dev = min_dev + ios->ol_state.num_comps; ++ ++ BUG_ON(dev < min_dev || max_dev <= dev); ++ return ios->objio_seg->ods[dev - min_dev]; ++} ++ ++struct _striping_info { ++ u64 obj_offset; ++ u64 group_length; ++ u64 total_group_length; ++ u64 Major; ++ unsigned dev; ++ unsigned unit_off; ++}; ++ ++static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, ++ struct _striping_info *si) ++{ ++ u32 stripe_unit = ios->objio_seg->stripe_unit; ++ u32 group_width = ios->objio_seg->group_width; ++ u64 group_depth = ios->objio_seg->group_depth; ++ u32 U = stripe_unit * group_width; ++ ++ u64 T = U * group_depth; ++ u64 S = T * ios->objio_seg->group_count; ++ u64 M = div64_u64(file_offset, S); ++ ++ /* ++ G = (L - (M * S)) / T ++ H = (L - (M * S)) % T ++ */ ++ u64 LmodU = file_offset - M * S; ++ u32 G = div64_u64(LmodU, T); ++ u64 H = LmodU - G * T; ++ ++ u32 N = div_u64(H, U); ++ ++ div_u64_rem(file_offset, stripe_unit, &si->unit_off); ++ si->obj_offset = si->unit_off + (N * stripe_unit) + ++ (M * group_depth * stripe_unit); ++ ++ /* "H - (N * U)" is just "H % U" so it's bound to u32 */ ++ si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; ++ si->dev *= ios->objio_seg->mirrors_p1; ++ ++ si->group_length = T - H; ++ si->total_group_length = T; ++ si->Major = M; ++} ++ ++static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, ++ unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len) ++{ ++ unsigned pg = *cur_pg; ++ struct request_queue *q = ++ osd_request_queue(_io_od(ios, per_dev->dev)); ++ ++ per_dev->length += cur_len; ++ ++ if (per_dev->bio == NULL) { ++ unsigned stripes = ios->ol_state.num_comps / ++ ios->objio_seg->mirrors_p1; ++ unsigned pages_in_stripe = stripes * ++ (ios->objio_seg->stripe_unit / PAGE_SIZE); ++ unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / ++ stripes; ++ ++ per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); ++ if (unlikely(!per_dev->bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", bio_size); ++ return -ENOMEM; ++ } ++ } ++ ++ while (cur_len > 0) { ++ unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); ++ unsigned added_len; ++ ++ BUG_ON(ios->ol_state.nr_pages <= pg); ++ cur_len -= pglen; ++ ++ added_len = bio_add_pc_page(q, per_dev->bio, ++ ios->ol_state.pages[pg], pglen, pgbase); ++ if (unlikely(pglen != added_len)) ++ return -ENOMEM; ++ pgbase = 0; ++ ++pg; ++ } ++ BUG_ON(cur_len); ++ ++ *cur_pg = pg; ++ return 0; ++} ++ ++static int _prepare_one_group(struct objio_state *ios, u64 length, ++ struct _striping_info *si, unsigned first_comp, ++ unsigned *last_pg) ++{ ++ unsigned stripe_unit = ios->objio_seg->stripe_unit; ++ unsigned mirrors_p1 = ios->objio_seg->mirrors_p1; ++ unsigned devs_in_group = ios->objio_seg->group_width * mirrors_p1; ++ unsigned dev = si->dev; ++ unsigned first_dev = dev - (dev % devs_in_group); ++ unsigned comp = first_comp + (dev - first_dev); ++ unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; ++ unsigned cur_pg = *last_pg; ++ int ret = 0; ++ ++ while (length) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[comp]; ++ unsigned cur_len, page_off = 0; ++ ++ if (!per_dev->length) { ++ per_dev->dev = dev; ++ if (dev < si->dev) { ++ per_dev->offset = si->obj_offset + stripe_unit - ++ si->unit_off; ++ cur_len = stripe_unit; ++ } else if (dev == si->dev) { ++ per_dev->offset = si->obj_offset; ++ cur_len = stripe_unit - si->unit_off; ++ page_off = si->unit_off & ~PAGE_MASK; ++ BUG_ON(page_off && ++ (page_off != ios->ol_state.pgbase)); ++ } else { /* dev > si->dev */ ++ per_dev->offset = si->obj_offset - si->unit_off; ++ cur_len = stripe_unit; ++ } ++ ++ if (max_comp < comp) ++ max_comp = comp; ++ ++ dev += mirrors_p1; ++ dev = (dev % devs_in_group) + first_dev; ++ } else { ++ cur_len = stripe_unit; ++ } ++ if (cur_len >= length) ++ cur_len = length; ++ ++ ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, ++ cur_len); ++ if (unlikely(ret)) ++ goto out; ++ ++ comp += mirrors_p1; ++ comp = (comp % devs_in_group) + first_comp; ++ ++ length -= cur_len; ++ ios->length += cur_len; ++ } ++out: ++ ios->numdevs = max_comp + mirrors_p1; ++ *last_pg = cur_pg; ++ return ret; ++} ++ ++static int _io_rw_pagelist(struct objio_state *ios) ++{ ++ u64 length = ios->ol_state.count; ++ struct _striping_info si; ++ unsigned devs_in_group = ios->objio_seg->group_width * ++ ios->objio_seg->mirrors_p1; ++ unsigned first_comp = 0; ++ unsigned num_comps = ios->objio_seg->layout->olo_map.odm_num_comps; ++ unsigned last_pg = 0; ++ int ret = 0; ++ ++ _calc_stripe_info(ios, ios->ol_state.offset, &si); ++ while (length) { ++ if (length < si.group_length) ++ si.group_length = length; ++ ++ ret = _prepare_one_group(ios, si.group_length, &si, first_comp, ++ &last_pg); ++ if (unlikely(ret)) ++ goto out; ++ ++ length -= si.group_length; ++ ++ si.group_length = si.total_group_length; ++ si.unit_off = 0; ++ ++si.Major; ++ si.obj_offset = si.Major * ios->objio_seg->stripe_unit * ++ ios->objio_seg->group_depth; ++ ++ si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group; ++ si.dev %= num_comps; ++ ++ first_comp += devs_in_group; ++ first_comp %= num_comps; ++ } ++ ++out: ++ if (!ios->length) ++ return ret; ++ ++ return 0; ++} ++ ++static ssize_t _sync_done(struct objio_state *ios) ++{ ++ struct completion *waiting = ios->private; ++ ++ complete(waiting); ++ return 0; ++} ++ ++static void _last_io(struct kref *kref) ++{ ++ struct objio_state *ios = container_of(kref, struct objio_state, kref); ++ ++ ios->done(ios); ++} ++ ++static void _done_io(struct osd_request *or, void *p) ++{ ++ struct objio_state *ios = p; ++ ++ kref_put(&ios->kref, _last_io); ++} ++ ++static ssize_t _io_exec(struct objio_state *ios) ++{ ++ DECLARE_COMPLETION_ONSTACK(wait); ++ ssize_t status = 0; /* sync status */ ++ unsigned i; ++ objio_done_fn saved_done_fn = ios->done; ++ bool sync = ios->ol_state.sync; ++ ++ if (sync) { ++ ios->done = _sync_done; ++ ios->private = &wait; ++ } ++ ++ kref_init(&ios->kref); ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_request *or = ios->per_dev[i].or; ++ ++ if (!or) ++ continue; ++ ++ kref_get(&ios->kref); ++ osd_execute_request_async(or, _done_io, ios); ++ } ++ ++ kref_put(&ios->kref, _last_io); ++ ++ if (sync) { ++ wait_for_completion(&wait); ++ status = saved_done_fn(ios); ++ } ++ ++ return status; ++} ++ ++/* ++ * read ++ */ ++static ssize_t _read_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, false); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) ++ status = ios->length; ++ else ++ status = ret; ++ ++ objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct osd_request *or = NULL; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = per_dev->dev; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ int ret; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ ++err: ++ return ret; ++} ++ ++static ssize_t _read_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _read_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _read_done; ++ return _io_exec(ios); /* In sync mode exec returns the io status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _read_exec(ios); ++} ++ ++/* ++ * write ++ */ ++static ssize_t _write_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, true); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) { ++ /* FIXME: should be based on the OSD's persistence model ++ * See OSD2r05 Section 4.13 Data persistence model */ ++ ios->ol_state.committed = NFS_UNSTABLE; //NFS_FILE_SYNC; ++ status = ios->length; ++ } else { ++ status = ret; ++ } ++ ++ objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = ios->per_dev[cur_comp].dev; ++ unsigned last_comp = cur_comp + ios->objio_seg->mirrors_p1; ++ int ret; ++ ++ for (; cur_comp < last_comp; ++cur_comp, ++dev) { ++ struct osd_request *or = NULL; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ struct bio *bio; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ if (per_dev != master_dev) { ++ bio = bio_kmalloc(GFP_KERNEL, ++ master_dev->bio->bi_max_vecs); ++ if (unlikely(!bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", ++ master_dev->bio->bi_max_vecs); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ __bio_clone(bio, master_dev->bio); ++ bio->bi_bdev = NULL; ++ bio->bi_next = NULL; ++ per_dev->bio = bio; ++ per_dev->dev = dev; ++ per_dev->length = master_dev->length; ++ per_dev->offset = master_dev->offset; ++ } else { ++ bio = master_dev->bio; ++ /* FIXME: bio_set_dir() */ ++ bio->bi_rw |= (1 << BIO_RW); ++ } ++ ++ osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ } ++ ++err: ++ return ret; ++} ++ ++static ssize_t _write_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _write_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _write_done; ++ return _io_exec(ios); /* In sync mode exec returns the io->status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ /* TODO: ios->stable = stable; */ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _write_exec(ios); ++} ++ ++/* ++ * Policy Operations ++ */ ++ ++/* ++ * Return the stripe size for the specified file ++ */ ++ssize_t ++objlayout_get_stripesize(struct pnfs_layout_hdr *pnfslay) ++{ ++ ssize_t sz, maxsz = -1; ++ struct pnfs_layout_segment *lseg; ++ ++ list_for_each_entry(lseg, &pnfslay->segs, fi_list) { ++ int n; ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(lseg); ++ struct pnfs_osd_layout *lo = ++ (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; ++ struct pnfs_osd_data_map *map = &lo->olo_map; ++ ++ n = map->odm_group_width; ++ if (n == 0) ++ n = map->odm_num_comps / (map->odm_mirror_cnt + 1); ++ ++ switch (map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_5: ++ n -= 1; ++ break; ++ ++ case PNFS_OSD_RAID_PQ: ++ n -= 2; ++ break; ++ ++ default: ++ BUG_ON(1); ++ } ++ sz = map->odm_stripe_unit * n; ++ if (sz > maxsz) ++ maxsz = sz; ++ } ++ dprintk("%s: Return %Zx\n", __func__, maxsz); ++ return maxsz; ++} ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++objlayout_get_blocksize(void) ++{ ++ ssize_t sz = BIO_MAX_PAGES_KMALLOC * PAGE_SIZE; ++ ++ return sz; ++} ++ ++static struct layoutdriver_policy_operations objlayout_policy_operations = { ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ .get_stripesize = objlayout_get_stripesize, ++ .get_blocksize = objlayout_get_blocksize, ++}; ++ ++static struct pnfs_layoutdriver_type objlayout_type = { ++ .id = LAYOUT_OSD2_OBJECTS, ++ .name = "LAYOUT_OSD2_OBJECTS", ++ .ld_io_ops = &objlayout_io_operations, ++ .ld_policy_ops = &objlayout_policy_operations, ++}; ++ ++void *objio_init_mt(void) ++{ ++ struct objio_mount_type *omt = kzalloc(sizeof(*omt), GFP_KERNEL); ++ ++ if (!omt) ++ return ERR_PTR(-ENOMEM); ++ ++ INIT_LIST_HEAD(&omt->dev_list); ++ spin_lock_init(&omt->dev_list_lock); ++ return omt; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++ _dev_list_remove_all(mountid); ++ kfree(mountid); ++} ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++objlayout_init(void) ++{ ++ pnfs_client_ops = pnfs_register_layoutdriver(&objlayout_type); ++ printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", ++ __func__); ++ return 0; ++} ++ ++static void __exit ++objlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&objlayout_type); ++ printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(objlayout_init); ++module_exit(objlayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-31 20:42:05.535059115 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-08-31 20:42:05.535059115 -0400 +@@ -0,0 +1,790 @@ ++/* ++ * objlayout.c ++ * ++ * pNFS layout driver for Panasas OSDs ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pnfs_client_operations *pnfs_client_ops; ++ ++/* ++ * Create a objlayout layout structure for the given inode and return it. ++ */ ++static struct pnfs_layout_hdr * ++objlayout_alloc_layout(struct inode *inode) ++{ ++ struct objlayout *objlay; ++ ++ objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL); ++ if (objlay) { ++ spin_lock_init(&objlay->lock); ++ INIT_LIST_HEAD(&objlay->err_list); ++ } ++ dprintk("%s: Return %p\n", __func__, objlay); ++ return &objlay->pnfs_layout; ++} ++ ++/* ++ * Free an objlayout layout structure ++ */ ++static void ++objlayout_free_layout(struct pnfs_layout_hdr *lo) ++{ ++ struct objlayout *objlay = OBJLAYOUT(lo); ++ ++ dprintk("%s: objlay %p\n", __func__, objlay); ++ ++ WARN_ON(!list_empty(&objlay->err_list)); ++ kfree(objlay); ++} ++ ++/* ++ * Unmarshall layout and store it in pnfslay. ++ */ ++static struct pnfs_layout_segment * ++objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, ++ struct nfs4_layoutget_res *lgr) ++{ ++ int status; ++ void *layout = lgr->layout.buf; ++ struct pnfs_layout_segment *lseg; ++ struct objlayout_segment *objlseg; ++ struct pnfs_osd_layout *pnfs_osd_layout; ++ ++ dprintk("%s: Begin pnfslay %p layout %p\n", __func__, pnfslay, layout); ++ ++ BUG_ON(!layout); ++ ++ status = -ENOMEM; ++ lseg = kzalloc(sizeof(*lseg) + sizeof(*objlseg) + ++ pnfs_osd_layout_incore_sz(layout), GFP_KERNEL); ++ if (!lseg) ++ goto err; ++ ++ objlseg = LSEG_LD_DATA(lseg); ++ pnfs_osd_layout = (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; ++ pnfs_osd_xdr_decode_layout(pnfs_osd_layout, layout); ++ ++ status = objio_alloc_lseg(&objlseg->internal, pnfslay, lseg, ++ pnfs_osd_layout); ++ if (status) ++ goto err; ++ ++ dprintk("%s: Return %p\n", __func__, lseg); ++ return lseg; ++ ++ err: ++ kfree(lseg); ++ return ERR_PTR(status); ++} ++ ++/* ++ * Free a layout segement ++ */ ++static void ++objlayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ struct objlayout_segment *objlseg; ++ ++ dprintk("%s: freeing layout segment %p\n", __func__, lseg); ++ ++ if (unlikely(!lseg)) ++ return; ++ ++ objlseg = LSEG_LD_DATA(lseg); ++ objio_free_lseg(objlseg->internal); ++ kfree(lseg); ++} ++ ++/* ++ * I/O Operations ++ */ ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ ++static struct objlayout_io_state * ++objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, ++ struct page **pages, ++ unsigned pgbase, ++ unsigned nr_pages, ++ loff_t offset, ++ size_t count, ++ struct pnfs_layout_segment *lseg, ++ void *rpcdata) ++{ ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(lseg); ++ struct objlayout_io_state *state; ++ u64 lseg_end_offset; ++ size_t size_nr_pages; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ if (objio_alloc_io_state(objlseg->internal, &state)) ++ return NULL; ++ ++ BUG_ON(offset < lseg->range.offset); ++ lseg_end_offset = end_offset(lseg->range.offset, lseg->range.length); ++ BUG_ON(offset >= lseg_end_offset); ++ if (offset + count > lseg_end_offset) { ++ count = lseg->range.length - (offset - lseg->range.offset); ++ dprintk("%s: truncated count %Zd\n", __func__, count); ++ } ++ ++ if (pgbase > PAGE_SIZE) { ++ unsigned n = pgbase >> PAGE_SHIFT; ++ ++ pgbase &= ~PAGE_MASK; ++ pages += n; ++ nr_pages -= n; ++ } ++ ++ size_nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ BUG_ON(nr_pages < size_nr_pages); ++ if (nr_pages > size_nr_pages) ++ nr_pages = size_nr_pages; ++ ++ INIT_LIST_HEAD(&state->err_list); ++ state->lseg = lseg; ++ state->rpcdata = rpcdata; ++ state->pages = pages; ++ state->pgbase = pgbase; ++ state->nr_pages = nr_pages; ++ state->offset = offset; ++ state->count = count; ++ state->sync = 0; ++ ++ return state; ++} ++ ++static void ++objlayout_free_io_state(struct objlayout_io_state *state) ++{ ++ dprintk("%s: freeing io_state\n", __func__); ++ if (unlikely(!state)) ++ return; ++ ++ objio_free_io_state(state); ++} ++ ++/* ++ * I/O done common code ++ */ ++static void ++objlayout_iodone(struct objlayout_io_state *state) ++{ ++ dprintk("%s: state %p status\n", __func__, state); ++ ++ if (likely(state->status >= 0)) { ++ objlayout_free_io_state(state); ++ } else { ++ struct objlayout *objlay = OBJLAYOUT(state->lseg->layout); ++ ++ spin_lock(&objlay->lock); ++ objlay->delta_space_valid = OBJ_DSU_INVALID; ++ list_add(&objlay->err_list, &state->err_list); ++ spin_unlock(&objlay->lock); ++ } ++} ++ ++/* ++ * objlayout_io_set_result - Set an osd_error code on a specific osd comp. ++ * ++ * The @index component IO failed (error returned from target). Register ++ * the error for later reporting at layout-return. ++ */ ++void ++objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, ++ int osd_error, u64 offset, u64 length, bool is_write) ++{ ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; ++ ++ BUG_ON(index >= state->num_comps); ++ if (osd_error) { ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(state->lseg); ++ struct pnfs_osd_layout *layout = ++ (typeof(layout))objlseg->pnfs_osd_layout; ++ ++ ioerr->oer_component = layout->olo_comps[index].oc_object_id; ++ ioerr->oer_comp_offset = offset; ++ ioerr->oer_comp_length = length; ++ ioerr->oer_iswrite = is_write; ++ ioerr->oer_errno = osd_error; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " ++ "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", ++ __func__, index, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ } else { ++ /* User need not call if no error is reported */ ++ ioerr->oer_errno = 0; ++ } ++} ++ ++static void _rpc_commit_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_client_ops->nfs_commit_complete(wdata); ++} ++ ++/* ++ * Commit data remotely on OSDs ++ */ ++enum pnfs_try_status ++objlayout_commit(struct nfs_write_data *wdata, int how) ++{ ++ int status = PNFS_ATTEMPTED; ++ ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_commit_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ dprintk("%s: Return %d\n", __func__, status); ++ return status; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_read_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ ++ pnfs_client_ops->nfs_readlist_complete(rdata); ++} ++ ++void ++objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) ++{ ++ int eof = state->eof; ++ struct nfs_read_data *rdata; ++ ++ state->status = status; ++ dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof); ++ rdata = state->rpcdata; ++ rdata->task.tk_status = status; ++ if (status >= 0) { ++ rdata->res.count = status; ++ rdata->res.eof = eof; ++ } ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_client_ops->nfs_readlist_complete(rdata); ++ else { ++ INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); ++ schedule_work(&rdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async reads. ++ */ ++enum pnfs_try_status ++objlayout_read_pagelist(struct nfs_read_data *rdata, unsigned nr_pages) ++{ ++ loff_t offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct objlayout_io_state *state; ++ ssize_t status = 0; ++ loff_t eof; ++ ++ dprintk("%s: Begin inode %p offset %llu count %d\n", ++ __func__, rdata->inode, offset, (int)count); ++ ++ eof = i_size_read(rdata->inode); ++ if (unlikely(offset + count > eof)) { ++ if (offset >= eof) { ++ status = 0; ++ rdata->res.count = 0; ++ rdata->res.eof = 1; ++ goto out; ++ } ++ count = eof - offset; ++ } ++ ++ state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, ++ rdata->args.pages, rdata->args.pgbase, ++ nr_pages, offset, count, ++ rdata->pdata.lseg, rdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->eof = state->offset + state->count >= eof; ++ ++ status = objio_read_pagelist(state); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ rdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_write_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_client_ops->nfs_writelist_complete(wdata); ++} ++ ++void ++objlayout_write_done(struct objlayout_io_state *state, ssize_t status, ++ bool sync) ++{ ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s: Begin\n", __func__); ++ wdata = state->rpcdata; ++ state->status = status; ++ wdata->task.tk_status = status; ++ if (status >= 0) { ++ wdata->res.count = status; ++ wdata->verf.committed = state->committed; ++ dprintk("%s: Return status %d committed %d\n", ++ __func__, wdata->task.tk_status, ++ wdata->verf.committed); ++ } else ++ dprintk("%s: Return status %d\n", ++ __func__, wdata->task.tk_status); ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_client_ops->nfs_writelist_complete(wdata); ++ else { ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async writes. ++ */ ++enum pnfs_try_status ++objlayout_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int how) ++{ ++ struct objlayout_io_state *state; ++ ssize_t status; ++ ++ dprintk("%s: Begin inode %p offset %llu count %u\n", ++ __func__, wdata->inode, wdata->args.offset, wdata->args.count); ++ ++ state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, ++ wdata->args.pages, ++ wdata->args.pgbase, ++ nr_pages, ++ wdata->args.offset, ++ wdata->args.count, ++ wdata->pdata.lseg, wdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->sync = how & FLUSH_SYNC; ++ ++ status = objio_write_pagelist(state, how & FLUSH_STABLE); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ wdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++void ++objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct pnfs_osd_layoutupdate lou; ++ __be32 *start; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ spin_lock(&objlay->lock); ++ lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); ++ lou.dsu_delta = objlay->delta_space_used; ++ objlay->delta_space_used = 0; ++ objlay->delta_space_valid = OBJ_DSU_INIT; ++ lou.olu_ioerr_flag = !list_empty(&objlay->err_list); ++ spin_unlock(&objlay->lock); ++ ++ start = xdr_reserve_space(xdr, 4); ++ ++ BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ ++ dprintk("%s: Return delta_space_used %lld err %d\n", __func__, ++ lou.dsu_delta, lou.olu_ioerr_flag); ++} ++ ++static int ++err_prio(u32 oer_errno) ++{ ++ switch (oer_errno) { ++ case 0: ++ return 0; ++ ++ case PNFS_OSD_ERR_RESOURCE: ++ return OSD_ERR_PRI_RESOURCE; ++ case PNFS_OSD_ERR_BAD_CRED: ++ return OSD_ERR_PRI_BAD_CRED; ++ case PNFS_OSD_ERR_NO_ACCESS: ++ return OSD_ERR_PRI_NO_ACCESS; ++ case PNFS_OSD_ERR_UNREACHABLE: ++ return OSD_ERR_PRI_UNREACHABLE; ++ case PNFS_OSD_ERR_NOT_FOUND: ++ return OSD_ERR_PRI_NOT_FOUND; ++ case PNFS_OSD_ERR_NO_SPACE: ++ return OSD_ERR_PRI_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case PNFS_OSD_ERR_EIO: ++ return OSD_ERR_PRI_EIO; ++ } ++} ++ ++static void ++merge_ioerr(struct pnfs_osd_ioerr *dest_err, ++ const struct pnfs_osd_ioerr *src_err) ++{ ++ u64 dest_end, src_end; ++ ++ if (!dest_err->oer_errno) { ++ *dest_err = *src_err; ++ /* accumulated device must be blank */ ++ memset(&dest_err->oer_component.oid_device_id, 0, ++ sizeof(dest_err->oer_component.oid_device_id)); ++ ++ return; ++ } ++ ++ if (dest_err->oer_component.oid_partition_id != ++ src_err->oer_component.oid_partition_id) ++ dest_err->oer_component.oid_partition_id = 0; ++ ++ if (dest_err->oer_component.oid_object_id != ++ src_err->oer_component.oid_object_id) ++ dest_err->oer_component.oid_object_id = 0; ++ ++ if (dest_err->oer_comp_offset > src_err->oer_comp_offset) ++ dest_err->oer_comp_offset = src_err->oer_comp_offset; ++ ++ dest_end = end_offset(dest_err->oer_comp_offset, ++ dest_err->oer_comp_length); ++ src_end = end_offset(src_err->oer_comp_offset, ++ src_err->oer_comp_length); ++ if (dest_end < src_end) ++ dest_end = src_end; ++ ++ dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; ++ ++ if ((src_err->oer_iswrite == dest_err->oer_iswrite) && ++ (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { ++ dest_err->oer_errno = src_err->oer_errno; ++ } else if (src_err->oer_iswrite) { ++ dest_err->oer_iswrite = true; ++ dest_err->oer_errno = src_err->oer_errno; ++ } ++} ++ ++static void ++encode_accumulated_error(struct objlayout *objlay, struct xdr_stream *xdr) ++{ ++ struct objlayout_io_state *state, *tmp; ++ struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ ++ for (i = 0; i < state->num_comps; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ merge_ioerr(&accumulated_err, ioerr); ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++ ++ BUG_ON(pnfs_osd_xdr_encode_ioerr(xdr, &accumulated_err)); ++} ++ ++void ++objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct objlayout_io_state *state, *tmp; ++ __be32 *start, *uninitialized_var(last_xdr); ++ ++ dprintk("%s: Begin\n", __func__); ++ start = xdr_reserve_space(xdr, 4); ++ BUG_ON(!start); ++ ++ spin_lock(&objlay->lock); ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ int res = 0; ++ ++ for (i = 0; i < state->num_comps && !res; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ last_xdr = xdr->p; ++ res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]); ++ } ++ if (unlikely(res)) { ++ /* no space for even one error descriptor */ ++ BUG_ON(last_xdr == start + 1); ++ ++ /* we've encountered a situation with lots and lots of ++ * errors and no space to encode them all. Use the last ++ * available slot to report the union of all the ++ * remaining errors. ++ */ ++ xdr_rewind_stream(xdr, last_xdr - ++ pnfs_osd_ioerr_xdr_sz() / 4); ++ encode_accumulated_error(objlay, xdr); ++ goto loop_done; ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++loop_done: ++ spin_unlock(&objlay->lock); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ dprintk("%s: Return\n", __func__); ++} ++ ++struct objlayout_deviceinfo { ++ struct page *page; ++ struct pnfs_osd_deviceaddr da; /* This must be last */ ++}; ++ ++/* Initialize and call nfs_getdeviceinfo, then decode and return a ++ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() ++ * should be called. ++ */ ++int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi; ++ struct pnfs_device pd; ++ struct super_block *sb; ++ struct page *page; ++ size_t sz; ++ u32 *p; ++ int err; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ pd.area = page_address(page); ++ ++ memcpy(&pd.dev_id, d_id, sizeof(*d_id)); ++ pd.layout_type = LAYOUT_OSD2_OBJECTS; ++ pd.dev_notify_types = 0; ++ pd.pages = &page; ++ pd.pgbase = 0; ++ pd.pglen = PAGE_SIZE; ++ pd.mincount = 0; ++ ++ sb = PNFS_INODE(pnfslay)->i_sb; ++ err = pnfs_client_ops->nfs_getdeviceinfo(PNFS_NFS_SERVER(pnfslay), &pd); ++ dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); ++ if (err) ++ goto err_out; ++ ++ p = pd.area; ++ sz = pnfs_osd_xdr_deviceaddr_incore_sz(p); ++ odi = kzalloc(sz + (sizeof(*odi) - sizeof(odi->da)), GFP_KERNEL); ++ if (!odi) { ++ err = -ENOMEM; ++ goto err_out; ++ } ++ pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); ++ odi->page = page; ++ *deviceaddr = &odi->da; ++ return 0; ++ ++err_out: ++ __free_page(page); ++ return err; ++} ++ ++void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi = container_of(deviceaddr, ++ struct objlayout_deviceinfo, ++ da); ++ ++ __free_page(odi->page); ++ kfree(odi); ++} ++ ++/* ++ * Initialize a mountpoint by retrieving the list of ++ * available devices for it. ++ * Return the pnfs_mount_type structure so the ++ * pNFS_client can refer to the mount point later on. ++ */ ++static int ++objlayout_initialize_mountpoint(struct nfs_server *server, ++ const struct nfs_fh *mntfh) ++{ ++ void *data; ++ ++ data = objio_init_mt(); ++ if (IS_ERR(data)) { ++ printk(KERN_INFO "%s: objlayout lib not ready err=%ld\n", ++ __func__, PTR_ERR(data)); ++ return PTR_ERR(data); ++ } ++ server->pnfs_ld_data = data; ++ ++ dprintk("%s: Return data=%p\n", __func__, data); ++ return 0; ++} ++ ++/* ++ * Uninitialize a mountpoint ++ */ ++static int ++objlayout_uninitialize_mountpoint(struct nfs_server *server) ++{ ++ dprintk("%s: Begin %p\n", __func__, server->pnfs_ld_data); ++ objio_fini_mt(server->pnfs_ld_data); ++ return 0; ++} ++ ++struct layoutdriver_io_operations objlayout_io_operations = { ++ .commit = objlayout_commit, ++ .read_pagelist = objlayout_read_pagelist, ++ .write_pagelist = objlayout_write_pagelist, ++ .alloc_layout = objlayout_alloc_layout, ++ .free_layout = objlayout_free_layout, ++ .alloc_lseg = objlayout_alloc_lseg, ++ .free_lseg = objlayout_free_lseg, ++ .encode_layoutcommit = objlayout_encode_layoutcommit, ++ .encode_layoutreturn = objlayout_encode_layoutreturn, ++ .initialize_mountpoint = objlayout_initialize_mountpoint, ++ .uninitialize_mountpoint = objlayout_uninitialize_mountpoint, ++}; +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-31 20:42:05.535059115 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-08-31 20:42:05.535059115 -0400 +@@ -0,0 +1,171 @@ ++/* ++ * objlayout.h ++ * ++ * Data types and function declerations for interfacing with the ++ * pNFS standard object layout driver. ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#ifndef _OBJLAYOUT_H ++#define _OBJLAYOUT_H ++ ++#include ++#include ++#include ++ ++/* ++ * in-core layout segment ++ */ ++struct objlayout_segment { ++ void *internal; /* for provider internal use */ ++ u8 pnfs_osd_layout[]; ++}; ++ ++/* ++ * per-inode layout ++ */ ++struct objlayout { ++ struct pnfs_layout_hdr pnfs_layout; ++ ++ /* for layout_commit */ ++ enum osd_delta_space_valid_enum { ++ OBJ_DSU_INIT = 0, ++ OBJ_DSU_VALID, ++ OBJ_DSU_INVALID, ++ } delta_space_valid; ++ s64 delta_space_used; /* consumed by write ops */ ++ ++ /* for layout_return */ ++ spinlock_t lock; ++ struct list_head err_list; ++}; ++ ++static inline struct objlayout * ++OBJLAYOUT(struct pnfs_layout_hdr *lo) ++{ ++ return container_of(lo, struct objlayout, pnfs_layout); ++} ++ ++/* ++ * per-I/O operation state ++ * embedded in objects provider io_state data structure ++ */ ++struct objlayout_io_state { ++ struct pnfs_layout_segment *lseg; ++ ++ struct page **pages; ++ unsigned pgbase; ++ unsigned nr_pages; ++ unsigned long count; ++ loff_t offset; ++ bool sync; ++ ++ void *rpcdata; ++ int status; /* res */ ++ int eof; /* res */ ++ int committed; /* res */ ++ ++ /* Error reporting (layout_return) */ ++ struct list_head err_list; ++ unsigned num_comps; ++ /* Pointer to array of error descriptors of size num_comps. ++ * It should contain as many entries as devices in the osd_layout ++ * that participate in the I/O. It is up to the io_engine to allocate ++ * needed space and set num_comps. ++ */ ++ struct pnfs_osd_ioerr *ioerrs; ++}; ++ ++/* ++ * Raid engine I/O API ++ */ ++extern void *objio_init_mt(void); ++extern void objio_fini_mt(void *mt); ++ ++extern int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout); ++extern void objio_free_lseg(void *p); ++ ++extern int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp); ++extern void objio_free_io_state(struct objlayout_io_state *state); ++ ++extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); ++extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable); ++ ++/* ++ * callback API ++ */ ++extern void objlayout_io_set_result(struct objlayout_io_state *state, ++ unsigned index, int osd_error, ++ u64 offset, u64 length, bool is_write); ++ ++static inline void ++objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) ++{ ++ struct objlayout *objlay = OBJLAYOUT(state->lseg->layout); ++ ++ /* If one of the I/Os errored out and the delta_space_used was ++ * invalid we render the complete report as invalid. Protocol mandate ++ * the DSU be accurate or not reported. ++ */ ++ spin_lock(&objlay->lock); ++ if (objlay->delta_space_valid != OBJ_DSU_INVALID) { ++ objlay->delta_space_valid = OBJ_DSU_VALID; ++ objlay->delta_space_used += space_used; ++ } ++ spin_unlock(&objlay->lock); ++} ++ ++extern void objlayout_read_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++extern void objlayout_write_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++ ++extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr); ++extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); ++ ++/* ++ * exported generic objects function vectors ++ */ ++extern struct layoutdriver_io_operations objlayout_io_operations; ++extern struct pnfs_client_operations *pnfs_client_ops; ++ ++#endif /* _OBJLAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-31 20:42:05.536110535 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-31 20:42:05.536110535 -0400 +@@ -0,0 +1,734 @@ ++/* ++ * panfs_shim.c ++ * ++ * Shim layer for interfacing with the Panasas DirectFlow module I/O stack ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include "objlayout.h" ++#include "panfs_shim.h" ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct panfs_export_operations *panfs_export_ops; ++ ++void * ++objio_init_mt(void) ++{ ++ return panfs_export_ops == NULL ? ERR_PTR(-EAGAIN) : NULL; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++} ++ ++static int ++panfs_shim_conv_raid01(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) { ++ hdr->type = PAN_AGG_RAID1; ++ hdr->hdr.raid1.num_comps = lo_map->odm_mirror_cnt + 1; ++ } else if (layout->olo_num_comps > 1) { ++ hdr->type = PAN_AGG_RAID0; ++ hdr->hdr.raid0.num_comps = layout->olo_num_comps; ++ hdr->hdr.raid0.stripe_unit = lo_map->odm_stripe_unit; ++ } else ++ hdr->type = PAN_AGG_SIMPLE; ++ return 0; ++} ++ ++static int ++panfs_shim_conv_raid5(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) ++ goto err; ++ ++ if (lo_map->odm_group_width || lo_map->odm_group_depth) { ++ if (!lo_map->odm_group_width || !lo_map->odm_group_depth) ++ goto err; ++ ++ hdr->type = PAN_AGG_GRP_RAID5_LEFT; ++ hdr->hdr.grp_raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.grp_raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.grp_raid5_left.stripe_unit = lo_map->odm_stripe_unit; ++ hdr->hdr.grp_raid5_left.rg_width = lo_map->odm_group_width; ++ hdr->hdr.grp_raid5_left.rg_depth = lo_map->odm_group_depth; ++ /* this is a guess, panasas server is not supposed to ++ hand out layotu otherwise */ ++ hdr->hdr.grp_raid5_left.group_layout_policy = ++ PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN; ++ } else { ++ hdr->type = PAN_AGG_RAID5_LEFT; ++ hdr->hdr.raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.raid5_left.stripe_unit2 = ++ hdr->hdr.raid5_left.stripe_unit1 = ++ hdr->hdr.raid5_left.stripe_unit0 = lo_map->odm_stripe_unit; ++ } ++ ++ return 0; ++err: ++ return -EINVAL; ++} ++ ++/* ++ * Convert a pnfs_osd data map into Panasas aggregation layout header ++ */ ++static int ++panfs_shim_conv_pnfs_osd_data_map( ++ struct pnfs_osd_layout *layout, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ int status = -EINVAL; ++ struct pnfs_osd_data_map *lo_map = &layout->olo_map; ++ ++ if (!layout->olo_num_comps) { ++ dprintk("%s: !!layout.n_comps(%u)\n", __func__, ++ layout->olo_num_comps); ++ goto err; ++ } ++ ++ switch (lo_map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_0 " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid01(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_5: ++ if (!lo_map->odm_group_width) { ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_5 !group_width " ++ "layout.n_comps(%u)!=map.n_comps(%u) " ++ "|| comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ } else if ((layout->olo_num_comps != lo_map->odm_num_comps && ++ layout->olo_num_comps > lo_map->odm_group_width) || ++ (layout->olo_comps_index % lo_map->odm_group_width)){ ++ dprintk("%s: !!PNFS_OSD_RAID_5 group_width(%u) " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ lo_map->odm_group_width, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid5(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_PQ: ++ default: ++ dprintk("%s: !!PNFS_OSD_RAID_(%d)\n", __func__, ++ lo_map->odm_raid_algorithm); ++ goto err; ++ } ++ ++ return 0; ++ ++err: ++ return status; ++} ++ ++/* ++ * Convert pnfs_osd layout into Panasas map and caps type ++ */ ++int ++objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ int i, total_comps; ++ int status; ++ struct pnfs_osd_object_cred *lo_comp; ++ pan_size_t alloc_sz, local_sz; ++ pan_sm_map_cap_t *mcs = NULL; ++ u8 *buf; ++ pan_agg_comp_obj_t *pan_comp; ++ pan_sm_sec_t *pan_sec; ++ ++ status = -EINVAL; ++ if (layout->olo_num_comps < layout->olo_map.odm_group_width) { ++ total_comps = layout->olo_comps_index + layout->olo_num_comps; ++ } else { ++ /* allocate full map, otherwise SAM gets confused */ ++ total_comps = layout->olo_map.odm_num_comps; ++ } ++ alloc_sz = total_comps * ++ (sizeof(pan_agg_comp_obj_t) + sizeof(pan_sm_sec_t)); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p = layout->olo_comps[i].oc_cap.cred; ++ if (panfs_export_ops->sm_sec_t_get_size_otw( ++ (pan_sm_sec_otw_t *)&p, &local_sz, NULL, NULL)) ++ goto err; ++ alloc_sz += local_sz; ++ } ++ ++ status = -ENOMEM; ++ mcs = kzalloc(sizeof(*mcs) + alloc_sz, GFP_KERNEL); ++ if (!mcs) ++ goto err; ++ buf = (u8 *)&mcs[1]; ++ ++ mcs->offset = lseg->range.offset; ++ mcs->length = lseg->range.length; ++#if 0 ++ /* FIXME: for now */ ++ mcs->expiration_time.ts_sec = 0; ++ mcs->expiration_time.ts_nsec = 0; ++#endif ++ mcs->full_map.map_hdr.avail_state = PAN_AGG_OBJ_STATE_NORMAL; ++ status = panfs_shim_conv_pnfs_osd_data_map(layout, ++ &mcs->full_map.layout_hdr); ++ if (status) ++ goto err; ++ ++ mcs->full_map.components.size = total_comps; ++ mcs->full_map.components.data = (pan_agg_comp_obj_t *)buf; ++ buf += total_comps * sizeof(pan_agg_comp_obj_t); ++ ++ mcs->secs.size = total_comps; ++ mcs->secs.data = (pan_sm_sec_t *)buf; ++ buf += total_comps * sizeof(pan_sm_sec_t); ++ ++ lo_comp = layout->olo_comps; ++ pan_comp = mcs->full_map.components.data + layout->olo_comps_index; ++ pan_sec = mcs->secs.data + layout->olo_comps_index; ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p; ++ pan_stor_obj_id_t *obj_id = &mcs->full_map.map_hdr.obj_id; ++ struct pnfs_osd_objid *oc_obj_id = &lo_comp->oc_object_id; ++ u64 dev_id = __be64_to_cpup( ++ (__be64 *)oc_obj_id->oid_device_id.data + 1); ++ ++ dprintk("%s: i=%d deviceid=%Lx:%Lx partition=%Lx object=%Lx\n", ++ __func__, i, ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data), ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data + 1), ++ oc_obj_id->oid_partition_id, oc_obj_id->oid_object_id); ++ ++ if (i == 0) { ++ /* make up mgr_id to calm sam down */ ++ pan_mgr_id_construct_artificial(PAN_MGR_SM, 0, ++ &obj_id->dev_id); ++ obj_id->grp_id = oc_obj_id->oid_partition_id; ++ obj_id->obj_id = oc_obj_id->oid_object_id; ++ } ++ ++ if (obj_id->grp_id != lo_comp->oc_object_id.oid_partition_id) { ++ dprintk("%s: i=%d grp_id=0x%Lx oid_partition_id=0x%Lx\n", ++ __func__, i, (u64)obj_id->grp_id, ++ lo_comp->oc_object_id.oid_partition_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ if (obj_id->obj_id != lo_comp->oc_object_id.oid_object_id) { ++ dprintk("%s: i=%d obj_id=0x%Lx oid_object_id=0x%Lx\n", ++ __func__, i, obj_id->obj_id, ++ lo_comp->oc_object_id.oid_object_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ pan_comp->dev_id = dev_id; ++ if (!pan_stor_is_device_id_an_obsd_id(pan_comp->dev_id)) { ++ dprintk("%s: i=%d dev_id=0x%Lx not an obsd_id\n", ++ __func__, i, obj_id->dev_id); ++ status = -EINVAL; ++ goto err; ++ } ++ if (lo_comp->oc_osd_version == PNFS_OSD_MISSING) { ++ dprintk("%s: degraded maps not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ pan_comp->avail_state = PAN_AGG_COMP_STATE_NORMAL; ++ if (lo_comp->oc_cap_key_sec != PNFS_OSD_CAP_KEY_SEC_NONE) { ++ dprintk("%s: cap key security not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ ++ p = lo_comp->oc_cap.cred; ++ panfs_export_ops->sm_sec_t_unmarshall( ++ (pan_sm_sec_otw_t *)&p, ++ pan_sec, ++ buf, ++ alloc_sz, ++ NULL, ++ &local_sz); ++ buf += local_sz; ++ alloc_sz -= local_sz; ++ ++ lo_comp++; ++ pan_comp++; ++ pan_sec++; ++ } ++ ++ *outp = mcs; ++ dprintk("%s:Return mcs=%p\n", __func__, mcs); ++ return 0; ++ ++err: ++ objio_free_lseg(mcs); ++ dprintk("%s:Error %d\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Free a Panasas map and caps type ++ */ ++void ++objio_free_lseg(void *p) ++{ ++ kfree(p); ++} ++ ++/* ++ * I/O routines ++ */ ++int ++objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct panfs_shim_io_state *p; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ p = kzalloc(sizeof(*p), GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ *outp = &p->ol_state; ++ return 0; ++} ++ ++/* ++ * Free an I/O state ++ */ ++void ++objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ int i; ++ ++ dprintk("%s: freeing io_state\n", __func__); ++ for (i = 0; i < state->ol_state.nr_pages; i++) ++ kunmap(state->ol_state.pages[i]); ++ ++ if (state->ucreds) ++ panfs_export_ops->ucreds_put(state->ucreds); ++ kfree(state->sg_list); ++ kfree(state); ++} ++ ++static int ++panfs_shim_pages_to_sg( ++ struct panfs_shim_io_state *state, ++ struct page **pages, ++ unsigned int pgbase, ++ unsigned nr_pages, ++ size_t count) ++{ ++ unsigned i, n; ++ pan_sg_entry_t *sg; ++ ++ dprintk("%s pgbase %u nr_pages %u count %d " ++ "pg0 %p flags 0x%x index %llu\n", ++ __func__, pgbase, nr_pages, (int)count, pages[0], ++ (unsigned)pages[0]->flags, (unsigned long long)pages[0]->index); ++ ++ sg = kmalloc(nr_pages * sizeof(*sg), GFP_KERNEL); ++ if (sg == NULL) ++ return -ENOMEM; ++ ++ dprintk("%s sg_list %p pages %p pgbase %u nr_pages %u\n", ++ __func__, sg, pages, pgbase, nr_pages); ++ ++ for (i = 0; i < nr_pages; i++) { ++ sg[i].buffer = (char *)kmap(pages[i]) + pgbase; ++ n = PAGE_SIZE - pgbase; ++ pgbase = 0; ++ if (n > count) ++ n = count; ++ sg[i].chunk_size = n; ++ count -= n; ++ if (likely(count)) { ++ sg[i].next = &sg[i+1]; ++ } else { ++ /* we're done */ ++ sg[i].next = NULL; ++ break; ++ } ++ } ++ BUG_ON(count); ++ ++ state->sg_list = sg; ++ return 0; ++} ++ ++/* ++ * Callback function for async reads ++ */ ++static void ++panfs_shim_read_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.read.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++ status = res_p->length; ++ WARN_ON(status < 0); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_read rc %d: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_read_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ struct objlayout_segment *lseg = LSEG_LD_DATA(ol_state->lseg); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)lseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.read.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.read.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_read(PAN_SAM_ACCESS_BYPASS_TIMESTAMP, ++ &state->u.read.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_read_done, ++ state, NULL, ++ &state->u.read.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_read_done(state, NULL, &state->u.read.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Callback function for async writes ++ */ ++static void ++panfs_shim_write_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.write.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++/* state->ol_state.committed = NFS_FILE_SYNC;*/ ++ state->ol_state.committed = NFS_UNSTABLE; ++ status = res_p->length; ++ WARN_ON(status < 0); ++ ++ objlayout_add_delta_space_used(&state->ol_state, ++ res_p->delta_capacity_used); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_write rc %u: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_write_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable /* unused, PanOSD writes are stable */) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ struct objlayout_segment *lseg = LSEG_LD_DATA(ol_state->lseg); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)lseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.write.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.write.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_write(PAN_SAM_ACCESS_NONE, ++ &state->u.write.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_write_done, ++ state, ++ NULL, ++ &state->u.write.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_write_done(state, NULL, &state->u.write.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++int ++panfs_shim_register(struct panfs_export_operations *ops) ++{ ++ if (panfs_export_ops) { ++ printk(KERN_INFO ++ "%s: panfs already registered (panfs ops %p)\n", ++ __func__, panfs_export_ops); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: registering panfs ops %p\n", ++ __func__, ops); ++ ++ panfs_export_ops = ops; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_register); ++ ++int ++panfs_shim_unregister(void) ++{ ++ if (!panfs_export_ops) { ++ printk(KERN_INFO "%s: panfs is not registered\n", __func__); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: unregistering panfs ops %p\n", ++ __func__, panfs_export_ops); ++ ++ panfs_export_ops = NULL; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_unregister); ++ ++/* ++ * Policy Operations ++ */ ++ ++/* ++ * Return the stripe size for the specified file ++ */ ++ssize_t ++panlayout_get_stripesize(struct pnfs_layout_hdr *pnfslay) ++{ ++ ssize_t sz, maxsz = -1; ++ struct pnfs_layout_segment *lseg; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ list_for_each_entry(lseg, &pnfslay->segs, fi_list) { ++ int n; ++ struct objlayout_segment *panlseg = LSEG_LD_DATA(lseg); ++ struct pnfs_osd_layout *lo = ++ (struct pnfs_osd_layout *)panlseg->pnfs_osd_layout; ++ struct pnfs_osd_data_map *map = &lo->olo_map; ++ ++ n = map->odm_group_width; ++ if (n == 0) ++ n = map->odm_num_comps / (map->odm_mirror_cnt + 1); ++ ++ switch (map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_5: ++ n -= 1; ++ n *= 8; /* FIXME: until we have 2-D coalescing */ ++ break; ++ ++ case PNFS_OSD_RAID_PQ: ++ n -= 2; ++ break; ++ ++ default: ++ BUG_ON(1); ++ } ++ sz = map->odm_stripe_unit * n; ++ if (sz > maxsz) ++ maxsz = sz; ++ } ++ dprintk("%s: Return %Zd\n", __func__, maxsz); ++ return maxsz; ++} ++ ++#define PANLAYOUT_DEF_STRIPE_UNIT (64*1024) ++#define PANLAYOUT_DEF_STRIPE_WIDTH 9 ++#define PANLAYOUT_MAX_STRIPE_WIDTH 11 ++#define PANLAYOUT_MAX_GATHER_STRIPES 8 ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++panlayout_get_blocksize(void) ++{ ++ ssize_t sz = (PANLAYOUT_MAX_STRIPE_WIDTH-1) * ++ PANLAYOUT_DEF_STRIPE_UNIT * ++ PANLAYOUT_MAX_GATHER_STRIPES; ++ dprintk("%s: Return %Zd\n", __func__, sz); ++ return sz; ++} ++ ++static struct layoutdriver_policy_operations panlayout_policy_operations = { ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ .get_stripesize = panlayout_get_stripesize, ++ .get_blocksize = panlayout_get_blocksize, ++}; ++ ++#define PNFS_LAYOUT_PANOSD (NFS4_PNFS_PRIVATE_LAYOUT | LAYOUT_OSD2_OBJECTS) ++ ++static struct pnfs_layoutdriver_type panlayout_type = { ++ .id = PNFS_LAYOUT_PANOSD, ++ .name = "PNFS_LAYOUT_PANOSD", ++ .ld_io_ops = &objlayout_io_operations, ++ .ld_policy_ops = &panlayout_policy_operations, ++}; ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for Panasas OSDs"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++panlayout_init(void) ++{ ++ pnfs_client_ops = pnfs_register_layoutdriver(&panlayout_type); ++ printk(KERN_INFO "%s: Registered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++ return 0; ++} ++ ++static void __exit ++panlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&panlayout_type); ++ printk(KERN_INFO "%s: Unregistered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(panlayout_init); ++module_exit(panlayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-31 20:42:05.537124598 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-31 20:42:05.537124598 -0400 +@@ -0,0 +1,482 @@ ++/* ++ * panfs_shim.h ++ * ++ * Data types and external function declerations for interfacing with ++ * panfs (Panasas DirectFlow) I/O stack ++ * ++ * Copyright (C) 2007 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#ifndef _PANLAYOUT_PANFS_SHIM_H ++#define _PANLAYOUT_PANFS_SHIM_H ++ ++typedef s8 pan_int8_t; ++typedef u8 pan_uint8_t; ++typedef s16 pan_int16_t; ++typedef u16 pan_uint16_t; ++typedef s32 pan_int32_t; ++typedef u32 pan_uint32_t; ++typedef s64 pan_int64_t; ++typedef u64 pan_uint64_t; ++ ++/* ++ * from pan_base_types.h ++ */ ++typedef pan_uint64_t pan_rpc_none_t; ++typedef pan_uint32_t pan_rpc_arrdim_t; ++typedef pan_uint32_t pan_status_t; ++typedef pan_uint8_t pan_otw_t; ++typedef pan_uint8_t pan_pad_t; ++ ++typedef pan_uint32_t pan_timespec_sec_t; ++typedef pan_uint32_t pan_timespec_nsec_t; ++ ++typedef struct pan_timespec_s pan_timespec_t; ++struct pan_timespec_s { ++ pan_timespec_sec_t ts_sec; ++ pan_timespec_nsec_t ts_nsec; ++}; ++ ++/* ++ * from pan_std_types.h ++ */ ++typedef pan_uint32_t pan_size_t; ++typedef int pan_bool_t; ++ ++/* ++ * from pan_common_error.h ++ */ ++#define PAN_SUCCESS ((pan_status_t)0) ++#define PAN_ERR_IN_PROGRESS ((pan_status_t)55) ++ ++/* ++ * from pan_sg.h ++ */ ++typedef struct pan_sg_entry_s pan_sg_entry_t; ++struct pan_sg_entry_s { ++ void *buffer; /* pointer to memory */ ++ pan_uint32_t chunk_size; /* size of each chunk (bytes) */ ++ pan_sg_entry_t *next; ++}; ++ ++/* ++ * from pan_storage.h ++ */ ++typedef pan_uint64_t pan_stor_dev_id_t; ++typedef pan_uint32_t pan_stor_obj_grp_id_t; ++typedef pan_uint64_t pan_stor_obj_uniq_t; ++typedef pan_uint32_t pan_stor_action_t; ++typedef pan_uint8_t pan_stor_cap_key_t[20]; ++ ++typedef pan_uint8_t pan_stor_key_type_t; ++typedef pan_uint64_t pan_stor_len_t; ++typedef pan_int64_t pan_stor_delta_len_t; ++typedef pan_uint64_t pan_stor_offset_t; ++typedef pan_uint16_t pan_stor_op_t; ++ ++typedef pan_uint16_t pan_stor_sec_level_t; ++ ++struct pan_stor_obj_id_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_obj_uniq_t obj_id; ++ pan_stor_obj_grp_id_t grp_id; ++}; ++ ++typedef struct pan_stor_obj_id_s pan_stor_obj_id_t; ++ ++#define PAN_STOR_OP_NONE ((pan_stor_op_t) 0U) ++#define PAN_STOR_OP_READ ((pan_stor_op_t) 8U) ++#define PAN_STOR_OP_WRITE ((pan_stor_op_t) 9U) ++#define PAN_STOR_OP_APPEND ((pan_stor_op_t) 10U) ++#define PAN_STOR_OP_GETATTR ((pan_stor_op_t) 11U) ++#define PAN_STOR_OP_SETATTR ((pan_stor_op_t) 12U) ++#define PAN_STOR_OP_FLUSH ((pan_stor_op_t) 13U) ++#define PAN_STOR_OP_CLEAR ((pan_stor_op_t) 14U) ++ ++/* ++ * from pan_aggregation_map.h ++ */ ++typedef pan_uint8_t pan_agg_type_t; ++typedef pan_uint64_t pan_agg_map_version_t; ++typedef pan_uint8_t pan_agg_obj_state_t; ++typedef pan_uint8_t pan_agg_comp_state_t; ++typedef pan_uint8_t pan_agg_comp_flag_t; ++ ++#define PAN_AGG_OBJ_STATE_INVALID ((pan_agg_obj_state_t) 0x00) ++#define PAN_AGG_OBJ_STATE_NORMAL ((pan_agg_obj_state_t) 0x01) ++#define PAN_AGG_OBJ_STATE_DEGRADED ((pan_agg_obj_state_t) 0x02) ++#define PAN_AGG_OBJ_STATE_RECONSTRUCT ((pan_agg_obj_state_t) 0x03) ++#define PAN_AGG_OBJ_STATE_COPYBACK ((pan_agg_obj_state_t) 0x04) ++#define PAN_AGG_OBJ_STATE_UNAVAILABLE ((pan_agg_obj_state_t) 0x05) ++#define PAN_AGG_OBJ_STATE_CREATING ((pan_agg_obj_state_t) 0x06) ++#define PAN_AGG_OBJ_STATE_DELETED ((pan_agg_obj_state_t) 0x07) ++#define PAN_AGG_COMP_STATE_INVALID ((pan_agg_comp_state_t) 0x00) ++#define PAN_AGG_COMP_STATE_NORMAL ((pan_agg_comp_state_t) 0x01) ++#define PAN_AGG_COMP_STATE_UNAVAILABLE ((pan_agg_comp_state_t) 0x02) ++#define PAN_AGG_COMP_STATE_COPYBACK ((pan_agg_comp_state_t) 0x03) ++#define PAN_AGG_COMP_F_NONE ((pan_agg_comp_flag_t) 0x00) ++#define PAN_AGG_COMP_F_ATTR_STORING ((pan_agg_comp_flag_t) 0x01) ++#define PAN_AGG_COMP_F_OBJ_CORRUPT_OBS ((pan_agg_comp_flag_t) 0x02) ++#define PAN_AGG_COMP_F_TEMP ((pan_agg_comp_flag_t) 0x04) ++ ++struct pan_aggregation_map_s { ++ pan_agg_map_version_t version; ++ pan_agg_obj_state_t avail_state; ++ pan_stor_obj_id_t obj_id; ++}; ++ ++typedef struct pan_aggregation_map_s pan_aggregation_map_t; ++ ++struct pan_agg_comp_obj_s { ++ pan_stor_dev_id_t dev_id; ++ pan_agg_comp_state_t avail_state; ++ pan_agg_comp_flag_t comp_flags; ++}; ++ ++typedef struct pan_agg_comp_obj_s pan_agg_comp_obj_t; ++ ++struct pan_agg_simple_header_s { ++ pan_uint8_t unused; ++}; ++ ++typedef struct pan_agg_simple_header_s pan_agg_simple_header_t; ++ ++struct pan_agg_raid1_header_s { ++ pan_uint16_t num_comps; ++}; ++ ++typedef struct pan_agg_raid1_header_s pan_agg_raid1_header_t; ++ ++struct pan_agg_raid0_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++}; ++ ++typedef struct pan_agg_raid0_header_s pan_agg_raid0_header_t; ++ ++struct pan_agg_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit0; ++ pan_uint32_t stripe_unit1; ++ pan_uint32_t stripe_unit2; ++}; ++ ++typedef struct pan_agg_raid5_left_header_s pan_agg_raid5_left_header_t; ++ ++typedef struct pan_agg_grp_raid5_left_header_s pan_agg_grp_raid5_left_header_t; ++ ++struct pan_agg_grp_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++ pan_uint16_t rg_width; ++ pan_uint16_t rg_depth; ++ pan_uint8_t group_layout_policy; ++}; ++ ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_INVALID ((pan_uint8_t) 0x00) ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN ((pan_uint8_t) 0x01) ++ ++#define PAN_AGG_NULL_MAP ((pan_agg_type_t) 0x00) ++#define PAN_AGG_SIMPLE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_RAID1 ((pan_agg_type_t) 0x02) ++#define PAN_AGG_RAID0 ((pan_agg_type_t) 0x03) ++#define PAN_AGG_RAID5_LEFT ((pan_agg_type_t) 0x04) ++#define PAN_AGG_GRP_RAID5_LEFT ((pan_agg_type_t) 0x06) ++#define PAN_AGG_MINTYPE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_MAXTYPE ((pan_agg_type_t) 0x06) ++ ++struct pan_agg_layout_hdr_s { ++ pan_agg_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_uint64_t null; ++ pan_agg_simple_header_t simple; ++ pan_agg_raid1_header_t raid1; ++ pan_agg_raid0_header_t raid0; ++ pan_agg_raid5_left_header_t raid5_left; ++ pan_agg_grp_raid5_left_header_t grp_raid5_left; ++ } hdr; ++}; ++ ++typedef struct pan_agg_layout_hdr_s pan_agg_layout_hdr_t; ++ ++struct pan_agg_comp_obj_a_s { ++ pan_rpc_arrdim_t size; ++ pan_agg_comp_obj_t *data; ++}; ++typedef struct pan_agg_comp_obj_a_s pan_agg_comp_obj_a; ++ ++struct pan_agg_full_map_s { ++ pan_aggregation_map_t map_hdr; ++ pan_agg_layout_hdr_t layout_hdr; ++ pan_agg_comp_obj_a components; ++}; ++ ++typedef struct pan_agg_full_map_s pan_agg_full_map_t; ++ ++/* ++ * from pan_obsd_rpc_types.h ++ */ ++typedef pan_uint8_t pan_obsd_security_key_a[16]; ++ ++typedef pan_uint8_t pan_obsd_capability_key_a[20]; ++ ++typedef pan_uint8_t pan_obsd_key_holder_id_t; ++ ++#define PAN_OBSD_KEY_HOLDER_BASIS_KEY ((pan_obsd_key_holder_id_t) 0x01) ++#define PAN_OBSD_KEY_HOLDER_CAP_KEY ((pan_obsd_key_holder_id_t) 0x02) ++ ++struct pan_obsd_key_holder_s { ++ pan_obsd_key_holder_id_t select; ++ pan_pad_t pad[3]; ++ union { ++ pan_obsd_security_key_a basis_key; ++ pan_obsd_capability_key_a cap_key; ++ } key; ++}; ++ ++typedef struct pan_obsd_key_holder_s pan_obsd_key_holder_t; ++ ++/* ++ * from pan_sm_sec.h ++ */ ++typedef pan_uint8_t pan_sm_sec_type_t; ++typedef pan_uint8_t pan_sm_sec_otw_allo_mode_t; ++ ++struct pan_obsd_capability_generic_otw_t_s { ++ pan_rpc_arrdim_t size; ++ pan_uint8_t *data; ++}; ++typedef struct pan_obsd_capability_generic_otw_t_s ++ pan_obsd_capability_generic_otw_t; ++ ++struct pan_sm_sec_obsd_s { ++ pan_obsd_key_holder_t key; ++ pan_obsd_capability_generic_otw_t cap_otw; ++ pan_sm_sec_otw_allo_mode_t allo_mode; ++}; ++ ++typedef struct pan_sm_sec_obsd_s pan_sm_sec_obsd_t; ++ ++struct pan_sm_sec_s { ++ pan_sm_sec_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_rpc_none_t none; ++ pan_sm_sec_obsd_t obsd; ++ } variant; ++}; ++ ++typedef struct pan_sm_sec_s pan_sm_sec_t; ++ ++struct pan_sm_sec_a_s { ++ pan_rpc_arrdim_t size; ++ pan_sm_sec_t *data; ++}; ++typedef struct pan_sm_sec_a_s pan_sm_sec_a; ++typedef pan_otw_t *pan_sm_sec_otw_t; ++ ++/* ++ * from pan_sm_types.h ++ */ ++typedef pan_uint64_t pan_sm_cap_handle_t; ++ ++struct pan_sm_map_cap_s { ++ pan_agg_full_map_t full_map; ++ pan_stor_offset_t offset; ++ pan_stor_len_t length; ++ pan_sm_sec_a secs; ++ pan_sm_cap_handle_t handle; ++ pan_timespec_t expiration_time; ++ pan_stor_action_t action_mask; ++ pan_uint32_t flags; ++}; ++ ++typedef struct pan_sm_map_cap_s pan_sm_map_cap_t; ++ ++/* ++ * from pan_sm_ops.h ++ */ ++typedef pan_rpc_none_t pan_sm_cache_ptr_t; ++ ++/* ++ * from pan_sam_api.h ++ */ ++typedef pan_uint32_t pan_sam_access_flags_t; ++ ++typedef struct pan_sam_dev_error_s pan_sam_dev_error_t; ++struct pan_sam_dev_error_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_op_t stor_op; ++ pan_status_t error; ++}; ++ ++typedef struct pan_sam_ext_status_s pan_sam_ext_status_t; ++struct pan_sam_ext_status_s { ++ pan_uint32_t available; ++ pan_uint32_t size; ++ pan_sam_dev_error_t *errors; ++}; ++ ++enum pan_sam_rpc_sec_sel_e { ++ PAN_SAM_RPC_SEC_DEFAULT, ++ PAN_SAM_RPC_SEC_ATLEAST, ++ PAN_SAM_RPC_SEC_EXACTLY ++}; ++typedef enum pan_sam_rpc_sec_sel_e pan_sam_rpc_sec_sel_t; ++ ++typedef struct pan_sam_obj_sec_s pan_sam_obj_sec_t; ++struct pan_sam_obj_sec_s { ++ pan_stor_sec_level_t min_security; ++ pan_sm_map_cap_t *map_ccaps; ++}; ++ ++typedef struct pan_sam_rpc_sec_s pan_sam_rpc_sec_t; ++struct pan_sam_rpc_sec_s { ++ pan_sam_rpc_sec_sel_t selector; ++}; ++ ++typedef struct pan_sam_read_args_s pan_sam_read_args_t; ++struct pan_sam_read_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ void *return_attr; ++ void *checksum; ++ pan_stor_offset_t offset; ++ pan_uint16_t sm_options; ++ void *callout; ++ void *callout_arg; ++}; ++ ++typedef struct pan_sam_read_res_s pan_sam_read_res_t; ++struct pan_sam_read_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ void *attr; ++ void *checksum; ++}; ++ ++typedef void (*pan_sam_read_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t status); ++ ++#define PAN_SAM_ACCESS_NONE 0x0000 ++#define PAN_SAM_ACCESS_BYPASS_TIMESTAMP 0x0020 ++ ++typedef struct pan_sam_write_args_s pan_sam_write_args_t; ++struct pan_sam_write_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ pan_stor_offset_t offset; ++ void *attr; ++ void *return_attr; ++}; ++ ++typedef struct pan_sam_write_res_s pan_sam_write_res_t; ++struct pan_sam_write_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ pan_stor_delta_len_t delta_capacity_used; ++ pan_bool_t parity_dirty; ++ void *attr; ++}; ++ ++typedef void (*pan_sam_write_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t status); ++ ++/* ++ * from pan_mgr_types.h ++ */ ++#define PAN_MGR_ID_TYPE_SHIFT 56 ++#define PAN_MGR_ID_TYPE_MASK ((pan_mgr_id_t)18374686479671623680ULL) ++#define PAN_MGR_ID_UNIQ_MASK ((pan_mgr_id_t)72057594037927935ULL) ++ ++typedef pan_uint16_t pan_mgr_type_t; ++typedef pan_uint64_t pan_mgr_id_t; ++ ++#define PAN_MGR_SM ((pan_mgr_type_t) 2U) ++#define PAN_MGR_OBSD ((pan_mgr_type_t) 6U) ++ ++/* ++ * from pan_mgr_types_c.h ++ */ ++#define pan_mgr_id_construct_artificial(_mgr_type_, _mgr_uniq_, _mgr_id_p_) { \ ++ pan_mgr_id_t _id1, _id2; \ ++\ ++ _id1 = (_mgr_type_); \ ++ _id1 <<= PAN_MGR_ID_TYPE_SHIFT; \ ++ _id1 &= PAN_MGR_ID_TYPE_MASK; \ ++ _id2 = (_mgr_uniq_); \ ++ _id2 &= PAN_MGR_ID_UNIQ_MASK; \ ++ _id1 |= _id2; \ ++ *(_mgr_id_p_) = _id1; \ ++} ++ ++/* ++ * from pan_storage_c.h ++ */ ++#define pan_stor_is_device_id_an_obsd_id(_device_id_) \ ++ ((((_device_id_) & PAN_MGR_ID_TYPE_MASK) >> PAN_MGR_ID_TYPE_SHIFT) \ ++ == PAN_MGR_OBSD) ++ ++/* ++ * pnfs_shim internal definitions ++ */ ++ ++struct panfs_shim_io_state { ++ struct objlayout_io_state ol_state; ++ ++ pan_sg_entry_t *sg_list; ++ pan_sam_obj_sec_t obj_sec; ++ void *ucreds; ++ union { ++ struct { ++ pan_sam_read_args_t args; ++ pan_sam_read_res_t res; ++ } read; ++ struct { ++ pan_sam_write_args_t args; ++ pan_sam_write_res_t res; ++ } write; ++ } u; ++}; ++ ++#endif /* _PANLAYOUT_PANFS_SHIM_H */ +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-31 20:42:05.538121971 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-31 20:42:05.538121971 -0400 +@@ -0,0 +1,435 @@ ++/* ++ * pnfs_osd_xdr.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* ++ * The following implementation is based on these Internet Drafts: ++ * ++ * draft-ietf-nfsv4-minorversion-21 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_objid(u32 *p, struct pnfs_osd_objid *objid) ++{ ++ COPYMEM(objid->oid_device_id.data, sizeof(objid->oid_device_id.data)); ++ READ64(objid->oid_partition_id); ++ READ64(objid->oid_object_id); ++ return p; ++} ++ ++static inline u32 * ++pnfs_osd_xdr_decode_opaque_cred(u32 *p, ++ struct pnfs_osd_opaque_cred *opaque_cred) ++{ ++ READ32(opaque_cred->cred_len); ++ COPYMEM(opaque_cred->cred, opaque_cred->cred_len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_object_cred(u32 *p, struct pnfs_osd_object_cred *comp, ++ u8 **credp) ++{ ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_objid(p, &comp->oc_object_id); ++ READ32(comp->oc_osd_version); ++ READ32(comp->oc_cap_key_sec); ++ ++ cred = *credp; ++ comp->oc_cap_key.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap_key); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap_key.cred_len)); ++ comp->oc_cap.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap.cred_len)); ++ *credp = cred; ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_data_map(u32 *p, struct pnfs_osd_data_map *data_map) ++{ ++ READ32(data_map->odm_num_comps); ++ READ64(data_map->odm_stripe_unit); ++ READ32(data_map->odm_group_width); ++ READ32(data_map->odm_group_depth); ++ READ32(data_map->odm_mirror_cnt); ++ READ32(data_map->odm_raid_algorithm); ++ dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " ++ "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", ++ __func__, ++ data_map->odm_num_comps, ++ (unsigned long long)data_map->odm_stripe_unit, ++ data_map->odm_group_width, ++ data_map->odm_group_depth, ++ data_map->odm_mirror_cnt, ++ data_map->odm_raid_algorithm); ++ return p; ++} ++ ++struct pnfs_osd_layout * ++pnfs_osd_xdr_decode_layout(struct pnfs_osd_layout *layout, u32 *p) ++{ ++ int i; ++ u32 *start = p; ++ struct pnfs_osd_object_cred *comp; ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_data_map(p, &layout->olo_map); ++ READ32(layout->olo_comps_index); ++ READ32(layout->olo_num_comps); ++ layout->olo_comps = (struct pnfs_osd_object_cred *)(layout + 1); ++ comp = layout->olo_comps; ++ cred = (u8 *)(comp + layout->olo_num_comps); ++ dprintk("%s: comps_index=%u num_comps=%u\n", ++ __func__, layout->olo_comps_index, layout->olo_num_comps); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ p = pnfs_osd_xdr_decode_object_cred(p, comp, &cred); ++ dprintk("%s: comp[%d]=dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "key_len=%u cap_len=%u\n", ++ __func__, i, ++ _DEVID_LO(&comp->oc_object_id.oid_device_id), ++ _DEVID_HI(&comp->oc_object_id.oid_device_id), ++ comp->oc_object_id.oid_partition_id, ++ comp->oc_object_id.oid_object_id, ++ comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); ++ comp++; ++ } ++ dprintk("%s: xdr_size=%Zd end=%p in_core_size=%Zd\n", __func__, ++ (char *)p - (char *)start, cred, (char *)cred - (char *)layout); ++ return layout; ++} ++ ++/* ++ * Get Device Information Decoding ++ * ++ * Note: since Device Information is currently done synchronously, most ++ * of the actual fields are left inside the rpc buffer and are only ++ * pointed to by the pnfs_osd_deviceaddr members. So the read buffer ++ * should not be freed while the returned information is in use. ++ */ ++ ++u32 *__xdr_read_calc_nfs4_string( ++ u32 *p, struct nfs4_string *str, u8 **freespace) ++{ ++ u32 len; ++ char *data; ++ bool need_copy; ++ ++ READ32(len); ++ data = (char *)p; ++ ++ if (data[len]) { /* Not null terminated we'll need extra space */ ++ data = *freespace; ++ *freespace += len + 1; ++ need_copy = true; ++ } else { ++ need_copy = false; ++ } ++ ++ if (str) { ++ str->len = len; ++ str->data = data; ++ if (need_copy) { ++ memcpy(data, p, len); ++ data[len] = 0; ++ } ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++u32 *__xdr_read_calc_u8_opaque( ++ u32 *p, struct nfs4_string *str) ++{ ++ u32 len; ++ ++ READ32(len); ++ ++ if (str) { ++ str->len = len; ++ str->data = (char *)p; ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetid { ++ * u32 oti_type; ++ * struct nfs4_string oti_scsi_device_id; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetid( ++ u32 *p, struct pnfs_osd_targetid* targetid, u8 **freespace) ++{ ++ u32 oti_type; ++ ++ READ32(oti_type); ++ if (targetid) ++ targetid->oti_type = oti_type; ++ ++ switch (oti_type) { ++ case OBJ_TARGET_SCSI_NAME: ++ case OBJ_TARGET_SCSI_DEVICE_ID: ++ p = __xdr_read_calc_u8_opaque(p, ++ targetid ? &targetid->oti_scsi_device_id : NULL); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_net_addr { ++ * struct nfs4_string r_netid; ++ * struct nfs4_string r_addr; ++ * }; ++ */ ++u32 *__xdr_read_calc_net_addr( ++ u32 *p, struct pnfs_osd_net_addr* netaddr, u8 **freespace) ++{ ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_netid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_addr : NULL, ++ freespace); ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetaddr { ++ * u32 ota_available; ++ * struct pnfs_osd_net_addr ota_netaddr; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetaddr( ++ u32 *p, struct pnfs_osd_targetaddr *targetaddr, u8 **freespace) ++{ ++ u32 ota_available; ++ ++ READ32(ota_available); ++ if (targetaddr) ++ targetaddr->ota_available = ota_available; ++ ++ if (ota_available) { ++ p = __xdr_read_calc_net_addr(p, ++ targetaddr ? &targetaddr->ota_netaddr : NULL, ++ freespace); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++u32 *__xdr_read_calc_deviceaddr( ++ u32 *p, struct pnfs_osd_deviceaddr *deviceaddr, u8 **freespace) ++{ ++ p = __xdr_read_calc_targetid(p, ++ deviceaddr ? &deviceaddr->oda_targetid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_targetaddr(p, ++ deviceaddr ? &deviceaddr->oda_targetaddr : NULL, ++ freespace); ++ ++ if (deviceaddr) ++ COPYMEM(deviceaddr->oda_lun, sizeof(deviceaddr->oda_lun)); ++ else ++ p += XDR_QUADLEN(sizeof(deviceaddr->oda_lun)); ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_systemid : NULL); ++ ++ if (deviceaddr) { ++ p = pnfs_osd_xdr_decode_object_cred(p, ++ &deviceaddr->oda_root_obj_cred, freespace); ++ } else { ++ *freespace += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_osdname : NULL); ++ ++ return p; ++} ++ ++size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p) ++{ ++ u8 *null_freespace = NULL; ++ size_t sz; ++ ++ __xdr_read_calc_deviceaddr(p, NULL, &null_freespace); ++ sz = sizeof(struct pnfs_osd_deviceaddr) + (size_t)null_freespace; ++ ++ return sz; ++} ++ ++void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p) ++{ ++ u8 *freespace = (u8 *)(deviceaddr + 1); ++ ++ __xdr_read_calc_deviceaddr(p, deviceaddr, &freespace); ++} ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou) ++{ ++ __be32 *p = xdr_reserve_space(xdr, 16); ++ ++ if (!p) ++ return -E2BIG; ++ ++ *p++ = cpu_to_be32(lou->dsu_valid); ++ if (lou->dsu_valid) ++ p = xdr_encode_hyper(p, lou->dsu_delta); ++ *p++ = cpu_to_be32(lou->olu_ioerr_flag); ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ */ ++static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p; ++ ++ p = xdr_reserve_space(xdr, 32); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, ++ sizeof(object_id->oid_device_id.data)); ++ p = xdr_encode_hyper(p, object_id->oid_partition_id); ++ p = xdr_encode_hyper(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, ++ struct pnfs_osd_ioerr *ioerr) ++{ ++ __be32 *p; ++ int ret; ++ ++ ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component); ++ if (ret) ++ return ret; ++ ++ p = xdr_reserve_space(xdr, 24); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_hyper(p, ioerr->oer_comp_offset); ++ p = xdr_encode_hyper(p, ioerr->oer_comp_length); ++ *p++ = cpu_to_be32(ioerr->oer_iswrite); ++ *p = cpu_to_be32(ioerr->oer_errno); ++ ++ return 0; ++} +diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/pagelist.c +--- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-08-31 20:41:19.162150222 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-31 20:42:05.539131687 -0400 +@@ -20,6 +20,7 @@ + #include + + #include "internal.h" ++#include "pnfs.h" + + static struct kmem_cache *nfs_page_cachep; + +@@ -56,7 +57,8 @@ nfs_page_free(struct nfs_page *p) + struct nfs_page * + nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, + struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + +@@ -79,7 +81,11 @@ nfs_create_request(struct nfs_open_conte + req->wb_pgbase = offset; + req->wb_bytes = count; + req->wb_context = get_nfs_open_context(ctx); ++ req->wb_lock_context = nfs_get_lock_context(ctx); + kref_init(&req->wb_kref); ++ req->wb_lseg = lseg; ++ if (lseg) ++ get_lseg(lseg); + return req; + } + +@@ -141,18 +147,26 @@ void nfs_clear_request(struct nfs_page * + { + struct page *page = req->wb_page; + struct nfs_open_context *ctx = req->wb_context; ++ struct nfs_lock_context *l_ctx = req->wb_lock_context; + + if (page != NULL) { + page_cache_release(page); + req->wb_page = NULL; + } ++ if (l_ctx != NULL) { ++ nfs_put_lock_context(l_ctx); ++ req->wb_lock_context = NULL; ++ } + if (ctx != NULL) { + put_nfs_open_context(ctx); + req->wb_context = NULL; + } ++ if (req->wb_lseg != NULL) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } + } + +- + /** + * nfs_release_request - Release the count on an NFS read/write request + * @req: request to release +@@ -231,11 +245,12 @@ void nfs_pageio_init(struct nfs_pageio_d + * Return 'true' if this is the case, else return 'false'. + */ + static int nfs_can_coalesce_requests(struct nfs_page *prev, +- struct nfs_page *req) ++ struct nfs_page *req, ++ struct nfs_pageio_descriptor *pgio) + { + if (req->wb_context->cred != prev->wb_context->cred) + return 0; +- if (req->wb_context->lockowner != prev->wb_context->lockowner) ++ if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) + return 0; + if (req->wb_context->state != prev->wb_context->state) + return 0; +@@ -245,6 +260,12 @@ static int nfs_can_coalesce_requests(str + return 0; + if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) + return 0; ++ if (req->wb_lseg != prev->wb_lseg) ++ return 0; ++#ifdef CONFIG_NFS_V4_1 ++ if (pgio->pg_test && !pgio->pg_test(pgio, prev, req)) ++ return 0; ++#endif /* CONFIG_NFS_V4_1 */ + return 1; + } + +@@ -277,7 +298,7 @@ static int nfs_pageio_do_add_request(str + if (newlen > desc->pg_bsize) + return 0; + prev = nfs_list_entry(desc->pg_list.prev); +- if (!nfs_can_coalesce_requests(prev, req)) ++ if (!nfs_can_coalesce_requests(prev, req, desc)) + return 0; + } else + desc->pg_base = req->wb_pgbase; +@@ -366,6 +387,7 @@ void nfs_pageio_cond_complete(struct nfs + * @idx_start: lower bound of page->index to scan + * @npages: idx_start + npages sets the upper bound to scan. + * @tag: tag to scan for ++ * @use_pnfs: will be set TRUE if commit needs to be handled by layout driver + * + * Moves elements from one of the inode request lists. + * If the number of requests is set to 0, the entire address_space +@@ -375,7 +397,7 @@ void nfs_pageio_cond_complete(struct nfs + */ + int nfs_scan_list(struct nfs_inode *nfsi, + struct list_head *dst, pgoff_t idx_start, +- unsigned int npages, int tag) ++ unsigned int npages, int tag, int *use_pnfs) + { + struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; + struct nfs_page *req; +@@ -406,6 +428,8 @@ int nfs_scan_list(struct nfs_inode *nfsi + radix_tree_tag_clear(&nfsi->nfs_page_tree, + req->wb_index, tag); + nfs_list_add_request(req, dst); ++ if (req->wb_lseg) ++ *use_pnfs = 1; + res++; + if (res == INT_MAX) + goto out; +diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs.c +--- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-08-31 20:42:05.541150301 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-08-31 20:42:05.541150301 -0400 +@@ -0,0 +1,2037 @@ ++/* ++ * linux/fs/nfs/pnfs.c ++ * ++ * pNFS functions to call and manage layout drivers. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "internal.h" ++#include "nfs4_fs.h" ++#include "pnfs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS ++ ++#define MIN_POOL_LC (4) ++ ++static int pnfs_initialized; ++ ++static void pnfs_free_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range); ++static inline void get_layout(struct pnfs_layout_hdr *lo); ++ ++/* Locking: ++ * ++ * pnfs_spinlock: ++ * protects pnfs_modules_tbl. ++ */ ++static spinlock_t pnfs_spinlock = __SPIN_LOCK_UNLOCKED(pnfs_spinlock); ++ ++/* ++ * pnfs_modules_tbl holds all pnfs modules ++ */ ++static struct list_head pnfs_modules_tbl; ++static struct kmem_cache *pnfs_cachep; ++static mempool_t *pnfs_layoutcommit_mempool; ++ ++static inline struct nfs4_layoutcommit_data *pnfs_layoutcommit_alloc(void) ++{ ++ struct nfs4_layoutcommit_data *p = ++ mempool_alloc(pnfs_layoutcommit_mempool, GFP_NOFS); ++ if (p) ++ memset(p, 0, sizeof(*p)); ++ ++ return p; ++} ++ ++void pnfs_layoutcommit_free(struct nfs4_layoutcommit_data *p) ++{ ++ mempool_free(p, pnfs_layoutcommit_mempool); ++} ++ ++/* ++ * struct pnfs_module - One per pNFS device module. ++ */ ++struct pnfs_module { ++ struct pnfs_layoutdriver_type *pnfs_ld_type; ++ struct list_head pnfs_tblid; ++}; ++ ++int ++pnfs_initialize(void) ++{ ++ INIT_LIST_HEAD(&pnfs_modules_tbl); ++ ++ pnfs_cachep = kmem_cache_create("nfs4_layoutcommit_data", ++ sizeof(struct nfs4_layoutcommit_data), ++ 0, SLAB_HWCACHE_ALIGN, NULL); ++ if (pnfs_cachep == NULL) ++ return -ENOMEM; ++ ++ pnfs_layoutcommit_mempool = mempool_create(MIN_POOL_LC, ++ mempool_alloc_slab, ++ mempool_free_slab, ++ pnfs_cachep); ++ if (pnfs_layoutcommit_mempool == NULL) { ++ kmem_cache_destroy(pnfs_cachep); ++ return -ENOMEM; ++ } ++ ++ pnfs_initialized = 1; ++ return 0; ++} ++ ++void pnfs_uninitialize(void) ++{ ++ mempool_destroy(pnfs_layoutcommit_mempool); ++ kmem_cache_destroy(pnfs_cachep); ++} ++ ++/* search pnfs_modules_tbl for right pnfs module */ ++static int ++find_pnfs(u32 id, struct pnfs_module **module) { ++ struct pnfs_module *local = NULL; ++ ++ dprintk("PNFS: %s: Searching for %u\n", __func__, id); ++ list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) { ++ if (local->pnfs_ld_type->id == id) { ++ *module = local; ++ return(1); ++ } ++ } ++ return 0; ++} ++ ++/* Set cred to indicate we require a layoutcommit ++ * If we don't even have a layout, we don't need to commit it. ++ */ ++void ++pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx) ++{ ++ dprintk("%s: has_layout=%d ctx=%p\n", __func__, has_layout(nfsi), ctx); ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (has_layout(nfsi) && ++ !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->state)) { ++ nfsi->layout->cred = get_rpccred(ctx->state->owner->so_cred); ++ __set_bit(NFS_INO_LAYOUTCOMMIT, ++ &nfsi->layout->state); ++ nfsi->change_attr++; ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ dprintk("%s: Set layoutcommit\n", __func__); ++ return; ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* Update last_write_offset for layoutcommit. ++ * TODO: We should only use commited extents, but the current nfs ++ * implementation does not calculate the written range in nfs_commit_done. ++ * We therefore update this field in writeback_done. ++ */ ++void ++pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent) ++{ ++ loff_t end_pos; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (offset < nfsi->layout->write_begin_pos) ++ nfsi->layout->write_begin_pos = offset; ++ end_pos = offset + extent - 1; /* I'm being inclusive */ ++ if (end_pos > nfsi->layout->write_end_pos) ++ nfsi->layout->write_end_pos = end_pos; ++ dprintk("%s: Wrote %lu@%lu bpos %lu, epos: %lu\n", ++ __func__, ++ (unsigned long) extent, ++ (unsigned long) offset , ++ (unsigned long) nfsi->layout->write_begin_pos, ++ (unsigned long) nfsi->layout->write_end_pos); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* Unitialize a mountpoint in a layout driver */ ++void ++unmount_pnfs_layoutdriver(struct nfs_server *nfss) ++{ ++ if (PNFS_EXISTS_LDIO_OP(nfss, uninitialize_mountpoint)) ++ nfss->pnfs_curr_ld->ld_io_ops->uninitialize_mountpoint(nfss); ++} ++ ++/* ++ * Set the server pnfs module to the first registered pnfs_type. ++ * Only one pNFS layout driver is supported. ++ */ ++void ++set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, ++ u32 id) ++{ ++ struct pnfs_module *mod = NULL; ++ ++ if (server->pnfs_curr_ld) ++ return; ++ ++ if (!find_pnfs(id, &mod)) { ++ request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); ++ find_pnfs(id, &mod); ++ } ++ ++ if (!mod) { ++ dprintk("%s: No pNFS module found for %u. ", __func__, id); ++ goto out_err; ++ } ++ ++ server->pnfs_curr_ld = mod->pnfs_ld_type; ++ if (mod->pnfs_ld_type->ld_io_ops->initialize_mountpoint( ++ server, mntfh)) { ++ printk(KERN_ERR "%s: Error initializing mount point " ++ "for layout driver %u. ", __func__, id); ++ goto out_err; ++ } ++ ++ dprintk("%s: pNFS module for %u set\n", __func__, id); ++ return; ++ ++out_err: ++ dprintk("Using NFSv4 I/O\n"); ++ server->pnfs_curr_ld = NULL; ++} ++ ++/* Allow I/O module to set its functions structure */ ++struct pnfs_client_operations* ++pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) ++{ ++ struct pnfs_module *pnfs_mod; ++ struct layoutdriver_io_operations *io_ops = ld_type->ld_io_ops; ++ ++ if (!pnfs_initialized) { ++ printk(KERN_ERR "%s Registration failure. " ++ "pNFS not initialized.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops || !io_ops->alloc_layout || !io_ops->free_layout) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "alloc_layout and free_layout.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops->alloc_lseg || !io_ops->free_lseg) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "alloc_lseg and free_lseg.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops->read_pagelist || !io_ops->write_pagelist || ++ !io_ops->commit) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "read_pagelist, write_pagelist, and commit.\n", ++ __func__); ++ return NULL; ++ } ++ ++ pnfs_mod = kmalloc(sizeof(struct pnfs_module), GFP_KERNEL); ++ if (pnfs_mod != NULL) { ++ dprintk("%s Registering id:%u name:%s\n", ++ __func__, ++ ld_type->id, ++ ld_type->name); ++ pnfs_mod->pnfs_ld_type = ld_type; ++ INIT_LIST_HEAD(&pnfs_mod->pnfs_tblid); ++ ++ spin_lock(&pnfs_spinlock); ++ list_add(&pnfs_mod->pnfs_tblid, &pnfs_modules_tbl); ++ spin_unlock(&pnfs_spinlock); ++ } ++ ++ return &pnfs_ops; ++} ++ ++/* Allow I/O module to set its functions structure */ ++void ++pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) ++{ ++ struct pnfs_module *pnfs_mod; ++ ++ if (find_pnfs(ld_type->id, &pnfs_mod)) { ++ dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); ++ spin_lock(&pnfs_spinlock); ++ list_del(&pnfs_mod->pnfs_tblid); ++ spin_unlock(&pnfs_spinlock); ++ kfree(pnfs_mod); ++ } ++} ++ ++/* ++ * pNFS client layout cache ++ */ ++#if defined(CONFIG_SMP) ++#define BUG_ON_UNLOCKED_INO(ino) \ ++ BUG_ON(!spin_is_locked(&ino->i_lock)) ++#define BUG_ON_UNLOCKED_LO(lo) \ ++ BUG_ON_UNLOCKED_INO(PNFS_INODE(lo)) ++#else /* CONFIG_SMP */ ++#define BUG_ON_UNLOCKED_INO(lo) do {} while (0) ++#define BUG_ON_UNLOCKED_LO(lo) do {} while (0) ++#endif /* CONFIG_SMP */ ++ ++static inline void ++get_layout(struct pnfs_layout_hdr *lo) ++{ ++ BUG_ON_UNLOCKED_LO(lo); ++ lo->refcount++; ++} ++ ++static inline void ++put_layout_locked(struct pnfs_layout_hdr *lo) ++{ ++ BUG_ON_UNLOCKED_LO(lo); ++ BUG_ON(lo->refcount <= 0); ++ ++ lo->refcount--; ++ if (!lo->refcount) { ++ struct layoutdriver_io_operations *io_ops = PNFS_LD_IO_OPS(lo); ++ struct nfs_inode *nfsi = PNFS_NFS_INODE(lo); ++ ++ dprintk("%s: freeing layout cache %p\n", __func__, lo); ++ WARN_ON(!list_empty(&lo->layouts)); ++ io_ops->free_layout(lo); ++ nfsi->layout = NULL; ++ } ++} ++ ++void ++put_layout(struct inode *inode) ++{ ++ spin_lock(&inode->i_lock); ++ put_layout_locked(NFS_I(inode)->layout); ++ spin_unlock(&inode->i_lock); ++ ++} ++ ++void ++pnfs_layout_release(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range) ++{ ++ struct nfs_inode *nfsi = PNFS_NFS_INODE(lo); ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (range) ++ pnfs_free_layout(lo, range); ++ /* ++ * Matched in _pnfs_update_layout for layoutget ++ * and by get_layout in _pnfs_return_layout for layoutreturn ++ */ ++ put_layout_locked(lo); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ wake_up_all(&nfsi->lo_waitq); ++} ++ ++void ++pnfs_destroy_layout(struct nfs_inode *nfsi) ++{ ++ struct pnfs_layout_hdr *lo; ++ struct pnfs_layout_range range = { ++ .iomode = IOMODE_ANY, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ lo = nfsi->layout; ++ if (lo) { ++ pnfs_free_layout(lo, &range); ++ WARN_ON(!list_empty(&nfsi->layout->segs)); ++ WARN_ON(!list_empty(&nfsi->layout->layouts)); ++ ++ if (nfsi->layout->refcount != 1) ++ printk(KERN_WARNING "%s: layout refcount not=1 %d\n", ++ __func__, nfsi->layout->refcount); ++ WARN_ON(nfsi->layout->refcount != 1); ++ ++ /* Matched by refcount set to 1 in alloc_init_layout */ ++ put_layout_locked(lo); ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* ++ * Called by the state manger to remove all layouts established under an ++ * expired lease. ++ */ ++void ++pnfs_destroy_all_layouts(struct nfs_client *clp) ++{ ++ struct pnfs_layout_hdr *lo; ++ ++ while (!list_empty(&clp->cl_layouts)) { ++ lo = list_entry(clp->cl_layouts.next, struct pnfs_layout_hdr, ++ layouts); ++ dprintk("%s freeing layout for inode %lu\n", __func__, ++ lo->inode->i_ino); ++ pnfs_destroy_layout(NFS_I(lo->inode)); ++ } ++} ++ ++static inline void ++init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) ++{ ++ INIT_LIST_HEAD(&lseg->fi_list); ++ kref_init(&lseg->kref); ++ lseg->valid = true; ++ lseg->layout = lo; ++} ++ ++static void ++destroy_lseg(struct kref *kref) ++{ ++ struct pnfs_layout_segment *lseg = ++ container_of(kref, struct pnfs_layout_segment, kref); ++ ++ dprintk("--> %s\n", __func__); ++ /* Matched by get_layout in pnfs_insert_layout */ ++ put_layout_locked(lseg->layout); ++ PNFS_LD_IO_OPS(lseg->layout)->free_lseg(lseg); ++} ++ ++static void ++put_lseg_locked(struct pnfs_layout_segment *lseg) ++{ ++ bool do_wake_up; ++ struct nfs_inode *nfsi; ++ ++ if (!lseg) ++ return; ++ ++ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, ++ atomic_read(&lseg->kref.refcount), lseg->valid); ++ do_wake_up = !lseg->valid; ++ nfsi = PNFS_NFS_INODE(lseg->layout); ++ kref_put(&lseg->kref, destroy_lseg); ++ if (do_wake_up) ++ wake_up(&nfsi->lo_waitq); ++} ++ ++void ++put_lseg(struct pnfs_layout_segment *lseg) ++{ ++ bool do_wake_up; ++ struct nfs_inode *nfsi; ++ ++ if (!lseg) ++ return; ++ ++ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, ++ atomic_read(&lseg->kref.refcount), lseg->valid); ++ do_wake_up = !lseg->valid; ++ nfsi = PNFS_NFS_INODE(lseg->layout); ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ kref_put(&lseg->kref, destroy_lseg); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ if (do_wake_up) ++ wake_up(&nfsi->lo_waitq); ++} ++EXPORT_SYMBOL(put_lseg); ++ ++void get_lseg(struct pnfs_layout_segment *lseg) ++{ ++ kref_get(&lseg->kref); ++} ++EXPORT_SYMBOL(get_lseg); ++ ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end: NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1: NFS4_MAX_UINT64; ++} ++ ++/* ++ * is l2 fully contained in l1? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_contained(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (start1 <= start2) && (end1 >= end2); ++} ++ ++/* ++ * is l1 and l2 intersecting? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_intersecting(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (end1 == NFS4_MAX_UINT64 || end1 > start2) && ++ (end2 == NFS4_MAX_UINT64 || end2 > start1); ++} ++ ++void ++pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, ++ const nfs4_stateid *stateid) ++{ ++ write_seqlock(&lo->seqlock); ++ memcpy(lo->stateid.u.data, stateid->u.data, sizeof(lo->stateid.u.data)); ++ write_sequnlock(&lo->seqlock); ++} ++ ++void ++pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo) ++{ ++ int seq; ++ ++ dprintk("--> %s\n", __func__); ++ ++ do { ++ seq = read_seqbegin(&lo->seqlock); ++ memcpy(dst->u.data, lo->stateid.u.data, ++ sizeof(lo->stateid.u.data)); ++ } while (read_seqretry(&lo->seqlock, seq)); ++ ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void ++pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo, ++ struct nfs4_state *state) ++{ ++ int seq; ++ ++ dprintk("--> %s\n", __func__); ++ ++ write_seqlock(&lo->seqlock); ++ if (!memcmp(lo->stateid.u.data, &zero_stateid, NFS4_STATEID_SIZE)) ++ do { ++ seq = read_seqbegin(&state->seqlock); ++ memcpy(lo->stateid.u.data, state->stateid.u.data, ++ sizeof(state->stateid.u.data)); ++ } while (read_seqretry(&state->seqlock, seq)); ++ write_sequnlock(&lo->seqlock); ++ dprintk("<-- %s\n", __func__); ++} ++ ++/* ++* Get layout from server. ++* for now, assume that whole file layouts are requested. ++* arg->offset: 0 ++* arg->length: all ones ++*/ ++static int ++send_layoutget(struct inode *ino, ++ struct nfs_open_context *ctx, ++ struct pnfs_layout_range *range, ++ struct pnfs_layout_segment **lsegpp, ++ struct pnfs_layout_hdr *lo) ++{ ++ int status; ++ struct nfs_server *server = NFS_SERVER(ino); ++ struct nfs4_layoutget *lgp; ++ ++ dprintk("--> %s\n", __func__); ++ ++ lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); ++ if (lgp == NULL) { ++ pnfs_layout_release(lo, NULL); ++ return -ENOMEM; ++ } ++ lgp->args.minlength = NFS4_MAX_UINT64; ++ lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; ++ lgp->args.range.iomode = range->iomode; ++ lgp->args.range.offset = 0; ++ lgp->args.range.length = NFS4_MAX_UINT64; ++ lgp->args.type = server->pnfs_curr_ld->id; ++ lgp->args.inode = ino; ++ lgp->lsegpp = lsegpp; ++ ++ if (!memcmp(lo->stateid.u.data, &zero_stateid, NFS4_STATEID_SIZE)) { ++ struct nfs_open_context *oldctx = ctx; ++ ++ if (!oldctx) { ++ ctx = nfs_find_open_context(ino, NULL, ++ (range->iomode == IOMODE_READ) ? ++ FMODE_READ: FMODE_WRITE); ++ BUG_ON(!ctx); ++ } ++ /* Set the layout stateid from the open stateid */ ++ pnfs_layout_from_open_stateid(NFS_I(ino)->layout, ctx->state); ++ if (!oldctx) ++ put_nfs_open_context(ctx); ++ } ++ ++ /* Retrieve layout information from server */ ++ status = nfs4_proc_layoutget(lgp); ++ ++ dprintk("<-- %s status %d\n", __func__, status); ++ return status; ++} ++ ++/* ++ * iomode matching rules: ++ * range lseg match ++ * ----- ----- ----- ++ * ANY READ true ++ * ANY RW true ++ * RW READ false ++ * RW RW true ++ * READ READ true ++ * READ RW false ++ */ ++static inline int ++should_free_lseg(struct pnfs_layout_segment *lseg, ++ struct pnfs_layout_range *range) ++{ ++ return (range->iomode == IOMODE_ANY || ++ lseg->range.iomode == range->iomode) && ++ lo_seg_intersecting(&lseg->range, range); ++} ++ ++static struct pnfs_layout_segment * ++has_layout_to_return(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range) ++{ ++ struct pnfs_layout_segment *out = NULL, *lseg; ++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", ++ __func__, lo, range->offset, range->length, range->iomode); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry (lseg, &lo->segs, fi_list) ++ if (should_free_lseg(lseg, range)) { ++ out = lseg; ++ break; ++ } ++ ++ dprintk("%s:Return lseg=%p\n", __func__, out); ++ return out; ++} ++ ++static inline bool ++_pnfs_can_return_lseg(struct pnfs_layout_segment *lseg) ++{ ++ return atomic_read(&lseg->kref.refcount) == 1; ++} ++ ++ ++static void ++pnfs_free_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range) ++{ ++ struct pnfs_layout_segment *lseg, *next; ++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", ++ __func__, lo, range->offset, range->length, range->iomode); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry_safe (lseg, next, &lo->segs, fi_list) { ++ if (!should_free_lseg(lseg, range) || ++ !_pnfs_can_return_lseg(lseg)) ++ continue; ++ dprintk("%s: freeing lseg %p iomode %d " ++ "offset %llu length %llu\n", __func__, ++ lseg, lseg->range.iomode, lseg->range.offset, ++ lseg->range.length); ++ list_del(&lseg->fi_list); ++ put_lseg_locked(lseg); ++ } ++ if (list_empty(&lo->segs)) { ++ struct nfs_client *clp; ++ ++ clp = PNFS_NFS_SERVER(lo)->nfs_client; ++ spin_lock(&clp->cl_lock); ++ list_del_init(&lo->layouts); ++ spin_unlock(&clp->cl_lock); ++ pnfs_set_layout_stateid(lo, &zero_stateid); ++ } ++ ++ dprintk("%s:Return\n", __func__); ++} ++ ++static bool ++pnfs_return_layout_barrier(struct nfs_inode *nfsi, ++ struct pnfs_layout_range *range) ++{ ++ struct pnfs_layout_segment *lseg; ++ bool ret = false; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ list_for_each_entry(lseg, &nfsi->layout->segs, fi_list) { ++ if (!should_free_lseg(lseg, range)) ++ continue; ++ lseg->valid = false; ++ if (!_pnfs_can_return_lseg(lseg)) { ++ dprintk("%s: wait on lseg %p refcount %d\n", ++ __func__, lseg, ++ atomic_read(&lseg->kref.refcount)); ++ ret = true; ++ } ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ dprintk("%s:Return %d\n", __func__, ret); ++ return ret; ++} ++ ++static int ++return_layout(struct inode *ino, struct pnfs_layout_range *range, ++ enum pnfs_layoutreturn_type type, struct pnfs_layout_hdr *lo, ++ bool wait) ++{ ++ struct nfs4_layoutreturn *lrp; ++ struct nfs_server *server = NFS_SERVER(ino); ++ int status = -ENOMEM; ++ ++ dprintk("--> %s\n", __func__); ++ ++ BUG_ON(type != RETURN_FILE); ++ ++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); ++ if (lrp == NULL) { ++ if (lo && (type == RETURN_FILE)) ++ pnfs_layout_release(lo, NULL); ++ goto out; ++ } ++ lrp->args.reclaim = 0; ++ lrp->args.layout_type = server->pnfs_curr_ld->id; ++ lrp->args.return_type = type; ++ lrp->args.range = *range; ++ lrp->args.inode = ino; ++ ++ status = nfs4_proc_layoutreturn(lrp, wait); ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; ++} ++ ++int ++_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ struct pnfs_layout_hdr *lo = NULL; ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_range arg; ++ int status = 0; ++ ++ dprintk("--> %s type %d\n", __func__, type); ++ ++ ++ arg.iomode = range ? range->iomode : IOMODE_ANY; ++ arg.offset = 0; ++ arg.length = NFS4_MAX_UINT64; ++ ++ if (type == RETURN_FILE) { ++ spin_lock(&ino->i_lock); ++ lo = nfsi->layout; ++ if (lo && !has_layout_to_return(lo, &arg)) { ++ lo = NULL; ++ } ++ if (!lo) { ++ spin_unlock(&ino->i_lock); ++ dprintk("%s: no layout segments to return\n", __func__); ++ goto out; ++ } ++ ++ /* Reference for layoutreturn matched in pnfs_layout_release */ ++ get_layout(lo); ++ ++ spin_unlock(&ino->i_lock); ++ ++ if (pnfs_return_layout_barrier(nfsi, &arg)) { ++ if (stateid) { /* callback */ ++ status = -EAGAIN; ++ goto out_put; ++ } ++ dprintk("%s: waiting\n", __func__); ++ wait_event(nfsi->lo_waitq, ++ !pnfs_return_layout_barrier(nfsi, &arg)); ++ } ++ ++ if (layoutcommit_needed(nfsi)) { ++ if (stateid && !wait) { /* callback */ ++ dprintk("%s: layoutcommit pending\n", __func__); ++ status = -EAGAIN; ++ goto out_put; ++ } ++ status = pnfs_layoutcommit_inode(ino, wait); ++ if (status) { ++ /* Return layout even if layoutcommit fails */ ++ dprintk("%s: layoutcommit failed, status=%d. " ++ "Returning layout anyway\n", ++ __func__, status); ++ } ++ } ++ ++ if (!stateid) ++ status = return_layout(ino, &arg, type, lo, wait); ++ else ++ pnfs_layout_release(lo, &arg); ++ } ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; ++out_put: ++ put_layout(ino); ++ goto out; ++} ++ ++/* ++ * cmp two layout segments for sorting into layout cache ++ */ ++static inline s64 ++cmp_layout(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) ++{ ++ s64 d; ++ ++ /* higher offset > lower offset */ ++ d = l1->offset - l2->offset; ++ if (d) ++ return d; ++ ++ /* longer length > shorter length */ ++ d = l1->length - l2->length; ++ if (d) ++ return d; ++ ++ /* read > read/write */ ++ return (int)(l1->iomode == IOMODE_READ) - ++ (int)(l2->iomode == IOMODE_READ); ++} ++ ++static void ++pnfs_insert_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct pnfs_layout_segment *lp; ++ int found = 0; ++ ++ dprintk("%s:Begin\n", __func__); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ if (list_empty(&lo->segs)) { ++ struct nfs_client *clp = PNFS_NFS_SERVER(lo)->nfs_client; ++ ++ spin_lock(&clp->cl_lock); ++ BUG_ON(!list_empty(&lo->layouts)); ++ list_add_tail(&lo->layouts, &clp->cl_layouts); ++ spin_unlock(&clp->cl_lock); ++ } ++ list_for_each_entry (lp, &lo->segs, fi_list) { ++ if (cmp_layout(&lp->range, &lseg->range) > 0) ++ continue; ++ list_add_tail(&lseg->fi_list, &lp->fi_list); ++ dprintk("%s: inserted lseg %p " ++ "iomode %d offset %llu length %llu before " ++ "lp %p iomode %d offset %llu length %llu\n", ++ __func__, lseg, lseg->range.iomode, ++ lseg->range.offset, lseg->range.length, ++ lp, lp->range.iomode, lp->range.offset, ++ lp->range.length); ++ found = 1; ++ break; ++ } ++ if (!found) { ++ list_add_tail(&lseg->fi_list, &lo->segs); ++ dprintk("%s: inserted lseg %p " ++ "iomode %d offset %llu length %llu at tail\n", ++ __func__, lseg, lseg->range.iomode, ++ lseg->range.offset, lseg->range.length); ++ } ++ get_layout(lo); ++ ++ dprintk("%s:Return\n", __func__); ++} ++ ++/* ++ * Each layoutdriver embeds pnfs_layout_hdr as the first field in it's ++ * per-layout type layout cache structure and returns it ZEROed ++ * from layoutdriver_io_ops->alloc_layout ++ */ ++static struct pnfs_layout_hdr * ++alloc_init_layout(struct inode *ino) ++{ ++ struct pnfs_layout_hdr *lo; ++ struct layoutdriver_io_operations *io_ops; ++ ++ io_ops = NFS_SERVER(ino)->pnfs_curr_ld->ld_io_ops; ++ lo = io_ops->alloc_layout(ino); ++ if (!lo) { ++ printk(KERN_ERR ++ "%s: out of memory: io_ops->alloc_layout failed\n", ++ __func__); ++ return NULL; ++ } ++ lo->refcount = 1; ++ INIT_LIST_HEAD(&lo->layouts); ++ INIT_LIST_HEAD(&lo->segs); ++ seqlock_init(&lo->seqlock); ++ lo->inode = ino; ++ return lo; ++} ++ ++/* ++ * Retrieve and possibly allocate the inode layout ++ * ++ * ino->i_lock must be taken by the caller. ++ */ ++static struct pnfs_layout_hdr * ++pnfs_alloc_layout(struct inode *ino) ++{ ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_hdr *new = NULL; ++ ++ dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); ++ ++ BUG_ON_UNLOCKED_INO(ino); ++ if (likely(nfsi->layout)) ++ return nfsi->layout; ++ ++ spin_unlock(&ino->i_lock); ++ new = alloc_init_layout(ino); ++ spin_lock(&ino->i_lock); ++ ++ if (likely(nfsi->layout == NULL)) { /* Won the race? */ ++ nfsi->layout = new; ++ } else if (new) { ++ /* Reference the layout accross i_lock release and grab */ ++ get_layout(nfsi->layout); ++ spin_unlock(&ino->i_lock); ++ NFS_SERVER(ino)->pnfs_curr_ld->ld_io_ops->free_layout(new); ++ spin_lock(&ino->i_lock); ++ put_layout_locked(nfsi->layout); ++ } ++ return nfsi->layout; ++} ++ ++/* ++ * iomode matching rules: ++ * range lseg match ++ * ----- ----- ----- ++ * ANY READ true ++ * ANY RW true ++ * RW READ false ++ * RW RW true ++ * READ READ true ++ * READ RW true ++ */ ++static inline int ++has_matching_lseg(struct pnfs_layout_segment *lseg, ++ struct pnfs_layout_range *range) ++{ ++ struct pnfs_layout_range range1; ++ ++ if ((range->iomode == IOMODE_RW && lseg->range.iomode != IOMODE_RW) || ++ !lo_seg_intersecting(&lseg->range, range)) ++ return 0; ++ ++ /* range1 covers only the first byte in the range */ ++ range1 = *range; ++ range1.length = 1; ++ return lo_seg_contained(&lseg->range, &range1); ++} ++ ++/* ++ * lookup range in layout ++ */ ++static struct pnfs_layout_segment * ++pnfs_has_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range, ++ bool take_ref, ++ bool only_valid) ++{ ++ struct pnfs_layout_segment *lseg, *ret = NULL; ++ ++ dprintk("%s:Begin\n", __func__); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry (lseg, &lo->segs, fi_list) { ++ if (has_matching_lseg(lseg, range) && ++ (lseg->valid || !only_valid)) { ++ ret = lseg; ++ if (take_ref) ++ get_lseg(ret); ++ break; ++ } ++ if (cmp_layout(range, &lseg->range) > 0) ++ break; ++ } ++ ++ dprintk("%s:Return lseg %p take_ref %d ref %d valid %d\n", ++ __func__, ret, take_ref, ++ ret ? atomic_read(&ret->kref.refcount) : 0, ++ ret ? ret->valid : 0); ++ return ret; ++} ++ ++/* Update the file's layout for the given range and iomode. ++ * Layout is retreived from the server if needed. ++ * If lsegpp is given, the appropriate layout segment is referenced and ++ * returned to the caller. ++ */ ++void ++_pnfs_update_layout(struct inode *ino, ++ struct nfs_open_context *ctx, ++ loff_t pos, ++ u64 count, ++ enum pnfs_iomode iomode, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ struct pnfs_layout_range arg = { ++ .iomode = iomode, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_hdr *lo; ++ struct pnfs_layout_segment *lseg = NULL; ++ bool take_ref = (lsegpp != NULL); ++ ++ if (take_ref) ++ *lsegpp = NULL; ++ spin_lock(&ino->i_lock); ++ lo = pnfs_alloc_layout(ino); ++ if (lo == NULL) { ++ dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); ++ goto out_unlock; ++ } ++ ++ /* Check to see if the layout for the given range already exists */ ++ lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref); ++ if (lseg && !lseg->valid) { ++ if (take_ref) ++ put_lseg_locked(lseg); ++ /* someone is cleaning the layout */ ++ lseg = NULL; ++ goto out_unlock; ++ } ++ ++ if (lseg) { ++ dprintk("%s: Using cached lseg %p for %llu@%llu iomode %d)\n", ++ __func__, ++ lseg, ++ arg.length, ++ arg.offset, ++ arg.iomode); ++ ++ goto out_unlock; ++ } ++ ++ /* if get layout already failed once goto out */ ++ if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state)) { ++ if (unlikely(nfsi->pnfs_layout_suspend && ++ get_seconds() >= nfsi->pnfs_layout_suspend)) { ++ dprintk("%s: layout_get resumed\n", __func__); ++ clear_bit(lo_fail_bit(iomode), ++ &nfsi->layout->state); ++ nfsi->pnfs_layout_suspend = 0; ++ } else ++ goto out_unlock; ++ } ++ ++ /* Reference the layout for layoutget matched in pnfs_layout_release */ ++ get_layout(lo); ++ spin_unlock(&ino->i_lock); ++ ++ send_layoutget(ino, ctx, &arg, lsegpp, lo); ++out: ++ dprintk("%s end, state 0x%lx lseg %p\n", __func__, ++ nfsi->layout->state, lseg); ++ return; ++out_unlock: ++ if (lsegpp) ++ *lsegpp = lseg; ++ spin_unlock(&ino->i_lock); ++ goto out; ++} ++ ++void ++pnfs_get_layout_done(struct nfs4_layoutget *lgp, int rpc_status) ++{ ++ struct pnfs_layout_segment *lseg = NULL; ++ struct nfs_inode *nfsi = NFS_I(lgp->args.inode); ++ time_t suspend = 0; ++ ++ dprintk("-->%s\n", __func__); ++ ++ lgp->status = rpc_status; ++ if (likely(!rpc_status)) { ++ if (unlikely(lgp->res.layout.len < 0)) { ++ printk(KERN_ERR ++ "%s: ERROR Returned layout size is ZERO\n", __func__); ++ lgp->status = -EIO; ++ } ++ goto out; ++ } ++ ++ dprintk("%s: ERROR retrieving layout %d\n", __func__, rpc_status); ++ switch (rpc_status) { ++ case -NFS4ERR_BADLAYOUT: ++ lgp->status = -ENOENT; ++ /* FALLTHROUGH */ ++ case -EACCES: /* NFS4ERR_ACCESS */ ++ /* transient error, don't mark with NFS_INO_LAYOUT_FAILED */ ++ goto out; ++ ++ case -NFS4ERR_LAYOUTTRYLATER: ++ case -NFS4ERR_RECALLCONFLICT: ++ case -NFS4ERR_OLD_STATEID: ++ case -EAGAIN: /* NFS4ERR_LOCKED */ ++ lgp->status = -NFS4ERR_DELAY; /* for nfs4_handle_exception */ ++ /* FALLTHROUGH */ ++ case -NFS4ERR_GRACE: ++ case -NFS4ERR_DELAY: ++ goto out; ++ ++ case -NFS4ERR_ADMIN_REVOKED: ++ case -NFS4ERR_DELEG_REVOKED: ++ /* The layout is expected to be returned at this point. ++ * This should clear the layout stateid as well */ ++ suspend = get_seconds() + 1; ++ break; ++ ++ case -NFS4ERR_LAYOUTUNAVAILABLE: ++ lgp->status = -ENOTSUPP; ++ break; ++ ++ case -NFS4ERR_REP_TOO_BIG: ++ case -NFS4ERR_REP_TOO_BIG_TO_CACHE: ++ lgp->status = -E2BIG; ++ break; ++ ++ /* Leave the following errors untranslated */ ++ case -NFS4ERR_DEADSESSION: ++ case -NFS4ERR_DQUOT: ++ case -EINVAL: /* NFS4ERR_INVAL */ ++ case -EIO: /* NFS4ERR_IO */ ++ case -NFS4ERR_FHEXPIRED: ++ case -NFS4ERR_MOVED: ++ case -NFS4ERR_NOSPC: ++ case -ESERVERFAULT: /* NFS4ERR_SERVERFAULT */ ++ case -ESTALE: /* NFS4ERR_STALE */ ++ case -ETOOSMALL: /* NFS4ERR_TOOSMALL */ ++ break; ++ ++ /* The following errors are our fault and should never happen */ ++ case -NFS4ERR_BADIOMODE: ++ case -NFS4ERR_BADXDR: ++ case -NFS4ERR_REQ_TOO_BIG: ++ case -NFS4ERR_UNKNOWN_LAYOUTTYPE: ++ case -NFS4ERR_WRONG_TYPE: ++ lgp->status = -EINVAL; ++ /* FALLTHROUGH */ ++ case -NFS4ERR_BAD_STATEID: ++ case -NFS4ERR_NOFILEHANDLE: ++ case -ENOTSUPP: /* NFS4ERR_NOTSUPP */ ++ case -NFS4ERR_OPENMODE: ++ case -NFS4ERR_OP_NOT_IN_SESSION: ++ case -NFS4ERR_TOO_MANY_OPS: ++ dprintk("%s: error %d: should never happen\n", __func__, ++ rpc_status); ++ break; ++ ++ /* The following errors are the server's fault */ ++ default: ++ dprintk("%s: illegal error %d\n", __func__, rpc_status); ++ lgp->status = -EIO; ++ break; ++ } ++ ++ /* remember that get layout failed and suspend trying */ ++ nfsi->pnfs_layout_suspend = suspend; ++ set_bit(lo_fail_bit(lgp->args.range.iomode), ++ &nfsi->layout->state); ++ dprintk("%s: layout_get suspended until %ld\n", ++ __func__, suspend); ++out: ++ dprintk("%s end (err:%d) state 0x%lx lseg %p\n", ++ __func__, lgp->status, nfsi->layout->state, lseg); ++ return; ++} ++ ++int ++pnfs_layout_process(struct nfs4_layoutget *lgp) ++{ ++ struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; ++ struct nfs4_layoutget_res *res = &lgp->res; ++ struct pnfs_layout_segment *lseg; ++ struct inode *ino = PNFS_INODE(lo); ++ int status = 0; ++ ++ /* Inject layout blob into I/O device driver */ ++ lseg = PNFS_LD_IO_OPS(lo)->alloc_lseg(lo, res); ++ if (!lseg || IS_ERR(lseg)) { ++ if (!lseg) ++ status = -ENOMEM; ++ else ++ status = PTR_ERR(lseg); ++ dprintk("%s: Could not allocate layout: error %d\n", ++ __func__, status); ++ goto out; ++ } ++ ++ spin_lock(&ino->i_lock); ++ init_lseg(lo, lseg); ++ lseg->range = res->range; ++ if (lgp->lsegpp) { ++ get_lseg(lseg); ++ *lgp->lsegpp = lseg; ++ } ++ pnfs_insert_layout(lo, lseg); ++ ++ if (res->return_on_close) { ++ lo->roc_iomode |= res->range.iomode; ++ if (!lo->roc_iomode) ++ lo->roc_iomode = IOMODE_ANY; ++ } ++ ++ /* Done processing layoutget. Set the layout stateid */ ++ pnfs_set_layout_stateid(lo, &res->stateid); ++ spin_unlock(&ino->i_lock); ++out: ++ return status; ++} ++ ++void ++readahead_range(struct inode *inode, struct list_head *pages, loff_t *offset, ++ size_t *count) ++{ ++ struct page *first, *last; ++ loff_t foff, i_size = i_size_read(inode); ++ pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; ++ size_t range; ++ ++ ++ first = list_entry((pages)->prev, struct page, lru); ++ last = list_entry((pages)->next, struct page, lru); ++ ++ foff = (loff_t)first->index << PAGE_CACHE_SHIFT; ++ ++ range = (last->index - first->index) * PAGE_CACHE_SIZE; ++ if (last->index == end_index) ++ range += ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; ++ else ++ range += PAGE_CACHE_SIZE; ++ dprintk("%s foff %lu, range %Zu\n", __func__, (unsigned long)foff, ++ range); ++ *offset = foff; ++ *count = range; ++} ++ ++void ++pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio) ++{ ++ struct pnfs_layout_hdr *lo; ++ struct pnfs_layoutdriver_type *ld; ++ ++ pgio->pg_test = NULL; ++ ++ lo = NFS_I(inode)->layout; ++ ld = NFS_SERVER(inode)->pnfs_curr_ld; ++ if (!pnfs_enabled_sb(NFS_SERVER(inode)) || !lo) ++ return; ++ ++ if (ld->ld_policy_ops) ++ pgio->pg_test = ld->ld_policy_ops->pg_test; ++} ++ ++static u32 ++pnfs_getboundary(struct inode *inode) ++{ ++ u32 stripe_size = 0; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct layoutdriver_policy_operations *policy_ops; ++ ++ if (!nfss->pnfs_curr_ld) ++ goto out; ++ ++ policy_ops = nfss->pnfs_curr_ld->ld_policy_ops; ++ if (!policy_ops || !policy_ops->get_stripesize) ++ goto out; ++ ++ /* The default is to not gather across stripes */ ++ if (pnfs_ld_gather_across_stripes(nfss->pnfs_curr_ld)) ++ goto out; ++ ++ spin_lock(&inode->i_lock); ++ if (NFS_I(inode)->layout) ++ stripe_size = policy_ops->get_stripesize(NFS_I(inode)->layout); ++ spin_unlock(&inode->i_lock); ++out: ++ return stripe_size; ++} ++ ++/* ++ * rsize is already set by caller to MDS rsize. ++ */ ++void ++pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, ++ struct inode *inode, ++ struct nfs_open_context *ctx, ++ struct list_head *pages, ++ size_t *rsize) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ size_t count = 0; ++ loff_t loff; ++ ++ pgio->pg_iswrite = 0; ++ pgio->pg_boundary = 0; ++ pgio->pg_test = NULL; ++ pgio->pg_lseg = NULL; ++ ++ if (!pnfs_enabled_sb(nfss)) ++ return; ++ ++ /* Calculate the total read-ahead count */ ++ readahead_range(inode, pages, &loff, &count); ++ ++ if (count > 0) { ++ _pnfs_update_layout(inode, ctx, loff, count, IOMODE_READ, ++ &pgio->pg_lseg); ++ if (!pgio->pg_lseg) ++ return; ++ ++ *rsize = NFS_SERVER(inode)->ds_rsize; ++ pgio->pg_boundary = pnfs_getboundary(inode); ++ if (pgio->pg_boundary) ++ pnfs_set_pg_test(inode, pgio); ++ } ++} ++ ++void ++pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, ++ size_t *wsize) ++{ ++ struct nfs_server *server = NFS_SERVER(inode); ++ ++ pgio->pg_iswrite = 1; ++ if (!pnfs_enabled_sb(server)) { ++ pgio->pg_boundary = 0; ++ pgio->pg_test = NULL; ++ return; ++ } ++ pgio->pg_boundary = pnfs_getboundary(inode); ++ pnfs_set_pg_test(inode, pgio); ++ *wsize = server->ds_wsize; ++} ++ ++/* Return I/O buffer size for a layout driver ++ * This value will determine what size reads and writes ++ * will be gathered into and sent to the data servers. ++ * blocksize must be a multiple of the page cache size. ++ */ ++unsigned int ++pnfs_getiosize(struct nfs_server *server) ++{ ++ if (!PNFS_EXISTS_LDPOLICY_OP(server, get_blocksize)) ++ return 0; ++ return server->pnfs_curr_ld->ld_policy_ops->get_blocksize(); ++} ++ ++void ++pnfs_set_ds_iosize(struct nfs_server *server) ++{ ++ unsigned dssize = pnfs_getiosize(server); ++ ++ /* Set buffer size for data servers */ ++ if (dssize > 0) { ++ server->ds_rsize = server->ds_wsize = ++ nfs_block_size(dssize, NULL); ++ } else { ++ server->ds_wsize = server->wsize; ++ server->ds_rsize = server->rsize; ++ } ++} ++ ++static int ++pnfs_call_done(struct pnfs_call_data *pdata, struct rpc_task *task, void *data) ++{ ++ put_lseg(pdata->lseg); ++ pdata->lseg = NULL; ++ pdata->call_ops->rpc_call_done(task, data); ++ if (pdata->pnfs_error == -EAGAIN || task->tk_status == -EAGAIN) ++ return -EAGAIN; ++ if (pdata->pnfsflags & PNFS_NO_RPC) { ++ pdata->call_ops->rpc_release(data); ++ } else { ++ /* ++ * just restore original rpc call ops ++ * rpc_release will be called later by the rpc scheduling layer. ++ */ ++ task->tk_ops = pdata->call_ops; ++ } ++ return 0; ++} ++ ++/* Post-write completion function ++ * Invoked by all layout drivers when write_pagelist is done. ++ * ++ * NOTE: callers set data->pnfsflags PNFS_NO_RPC ++ * so that the NFS cleanup routines perform only the page cache ++ * cleanup. ++ */ ++static void ++pnfs_write_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ struct pnfs_layout_range range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = wdata->args.offset; ++ range.length = wdata->args.count; ++ _pnfs_return_layout(wdata->inode, &range, NULL, RETURN_FILE, true); ++ pnfs_initiate_write(wdata, NFS_CLIENT(wdata->inode), ++ wdata->pdata.call_ops, wdata->pdata.how); ++} ++ ++static void ++pnfs_writeback_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ /* update last write offset and need layout commit ++ * for non-files layout types (files layout calls ++ * pnfs4_write_done for this) ++ */ ++ if ((pdata->pnfsflags & PNFS_NO_RPC) && ++ data->task.tk_status >= 0 && data->res.count > 0) { ++ struct nfs_inode *nfsi = NFS_I(data->inode); ++ ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++ } ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_write_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++ ++static void _pnfs_clear_lseg_from_pages(struct list_head *head) ++{ ++ struct nfs_page *req; ++ ++ list_for_each_entry(req, head, wb_list) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } ++} ++ ++/* ++ * Call the appropriate parallel I/O subsystem write function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ * ++ * TODO: Is wdata->how and wdata->args.stable always the same value? ++ * TODO: It seems in NFS, the server may not do a stable write even ++ * though it was requested (and vice-versa?). To check, it looks ++ * in data->res.verf->committed. Do we need this ability ++ * for non-file layout drivers? ++ */ ++enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *wdata, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ struct inode *inode = wdata->inode; ++ enum pnfs_try_status trypnfs; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = wdata->req->wb_lseg; ++ ++ wdata->pdata.call_ops = call_ops; ++ wdata->pdata.pnfs_error = 0; ++ wdata->pdata.how = how; ++ ++ dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, ++ inode->i_ino, wdata->args.count, wdata->args.offset, how); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ wdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ wdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->write_pagelist(wdata, ++ nfs_page_array_len(wdata->args.pgbase, wdata->args.count), ++ how); ++ ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ wdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ wdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&wdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* Post-read completion function. Invoked by all layout drivers when ++ * read_pagelist is done ++ */ ++static void ++pnfs_read_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ struct pnfs_layout_range range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = rdata->args.offset; ++ range.length = rdata->args.count; ++ _pnfs_return_layout(rdata->inode, &range, NULL, RETURN_FILE, true); ++ pnfs_initiate_read(rdata, NFS_CLIENT(rdata->inode), ++ rdata->pdata.call_ops); ++} ++ ++static void ++pnfs_read_done(struct nfs_read_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_read_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++ ++/* ++ * Call the appropriate parallel I/O subsystem read function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ */ ++enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *rdata, ++ const struct rpc_call_ops *call_ops) ++{ ++ struct inode *inode = rdata->inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = rdata->req->wb_lseg; ++ enum pnfs_try_status trypnfs; ++ ++ rdata->pdata.call_ops = call_ops; ++ rdata->pdata.pnfs_error = 0; ++ ++ dprintk("%s: Reading ino:%lu %u@%llu\n", ++ __func__, inode->i_ino, rdata->args.count, rdata->args.offset); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ rdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ rdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->read_pagelist(rdata, ++ nfs_page_array_len(rdata->args.pgbase, rdata->args.count)); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ rdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ rdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&rdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_READ); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* ++ * This gives the layout driver an opportunity to read in page "around" ++ * the data to be written. It returns 0 on success, otherwise an error code ++ * which will either be passed up to user, or ignored if ++ * some previous part of write succeeded. ++ * Note the range [pos, pos+len-1] is entirely within the page. ++ */ ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata) ++{ ++ struct pnfs_fsdata *data; ++ int status = 0; ++ ++ dprintk("--> %s: pos=%llu len=%u\n", ++ __func__, (unsigned long long)pos, len); ++ data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL); ++ if (!data) { ++ status = -ENOMEM; ++ goto out; ++ } ++ data->lseg = lseg; /* refcount passed into data to be managed there */ ++ status = NFS_SERVER(inode)->pnfs_curr_ld->ld_io_ops->write_begin( ++ lseg, page, pos, len, data); ++ if (status) { ++ kfree(data); ++ data = NULL; ++ } ++out: ++ *fsdata = data; ++ dprintk("<-- %s: status=%d\n", __func__, status); ++ return status; ++} ++ ++/* Return 0 on succes, negative on failure */ ++/* CAREFUL - what happens if copied < len??? */ ++int _pnfs_write_end(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int status; ++ ++ status = nfss->pnfs_curr_ld->ld_io_ops->write_end(inode, page, ++ pos, len, copied, lseg); ++ return status; ++} ++ ++/* pNFS Commit callback function for all layout drivers */ ++static void ++pnfs_commit_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ struct pnfs_layout_range range = { ++ .iomode = IOMODE_RW, ++ .offset = data->args.offset, ++ .length = data->args.count, ++ }; ++ dprintk("%s: retrying\n", __func__); ++ _pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE, ++ true); ++ pnfs_initiate_commit(data, NFS_CLIENT(data->inode), ++ pdata->call_ops, pdata->how, 1); ++ } ++} ++ ++enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int sync) ++{ ++ struct inode *inode = data->inode; ++ struct nfs_server *nfss = NFS_SERVER(data->inode); ++ enum pnfs_try_status trypnfs; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ /* We need to account for possibility that ++ * each nfs_page can point to a different lseg (or be NULL). ++ * For the immediate case of whole-file-only layouts, we at ++ * least know there can be only a single lseg. ++ * We still have to account for the possibility of some being NULL. ++ * This will be done by passing the buck to the layout driver. ++ */ ++ data->pdata.call_ops = call_ops; ++ data->pdata.pnfs_error = 0; ++ data->pdata.how = sync; ++ data->pdata.lseg = NULL; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->commit(data, sync); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ data->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ _pnfs_clear_lseg_from_pages(&data->pages); ++ } else ++ nfs_inc_stats(inode, NFSIOS_PNFS_COMMIT); ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) ++{ ++ struct nfs_server *nfss = NFS_SERVER(data->args.inode); ++ ++ /* TODO: Maybe we should avoid this by allowing the layout driver ++ * to directly xdr its layout on the wire. ++ */ ++ if (nfss->pnfs_curr_ld->ld_io_ops->cleanup_layoutcommit) ++ nfss->pnfs_curr_ld->ld_io_ops->cleanup_layoutcommit( ++ NFS_I(data->args.inode)->layout, ++ &data->args, data->status); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int ++pnfs_layoutcommit_setup(struct inode *inode, ++ struct nfs4_layoutcommit_data *data, ++ loff_t write_begin_pos, loff_t write_end_pos) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int result = 0; ++ ++ dprintk("--> %s\n", __func__); ++ ++ data->args.inode = inode; ++ data->args.fh = NFS_FH(inode); ++ data->args.layout_type = nfss->pnfs_curr_ld->id; ++ data->res.fattr = &data->fattr; ++ nfs_fattr_init(&data->fattr); ++ ++ /* TODO: Need to determine the correct values */ ++ data->args.time_modify_changed = 0; ++ ++ /* Set values from inode so it can be reset ++ */ ++ data->args.range.iomode = IOMODE_RW; ++ data->args.range.offset = write_begin_pos; ++ data->args.range.length = write_end_pos - write_begin_pos + 1; ++ data->args.lastbytewritten = min(write_end_pos, ++ i_size_read(inode) - 1); ++ data->args.bitmask = nfss->attr_bitmask; ++ data->res.server = nfss; ++ ++ /* Call layout driver to set the arguments */ ++ if (nfss->pnfs_curr_ld->ld_io_ops->setup_layoutcommit) ++ result = nfss->pnfs_curr_ld->ld_io_ops->setup_layoutcommit( ++ NFS_I(inode)->layout, &data->args); ++ ++ dprintk("<-- %s Status %d\n", __func__, result); ++ return result; ++} ++ ++/* Issue a async layoutcommit for an inode. ++ */ ++int ++pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ struct nfs4_layoutcommit_data *data; ++ struct nfs_inode *nfsi = NFS_I(inode); ++ loff_t write_begin_pos; ++ loff_t write_end_pos; ++ ++ int status = 0; ++ ++ dprintk("%s Begin (sync:%d)\n", __func__, sync); ++ ++ BUG_ON(!has_layout(nfsi)); ++ ++ data = pnfs_layoutcommit_alloc(); ++ if (!data) ++ return -ENOMEM; ++ ++ spin_lock(&inode->i_lock); ++ if (!layoutcommit_needed(nfsi)) { ++ spin_unlock(&inode->i_lock); ++ goto out_free; ++ } ++ ++ /* Clear layoutcommit properties in the inode so ++ * new lc info can be generated ++ */ ++ write_begin_pos = nfsi->layout->write_begin_pos; ++ write_end_pos = nfsi->layout->write_end_pos; ++ data->cred = nfsi->layout->cred; ++ nfsi->layout->write_begin_pos = 0; ++ nfsi->layout->write_end_pos = 0; ++ nfsi->layout->cred = NULL; ++ __clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->state); ++ pnfs_get_layout_stateid(&data->args.stateid, nfsi->layout); ++ ++ /* Reference for layoutcommit matched in pnfs_layoutcommit_release */ ++ get_layout(NFS_I(inode)->layout); ++ ++ spin_unlock(&inode->i_lock); ++ ++ /* Set up layout commit args */ ++ status = pnfs_layoutcommit_setup(inode, data, write_begin_pos, ++ write_end_pos); ++ if (status) { ++ /* The layout driver failed to setup the layoutcommit */ ++ put_rpccred(data->cred); ++ put_layout(inode); ++ goto out_free; ++ } ++ status = nfs4_proc_layoutcommit(data, sync); ++out: ++ dprintk("%s end (err:%d)\n", __func__, status); ++ return status; ++out_free: ++ pnfs_layoutcommit_free(data); ++ goto out; ++} ++ ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata) ++{ ++ if (fsdata) { ++ /* lseg refcounting handled directly in nfs_Write_end */ ++ kfree(fsdata); ++ } ++} ++ ++/* Callback operations for layout drivers. ++ */ ++struct pnfs_client_operations pnfs_ops = { ++ .nfs_getdevicelist = nfs4_proc_getdevicelist, ++ .nfs_getdeviceinfo = nfs4_proc_getdeviceinfo, ++ .nfs_readlist_complete = pnfs_read_done, ++ .nfs_writelist_complete = pnfs_writeback_done, ++ .nfs_commit_complete = pnfs_commit_done, ++}; ++ ++EXPORT_SYMBOL(pnfs_unregister_layoutdriver); ++EXPORT_SYMBOL(pnfs_register_layoutdriver); ++ ++ ++/* Device ID cache. Supports one layout type per struct nfs_client */ ++int ++nfs4_alloc_init_deviceid_cache(struct nfs_client *clp, ++ void (*free_callback)(struct kref *)) ++{ ++ struct nfs4_deviceid_cache *c; ++ ++ c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL); ++ if (!c) ++ return -ENOMEM; ++ spin_lock(&clp->cl_lock); ++ if (clp->cl_devid_cache != NULL) { ++ kref_get(&clp->cl_devid_cache->dc_kref); ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [kref [%d]]\n", __func__, ++ atomic_read(&clp->cl_devid_cache->dc_kref.refcount)); ++ kfree(c); ++ } else { ++ int i; ++ ++ spin_lock_init(&c->dc_lock); ++ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE ; i++) ++ INIT_HLIST_HEAD(&c->dc_deviceids[i]); ++ kref_init(&c->dc_kref); ++ c->dc_free_callback = free_callback; ++ clp->cl_devid_cache = c; ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [new]\n", __func__); ++ } ++ return 0; ++} ++EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache); ++ ++void ++nfs4_init_deviceid_node(struct nfs4_deviceid *d) ++{ ++ INIT_HLIST_NODE(&d->de_node); ++ kref_init(&d->de_kref); ++} ++EXPORT_SYMBOL(nfs4_init_deviceid_node); ++ ++/* Called from layoutdriver_io_operations->alloc_lseg */ ++void ++nfs4_set_layout_deviceid(struct pnfs_layout_segment *l, struct nfs4_deviceid *d) ++{ ++ dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); ++ l->deviceid = d; ++} ++EXPORT_SYMBOL(nfs4_set_layout_deviceid); ++ ++/* Called from layoutdriver_io_operations->free_lseg */ ++void ++nfs4_put_unset_layout_deviceid(struct pnfs_layout_segment *l, ++ struct nfs4_deviceid *d, ++ void (*free_callback)(struct kref *)) ++{ ++ dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); ++ l->deviceid = NULL; ++ kref_put(&d->de_kref, free_callback); ++} ++EXPORT_SYMBOL(nfs4_put_unset_layout_deviceid); ++ ++/* Find and reference a deviceid */ ++struct nfs4_deviceid * ++nfs4_find_get_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ long hash = nfs4_deviceid_hash(id); ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ rcu_read_lock(); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) { ++ if (!atomic_inc_not_zero(&d->de_kref.refcount)) { ++ goto fail; ++ } else { ++ rcu_read_unlock(); ++ return d; ++ } ++ } ++ } ++fail: ++ rcu_read_unlock(); ++ return NULL; ++} ++EXPORT_SYMBOL(nfs4_find_get_deviceid); ++ ++/* ++ * Add and kref_get a deviceid. ++ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new ++ */ ++struct nfs4_deviceid * ++nfs4_add_get_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ long hash = nfs4_deviceid_hash(&new->de_id); ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ spin_lock(&c->dc_lock); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) { ++ kref_get(&d->de_kref); ++ spin_unlock(&c->dc_lock); ++ dprintk("%s [discard]\n", __func__); ++ c->dc_free_callback(&new->de_kref); ++ return d; ++ } ++ } ++ hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]); ++ kref_get(&new->de_kref); ++ spin_unlock(&c->dc_lock); ++ dprintk("%s [new]\n", __func__); ++ return new; ++} ++EXPORT_SYMBOL(nfs4_add_get_deviceid); ++ ++/* ++ * Remove the first deviceid from a hash bucket, or return 0 if bucket list ++ * is empty. ++ */ ++static int ++nfs4_remove_deviceid(struct nfs4_deviceid_cache *c, long hash, ++ struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ spin_lock(&c->dc_lock); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (id && memcmp(id, &d->de_id, NFS4_PNFS_DEVICEID4_SIZE)) ++ continue; ++ hlist_del_rcu(&d->de_node); ++ spin_unlock(&c->dc_lock); ++ synchronize_rcu(); ++ dprintk("%s [%d]\n", __func__, ++ atomic_read(&d->de_kref.refcount)); ++ kref_put(&d->de_kref, c->dc_free_callback); ++ return 1; ++ } ++ spin_unlock(&c->dc_lock); ++ return 0; ++} ++ ++void ++nfs4_delete_device(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) ++{ ++ long hash = nfs4_deviceid_hash(id); ++ ++ nfs4_remove_deviceid(c, hash, id); ++} ++EXPORT_SYMBOL(nfs4_delete_device); ++ ++static void ++nfs4_free_deviceid_cache(struct kref *kref) ++{ ++ struct nfs4_deviceid_cache *cache = ++ container_of(kref, struct nfs4_deviceid_cache, dc_kref); ++ long i; ++ ++ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) ++ while (nfs4_remove_deviceid(cache, i, NULL)) ++ ; ++ kfree(cache); ++} ++ ++void ++nfs4_put_deviceid_cache(struct nfs_client *clp) ++{ ++ struct nfs4_deviceid_cache *tmp = clp->cl_devid_cache; ++ int refcount; ++ ++ dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); ++ spin_lock(&clp->cl_lock); ++ refcount = atomic_read(&clp->cl_devid_cache->dc_kref.refcount); ++ if (refcount == 1) ++ clp->cl_devid_cache = NULL; ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [%d]\n", __func__, refcount); ++ kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache); ++} ++EXPORT_SYMBOL(nfs4_put_deviceid_cache); +diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs.h +--- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-08-31 20:42:05.542222767 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-08-31 20:42:05.542222767 -0400 +@@ -0,0 +1,354 @@ ++/* ++ * fs/nfs/pnfs.h ++ * ++ * pNFS client data structures. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef FS_NFS_PNFS_H ++#define FS_NFS_PNFS_H ++ ++#include ++ ++#ifdef CONFIG_NFS_V4_1 ++ ++#include ++#include ++#include "iostat.h" ++ ++/* nfs4proc.c */ ++extern int nfs4_proc_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist); ++extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, ++ struct pnfs_device *dev); ++extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); ++extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, ++ int issync); ++extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait); ++ ++/* pnfs.c */ ++extern const nfs4_stateid zero_stateid; ++ ++void _pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp); ++ ++int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type, bool wait); ++void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id); ++void unmount_pnfs_layoutdriver(struct nfs_server *); ++enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, ++ const struct rpc_call_ops *); ++int pnfs_initialize(void); ++void pnfs_uninitialize(void); ++void pnfs_layoutcommit_free(struct nfs4_layoutcommit_data *data); ++void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); ++int pnfs_layoutcommit_inode(struct inode *inode, int sync); ++void pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent); ++void pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx); ++unsigned int pnfs_getiosize(struct nfs_server *server); ++void pnfs_set_ds_iosize(struct nfs_server *server); ++enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, ++ struct nfs_open_context *, struct list_head *, ++ size_t *); ++void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, ++ size_t *); ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata); ++void pnfs_get_layout_done(struct nfs4_layoutget *, int rpc_status); ++int pnfs_layout_process(struct nfs4_layoutget *lgp); ++void pnfs_layout_release(struct pnfs_layout_hdr *, struct pnfs_layout_range *range); ++void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, ++ const nfs4_stateid *stateid); ++void pnfs_destroy_layout(struct nfs_inode *); ++void pnfs_destroy_all_layouts(struct nfs_client *); ++void put_layout(struct inode *inode); ++void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo); ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata); ++int _pnfs_write_end(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg); ++ ++#define PNFS_EXISTS_LDIO_OP(srv, opname) ((srv)->pnfs_curr_ld && \ ++ (srv)->pnfs_curr_ld->ld_io_ops && \ ++ (srv)->pnfs_curr_ld->ld_io_ops->opname) ++#define PNFS_EXISTS_LDPOLICY_OP(srv, opname) ((srv)->pnfs_curr_ld && \ ++ (srv)->pnfs_curr_ld->ld_policy_ops && \ ++ (srv)->pnfs_curr_ld->ld_policy_ops->opname) ++ ++#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" ++ ++static inline int lo_fail_bit(u32 iomode) ++{ ++ return iomode == IOMODE_RW ? ++ NFS_INO_RW_LAYOUT_FAILED : NFS_INO_RO_LAYOUT_FAILED; ++} ++ ++/* Return true if a layout driver is being used for this mountpoint */ ++static inline int pnfs_enabled_sb(struct nfs_server *nfss) ++{ ++ return nfss->pnfs_curr_ld != NULL; ++} ++ ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) ++{ ++ return !fsdata || ((struct pnfs_layout_segment *)fsdata == lseg) || ++ !fsdata->bypass_eof; ++} ++ ++/* Should the pNFS client commit and return the layout upon a setattr */ ++static inline bool ++pnfs_ld_layoutret_on_setattr(struct inode *inode) ++{ ++ if (!pnfs_enabled_sb(NFS_SERVER(inode))) ++ return false; ++ return NFS_SERVER(inode)->pnfs_curr_ld->ld_policy_ops->flags & ++ PNFS_LAYOUTRET_ON_SETATTR; ++} ++ ++/* Should the pNFS client commit and return the layout on close ++ */ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout->roc_iomode; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int status = 0; ++ ++ *fsdata = lseg; ++ if (lseg && PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ status = _pnfs_write_begin(inode, page, pos, len, lseg, ++ (struct pnfs_fsdata **) fsdata); ++ return status; ++} ++ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_end)) ++ return _pnfs_write_end(inode, page, pos, len, copied, lseg); ++ else ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++ if (fsdata) { ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_end_cleanup)) ++ nfss->pnfs_curr_ld->ld_io_ops->write_end_cleanup(filp, fsdata); ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ pnfs_free_fsdata(fsdata); ++ } ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct pnfs_layout_range *range, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct nfs_server *nfss = NFS_SERVER(ino); ++ ++ if (pnfs_enabled_sb(nfss) && ++ (type != RETURN_FILE || has_layout(nfsi))) ++ return _pnfs_return_layout(ino, range, stateid, type, wait); ++ ++ return 0; ++} ++ ++static inline void pnfs_update_layout(struct inode *ino, ++ struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ struct nfs_server *nfss = NFS_SERVER(ino); ++ ++ if (pnfs_enabled_sb(nfss)) ++ _pnfs_update_layout(ino, ctx, pos, count, access_type, lsegpp); ++ else { ++ if (lsegpp) ++ *lsegpp = NULL; ++ } ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline int pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ if (pnfs_enabled_sb(nfss)) ++ return pnfs_ld_use_rpc_code(nfss->pnfs_curr_ld); ++ ++ return 1; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ if (fsdata) { ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ return ((struct pnfs_fsdata *) fsdata)->lseg; ++ } ++ return fsdata; ++} ++#else /* CONFIG_NFS_V4_1 */ ++ ++static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) ++{ ++} ++ ++static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) ++{ ++} ++ ++static inline void get_lseg(struct pnfs_layout_segment *lseg) ++{ ++} ++ ++static inline void put_lseg(struct pnfs_layout_segment *lseg) ++{ ++} ++ ++static inline void ++pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ if (lsegpp) ++ *lsegpp = NULL; ++} ++ ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) ++{ ++ return 1; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ *fsdata = NULL; ++ return 0; ++} ++ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return 0; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return 0; ++} ++ ++static inline int pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ return 1; ++} ++ ++static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ return 0; ++} ++ ++static inline bool ++pnfs_ld_layoutret_on_setattr(struct inode *inode) ++{ ++ return false; ++} ++ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return 0; ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct pnfs_layout_range *range, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ return 0; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ return NULL; ++} ++ ++#endif /* CONFIG_NFS_V4_1 */ ++ ++#endif /* FS_NFS_PNFS_H */ +diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc.c +--- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-08-31 20:41:19.163155499 -0400 ++++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-31 20:42:05.543103394 -0400 +@@ -443,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, stru + fattr = nfs_alloc_fattr(); + status = -ENOMEM; + if (fh == NULL || fattr == NULL) +- goto out; ++ goto out_free; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); +@@ -455,7 +455,7 @@ nfs_proc_symlink(struct inode *dir, stru + */ + if (status == 0) + status = nfs_instantiate(dentry, fh, fattr); +- ++out_free: + nfs_free_fattr(fattr); + nfs_free_fhandle(fh); + out: +@@ -694,6 +694,7 @@ const struct nfs_rpc_ops nfs_v2_clientop + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs_dir_inode_operations, + .file_inode_ops = &nfs_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs_proc_get_root, + .getattr = nfs_proc_getattr, + .setattr = nfs_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read.c +--- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-08-31 20:41:19.164160482 -0400 ++++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-31 20:42:05.544233042 -0400 +@@ -18,8 +18,12 @@ + #include + #include + #include ++#include ++#include + + #include ++#include ++#include "pnfs.h" + + #include "nfs4_fs.h" + #include "internal.h" +@@ -117,11 +121,14 @@ int nfs_readpage_async(struct nfs_open_c + LIST_HEAD(one_request); + struct nfs_page *new; + unsigned int len; ++ struct pnfs_layout_segment *lseg; + + len = nfs_page_length(page); + if (len == 0) + return nfs_return_empty_page(page); +- new = nfs_create_request(ctx, inode, page, 0, len); ++ pnfs_update_layout(inode, ctx, 0, NFS4_MAX_UINT64, IOMODE_READ, &lseg); ++ new = nfs_create_request(ctx, inode, page, 0, len, lseg); ++ put_lseg(lseg); + if (IS_ERR(new)) { + unlock_page(page); + return PTR_ERR(new); +@@ -155,24 +162,20 @@ static void nfs_readpage_release(struct + nfs_release_request(req); + } + +-/* +- * Set up the NFS read request struct +- */ +-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset) ++int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, +@@ -180,9 +183,46 @@ static int nfs_read_rpcsetup(struct nfs_ + .flags = RPC_TASK_ASYNC | swap_flags, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->read_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_read); ++ ++int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) ++ return pnfs_get_read_status(data); ++ ++ return nfs_initiate_read(data, clnt, call_ops); ++} ++ ++/* ++ * Set up the NFS read request struct ++ */ ++static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + data->req = req; + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -190,27 +230,14 @@ static int nfs_read_rpcsetup(struct nfs_ + data->args.pages = data->pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); ++ data->args.lock_context = req->wb_lock_context; + + data->res.fattr = &data->fattr; + data->res.count = count; + data->res.eof = 0; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- rpc_put_task(task); +- return 0; ++ return pnfs_initiate_read(data, NFS_CLIENT(inode), call_ops); + } + + static void +@@ -354,7 +381,14 @@ static void nfs_readpage_retry(struct rp + { + struct nfs_readargs *argp = &data->args; + struct nfs_readres *resp = &data->res; ++ struct nfs_client *clp = NFS_SERVER(data->inode)->nfs_client; + ++#ifdef CONFIG_NFS_V4_1 ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + if (resp->eof || resp->count == argp->count) + return; + +@@ -368,7 +402,10 @@ static void nfs_readpage_retry(struct rp + argp->offset += resp->count; + argp->pgbase += resp->count; + argp->count -= resp->count; +- nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ ++ nfs_restart_rpc(task, clp); + } + + /* +@@ -409,13 +446,19 @@ static void nfs_readpage_release_partial + void nfs_read_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_read_data *data = calldata; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client, ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, + &data->args.seq_args, &data->res.seq_res, + 0, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_read_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_read_partial_ops = { +@@ -568,7 +611,8 @@ readpage_async_filler(void *data, struct + if (len == 0) + return nfs_return_empty_page(page); + +- new = nfs_create_request(desc->ctx, inode, page, 0, len); ++ new = nfs_create_request(desc->ctx, inode, page, 0, len, ++ desc->pgio->pg_lseg); + if (IS_ERR(new)) + goto out_error; + +@@ -624,6 +668,9 @@ int nfs_readpages(struct file *filp, str + if (ret == 0) + goto read_complete; /* all pages were read */ + ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_pageio_init_read(&pgio, inode, desc.ctx, pages, &rsize); ++#endif /* CONFIG_NFS_V4_1 */ + if (rsize < PAGE_CACHE_SIZE) + nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); + else +@@ -632,6 +679,7 @@ int nfs_readpages(struct file *filp, str + ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); + + nfs_pageio_complete(&pgio); ++ put_lseg(pgio.pg_lseg); + npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nfs_add_stats(inode, NFSIOS_READPAGES, npages); + read_complete: +diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/super.c +--- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-31 20:41:19.165170508 -0400 ++++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-31 20:42:05.545114737 -0400 +@@ -64,6 +64,7 @@ + #include "iostat.h" + #include "internal.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_VFS + +@@ -676,6 +677,28 @@ static int nfs_show_options(struct seq_f + + return 0; + } ++#ifdef CONFIG_NFS_V4_1 ++void show_sessions(struct seq_file *m, struct nfs_server *server) ++{ ++ if (nfs4_has_session(server->nfs_client)) ++ seq_printf(m, ",sessions"); ++} ++#else ++void show_sessions(struct seq_file *m, struct nfs_server *server) {} ++#endif ++ ++#ifdef CONFIG_NFS_V4_1 ++void show_pnfs(struct seq_file *m, struct nfs_server *server) ++{ ++ seq_printf(m, ",pnfs="); ++ if (server->pnfs_curr_ld) ++ seq_printf(m, "%s", server->pnfs_curr_ld->name); ++ else ++ seq_printf(m, "not configured"); ++} ++#else /* CONFIG_NFS_V4_1 */ ++void show_pnfs(struct seq_file *m, struct nfs_server *server) {} ++#endif /* CONFIG_NFS_V4_1 */ + + /* + * Present statistical information for this VFS mountpoint +@@ -714,6 +737,8 @@ static int nfs_show_stats(struct seq_fil + seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); + seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); + seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); ++ show_sessions(m, nfss); ++ show_pnfs(m, nfss); + } + #endif + +diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unlink.c +--- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-08-31 20:41:19.166151095 -0400 ++++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-31 20:42:05.546131839 -0400 +@@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task + struct nfs_unlinkdata *data = calldata; + struct nfs_server *server = NFS_SERVER(data->dir); + +- if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, ++ if (nfs4_setup_sequence(server, NULL, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/write.c +--- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-08-31 20:41:17.273213379 -0400 ++++ linux-2.6.34.noarch/fs/nfs/write.c 2010-08-31 20:42:05.548212682 -0400 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + #include + +@@ -28,6 +29,7 @@ + #include "iostat.h" + #include "nfs4_fs.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PAGECACHE + +@@ -59,6 +61,7 @@ struct nfs_write_data *nfs_commitdata_al + } + return p; + } ++EXPORT_SYMBOL(nfs_commitdata_alloc); + + void nfs_commit_free(struct nfs_write_data *p) + { +@@ -66,6 +69,7 @@ void nfs_commit_free(struct nfs_write_da + kfree(p->pagevec); + mempool_free(p, nfs_commit_mempool); + } ++EXPORT_SYMBOL(nfs_commit_free); + + struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) + { +@@ -418,6 +422,17 @@ static void nfs_inode_remove_request(str + nfs_clear_request(req); + nfs_release_request(req); + } ++static void ++nfs_mark_request_nopnfs(struct nfs_page *req) ++{ ++ struct pnfs_layout_segment *lseg = req->wb_lseg; ++ ++ if (req->wb_lseg == NULL) ++ return; ++ req->wb_lseg = NULL; ++ put_lseg(lseg); ++ dprintk(" retry through MDS\n"); ++} + + static void + nfs_mark_request_dirty(struct nfs_page *req) +@@ -523,7 +538,7 @@ nfs_need_commit(struct nfs_inode *nfsi) + * The requests are *not* checked to ensure that they form a contiguous set. + */ + static int +-nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) ++nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int *use_pnfs) + { + struct nfs_inode *nfsi = NFS_I(inode); + int ret; +@@ -531,7 +546,8 @@ nfs_scan_commit(struct inode *inode, str + if (!nfs_need_commit(nfsi)) + return 0; + +- ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); ++ ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT, ++ use_pnfs); + if (ret > 0) + nfsi->ncommit -= ret; + if (nfs_need_commit(NFS_I(inode))) +@@ -560,7 +576,8 @@ static inline int nfs_scan_commit(struct + static struct nfs_page *nfs_try_to_update_request(struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int bytes) ++ unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + unsigned int rqend; +@@ -585,8 +602,8 @@ static struct nfs_page *nfs_try_to_updat + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. + */ +- if (offset > rqend +- || end < req->wb_offset) ++ if (offset > rqend || end < req->wb_offset || ++ req->wb_lseg != lseg) + goto out_flushme; + + if (nfs_set_page_tag_locked(req)) +@@ -634,16 +651,17 @@ out_err: + * already called nfs_flush_incompatible() if necessary. + */ + static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, +- struct page *page, unsigned int offset, unsigned int bytes) ++ struct page *page, unsigned int offset, unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct inode *inode = page->mapping->host; + struct nfs_page *req; + int error; + +- req = nfs_try_to_update_request(inode, page, offset, bytes); ++ req = nfs_try_to_update_request(inode, page, offset, bytes, lseg); + if (req != NULL) + goto out; +- req = nfs_create_request(ctx, inode, page, offset, bytes); ++ req = nfs_create_request(ctx, inode, page, offset, bytes, lseg); + if (IS_ERR(req)) + goto out; + error = nfs_inode_add_request(inode, req); +@@ -656,23 +674,27 @@ out: + } + + static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, ++ void *fsdata) + { + struct nfs_page *req; + +- req = nfs_setup_write_request(ctx, page, offset, count); ++ req = nfs_setup_write_request(ctx, page, offset, count, lseg); + if (IS_ERR(req)) + return PTR_ERR(req); + nfs_mark_request_dirty(req); + /* Update file length */ +- nfs_grow_file(page, offset, count); ++ if (pnfs_grow_ok(lseg, fsdata)) ++ nfs_grow_file(page, offset, count); + nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + return 0; + } + +-int nfs_flush_incompatible(struct file *file, struct page *page) ++int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct nfs_page *req; +@@ -689,7 +711,10 @@ int nfs_flush_incompatible(struct file * + req = nfs_page_find_request(page); + if (req == NULL) + return 0; +- do_flush = req->wb_page != page || req->wb_context != ctx; ++ do_flush = req->wb_page != page || req->wb_context != ctx || ++ req->wb_lock_context->lockowner != current->files || ++ req->wb_lock_context->pid != current->tgid || ++ req->wb_lseg != lseg; + nfs_release_request(req); + if (!do_flush) + return 0; +@@ -716,7 +741,8 @@ static int nfs_write_pageuptodate(struct + * things with a page scheduled for an RPC call (e.g. invalidate it). + */ + int nfs_updatepage(struct file *file, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct inode *inode = page->mapping->host; +@@ -741,7 +767,7 @@ int nfs_updatepage(struct file *file, st + offset = 0; + } + +- status = nfs_writepage_setup(ctx, page, offset, count); ++ status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata); + if (status < 0) + nfs_set_pageerror(page); + +@@ -771,25 +797,21 @@ static int flush_task_priority(int how) + return RPC_PRIORITY_NORMAL; + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_write_rpcsetup(struct nfs_page *req, +- struct nfs_write_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset, +- int how) ++int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .task = &data->task, + .rpc_message = &msg, + .callback_ops = call_ops, +@@ -800,12 +822,62 @@ static int nfs_write_rpcsetup(struct nfs + }; + int ret = 0; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->write_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ if (how & FLUSH_SYNC) { ++ ret = rpc_wait_for_completion_task(task); ++ if (ret == 0) ++ ret = task->tk_status; ++ } ++ rpc_put_task(task); ++out: ++ return ret; ++} ++EXPORT_SYMBOL(nfs_initiate_write); ++ ++int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_write(data, clnt, call_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_write_rpcsetup(struct nfs_page *req, ++ struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset, ++ int how) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + data->req = req; + data->inode = inode = req->wb_context->path.dentry->d_inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -813,6 +885,7 @@ static int nfs_write_rpcsetup(struct nfs + data->args.pages = data->pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); ++ data->args.lock_context = req->wb_lock_context; + data->args.stable = NFS_UNSTABLE; + if (how & FLUSH_STABLE) { + data->args.stable = NFS_DATA_SYNC; +@@ -825,30 +898,7 @@ static int nfs_write_rpcsetup(struct nfs + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- dprintk("NFS: %5u initiated write call " +- "(req %s/%lld, %u bytes @ offset %llu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) { +- ret = PTR_ERR(task); +- goto out; +- } +- if (how & FLUSH_SYNC) { +- ret = rpc_wait_for_completion_task(task); +- if (ret == 0) +- ret = task->tk_status; +- } +- rpc_put_task(task); +-out: +- return ret; ++ return pnfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); + } + + /* If a nfs_flush_* function fails, it should remove reqs from @head and +@@ -859,6 +909,7 @@ static void nfs_redirty_request(struct n + { + struct page *page = req->wb_page; + ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); +@@ -971,6 +1022,10 @@ static void nfs_pageio_init_write(struct + { + size_t wsize = NFS_SERVER(inode)->wsize; + ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_pageio_init_write(pgio, inode, &wsize); ++#endif /* CONFIG_NFS_V4_1 */ ++ + if (wsize < PAGE_CACHE_SIZE) + nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); + else +@@ -1036,13 +1091,27 @@ out: + void nfs_write_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_write_data *data = calldata; +- struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(clp, &data->args.seq_args, ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } else if (data->args.count > NFS_SERVER(data->inode)->wsize) { ++ /* retrying via MDS? */ ++ data->pdata.orig_count = data->args.count; ++ data->args.count = NFS_SERVER(data->inode)->wsize; ++ dprintk("%s: trimmed count %u to wsize %u\n", __func__, ++ data->pdata.orig_count, data->args.count); ++ } else ++ data->pdata.orig_count = 0; ++ ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, ++ &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_write_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_write_partial_ops = { +@@ -1126,10 +1195,11 @@ int nfs_writeback_done(struct rpc_task * + struct nfs_writeargs *argp = &data->args; + struct nfs_writeres *resp = &data->res; + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *clp = server->nfs_client; + int status; + +- dprintk("NFS: %5u nfs_writeback_done (status %d)\n", +- task->tk_pid, task->tk_status); ++ dprintk("NFS: %5u nfs_writeback_done (status %d count %u)\n", ++ task->tk_pid, task->tk_status, resp->count); + + /* + * ->write_done will attempt to use post-op attributes to detect +@@ -1142,6 +1212,13 @@ int nfs_writeback_done(struct rpc_task * + if (status != 0) + return status; + nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); ++#ifdef CONFIG_NFS_V4_1 ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) + if (resp->verf->committed < argp->stable && task->tk_status >= 0) { +@@ -1158,7 +1235,7 @@ int nfs_writeback_done(struct rpc_task * + if (time_before(complain, jiffies)) { + dprintk("NFS: faulty NFS server %s:" + " (committed = %d) != (stable = %d)\n", +- server->nfs_client->cl_hostname, ++ clp->cl_hostname, + resp->verf->committed, argp->stable); + complain = jiffies + 300 * HZ; + } +@@ -1168,6 +1245,9 @@ int nfs_writeback_done(struct rpc_task * + if (task->tk_status >= 0 && resp->count < argp->count) { + static unsigned long complain; + ++ dprintk("NFS: short write:" ++ " (resp->count %u) < (argp->count = %u)\n", ++ resp->count, argp->count); + nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE); + + /* Has the server at least made some progress? */ +@@ -1184,7 +1264,10 @@ int nfs_writeback_done(struct rpc_task * + */ + argp->stable = NFS_FILE_SYNC; + } +- nfs_restart_rpc(task, server->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ ++ nfs_restart_rpc(task, clp); + return -EAGAIN; + } + if (time_before(complain, jiffies)) { +@@ -1228,40 +1311,73 @@ static void nfs_commitdata_release(void + nfs_commit_free(wdata); + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_commit_rpcsetup(struct list_head *head, +- struct nfs_write_data *data, +- int how) ++int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct nfs_page *first = nfs_list_entry(head->next); +- struct inode *inode = first->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = first->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, +- .callback_ops = &nfs_commit_ops, ++ .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + .priority = priority, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->commit_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_commit); ++ ++ ++int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs) ++{ ++ if (pnfs && ++ (pnfs_try_to_commit(data, &nfs_commit_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_commit(data, clnt, &nfs_commit_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_commit_rpcsetup(struct list_head *head, ++ struct nfs_write_data *data, ++ int how, int pnfs) ++{ ++ struct nfs_page *first = nfs_list_entry(head->next); ++ struct inode *inode = first->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + list_splice_init(head, &data->pages); + + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = first->wb_context->cred; + + data->args.fh = NFS_FH(data->inode); + /* Note: we always request a commit of the entire inode */ +@@ -1272,45 +1388,47 @@ static int nfs_commit_rpcsetup(struct li + data->res.fattr = &data->fattr; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); ++ kref_init(&data->refcount); ++ data->parent = NULL; ++ data->args.context = first->wb_context; /* used by commit done */ + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->commit_setup(data, &msg); ++ return pnfs_initiate_commit(data, NFS_CLIENT(inode), &nfs_commit_ops, ++ how, pnfs); ++} + +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++/* Handle memory error during commit */ ++void nfs_mark_list_commit(struct list_head *head) ++{ ++ struct nfs_page *req; + +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- rpc_put_task(task); +- return 0; ++ while (!list_empty(head)) { ++ req = nfs_list_entry(head->next); ++ nfs_list_remove_request(req); ++ nfs_mark_request_commit(req); ++ dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); ++ dec_bdi_stat(req->wb_page->mapping->backing_dev_info, ++ BDI_RECLAIMABLE); ++ nfs_clear_page_tag_locked(req); ++ } + } ++EXPORT_SYMBOL(nfs_mark_list_commit); + + /* + * Commit dirty pages + */ + static int +-nfs_commit_list(struct inode *inode, struct list_head *head, int how) ++nfs_commit_list(struct inode *inode, struct list_head *head, int how, int pnfs) + { + struct nfs_write_data *data; +- struct nfs_page *req; + + data = nfs_commitdata_alloc(); +- + if (!data) + goto out_bad; + + /* Set up the argument struct */ +- return nfs_commit_rpcsetup(head, data, how); ++ return nfs_commit_rpcsetup(head, data, how, pnfs); + out_bad: +- while (!list_empty(head)) { +- req = nfs_list_entry(head->next); +- nfs_list_remove_request(req); +- nfs_mark_request_commit(req); +- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); +- dec_bdi_stat(req->wb_page->mapping->backing_dev_info, +- BDI_RECLAIMABLE); +- nfs_clear_page_tag_locked(req); +- } ++ nfs_mark_list_commit(head); + nfs_commit_clear_lock(NFS_I(inode)); + return -ENOMEM; + } +@@ -1330,6 +1448,19 @@ static void nfs_commit_done(struct rpc_t + return; + } + ++static inline void nfs_commit_cleanup(struct kref *kref) ++{ ++ struct nfs_write_data *data; ++ ++ data = container_of(kref, struct nfs_write_data, refcount); ++ /* Clear lock only when all cloned commits are finished */ ++ if (data->parent) ++ kref_put(&data->parent->refcount, nfs_commit_cleanup); ++ else ++ nfs_commit_clear_lock(NFS_I(data->inode)); ++ nfs_commitdata_release(data); ++} ++ + static void nfs_commit_release(void *calldata) + { + struct nfs_write_data *data = calldata; +@@ -1347,6 +1478,11 @@ static void nfs_commit_release(void *cal + req->wb_bytes, + (long long)req_offset(req)); + if (status < 0) { ++ if (req->wb_lseg) { ++ nfs_mark_request_nopnfs(req); ++ nfs_mark_request_dirty(req); ++ goto next; ++ } + nfs_context_set_write_error(req->wb_context, status); + nfs_inode_remove_request(req); + dprintk(", error = %d\n", status); +@@ -1363,12 +1499,12 @@ static void nfs_commit_release(void *cal + } + /* We have a mismatch. Write the page again */ + dprintk(" mismatch\n"); ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + next: + nfs_clear_page_tag_locked(req); + } +- nfs_commit_clear_lock(NFS_I(data->inode)); +- nfs_commitdata_release(calldata); ++ kref_put(&data->refcount, nfs_commit_cleanup); + } + + static const struct rpc_call_ops nfs_commit_ops = { +@@ -1384,21 +1520,22 @@ int nfs_commit_inode(struct inode *inode + LIST_HEAD(head); + int may_wait = how & FLUSH_SYNC; + int res = 0; ++ int use_pnfs = 0; + + if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) + goto out_mark_dirty; + spin_lock(&inode->i_lock); +- res = nfs_scan_commit(inode, &head, 0, 0); ++ res = nfs_scan_commit(inode, &head, 0, 0, &use_pnfs); + spin_unlock(&inode->i_lock); + if (res) { +- int error = nfs_commit_list(inode, &head, how); ++ int error = nfs_commit_list(inode, &head, how, use_pnfs); + if (error < 0) + return error; +- if (may_wait) ++ if (may_wait) { + wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); +- else ++ } else + goto out_mark_dirty; + } else + nfs_commit_clear_lock(NFS_I(inode)); +@@ -1451,7 +1588,18 @@ static int nfs_commit_unstable_pages(str + + int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) + { +- return nfs_commit_unstable_pages(inode, wbc); ++ int ret; ++ ret = nfs_commit_unstable_pages(inode, wbc); ++ if (ret >= 0 && layoutcommit_needed(NFS_I(inode))) { ++ int err, sync = wbc->sync_mode; ++ ++ if (wbc->nonblocking || wbc->for_background) ++ sync = 0; ++ err = pnfs_layoutcommit_inode(inode, sync); ++ if (err < 0) ++ ret = err; ++ } ++ return ret; + } + + /* +@@ -1459,6 +1607,7 @@ int nfs_write_inode(struct inode *inode, + */ + int nfs_wb_all(struct inode *inode) + { ++ int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, +@@ -1466,7 +1615,8 @@ int nfs_wb_all(struct inode *inode) + .range_end = LLONG_MAX, + }; + +- return sync_inode(inode, &wbc); ++ ret = sync_inode(inode, &wbc); ++ return ret; + } + + int nfs_wb_page_cancel(struct inode *inode, struct page *page) +diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/include/linux/exportfs.h +--- linux-2.6.34.noarch/include/linux/exportfs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-08-31 20:42:05.577222704 -0400 +@@ -2,6 +2,7 @@ + #define LINUX_EXPORTFS_H 1 + + #include ++#include + + struct dentry; + struct inode; +@@ -175,4 +176,62 @@ extern struct dentry *generic_fh_to_pare + struct fid *fid, int fh_len, int fh_type, + struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)); + ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct pnfs_filelayout_device; ++struct pnfs_filelayout_layout; ++ ++extern int filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev); ++extern enum nfsstat4 filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct list_head; ++ ++extern int blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes); ++ ++extern enum nfsstat4 blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *layouts); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_PNFSD) ++#include ++ ++struct pnfsd_cb_operations; ++ ++struct pnfsd_cb_ctl { ++ spinlock_t lock; ++ struct module *module; ++ const struct pnfsd_cb_operations *cb_op; ++}; ++ ++/* in expfs.c so that file systems can depend on it */ ++extern struct pnfsd_cb_ctl pnfsd_cb_ctl; ++ ++static inline int ++pnfsd_get_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ int ret = -ENOENT; ++ ++ spin_lock(&pnfsd_cb_ctl.lock); ++ if (!pnfsd_cb_ctl.cb_op) ++ goto out; ++ if (!try_module_get(pnfsd_cb_ctl.module)) ++ goto out; ++ ctl->cb_op = pnfsd_cb_ctl.cb_op; ++ ctl->module = pnfsd_cb_ctl.module; ++ ret = 0; ++out: ++ spin_unlock(&pnfsd_cb_ctl.lock); ++ return ret; ++} ++ ++static inline void ++pnfsd_put_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ module_put(ctl->module); ++} ++#endif /* CONFIG_PNFSD */ + #endif /* LINUX_EXPORTFS_H */ +diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/include/linux/exp_xdr.h +--- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-08-31 20:42:05.576053304 -0400 ++++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-08-31 20:42:05.576053304 -0400 +@@ -0,0 +1,141 @@ ++#ifndef _LINUX_EXP_XDR_H ++#define _LINUX_EXP_XDR_H ++ ++#include ++#include ++#include ++ ++struct exp_xdr_stream { ++ __be32 *p; ++ __be32 *end; ++}; ++ ++/** ++ * exp_xdr_qwords - Calculate the number of quad-words holding nbytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline size_t ++exp_xdr_qwords(__u32 nbytes) ++{ ++ return DIV_ROUND_UP(nbytes, 4); ++} ++ ++/** ++ * exp_xdr_qbytes - Calculate the number of bytes holding qwords ++ * @qwords: number of quad-words to encode ++ */ ++static inline size_t ++exp_xdr_qbytes(size_t qwords) ++{ ++ return qwords << 2; ++} ++ ++/** ++ * exp_xdr_reserve_space - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nbytes: number of bytes to reserve ++ * ++ * Checks that we have enough buffer space to encode 'nbytes' more ++ * bytes of data. If so, update the xdr stream. ++ */ ++static inline __be32 * ++exp_xdr_reserve_space(struct exp_xdr_stream *xdr, size_t nbytes) ++{ ++ __be32 *p = xdr->p; ++ __be32 *q; ++ ++ /* align nbytes on the next 32-bit boundary */ ++ q = p + exp_xdr_qwords(nbytes); ++ if (unlikely(q > xdr->end || q < p)) ++ return NULL; ++ xdr->p = q; ++ return p; ++} ++ ++/** ++ * exp_xdr_reserve_qwords - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nwords: number of quad words (u32's) to reserve ++ */ ++static inline __be32 * ++exp_xdr_reserve_qwords(struct exp_xdr_stream *xdr, size_t qwords) ++{ ++ return exp_xdr_reserve_space(xdr, exp_xdr_qbytes(qwords)); ++} ++ ++/** ++ * exp_xdr_encode_u32 - Encode an unsigned 32-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u32(__be32 *p, __u32 val) ++{ ++ *p = cpu_to_be32(val); ++ return p + 1; ++} ++ ++/** ++ * exp_xdr_encode_u64 - Encode an unsigned 64-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u64(__be32 *p, __u64 val) ++{ ++ put_unaligned_be64(val, p); ++ return p + 2; ++} ++ ++/** ++ * exp_xdr_encode_bytes - Encode an array of bytes onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the array of bytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_bytes(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ if (likely(nbytes != 0)) { ++ unsigned int qwords = exp_xdr_qwords(nbytes); ++ unsigned int padding = exp_xdr_qbytes(qwords) - nbytes; ++ ++ memcpy(p, ptr, nbytes); ++ if (padding != 0) ++ memset((char *)p + nbytes, 0, padding); ++ p += qwords; ++ } ++ return p; ++} ++ ++/** ++ * exp_xdr_encode_opaque - Encode an opaque type onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the opaque array ++ * @nbytes: number of bytes to encode ++ * ++ * Encodes the 32-bit opaque size in bytes followed by the opaque value. ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ p = exp_xdr_encode_u32(p, nbytes); ++ return exp_xdr_encode_bytes(p, ptr, nbytes); ++} ++ ++/** ++ * exp_xdr_encode_opaque_qlen - Encode the opaque length onto a xdr stream ++ * @lenp: pointer to the opaque length destination ++ * @endp: pointer to the end of the opaque array ++ * ++ * Encodes the 32-bit opaque size in bytes given the start and end pointers ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque_len(__be32 *lenp, const void *endp) ++{ ++ size_t nbytes = (char *)endp - (char *)(lenp + 1); ++ ++ exp_xdr_encode_u32(lenp, nbytes); ++ return lenp + 1 + exp_xdr_qwords(nbytes); ++} ++#endif /* _LINUX_EXP_XDR_H */ +diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include/linux/fs.h +--- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-08-31 20:41:19.120034834 -0400 ++++ linux-2.6.34.noarch/include/linux/fs.h 2010-08-31 20:42:05.579212604 -0400 +@@ -387,6 +387,7 @@ struct inodes_stat_t { + #include + + struct export_operations; ++struct pnfs_export_operations; + struct hd_geometry; + struct iovec; + struct nameidata; +@@ -1329,6 +1330,7 @@ struct super_block { + const struct dquot_operations *dq_op; + const struct quotactl_ops *s_qcop; + const struct export_operations *s_export_op; ++ const struct pnfs_export_operations *s_pnfs_op; + unsigned long s_flags; + unsigned long s_magic; + struct dentry *s_root; +diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/include/linux/nfs4.h +--- linux-2.6.34.noarch/include/linux/nfs4.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-08-31 20:42:05.581035627 -0400 +@@ -17,7 +17,10 @@ + + #define NFS4_BITMAP_SIZE 2 + #define NFS4_VERIFIER_SIZE 8 +-#define NFS4_STATEID_SIZE 16 ++#define NFS4_CLIENTID_SIZE 8 ++#define NFS4_STATEID_SEQID_SIZE 4 ++#define NFS4_STATEID_OTHER_SIZE 12 ++#define NFS4_STATEID_SIZE (NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE) + #define NFS4_FHSIZE 128 + #define NFS4_MAXPATHLEN PATH_MAX + #define NFS4_MAXNAMLEN NAME_MAX +@@ -119,6 +122,13 @@ + #define EXCHGID4_FLAG_MASK_A 0x40070003 + #define EXCHGID4_FLAG_MASK_R 0x80070003 + ++static inline bool ++is_ds_only_session(u32 exchange_flags) ++{ ++ u32 mask = EXCHGID4_FLAG_USE_PNFS_DS | EXCHGID4_FLAG_USE_PNFS_MDS; ++ return (exchange_flags & mask) == EXCHGID4_FLAG_USE_PNFS_DS; ++} ++ + #define SEQ4_STATUS_CB_PATH_DOWN 0x00000001 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING 0x00000002 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED 0x00000004 +@@ -166,8 +176,25 @@ struct nfs4_acl { + struct nfs4_ace aces[0]; + }; + ++struct nfs4_fsid { ++ u64 major; ++ u64 minor; ++}; ++ + typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier; +-typedef struct { char data[NFS4_STATEID_SIZE]; } nfs4_stateid; ++typedef struct { char data[NFS4_CLIENTID_SIZE]; } nfs4_clientid; ++ ++struct nfs41_stateid { ++ __be32 seqid; ++ char other[NFS4_STATEID_OTHER_SIZE]; ++} __attribute__ ((packed)); ++ ++typedef struct { ++ union { ++ char data[NFS4_STATEID_SIZE]; ++ struct nfs41_stateid stateid; ++ } u; ++} nfs4_stateid; + + enum nfs_opnum4 { + OP_ACCESS = 3, +@@ -471,6 +498,8 @@ enum lock_type4 { + #define FATTR4_WORD1_TIME_MODIFY (1UL << 21) + #define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22) + #define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23) ++#define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30) ++#define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) + + #define NFSPROC4_NULL 0 + #define NFSPROC4_COMPOUND 1 +@@ -523,6 +552,7 @@ enum { + NFSPROC4_CLNT_GETACL, + NFSPROC4_CLNT_SETACL, + NFSPROC4_CLNT_FS_LOCATIONS, ++ NFSPROC4_CLNT_RELEASE_LOCKOWNER, + + /* nfs41 */ + NFSPROC4_CLNT_EXCHANGE_ID, +@@ -531,6 +561,13 @@ enum { + NFSPROC4_CLNT_SEQUENCE, + NFSPROC4_CLNT_GET_LEASE_TIME, + NFSPROC4_CLNT_RECLAIM_COMPLETE, ++ NFSPROC4_CLNT_LAYOUTGET, ++ NFSPROC4_CLNT_LAYOUTCOMMIT, ++ NFSPROC4_CLNT_LAYOUTRETURN, ++ NFSPROC4_CLNT_GETDEVICELIST, ++ NFSPROC4_CLNT_GETDEVICEINFO, ++ NFSPROC4_CLNT_PNFS_WRITE, ++ NFSPROC4_CLNT_PNFS_COMMIT, + }; + + /* nfs41 types */ +@@ -549,6 +586,43 @@ enum state_protect_how4 { + SP4_SSV = 2 + }; + ++enum pnfs_layouttype { ++ LAYOUT_NFSV4_1_FILES = 1, ++ LAYOUT_OSD2_OBJECTS = 2, ++ LAYOUT_BLOCK_VOLUME = 3, ++}; ++ ++/* used for both layout return and recall */ ++enum pnfs_layoutreturn_type { ++ RETURN_FILE = 1, ++ RETURN_FSID = 2, ++ RETURN_ALL = 3 ++}; ++ ++enum pnfs_iomode { ++ IOMODE_READ = 1, ++ IOMODE_RW = 2, ++ IOMODE_ANY = 3, ++}; ++ ++enum pnfs_notify_deviceid_type4 { ++ NOTIFY_DEVICEID4_CHANGE = 1 << 1, ++ NOTIFY_DEVICEID4_DELETE = 1 << 2, ++}; ++ ++#define NFL4_UFLG_MASK 0x0000003F ++#define NFL4_UFLG_DENSE 0x00000001 ++#define NFL4_UFLG_COMMIT_THRU_MDS 0x00000002 ++#define NFL4_UFLG_STRIPE_UNIT_SIZE_MASK 0xFFFFFFC0 ++ ++/* Encoded in the loh_body field of type layouthint4 */ ++enum filelayout_hint_care4 { ++ NFLH4_CARE_DENSE = NFL4_UFLG_DENSE, ++ NFLH4_CARE_COMMIT_THRU_MDS = NFL4_UFLG_COMMIT_THRU_MDS, ++ NFLH4_CARE_STRIPE_UNIT_SIZE = 0x00000040, ++ NFLH4_CARE_STRIPE_COUNT = 0x00000080 ++}; ++ + #endif + #endif + +diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfs4_pnfs.h +--- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-31 20:42:05.583087731 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-08-31 20:42:05.583087731 -0400 +@@ -0,0 +1,329 @@ ++/* ++ * include/linux/nfs4_pnfs.h ++ * ++ * Common data structures needed by the pnfs client and pnfs layout driver. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef LINUX_NFS4_PNFS_H ++#define LINUX_NFS4_PNFS_H ++ ++#include ++ ++enum pnfs_try_status { ++ PNFS_ATTEMPTED = 0, ++ PNFS_NOT_ATTEMPTED = 1, ++}; ++ ++#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 ++ ++/* Per-layout driver specific registration structure */ ++struct pnfs_layoutdriver_type { ++ const u32 id; ++ const char *name; ++ struct layoutdriver_io_operations *ld_io_ops; ++ struct layoutdriver_policy_operations *ld_policy_ops; ++}; ++ ++struct pnfs_fsdata { ++ int bypass_eof; ++ struct pnfs_layout_segment *lseg; ++ void *private; ++}; ++ ++#if defined(CONFIG_NFS_V4_1) ++ ++static inline struct nfs_inode * ++PNFS_NFS_INODE(struct pnfs_layout_hdr *lo) ++{ ++ return NFS_I(lo->inode); ++} ++ ++static inline struct inode * ++PNFS_INODE(struct pnfs_layout_hdr *lo) ++{ ++ return lo->inode; ++} ++ ++static inline struct nfs_server * ++PNFS_NFS_SERVER(struct pnfs_layout_hdr *lo) ++{ ++ return NFS_SERVER(PNFS_INODE(lo)); ++} ++ ++static inline struct pnfs_layoutdriver_type * ++PNFS_LD(struct pnfs_layout_hdr *lo) ++{ ++ return NFS_SERVER(PNFS_INODE(lo))->pnfs_curr_ld; ++} ++ ++static inline struct layoutdriver_io_operations * ++PNFS_LD_IO_OPS(struct pnfs_layout_hdr *lo) ++{ ++ return PNFS_LD(lo)->ld_io_ops; ++} ++ ++static inline struct layoutdriver_policy_operations * ++PNFS_LD_POLICY_OPS(struct pnfs_layout_hdr *lo) ++{ ++ return PNFS_LD(lo)->ld_policy_ops; ++} ++ ++static inline bool ++has_layout(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout != NULL; ++} ++ ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) ++{ ++ return has_layout(nfsi) && ++ test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->state); ++} ++ ++extern void put_lseg(struct pnfs_layout_segment *lseg); ++extern void get_lseg(struct pnfs_layout_segment *lseg); ++ ++#else /* CONFIG_NFS_V4_1 */ ++ ++static inline bool ++has_layout(struct nfs_inode *nfsi) ++{ ++ return false; ++} ++ ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_NFS_V4_1 */ ++ ++struct pnfs_layout_segment { ++ struct list_head fi_list; ++ struct pnfs_layout_range range; ++ struct kref kref; ++ bool valid; ++ struct pnfs_layout_hdr *layout; ++ struct nfs4_deviceid *deviceid; ++ u8 ld_data[]; /* layout driver private data */ ++}; ++ ++static inline void * ++LSEG_LD_DATA(struct pnfs_layout_segment *lseg) ++{ ++ return lseg->ld_data; ++} ++ ++/* Layout driver I/O operations. ++ * Either the pagecache or non-pagecache read/write operations must be implemented ++ */ ++struct layoutdriver_io_operations { ++ /* Functions that use the pagecache. ++ * If use_pagecache == 1, then these functions must be implemented. ++ */ ++ /* read and write pagelist should return just 0 (to indicate that ++ * the layout code has taken control) or 1 (to indicate that the ++ * layout code wishes to fall back to normal nfs.) If 0 is returned, ++ * information can be passed back through nfs_data->res and ++ * nfs_data->task.tk_status, and the appropriate pnfs done function ++ * MUST be called. ++ */ ++ enum pnfs_try_status ++ (*read_pagelist) (struct nfs_read_data *nfs_data, unsigned nr_pages); ++ enum pnfs_try_status ++ (*write_pagelist) (struct nfs_write_data *nfs_data, unsigned nr_pages, int how); ++ int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page, ++ loff_t pos, unsigned count, ++ struct pnfs_fsdata *fsdata); ++ int (*write_end)(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, ++ struct pnfs_layout_segment *lseg); ++ void (*write_end_cleanup)(struct file *filp, ++ struct pnfs_fsdata *fsdata); ++ ++ /* Consistency ops */ ++ /* 2 problems: ++ * 1) the page list contains nfs_pages, NOT pages ++ * 2) currently the NFS code doesn't create a page array (as it does with read/write) ++ */ ++ enum pnfs_try_status ++ (*commit) (struct nfs_write_data *nfs_data, int how); ++ ++ /* Layout information. For each inode, alloc_layout is executed once to retrieve an ++ * inode specific layout structure. Each subsequent layoutget operation results in ++ * a set_layout call to set the opaque layout in the layout driver.*/ ++ struct pnfs_layout_hdr * (*alloc_layout) (struct inode *inode); ++ void (*free_layout) (struct pnfs_layout_hdr *); ++ struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); ++ void (*free_lseg) (struct pnfs_layout_segment *lseg); ++ ++ int (*setup_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct nfs4_layoutcommit_args *args); ++ void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *args); ++ void (*cleanup_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct nfs4_layoutcommit_args *args, ++ int status); ++ void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args); ++ ++ /* Registration information for a new mounted file system ++ */ ++ int (*initialize_mountpoint) (struct nfs_server *, ++ const struct nfs_fh * mntfh); ++ int (*uninitialize_mountpoint) (struct nfs_server *server); ++}; ++ ++enum layoutdriver_policy_flags { ++ /* Should the full nfs rpc cleanup code be used after io */ ++ PNFS_USE_RPC_CODE = 1 << 0, ++ ++ /* Should the NFS req. gather algorithm cross stripe boundaries? */ ++ PNFS_GATHER_ACROSS_STRIPES = 1 << 1, ++ ++ /* Should the pNFS client commit and return the layout upon a setattr */ ++ PNFS_LAYOUTRET_ON_SETATTR = 1 << 3, ++}; ++ ++struct layoutdriver_policy_operations { ++ unsigned flags; ++ ++ /* The stripe size of the file system */ ++ ssize_t (*get_stripesize) (struct pnfs_layout_hdr *layoutid); ++ ++ /* test for nfs page cache coalescing */ ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++ ++ /* Retreive the block size of the file system. ++ * If gather_across_stripes == 1, then the file system will gather ++ * requests into the block size. ++ * TODO: Where will the layout driver get this info? It is hard ++ * coded in PVFS2. ++ */ ++ ssize_t (*get_blocksize) (void); ++}; ++ ++/* Should the full nfs rpc cleanup code be used after io */ ++static inline int ++pnfs_ld_use_rpc_code(struct pnfs_layoutdriver_type *ld) ++{ ++ return ld->ld_policy_ops->flags & PNFS_USE_RPC_CODE; ++} ++ ++/* Should the NFS req. gather algorithm cross stripe boundaries? */ ++static inline int ++pnfs_ld_gather_across_stripes(struct pnfs_layoutdriver_type *ld) ++{ ++ return ld->ld_policy_ops->flags & PNFS_GATHER_ACROSS_STRIPES; ++} ++ ++struct pnfs_device { ++ struct pnfs_deviceid dev_id; ++ unsigned int layout_type; ++ unsigned int mincount; ++ struct page **pages; ++ void *area; ++ unsigned int pgbase; ++ unsigned int pglen; ++ unsigned int dev_notify_types; ++}; ++ ++struct pnfs_devicelist { ++ unsigned int eof; ++ unsigned int num_devs; ++ struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; ++}; ++ ++/* ++ * Device ID RCU cache. A device ID is unique per client ID and layout type. ++ */ ++#define NFS4_DEVICE_ID_HASH_BITS 5 ++#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) ++#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) ++ ++static inline u32 ++nfs4_deviceid_hash(struct pnfs_deviceid *id) ++{ ++ unsigned char *cptr = (unsigned char *)id->data; ++ unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE; ++ u32 x = 0; ++ ++ while (nbytes--) { ++ x *= 37; ++ x += *cptr++; ++ } ++ return x & NFS4_DEVICE_ID_HASH_MASK; ++} ++ ++struct nfs4_deviceid_cache { ++ spinlock_t dc_lock; ++ struct kref dc_kref; ++ void (*dc_free_callback)(struct kref *); ++ struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE]; ++}; ++ ++/* Device ID cache node */ ++struct nfs4_deviceid { ++ struct hlist_node de_node; ++ struct pnfs_deviceid de_id; ++ struct kref de_kref; ++}; ++ ++extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *, ++ void (*free_callback)(struct kref *)); ++extern void nfs4_put_deviceid_cache(struct nfs_client *); ++extern void nfs4_init_deviceid_node(struct nfs4_deviceid *); ++extern struct nfs4_deviceid *nfs4_find_get_deviceid( ++ struct nfs4_deviceid_cache *, ++ struct pnfs_deviceid *); ++extern struct nfs4_deviceid *nfs4_add_get_deviceid(struct nfs4_deviceid_cache *, ++ struct nfs4_deviceid *); ++extern void nfs4_set_layout_deviceid(struct pnfs_layout_segment *, ++ struct nfs4_deviceid *); ++extern void nfs4_put_unset_layout_deviceid(struct pnfs_layout_segment *, ++ struct nfs4_deviceid *, ++ void (*free_callback)(struct kref *)); ++extern void nfs4_delete_device(struct nfs4_deviceid_cache *, ++ struct pnfs_deviceid *); ++ ++/* pNFS client callback functions. ++ * These operations allow the layout driver to access pNFS client ++ * specific information or call pNFS client->server operations. ++ * E.g., getdeviceinfo, I/O callbacks, etc ++ */ ++struct pnfs_client_operations { ++ int (*nfs_getdevicelist) (struct nfs_server *, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist); ++ int (*nfs_getdeviceinfo) (struct nfs_server *, ++ struct pnfs_device *dev); ++ ++ /* Post read callback. */ ++ void (*nfs_readlist_complete) (struct nfs_read_data *nfs_data); ++ ++ /* Post write callback. */ ++ void (*nfs_writelist_complete) (struct nfs_write_data *nfs_data); ++ ++ /* Post commit callback. */ ++ void (*nfs_commit_complete) (struct nfs_write_data *nfs_data); ++ void (*nfs_return_layout) (struct inode *); ++}; ++ ++extern struct pnfs_client_operations pnfs_ops; ++ ++extern struct pnfs_client_operations *pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); ++extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); ++ ++#define NFS4_PNFS_MAX_LAYOUTS 4 ++#define NFS4_PNFS_PRIVATE_LAYOUT 0x80000000 ++ ++#endif /* LINUX_NFS4_PNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarch/include/linux/nfsd4_block.h +--- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-08-31 20:42:05.596098115 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-08-31 20:42:05.596098115 -0400 +@@ -0,0 +1,101 @@ ++#ifndef NFSD4_BLOCK ++#define NFSD4_BLOCK ++ ++#include ++#include ++#include ++#include ++ ++#define PNFS_BLOCK_SUCCESS 1 ++#define PNFS_BLOCK_FAILURE 0 ++ ++#define PNFS_BLOCK_CTL_START 1 ++#define PNFS_BLOCK_CTL_STOP 2 ++#define PNFS_BLOCK_CTL_VERS 3 /* Allows daemon to request current ++ * version from kernel via an upcall. ++ */ ++ ++#define PNFS_UPCALL_MSG_STOP 0 ++#define PNFS_UPCALL_MSG_GETSIG 1 ++#define PNFS_UPCALL_MSG_GETSLICE 2 ++#define PNFS_UPCALL_MSG_DMCHK 3 // See if dev_t is a DM volume ++#define PNFS_UPCALL_MSG_DMGET 4 ++#define PNFS_UPCALL_MSG_VERS 5 ++ ++#define PNFS_UPCALL_VERS 8 ++ ++typedef struct stripe_dev { ++ int major, ++ minor, ++ offset; ++} stripe_dev_t; ++ ++typedef struct bl_comm_res { ++ int res_status; ++ union { ++ struct { ++ long long start, ++ length; ++ } slice; ++ struct { ++ int num_stripes, ++ stripe_size; ++ stripe_dev_t devs[]; ++ } stripe; ++ struct { ++ long long sector; ++ int offset, ++ len; ++ char sig[]; ++ } sig; ++ int vers, ++ dm_vol; ++ } u; ++} bl_comm_res_t; ++ ++typedef struct bl_comm_msg { ++ int msg_type, ++ msg_status; ++ union { ++ dev_t msg_dev; ++ int msg_vers; ++ } u; ++ bl_comm_res_t *msg_res; ++} bl_comm_msg_t; ++ ++#ifdef __KERNEL__ ++ ++typedef struct bl_comm { ++ /* ---- protects access to this structure ---- */ ++ struct mutex lock; ++ /* ---- protects access to rpc pipe ---- */ ++ struct mutex pipe_lock; ++ struct dentry *pipe_dentry; ++ wait_queue_head_t pipe_wq; ++ bl_comm_msg_t msg; ++} bl_comm_t; ++ ++int pnfs_block_enabled(struct inode *, int); ++int bl_layout_type(struct super_block *sb); ++int bl_getdeviceiter(struct super_block *, u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int bl_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++enum nfsstat4 bl_layoutget(struct inode *, struct exp_xdr_stream *, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int bl_layoutcommit(struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++int bl_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len); ++int bl_init_proc(void); ++int bl_upcall(bl_comm_t *, bl_comm_msg_t *, bl_comm_res_t **); ++ ++extern bl_comm_t *bl_comm_global; // Ugly... ++#endif /* __KERNEL__ */ ++ ++#endif /* NFSD4_BLOCK */ ++ +diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h +--- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-31 20:42:05.597097942 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-08-31 20:42:05.597097942 -0400 +@@ -0,0 +1,345 @@ ++/* ++ * include/linux/nfsd4_spnfs.h ++ * ++ * spNFS - simple pNFS implementation with userspace daemon ++ * ++ */ ++ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#ifndef NFS_SPNFS_H ++#define NFS_SPNFS_H ++ ++ ++#ifdef __KERNEL__ ++#include "exportfs.h" ++#include "sunrpc/svc.h" ++#include "nfsd/nfsfh.h" ++#else ++#include ++#endif /* __KERNEL__ */ ++ ++#define SPNFS_STATUS_INVALIDMSG 0x01 ++#define SPNFS_STATUS_AGAIN 0x02 ++#define SPNFS_STATUS_FAIL 0x04 ++#define SPNFS_STATUS_SUCCESS 0x08 ++ ++#define SPNFS_TYPE_LAYOUTGET 0x01 ++#define SPNFS_TYPE_LAYOUTCOMMIT 0x02 ++#define SPNFS_TYPE_LAYOUTRETURN 0x03 ++#define SPNFS_TYPE_GETDEVICEITER 0x04 ++#define SPNFS_TYPE_GETDEVICEINFO 0x05 ++#define SPNFS_TYPE_SETATTR 0x06 ++#define SPNFS_TYPE_OPEN 0x07 ++#define SPNFS_TYPE_CLOSE 0x08 ++#define SPNFS_TYPE_CREATE 0x09 ++#define SPNFS_TYPE_REMOVE 0x0a ++#define SPNFS_TYPE_COMMIT 0x0b ++#define SPNFS_TYPE_READ 0x0c ++#define SPNFS_TYPE_WRITE 0x0d ++ ++#define SPNFS_MAX_DEVICES 1 ++#define SPNFS_MAX_DATA_SERVERS 16 ++#define SPNFS_MAX_IO 512 ++ ++/* layout */ ++struct spnfs_msg_layoutget_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_filelayout_list { ++ u_int32_t fh_len; ++ unsigned char fh_val[128]; /* DMXXX fix this const */ ++}; ++ ++struct spnfs_msg_layoutget_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t stripe_size; ++ u_int32_t stripe_type; ++ u_int32_t stripe_count; ++ struct spnfs_filelayout_list flist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++/* layoutcommit */ ++struct spnfs_msg_layoutcommit_args { ++ unsigned long inode; ++ unsigned long generation; ++ u_int64_t file_size; ++}; ++ ++struct spnfs_msg_layoutcommit_res { ++ int status; ++}; ++ ++/* layoutreturn */ ++/* No op for the daemon */ ++/* ++struct spnfs_msg_layoutreturn_args { ++}; ++ ++struct spnfs_msg_layoutreturn_res { ++}; ++*/ ++ ++/* getdeviceiter */ ++struct spnfs_msg_getdeviceiter_args { ++ unsigned long inode; ++ u_int64_t cookie; ++ u_int64_t verf; ++}; ++ ++struct spnfs_msg_getdeviceiter_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t cookie; ++ u_int64_t verf; ++ u_int32_t eof; ++}; ++ ++/* getdeviceinfo */ ++struct spnfs_data_server { ++ u_int32_t dsid; ++ char netid[5]; ++ char addr[29]; ++}; ++ ++struct spnfs_device { ++ u_int64_t devid; ++ int dscount; ++ struct spnfs_data_server dslist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++struct spnfs_msg_getdeviceinfo_args { ++ u_int64_t devid; ++}; ++ ++struct spnfs_msg_getdeviceinfo_res { ++ int status; ++ struct spnfs_device devinfo; ++}; ++ ++/* setattr */ ++struct spnfs_msg_setattr_args { ++ unsigned long inode; ++ unsigned long generation; ++ int file_size; ++}; ++ ++struct spnfs_msg_setattr_res { ++ int status; ++}; ++ ++/* open */ ++struct spnfs_msg_open_args { ++ unsigned long inode; ++ unsigned long generation; ++ int create; ++ int createmode; ++ int truncate; ++}; ++ ++struct spnfs_msg_open_res { ++ int status; ++}; ++ ++/* close */ ++/* No op for daemon */ ++struct spnfs_msg_close_args { ++ int x; ++}; ++ ++struct spnfs_msg_close_res { ++ int y; ++}; ++ ++/* create */ ++/* ++struct spnfs_msg_create_args { ++ int x; ++}; ++ ++struct spnfs_msg_create_res { ++ int y; ++}; ++*/ ++ ++/* remove */ ++struct spnfs_msg_remove_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_msg_remove_res { ++ int status; ++}; ++ ++/* commit */ ++/* ++struct spnfs_msg_commit_args { ++ int x; ++}; ++ ++struct spnfs_msg_commit_res { ++ int y; ++}; ++*/ ++ ++/* read */ ++struct spnfs_msg_read_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++}; ++ ++struct spnfs_msg_read_res { ++ int status; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++/* write */ ++struct spnfs_msg_write_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++struct spnfs_msg_write_res { ++ int status; ++}; ++ ++/* bundle args and responses */ ++union spnfs_msg_args { ++ struct spnfs_msg_layoutget_args layoutget_args; ++ struct spnfs_msg_layoutcommit_args layoutcommit_args; ++/* ++ struct spnfs_msg_layoutreturn_args layoutreturn_args; ++*/ ++ struct spnfs_msg_getdeviceiter_args getdeviceiter_args; ++ struct spnfs_msg_getdeviceinfo_args getdeviceinfo_args; ++ struct spnfs_msg_setattr_args setattr_args; ++ struct spnfs_msg_open_args open_args; ++ struct spnfs_msg_close_args close_args; ++/* ++ struct spnfs_msg_create_args create_args; ++*/ ++ struct spnfs_msg_remove_args remove_args; ++/* ++ struct spnfs_msg_commit_args commit_args; ++*/ ++ struct spnfs_msg_read_args read_args; ++ struct spnfs_msg_write_args write_args; ++}; ++ ++union spnfs_msg_res { ++ struct spnfs_msg_layoutget_res layoutget_res; ++ struct spnfs_msg_layoutcommit_res layoutcommit_res; ++/* ++ struct spnfs_msg_layoutreturn_res layoutreturn_res; ++*/ ++ struct spnfs_msg_getdeviceiter_res getdeviceiter_res; ++ struct spnfs_msg_getdeviceinfo_res getdeviceinfo_res; ++ struct spnfs_msg_setattr_res setattr_res; ++ struct spnfs_msg_open_res open_res; ++ struct spnfs_msg_close_res close_res; ++/* ++ struct spnfs_msg_create_res create_res; ++*/ ++ struct spnfs_msg_remove_res remove_res; ++/* ++ struct spnfs_msg_commit_res commit_res; ++*/ ++ struct spnfs_msg_read_res read_res; ++ struct spnfs_msg_write_res write_res; ++}; ++ ++/* a spnfs message, args and response */ ++struct spnfs_msg { ++ unsigned char im_type; ++ unsigned char im_status; ++ union spnfs_msg_args im_args; ++ union spnfs_msg_res im_res; ++}; ++ ++/* spnfs configuration info */ ++struct spnfs_config { ++ unsigned char dense_striping; ++ int stripe_size; ++ int num_ds; ++ char ds_dir[SPNFS_MAX_DATA_SERVERS][80]; /* XXX */ ++}; ++ ++#if defined(__KERNEL__) && defined(CONFIG_SPNFS) ++ ++#include ++ ++/* pipe mgmt structure. messages flow through here */ ++struct spnfs { ++ struct dentry *spnfs_dentry; /* dentry for pipe */ ++ wait_queue_head_t spnfs_wq; ++ struct spnfs_msg spnfs_im; /* spnfs message */ ++ struct mutex spnfs_lock; /* Serializes upcalls */ ++ struct mutex spnfs_plock; ++}; ++ ++struct nfsd4_open; ++ ++int spnfs_layout_type(struct super_block *); ++enum nfsstat4 spnfs_layoutget(struct inode *, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int spnfs_layoutcommit(void); ++int spnfs_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int spnfs_getdeviceiter(struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int spnfs_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++int spnfs_setattr(void); ++int spnfs_open(struct inode *, struct nfsd4_open *); ++int spnfs_get_state(struct inode *, struct knfsd_fh *, struct pnfs_get_state *); ++int spnfs_remove(unsigned long, unsigned long); ++__be32 spnfs_read(struct inode *, loff_t, unsigned long *, ++ int, struct svc_rqst *); ++__be32 spnfs_write(struct inode *, loff_t, size_t, int, struct svc_rqst *); ++int spnfs_getfh(int, struct nfs_fh *); ++int spnfs_test_layoutrecall(char *, u64, u64); ++int spnfs_layoutrecall(struct inode *, int, u64, u64); ++ ++int nfsd_spnfs_new(void); ++void nfsd_spnfs_delete(void); ++int spnfs_upcall(struct spnfs *, struct spnfs_msg *, union spnfs_msg_res *); ++int spnfs_enabled(void); ++int spnfs_init_proc(void); ++ ++extern struct spnfs_config *spnfs_config; ++ ++#endif /* __KERNEL__ && CONFIG_SPNFS */ ++ ++#endif /* NFS_SPNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch/include/linux/nfsd/const.h +--- linux-2.6.34.noarch/include/linux/nfsd/const.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-08-31 20:42:05.591097762 -0400 +@@ -29,6 +29,7 @@ + #ifdef __KERNEL__ + + #include ++#include + + /* + * Largest number of bytes we need to allocate for an NFS +diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch/include/linux/nfsd/debug.h +--- linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-08-31 20:42:05.591097762 -0400 +@@ -32,6 +32,8 @@ + #define NFSDDBG_REPCACHE 0x0080 + #define NFSDDBG_XDR 0x0100 + #define NFSDDBG_LOCKD 0x0200 ++#define NFSDDBG_PNFS 0x0400 ++#define NFSDDBG_FILELAYOUT 0x0800 + #define NFSDDBG_ALL 0x7FFF + #define NFSDDBG_NOCHANGE 0xFFFF + +diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarch/include/linux/nfsd/export.h +--- linux-2.6.34.noarch/include/linux/nfsd/export.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-08-31 20:42:05.592118086 -0400 +@@ -100,6 +100,7 @@ struct svc_export { + uid_t ex_anon_uid; + gid_t ex_anon_gid; + int ex_fsid; ++ int ex_pnfs; + unsigned char * ex_uuid; /* 16 byte fsid */ + struct nfsd4_fs_locations ex_fslocs; + int ex_nflavors; +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-31 20:42:05.592118086 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-31 20:42:05.592118086 -0400 +@@ -0,0 +1,132 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef NFSD_NFS4LAYOUTXDR_H ++#define NFSD_NFS4LAYOUTXDR_H ++ ++#include ++#include ++ ++/* the nfsd4_pnfs_devlist dev_addr for the file layout type */ ++struct pnfs_filelayout_devaddr { ++ struct xdr_netobj r_netid; ++ struct xdr_netobj r_addr; ++}; ++ ++/* list of multipath servers */ ++struct pnfs_filelayout_multipath { ++ u32 fl_multipath_length; ++ struct pnfs_filelayout_devaddr *fl_multipath_list; ++}; ++ ++struct pnfs_filelayout_device { ++ u32 fl_stripeindices_length; ++ u32 *fl_stripeindices_list; ++ u32 fl_device_length; ++ struct pnfs_filelayout_multipath *fl_device_list; ++}; ++ ++struct pnfs_filelayout_layout { ++ u32 lg_layout_type; /* response */ ++ u32 lg_stripe_type; /* response */ ++ u32 lg_commit_through_mds; /* response */ ++ u64 lg_stripe_unit; /* response */ ++ u64 lg_pattern_offset; /* response */ ++ u32 lg_first_stripe_index; /* response */ ++ struct nfsd4_pnfs_deviceid device_id; /* response */ ++ u32 lg_fh_length; /* response */ ++ struct knfsd_fh *lg_fh_list; /* response */ ++}; ++ ++enum stripetype4 { ++ STRIPE_SPARSE = 1, ++ STRIPE_DENSE = 2 ++}; ++ ++enum pnfs_block_extent_state4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, ++ PNFS_BLOCK_NONE_DATA = 3 ++}; ++ ++enum pnfs_block_volume_type4 { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, ++ PNFS_BLOCK_VOLUME_SLICE = 1, ++ PNFS_BLOCK_VOLUME_CONCAT = 2, ++ PNFS_BLOCK_VOLUME_STRIPE = 3, ++}; ++typedef enum pnfs_block_volume_type4 pnfs_block_volume_type4; ++ ++enum bl_cache_state { ++ BLOCK_LAYOUT_NEW = 0, ++ BLOCK_LAYOUT_CACHE = 1, ++ BLOCK_LAYOUT_UPDATE = 2, ++}; ++ ++typedef struct pnfs_blocklayout_layout { ++ struct list_head bll_list; ++ struct nfsd4_pnfs_deviceid bll_vol_id; ++ u64 bll_foff; // file offset ++ u64 bll_len; ++ u64 bll_soff; // storage offset ++ int bll_recalled; ++ enum pnfs_block_extent_state4 bll_es; ++ enum bl_cache_state bll_cache_state; ++} pnfs_blocklayout_layout_t; ++ ++typedef struct pnfs_blocklayout_devinfo { ++ struct list_head bld_list; ++ pnfs_block_volume_type4 bld_type; ++ struct nfsd4_pnfs_deviceid bld_devid; ++ int bld_index_loc; ++ union { ++ struct { ++ u64 bld_offset; ++ u32 bld_sig_len, ++ *bld_sig; ++ } simple; ++ struct { ++ u64 bld_start, ++ bld_len; ++ u32 bld_index; /* Index of Simple Volume */ ++ } slice; ++ struct { ++ u32 bld_stripes; ++ u64 bld_chunk_size; ++ u32 *bld_stripe_indexs; ++ } stripe; ++ } u; ++} pnfs_blocklayout_devinfo_t; ++ ++#endif /* NFSD_NFS4LAYOUTXDR_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-31 20:42:05.593020723 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-31 20:42:05.593020723 -0400 +@@ -0,0 +1,54 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++#include ++ ++/* ++ * Length of comma separated pnfs data server IPv4 addresses. Enough room for ++ * 32 addresses. ++ */ ++#define NFSD_DLM_DS_LIST_MAX 512 ++/* ++ * Length of colon separated pnfs dlm device of the form ++ * disk_name:comma separated data server IPv4 address ++ */ ++#define NFSD_PNFS_DLM_DEVICE_MAX (NFSD_DLM_DS_LIST_MAX + DISK_NAME_LEN + 1) ++ ++#ifdef CONFIG_PNFSD ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++extern const struct pnfs_export_operations pnfs_dlm_export_ops; ++ ++int nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len); ++ ++void nfsd4_pnfs_dlm_shutdown(void); ++ ++ssize_t nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen); ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-31 20:42:05.594107962 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-31 20:42:05.594107962 -0400 +@@ -0,0 +1,271 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef _LINUX_NFSD_NFSD4_PNFS_H ++#define _LINUX_NFSD_NFSD4_PNFS_H ++ ++#include ++#include ++#include ++ ++struct nfsd4_pnfs_deviceid { ++ u64 sbid; /* per-superblock unique ID */ ++ u64 devid; /* filesystem-wide unique device ID */ ++}; ++ ++struct nfsd4_pnfs_dev_iter_res { ++ u64 gd_cookie; /* request/repsonse */ ++ u64 gd_verf; /* request/repsonse */ ++ u64 gd_devid; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++/* Arguments for set_device_notify */ ++struct pnfs_devnotify_arg { ++ struct nfsd4_pnfs_deviceid dn_devid; /* request */ ++ u32 dn_layout_type; /* request */ ++ u32 dn_notify_types; /* request/response */ ++}; ++ ++struct nfsd4_layout_seg { ++ u64 clientid; ++ u32 layout_type; ++ u32 iomode; ++ u64 offset; ++ u64 length; ++}; ++ ++/* Used by layout_get to encode layout (loc_body var in spec) ++ * Args: ++ * minlength - min number of accessible bytes given by layout ++ * fsid - Major part of struct pnfs_deviceid. File system uses this ++ * to build the deviceid returned in the layout. ++ * fh - fs can modify the file handle for use on data servers ++ * seg - layout info requested and layout info returned ++ * xdr - xdr info ++ * return_on_close - true if layout to be returned on file close ++ */ ++ ++struct nfsd4_pnfs_layoutget_arg { ++ u64 lg_minlength; ++ u64 lg_sbid; ++ const struct knfsd_fh *lg_fh; ++}; ++ ++struct nfsd4_pnfs_layoutget_res { ++ struct nfsd4_layout_seg lg_seg; /* request/resopnse */ ++ u32 lg_return_on_close; ++}; ++ ++struct nfsd4_pnfs_layoutcommit_arg { ++ struct nfsd4_layout_seg lc_seg; /* request */ ++ u32 lc_reclaim; /* request */ ++ u32 lc_newoffset; /* request */ ++ u64 lc_last_wr; /* request */ ++ struct nfstime4 lc_mtime; /* request */ ++ u32 lc_up_len; /* layout length */ ++ void *lc_up_layout; /* decoded by callback */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit_res { ++ u32 lc_size_chg; /* boolean for response */ ++ u64 lc_newsize; /* response */ ++}; ++ ++#define PNFS_LAST_LAYOUT_NO_RECALLS ((void *)-1) /* used with lr_cookie below */ ++ ++struct nfsd4_pnfs_layoutreturn_arg { ++ u32 lr_return_type; /* request */ ++ struct nfsd4_layout_seg lr_seg; /* request */ ++ u32 lr_reclaim; /* request */ ++ u32 lrf_body_len; /* request */ ++ void *lrf_body; /* request */ ++ void *lr_cookie; /* fs private */ ++}; ++ ++/* pNFS Metadata to Data server state communication */ ++struct pnfs_get_state { ++ u32 dsid; /* request */ ++ u64 ino; /* request */ ++ nfs4_stateid stid; /* request;response */ ++ nfs4_clientid clid; /* response */ ++ u32 access; /* response */ ++ u32 stid_gen; /* response */ ++ u32 verifier[2]; /* response */ ++}; ++ ++/* ++ * pNFS export operations vector. ++ * ++ * The filesystem must implement the following methods: ++ * layout_type ++ * get_device_info ++ * layout_get ++ * ++ * All other methods are optional and can be set to NULL if not implemented. ++ */ ++struct pnfs_export_operations { ++ /* Returns the supported pnfs_layouttype4. */ ++ int (*layout_type) (struct super_block *); ++ ++ /* Encode device info onto the xdr stream. */ ++ int (*get_device_info) (struct super_block *, ++ struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++ ++ /* Retrieve all available devices via an iterator. ++ * arg->cookie == 0 indicates the beginning of the list, ++ * otherwise arg->verf is used to verify that the list hasn't changed ++ * while retrieved. ++ * ++ * On output, the filesystem sets the devid based on the current cookie ++ * and sets res->cookie and res->verf corresponding to the next entry. ++ * When the last entry in the list is retrieved, res->eof is set to 1. ++ */ ++ int (*get_device_iter) (struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++ ++ int (*set_device_notify) (struct super_block *, ++ struct pnfs_devnotify_arg *); ++ ++ /* Retrieve and encode a layout for inode onto the xdr stream. ++ * arg->minlength is the minimum number of accessible bytes required ++ * by the client. ++ * The maximum number of bytes to encode the layout is given by ++ * the xdr stream end pointer. ++ * arg->fsid contains the major part of struct pnfs_deviceid. ++ * The file system uses this to build the deviceid returned ++ * in the layout. ++ * res->seg - layout segment requested and layout info returned. ++ * res->fh can be modified the file handle for use on data servers ++ * res->return_on_close - true if layout to be returned on file close ++ * ++ * return one of the following nfs errors: ++ * NFS_OK Success ++ * NFS4ERR_ACCESS Permission error ++ * NFS4ERR_BADIOMODE Server does not support requested iomode ++ * NFS4ERR_BADLAYOUT No layout matching loga_minlength rules ++ * NFS4ERR_INVAL Parameter other than layout is invalid ++ * NFS4ERR_IO I/O error ++ * NFS4ERR_LAYOUTTRYLATER Layout may be retrieved later ++ * NFS4ERR_LAYOUTUNAVAILABLE Layout unavailable for this file ++ * NFS4ERR_LOCKED Lock conflict ++ * NFS4ERR_NOSPC Out-of-space error occured ++ * NFS4ERR_RECALLCONFLICT Layout currently unavialable due to ++ * a conflicting CB_LAYOUTRECALL ++ * NFS4ERR_SERVERFAULT Server went bezerk ++ * NFS4ERR_TOOSMALL loga_maxcount too small to fit layout ++ * NFS4ERR_WRONG_TYPE Wrong file type (not a regular file) ++ */ ++ enum nfsstat4 (*layout_get) (struct inode *, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++ ++ /* Commit changes to layout */ ++ int (*layout_commit) (struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++ ++ /* Returns the layout */ ++ int (*layout_return) (struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++ ++ /* Can layout segments be merged for this layout type? */ ++ int (*can_merge_layouts) (u32 layout_type); ++ ++ /* pNFS Files layout specific operations */ ++ ++ /* Get the write verifier for DS (called on MDS only) */ ++ void (*get_verifier) (struct super_block *, u32 *p); ++ /* Call fs on DS only */ ++ int (*get_state) (struct inode *, struct knfsd_fh *, ++ struct pnfs_get_state *); ++}; ++ ++struct nfsd4_pnfs_cb_layout { ++ u32 cbl_recall_type; /* request */ ++ struct nfsd4_layout_seg cbl_seg; /* request */ ++ u32 cbl_layoutchanged; /* request */ ++ nfs4_stateid cbl_sid; /* request */ ++ struct nfs4_fsid cbl_fsid; ++ void *cbl_cookie; /* fs private */ ++}; ++ ++/* layoutrecall request (from exported filesystem) */ ++struct nfs4_layoutrecall { ++ struct kref clr_ref; ++ struct nfsd4_pnfs_cb_layout cb; /* request */ ++ struct list_head clr_perclnt; /* on cl_layoutrecalls */ ++ struct nfs4_client *clr_client; ++ struct nfs4_file *clr_file; ++ struct timespec clr_time; /* last activity */ ++ struct super_block *clr_sb; /* We might not have a file */ ++ struct nfs4_layoutrecall *parent; /* The initiating recall */ ++ ++ void *clr_args; /* nfsd internal */ ++}; ++ ++struct nfsd4_pnfs_cb_dev_item { ++ u32 cbd_notify_type; /* request */ ++ u32 cbd_layout_type; /* request */ ++ struct nfsd4_pnfs_deviceid cbd_devid; /* request */ ++ u32 cbd_immediate; /* request */ ++}; ++ ++struct nfsd4_pnfs_cb_dev_list { ++ u32 cbd_len; /* request */ ++ struct nfsd4_pnfs_cb_dev_item *cbd_list; /* request */ ++}; ++ ++/* ++ * callbacks provided by the nfsd ++ */ ++struct pnfsd_cb_operations { ++ /* Generic callbacks */ ++ int (*cb_layout_recall) (struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++ int (*cb_device_notify) (struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++ ++ /* pNFS Files layout specific callbacks */ ++ ++ /* Callback from fs on MDS only */ ++ int (*cb_get_state) (struct super_block *, struct pnfs_get_state *); ++ /* Callback from fs on DS only */ ++ int (*cb_change_state) (struct pnfs_get_state *); ++}; ++ ++#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noarch/include/linux/nfsd/syscall.h +--- linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-08-31 20:42:05.594107962 -0400 +@@ -29,6 +29,7 @@ + /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ + #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ + #define NFSCTL_GETFS 8 /* get an fh by path with max FH len */ ++#define NFSCTL_FD2FH 9 /* get a fh from a fd */ + + /* SVC */ + struct nfsctl_svc { +@@ -71,6 +72,11 @@ struct nfsctl_fsparm { + int gd_maxlen; + }; + ++/* FD2FH */ ++struct nfsctl_fd2fh { ++ int fd; ++}; ++ + /* + * This is the argument union. + */ +@@ -82,6 +88,7 @@ struct nfsctl_arg { + struct nfsctl_export u_export; + struct nfsctl_fdparm u_getfd; + struct nfsctl_fsparm u_getfs; ++ struct nfsctl_fd2fh u_fd2fh; + /* + * The following dummy member is needed to preserve binary compatibility + * on platforms where alignof(void*)>alignof(int). It's needed because +@@ -95,6 +102,7 @@ struct nfsctl_arg { + #define ca_export u.u_export + #define ca_getfd u.u_getfd + #define ca_getfs u.u_getfs ++#define ca_fd2fh u.u_fd2fh + }; + + union nfsctl_res { +diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/include/linux/nfs_fs.h +--- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-31 20:41:19.168160480 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-31 20:42:05.584098019 -0400 +@@ -72,13 +72,20 @@ struct nfs_access_entry { + int mask; + }; + ++struct nfs_lock_context { ++ atomic_t count; ++ struct list_head list; ++ struct nfs_open_context *open_context; ++ fl_owner_t lockowner; ++ pid_t pid; ++}; ++ + struct nfs4_state; + struct nfs_open_context { +- atomic_t count; ++ struct nfs_lock_context lock_context; + struct path path; + struct rpc_cred *cred; + struct nfs4_state *state; +- fl_owner_t lockowner; + fmode_t mode; + + unsigned long flags; +@@ -97,6 +104,27 @@ struct nfs_delegation; + + struct posix_acl; + ++struct pnfs_layout_hdr { ++ int refcount; ++ struct list_head layouts; /* other client layouts */ ++ struct list_head segs; /* layout segments list */ ++ int roc_iomode;/* return on close iomode, 0=none */ ++ seqlock_t seqlock; /* Protects the stateid */ ++ nfs4_stateid stateid; ++ unsigned long state; ++#define NFS_INO_RO_LAYOUT_FAILED 0 /* ro layoutget failed stop trying */ ++#define NFS_INO_RW_LAYOUT_FAILED 1 /* rw layoutget failed stop trying */ ++#define NFS_INO_LAYOUTCOMMIT 2 /* LAYOUTCOMMIT needed */ ++ ++ struct rpc_cred *cred; /* layoutcommit credential */ ++ /* DH: These vars keep track of the maximum write range ++ * so the values can be used for layoutcommit. ++ */ ++ loff_t write_begin_pos; ++ loff_t write_end_pos; ++ struct inode *inode; ++}; ++ + /* + * nfs fs inode data in memory + */ +@@ -181,6 +209,13 @@ struct nfs_inode { + struct nfs_delegation *delegation; + fmode_t delegation_state; + struct rw_semaphore rwsem; ++ ++ /* pNFS layout information */ ++#if defined(CONFIG_NFS_V4_1) ++ wait_queue_head_t lo_waitq; ++ struct pnfs_layout_hdr *layout; ++ time_t pnfs_layout_suspend; ++#endif /* CONFIG_NFS_V4_1 */ + #endif /* CONFIG_NFS_V4*/ + #ifdef CONFIG_NFS_FSCACHE + struct fscache_cookie *fscache; +@@ -353,6 +388,8 @@ extern void nfs_setattr_update_inode(str + extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); + extern void put_nfs_open_context(struct nfs_open_context *ctx); + extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode); ++extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx); ++extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); + extern u64 nfs_compat_user_ino64(u64 fileid); + extern void nfs_fattr_init(struct nfs_fattr *fattr); + +@@ -481,8 +518,12 @@ extern void nfs_unblock_sillyrename(stru + extern int nfs_congestion_kb; + extern int nfs_writepage(struct page *page, struct writeback_control *wbc); + extern int nfs_writepages(struct address_space *, struct writeback_control *); +-extern int nfs_flush_incompatible(struct file *file, struct page *page); +-extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); ++struct pnfs_layout_segment; ++extern int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg); ++extern int nfs_updatepage(struct file *, struct page *, ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata); + extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); + + /* +@@ -604,6 +645,8 @@ extern void * nfs_root_data(void); + #define NFSDBG_CLIENT 0x0200 + #define NFSDBG_MOUNT 0x0400 + #define NFSDBG_FSCACHE 0x0800 ++#define NFSDBG_PNFS 0x1000 ++#define NFSDBG_PNFS_LD 0x2000 + #define NFSDBG_ALL 0xFFFF + + #ifdef __KERNEL__ +diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/include/linux/nfs_fs_sb.h +--- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-31 20:41:19.168160480 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-31 20:42:05.586087719 -0400 +@@ -15,6 +15,7 @@ struct nlm_host; + struct nfs4_sequence_args; + struct nfs4_sequence_res; + struct nfs_server; ++struct nfs4_minor_version_ops; + + /* + * The nfs_client identifies our client state to the server. +@@ -70,11 +71,7 @@ struct nfs_client { + */ + char cl_ipaddr[48]; + unsigned char cl_id_uniquifier; +- int (* cl_call_sync)(struct nfs_server *server, +- struct rpc_message *msg, +- struct nfs4_sequence_args *args, +- struct nfs4_sequence_res *res, +- int cache_reply); ++ const struct nfs4_minor_version_ops *cl_mvops; + #endif /* CONFIG_NFS_V4 */ + + #ifdef CONFIG_NFS_V4_1 +@@ -85,6 +82,8 @@ struct nfs_client { + /* The flags used for obtaining the clientid during EXCHANGE_ID */ + u32 cl_exchange_flags; + struct nfs4_session *cl_session; /* sharred session */ ++ struct list_head cl_layouts; ++ struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */ + #endif /* CONFIG_NFS_V4_1 */ + + #ifdef CONFIG_NFS_FSCACHE +@@ -92,6 +91,16 @@ struct nfs_client { + #endif + }; + ++static inline bool ++is_ds_only_client(struct nfs_client *clp) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ return is_ds_only_session(clp->cl_exchange_flags); ++#else ++ return false; ++#endif ++} ++ + /* + * NFS client parameters stored in the superblock. + */ +@@ -136,7 +145,7 @@ struct nfs_server { + #endif + + #ifdef CONFIG_NFS_V4 +- u32 attr_bitmask[2];/* V4 bitmask representing the set ++ u32 attr_bitmask[3];/* V4 bitmask representing the set + of attributes supported on this + filesystem */ + u32 cache_consistency_bitmask[2]; +@@ -148,6 +157,15 @@ struct nfs_server { + that are supported on this + filesystem */ + #endif ++ ++#ifdef CONFIG_NFS_V4_1 ++ u32 pnfs_blksize; /* layout_blksize attr */ ++ struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ ++ void *pnfs_ld_data; /* Per-mount data */ ++ unsigned int ds_rsize; /* Data server read size */ ++ unsigned int ds_wsize; /* Data server write size */ ++#endif /* CONFIG_NFS_V4_1 */ ++ + void (*destroy)(struct nfs_server *); + + atomic_t active; /* Keep trace of any activity to this server */ +diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch/include/linux/nfs_iostat.h +--- linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-08-31 20:42:05.587097913 -0400 +@@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { + NFSIOS_SHORTREAD, + NFSIOS_SHORTWRITE, + NFSIOS_DELAY, ++ NFSIOS_PNFS_READ, ++ NFSIOS_PNFS_WRITE, ++ NFSIOS_PNFS_COMMIT, + __NFSIOS_COUNTSMAX, + }; + +diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/include/linux/nfs_page.h +--- linux-2.6.34.noarch/include/linux/nfs_page.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-08-31 20:42:05.588097898 -0400 +@@ -39,6 +39,7 @@ struct nfs_page { + struct list_head wb_list; /* Defines state of page: */ + struct page *wb_page; /* page to read in/write out */ + struct nfs_open_context *wb_context; /* File state context info */ ++ struct nfs_lock_context *wb_lock_context; /* lock context info */ + atomic_t wb_complete; /* i/os we're waiting for */ + pgoff_t wb_index; /* Offset >> PAGE_CACHE_SHIFT */ + unsigned int wb_offset, /* Offset & ~PAGE_CACHE_MASK */ +@@ -47,6 +48,7 @@ struct nfs_page { + struct kref wb_kref; /* reference count */ + unsigned long wb_flags; + struct nfs_writeverf wb_verf; /* Commit cookie */ ++ struct pnfs_layout_segment *wb_lseg; /* Pnfs layout info */ + }; + + struct nfs_pageio_descriptor { +@@ -60,6 +62,12 @@ struct nfs_pageio_descriptor { + int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int); + int pg_ioflags; + int pg_error; ++ struct pnfs_layout_segment *pg_lseg; ++#ifdef CONFIG_NFS_V4_1 ++ int pg_iswrite; ++ int pg_boundary; ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++#endif /* CONFIG_NFS_V4_1 */ + }; + + #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) +@@ -68,13 +76,15 @@ extern struct nfs_page *nfs_create_reque + struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int count); ++ unsigned int count, ++ struct pnfs_layout_segment *lseg); + extern void nfs_clear_request(struct nfs_page *req); + extern void nfs_release_request(struct nfs_page *req); + + + extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst, +- pgoff_t idx_start, unsigned int npages, int tag); ++ pgoff_t idx_start, unsigned int npages, int tag, ++ int *use_pnfs); + extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, + struct inode *inode, + int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), +diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/include/linux/nfs_xdr.h +--- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-08-31 20:41:19.169171911 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-31 20:42:05.590087729 -0400 +@@ -3,6 +3,8 @@ + + #include + #include ++#include ++#include + + /* + * To change the maximum rsize and wsize supported by the NFS client, adjust +@@ -10,7 +12,7 @@ + * support a megabyte or more. The default is left at 4096 bytes, which is + * reasonable for NFS over UDP. + */ +-#define NFS_MAX_FILE_IO_SIZE (1048576U) ++#define NFS_MAX_FILE_IO_SIZE (4U * 1048576U) + #define NFS_DEF_FILE_IO_SIZE (4096U) + #define NFS_MIN_FILE_IO_SIZE (1024U) + +@@ -113,6 +115,10 @@ struct nfs_fsinfo { + __u32 dtpref; /* pref. readdir transfer size */ + __u64 maxfilesize; + __u32 lease_time; /* in seconds */ ++#if defined(CONFIG_NFS_V4_1) ++ __u32 layouttype; /* supported pnfs layout driver */ ++ __u32 blksize; /* preferred pnfs io block size */ ++#endif + }; + + struct nfs_fsstat { +@@ -185,6 +191,125 @@ struct nfs4_get_lease_time_res { + struct nfs4_sequence_res lr_seq_res; + }; + ++#define PNFS_LAYOUT_MAXSIZE 4096 ++#define NFS4_PNFS_DEVICEID4_SIZE 16 ++ ++struct pnfs_deviceid { ++ char data[NFS4_PNFS_DEVICEID4_SIZE]; ++}; ++ ++struct nfs4_layoutdriver_data { ++ __u32 len; ++ void *buf; ++}; ++ ++struct pnfs_layout_range { ++ u32 iomode; ++ u64 offset; ++ u64 length; ++}; ++ ++struct nfs4_layoutget_args { ++ __u32 type; ++ struct pnfs_layout_range range; ++ __u64 minlength; ++ __u32 maxcount; ++ struct inode *inode; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_layoutget_res { ++ __u32 return_on_close; ++ struct pnfs_layout_range range; ++ __u32 type; ++ nfs4_stateid stateid; ++ struct nfs4_layoutdriver_data layout; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_layoutget { ++ struct nfs4_layoutget_args args; ++ struct nfs4_layoutget_res res; ++ struct pnfs_layout_segment **lsegpp; ++ int status; ++}; ++ ++struct nfs4_layoutcommit_args { ++ nfs4_stateid stateid; ++ __u64 lastbytewritten; ++ __u32 time_modify_changed; ++ struct timespec time_modify; ++ const u32 *bitmask; ++ struct nfs_fh *fh; ++ struct inode *inode; ++ ++ /* Values set by layout driver */ ++ struct pnfs_layout_range range; ++ __u32 layout_type; ++ void *layoutdriver_data; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_layoutcommit_res { ++ __u32 sizechanged; ++ __u64 newsize; ++ struct nfs_fattr *fattr; ++ const struct nfs_server *server; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_layoutcommit_data { ++ struct rpc_task task; ++ struct rpc_cred *cred; ++ struct nfs_fattr fattr; ++ struct nfs4_layoutcommit_args args; ++ struct nfs4_layoutcommit_res res; ++ int status; ++}; ++ ++struct nfs4_layoutreturn_args { ++ __u32 reclaim; ++ __u32 layout_type; ++ __u32 return_type; ++ struct pnfs_layout_range range; ++ struct inode *inode; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_layoutreturn_res { ++ struct nfs4_sequence_res seq_res; ++ u32 lrs_present; ++ nfs4_stateid stateid; ++}; ++ ++struct nfs4_layoutreturn { ++ struct nfs4_layoutreturn_args args; ++ struct nfs4_layoutreturn_res res; ++ struct rpc_cred *cred; ++ int rpc_status; ++}; ++ ++struct nfs4_getdevicelist_args { ++ const struct nfs_fh *fh; ++ u32 layoutclass; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_getdevicelist_res { ++ struct pnfs_devicelist *devlist; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_getdeviceinfo_args { ++ struct pnfs_device *pdev; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_getdeviceinfo_res { ++ struct pnfs_device *pdev; ++ struct nfs4_sequence_res seq_res; ++}; ++ + /* + * Arguments to the open call. + */ +@@ -196,8 +321,10 @@ struct nfs_openargs { + __u64 clientid; + __u64 id; + union { +- struct iattr * attrs; /* UNCHECKED, GUARDED */ +- nfs4_verifier verifier; /* EXCLUSIVE */ ++ struct { ++ struct iattr * attrs; /* UNCHECKED, GUARDED */ ++ nfs4_verifier verifier; /* EXCLUSIVE */ ++ }; + nfs4_stateid delegation; /* CLAIM_DELEGATE_CUR */ + fmode_t delegation_type; /* CLAIM_PREVIOUS */ + } u; +@@ -313,6 +440,10 @@ struct nfs_lockt_res { + struct nfs4_sequence_res seq_res; + }; + ++struct nfs_release_lockowner_args { ++ struct nfs_lowner lock_owner; ++}; ++ + struct nfs4_delegreturnargs { + const struct nfs_fh *fhandle; + const nfs4_stateid *stateid; +@@ -332,6 +463,7 @@ struct nfs4_delegreturnres { + struct nfs_readargs { + struct nfs_fh * fh; + struct nfs_open_context *context; ++ struct nfs_lock_context *lock_context; + __u64 offset; + __u32 count; + unsigned int pgbase; +@@ -352,6 +484,7 @@ struct nfs_readres { + struct nfs_writeargs { + struct nfs_fh * fh; + struct nfs_open_context *context; ++ struct nfs_lock_context *lock_context; + __u64 offset; + __u32 count; + enum nfs3_stable_how stable; +@@ -846,7 +979,7 @@ struct nfs4_server_caps_arg { + }; + + struct nfs4_server_caps_res { +- u32 attr_bitmask[2]; ++ u32 attr_bitmask[3]; + u32 acl_bitmask; + u32 has_links; + u32 has_symlinks; +@@ -961,6 +1094,27 @@ struct nfs_page; + + #define NFS_PAGEVEC_SIZE (8U) + ++#if defined(CONFIG_NFS_V4_1) ++/* pnfsflag values */ ++#define PNFS_NO_RPC 0x0001 /* non rpc result callback switch */ ++ ++/* pnfs-specific data needed for read, write, and commit calls */ ++struct pnfs_call_data { ++ struct pnfs_layout_segment *lseg; ++ const struct rpc_call_ops *call_ops; ++ u32 orig_count; /* for retry via MDS */ ++ int pnfs_error; ++ u8 pnfsflags; ++ u8 how; /* for FLUSH_STABLE */ ++}; ++ ++/* files layout-type specific data for read, write, and commit */ ++struct pnfs_fl_call_data { ++ struct nfs_client *ds_nfs_client; ++ __u64 orig_offset; ++}; ++#endif /* CONFIG_NFS_V4_1 */ ++ + struct nfs_read_data { + int flags; + struct rpc_task task; +@@ -976,10 +1130,16 @@ struct nfs_read_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + + struct nfs_write_data { ++ struct kref refcount; /* For pnfs commit splitting */ ++ struct nfs_write_data *parent; /* For pnfs commit splitting */ + int flags; + struct rpc_task task; + struct inode *inode; +@@ -995,6 +1155,10 @@ struct nfs_write_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + +@@ -1008,6 +1172,7 @@ struct nfs_rpc_ops { + const struct dentry_operations *dentry_ops; + const struct inode_operations *dir_inode_ops; + const struct inode_operations *file_inode_ops; ++ const struct file_operations *file_ops; + + int (*getroot) (struct nfs_server *, struct nfs_fh *, + struct nfs_fsinfo *); +@@ -1072,6 +1237,7 @@ struct nfs_rpc_ops { + extern const struct nfs_rpc_ops nfs_v2_clientops; + extern const struct nfs_rpc_ops nfs_v3_clientops; + extern const struct nfs_rpc_ops nfs_v4_clientops; ++extern const struct nfs_rpc_ops pnfs_v4_clientops; + extern struct rpc_version nfs_version2; + extern struct rpc_version nfs_version3; + extern struct rpc_version nfs_version4; +diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.noarch/include/linux/panfs_shim_api.h +--- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-08-31 20:42:05.598087997 -0400 ++++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-08-31 20:42:05.599087710 -0400 +@@ -0,0 +1,57 @@ ++#ifndef _PANFS_SHIM_API_H ++#define _PANFS_SHIM_API_H ++ ++/* ++ * imported panfs functions ++ */ ++struct panfs_export_operations { ++ int (*convert_rc)(pan_status_t rc); ++ ++ int (*sm_sec_t_get_size_otw)( ++ pan_sm_sec_otw_t *var, ++ pan_size_t *core_sizep, ++ pan_size_t *wire_size, ++ void *buf_end); ++ ++ int (*sm_sec_t_unmarshall)( ++ pan_sm_sec_otw_t *in, ++ pan_sm_sec_t *out, ++ void *buf, ++ pan_size_t size, ++ pan_size_t *otw_consumed, ++ pan_size_t *in_core_consumed); ++ ++ int (*ucreds_get)(void **ucreds_pp); ++ ++ void (*ucreds_put)(void *ucreds); ++ ++ int (*sam_read)( ++ pan_sam_access_flags_t flags, ++ pan_sam_read_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_read_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p); ++ ++ int (*sam_write)( ++ pan_sam_access_flags_t flags, ++ pan_sam_write_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_write_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p); ++}; ++ ++extern int ++panfs_shim_register(struct panfs_export_operations *ops); ++ ++extern int ++panfs_shim_unregister(void); ++ ++#endif /* _PANFS_SHIM_API_H */ +diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h +--- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-31 20:42:05.600025088 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-08-31 20:42:05.600025088 -0400 +@@ -0,0 +1,439 @@ ++/* ++ * pnfs_osd_xdr.h ++ * ++ * pNFS-osd on-the-wire data structures ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#ifndef __PNFS_OSD_XDR_H__ ++#define __PNFS_OSD_XDR_H__ ++ ++#include ++#include ++#include ++#include ++ ++#define PNFS_OSD_OSDNAME_MAXSIZE 256 ++ ++/* ++ * START OF "GENERIC" DECODE ROUTINES. ++ * These may look a little ugly since they are imported from a "generic" ++ * set of XDR encode/decode routines which are intended to be shared by ++ * all of our NFSv4 implementations (OpenBSD, MacOS X...). ++ * ++ * If the pain of reading these is too great, it should be a straightforward ++ * task to translate them into Linux-specific versions which are more ++ * consistent with the style used in NFSv2/v3... ++ */ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (u64)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++ ++/* ++ * draft-ietf-nfsv4-minorversion-22 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* Layout Structure */ ++ ++enum pnfs_osd_raid_algorithm4 { ++ PNFS_OSD_RAID_0 = 1, ++ PNFS_OSD_RAID_4 = 2, ++ PNFS_OSD_RAID_5 = 3, ++ PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ ++}; ++ ++/* struct pnfs_osd_data_map4 { ++ * uint32_t odm_num_comps; ++ * length4 odm_stripe_unit; ++ * uint32_t odm_group_width; ++ * uint32_t odm_group_depth; ++ * uint32_t odm_mirror_cnt; ++ * pnfs_osd_raid_algorithm4 odm_raid_algorithm; ++ * }; ++ */ ++struct pnfs_osd_data_map { ++ u32 odm_num_comps; ++ u64 odm_stripe_unit; ++ u32 odm_group_width; ++ u32 odm_group_depth; ++ u32 odm_mirror_cnt; ++ u32 odm_raid_algorithm; ++}; ++ ++static inline int ++pnfs_osd_data_map_xdr_sz(void) ++{ ++ return 1 + 2 + 1 + 1 + 1 + 1; ++} ++ ++static inline size_t ++pnfs_osd_data_map_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_data_map); ++} ++ ++/* struct pnfs_osd_objid4 { ++ * deviceid4 oid_device_id; ++ * uint64_t oid_partition_id; ++ * uint64_t oid_object_id; ++ * }; ++ */ ++struct pnfs_osd_objid { ++ struct pnfs_deviceid oid_device_id; ++ u64 oid_partition_id; ++ u64 oid_object_id; ++}; ++ ++/* For printout. I use "dev(%llx:%llx)", _DEVID_LO(), _DEVID_HI BE style */ ++#define _DEVID_LO(oid_device_id) \ ++ (unsigned long long)be64_to_cpup((__be64 *)oid_device_id.data) ++ ++#define _DEVID_HI(oid_device_id) \ ++ (unsigned long long)be64_to_cpup(((__be64 *)oid_device_id.data) + 1) ++ ++static inline int ++pnfs_osd_objid_xdr_sz(void) ++{ ++ return (NFS4_PNFS_DEVICEID4_SIZE / 4) + 2 + 2; ++} ++ ++static inline size_t ++pnfs_osd_objid_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_objid); ++} ++ ++enum pnfs_osd_version { ++ PNFS_OSD_MISSING = 0, ++ PNFS_OSD_VERSION_1 = 1, ++ PNFS_OSD_VERSION_2 = 2 ++}; ++ ++struct pnfs_osd_opaque_cred { ++ u32 cred_len; ++ u8 *cred; ++}; ++ ++static inline int ++pnfs_osd_opaque_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ READ32(n); ++ p += XDR_QUADLEN(n); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_opaque_cred_incore_sz(u32 *p) ++{ ++ u32 n; ++ ++ READ32(n); ++ return XDR_QUADLEN(n) * 4; ++} ++ ++enum pnfs_osd_cap_key_sec { ++ PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ PNFS_OSD_CAP_KEY_SEC_SSV = 1, ++}; ++ ++/* struct pnfs_osd_object_cred4 { ++ * pnfs_osd_objid4 oc_object_id; ++ * pnfs_osd_version4 oc_osd_version; ++ * pnfs_osd_cap_key_sec4 oc_cap_key_sec; ++ * opaque oc_capability_key<>; ++ * opaque oc_capability<>; ++ * }; ++ */ ++struct pnfs_osd_object_cred { ++ struct pnfs_osd_objid oc_object_id; ++ u32 oc_osd_version; ++ u32 oc_cap_key_sec; ++ struct pnfs_osd_opaque_cred oc_cap_key; ++ struct pnfs_osd_opaque_cred oc_cap; ++}; ++ ++static inline int ++pnfs_osd_object_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_object_cred_incore_sz(u32 *p) ++{ ++ size_t sz = sizeof(struct pnfs_osd_object_cred); ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ return sz; ++} ++ ++/* struct pnfs_osd_layout4 { ++ * pnfs_osd_data_map4 olo_map; ++ * uint32_t olo_comps_index; ++ * pnfs_osd_object_cred4 olo_components<>; ++ * }; ++ */ ++struct pnfs_osd_layout { ++ struct pnfs_osd_data_map olo_map; ++ u32 olo_comps_index; ++ u32 olo_num_comps; ++ struct pnfs_osd_object_cred *olo_comps; ++}; ++ ++static inline int ++pnfs_osd_layout_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ while ((int)(n--) > 0) ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_layout_incore_sz(u32 *p) ++{ ++ u32 n; ++ size_t sz; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ sz = sizeof(struct pnfs_osd_layout); ++ while ((int)(n--) > 0) { ++ sz += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ return sz; ++} ++ ++/* Device Address */ ++ ++enum pnfs_osd_targetid_type { ++ OBJ_TARGET_ANON = 1, ++ OBJ_TARGET_SCSI_NAME = 2, ++ OBJ_TARGET_SCSI_DEVICE_ID = 3, ++}; ++ ++/* union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) { ++ * case OBJ_TARGET_SCSI_NAME: ++ * string oti_scsi_name<>; ++ * ++ * case OBJ_TARGET_SCSI_DEVICE_ID: ++ * opaque oti_scsi_device_id<>; ++ * ++ * default: ++ * void; ++ * }; ++ * ++ * union pnfs_osd_targetaddr4 switch (bool ota_available) { ++ * case TRUE: ++ * netaddr4 ota_netaddr; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_deviceaddr4 { ++ * pnfs_osd_targetid4 oda_targetid; ++ * pnfs_osd_targetaddr4 oda_targetaddr; ++ * uint64_t oda_lun; ++ * opaque oda_systemid<>; ++ * pnfs_osd_object_cred4 oda_root_obj_cred; ++ * opaque oda_osdname<>; ++ * }; ++ */ ++struct pnfs_osd_targetid { ++ u32 oti_type; ++ struct nfs4_string oti_scsi_device_id; ++}; ++ ++enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 }; ++ ++/* struct netaddr4 { ++ * // see struct rpcb in RFC1833 ++ * string r_netid<>; // network id ++ * string r_addr<>; // universal address ++ * }; ++ */ ++struct pnfs_osd_net_addr { ++ struct nfs4_string r_netid; ++ struct nfs4_string r_addr; ++}; ++ ++struct pnfs_osd_targetaddr { ++ u32 ota_available; ++ struct pnfs_osd_net_addr ota_netaddr; ++}; ++ ++enum { ++ NETWORK_ID_MAX = 16 / 4, ++ UNIVERSAL_ADDRESS_MAX = 64 / 4, ++ PNFS_OSD_TARGETADDR_MAX = 3 + NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX, ++}; ++ ++struct pnfs_osd_deviceaddr { ++ struct pnfs_osd_targetid oda_targetid; ++ struct pnfs_osd_targetaddr oda_targetaddr; ++ u8 oda_lun[8]; ++ struct nfs4_string oda_systemid; ++ struct pnfs_osd_object_cred oda_root_obj_cred; ++ struct nfs4_string oda_osdname; ++}; ++ ++enum { ++ ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4, ++ PNFS_OSD_DEVICEADDR_MAX = ++ PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX + ++ 2 /*oda_lun*/ + ++ 1 + OSD_SYSTEMID_LEN + ++ 1 + ODA_OSDNAME_MAX, ++}; ++ ++/* LAYOUTCOMMIT: layoutupdate */ ++ ++/* union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) { ++ * case TRUE: ++ * int64_t dsu_delta; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_layoutupdate4 { ++ * pnfs_osd_deltaspaceused4 olu_delta_space_used; ++ * bool olu_ioerr_flag; ++ * }; ++ */ ++struct pnfs_osd_layoutupdate { ++ u32 dsu_valid; ++ s64 dsu_delta; ++ u32 olu_ioerr_flag; ++}; ++ ++/* LAYOUTRETURN: I/O Rrror Report */ ++ ++enum pnfs_osd_errno { ++ PNFS_OSD_ERR_EIO = 1, ++ PNFS_OSD_ERR_NOT_FOUND = 2, ++ PNFS_OSD_ERR_NO_SPACE = 3, ++ PNFS_OSD_ERR_BAD_CRED = 4, ++ PNFS_OSD_ERR_NO_ACCESS = 5, ++ PNFS_OSD_ERR_UNREACHABLE = 6, ++ PNFS_OSD_ERR_RESOURCE = 7 ++}; ++ ++/* struct pnfs_osd_ioerr4 { ++ * pnfs_osd_objid4 oer_component; ++ * length4 oer_comp_offset; ++ * length4 oer_comp_length; ++ * bool oer_iswrite; ++ * pnfs_osd_errno4 oer_errno; ++ * }; ++ */ ++struct pnfs_osd_ioerr { ++ struct pnfs_osd_objid oer_component; ++ u64 oer_comp_offset; ++ u64 oer_comp_length; ++ u32 oer_iswrite; ++ u32 oer_errno; ++}; ++ ++static inline unsigned ++pnfs_osd_ioerr_xdr_sz(void) ++{ ++ return pnfs_osd_objid_xdr_sz() + 2 + 2 + 1 + 1; ++} ++ ++/* OSD XDR API */ ++ ++/* Layout helpers */ ++extern struct pnfs_osd_layout *pnfs_osd_xdr_decode_layout( ++ struct pnfs_osd_layout *layout, u32 *p); ++ ++extern int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *layout); ++ ++/* Device Info helpers */ ++ ++/* First pass calculate total size for space needed */ ++extern size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p); ++ ++/* Note: some strings pointed to inside @deviceaddr might point ++ * to space inside @p. @p should stay valid while @deviceaddr ++ * is in use. ++ * It is assumed that @deviceaddr points to bigger memory of size ++ * calculated in first pass by pnfs_osd_xdr_deviceaddr_incore_sz() ++ */ ++extern void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p); ++ ++/* For Servers */ ++extern int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr); ++ ++/* layoutupdate (layout_commit) xdr helpers */ ++extern int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou); ++extern __be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p); ++ ++/* osd_ioerror encoding/decoding (layout_return) */ ++extern int ++pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, struct pnfs_osd_ioerr *ioerr); ++extern __be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p); ++ ++#endif /* __PNFS_OSD_XDR_H__ */ +diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/include/linux/posix_acl.h +--- linux-2.6.34.noarch/include/linux/posix_acl.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-08-31 20:42:05.601087875 -0400 +@@ -8,6 +8,7 @@ + #ifndef __LINUX_POSIX_ACL_H + #define __LINUX_POSIX_ACL_H + ++#include + #include + + #define ACL_UNDEFINED_ID (-1) +diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h +--- linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-08-31 20:42:05.602100892 -0400 +@@ -14,6 +14,8 @@ + /* size of an XDR encoding unit in bytes, i.e. 32bit */ + #define XDR_UNIT (4) + ++#include ++ + /* spec defines authentication flavor as an unsigned 32 bit integer */ + typedef u32 rpc_authflavor_t; + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h +--- linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-31 20:42:05.603108001 -0400 +@@ -3,6 +3,7 @@ + + #ifdef __KERNEL__ + ++#include + #include + + struct rpc_pipe_msg { +@@ -11,6 +12,10 @@ struct rpc_pipe_msg { + size_t len; + size_t copied; + int errno; ++#define PIPEFS_AUTOFREE_RPCMSG 0x01 /* frees rpc_pipe_msg */ ++#define PIPEFS_AUTOFREE_RPCMSG_DATA 0x02 /* frees rpc_pipe_msg->data */ ++#define PIPEFS_AUTOFREE_UPCALL_MSG PIPEFS_AUTOFREE_RPCMSG_DATA ++ u8 flags; + }; + + struct rpc_pipe_ops { +diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h +--- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-31 20:42:05.603108001 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-31 20:42:05.603108001 -0400 +@@ -0,0 +1,111 @@ ++/* ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#ifndef _SIMPLE_RPC_PIPEFS_H_ ++#define _SIMPLE_RPC_PIPEFS_H_ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#define payload_of(headerp) ((void *)(headerp + 1)) ++ ++/* ++ * struct pipefs_hdr -- the generic message format for simple_rpc_pipefs. ++ * Messages may simply be the header itself, although having an optional ++ * data payload follow the header allows much more flexibility. ++ * ++ * Messages are created using pipefs_alloc_init_msg() and ++ * pipefs_alloc_init_msg_padded(), both of which accept a pointer to an ++ * (optional) data payload. ++ * ++ * Given a struct pipefs_hdr *msg that has a struct foo payload, the data ++ * can be accessed using: struct foo *foop = payload_of(msg) ++ */ ++struct pipefs_hdr { ++ u32 msgid; ++ u8 type; ++ u8 flags; ++ u16 totallen; /* length of entire message, including hdr itself */ ++ u32 status; ++}; ++ ++/* ++ * struct pipefs_list -- a type of list used for tracking callers who've made an ++ * upcall and are blocked waiting for a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_assign_upcall_reply(). ++ */ ++struct pipefs_list { ++ struct list_head list; ++ spinlock_t list_lock; ++}; ++ ++ ++/* See net/sunrpc/simple_rpc_pipefs.c for more info on using these functions. */ ++extern struct dentry *pipefs_mkpipe(const char *name, ++ const struct rpc_pipe_ops *ops, ++ int wait_for_open); ++extern void pipefs_closepipe(struct dentry *pipe); ++extern void pipefs_init_list(struct pipefs_list *list); ++extern struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen); ++extern struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, ++ u8 flags, void *data, ++ u16 datalen, u16 padlen); ++extern struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list ++ *uplist, u8 upflags, ++ u32 timeout); ++extern int pipefs_queue_upcall_noreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, u8 upflags); ++extern int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist); ++extern struct pipefs_hdr *pipefs_readmsg(struct file *filp, ++ const char __user *src, size_t len); ++extern ssize_t pipefs_generic_upcall(struct file *filp, ++ struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen); ++extern void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg); ++ ++#endif /* _SIMPLE_RPC_PIPEFS_H_ */ +diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h +--- linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-31 20:42:05.604049784 -0400 +@@ -166,4 +166,41 @@ static inline char *__svc_print_addr(con + + return buf; + } ++ ++/* ++ * Print a network address in a universal format (see rfc1833 and nfsv4.1) ++ */ ++static inline int __svc_print_netaddr(struct sockaddr *addr, ++ struct xdr_netobj *na) ++{ ++ u16 port; ++ ssize_t len; ++ ++ switch (addr->sa_family) { ++ case AF_INET: { ++ struct sockaddr_in *sin = (struct sockaddr_in *)addr; ++ port = ntohs(sin->sin_port); ++ ++ len = snprintf(na->data, na->len, "%pI4.%u.%u", ++ &sin->sin_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ case AF_INET6: { ++ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; ++ port = ntohs(sin6->sin6_port); ++ ++ len = snprintf(na->data, na->len, "%pI6.%u.%u", ++ &sin6->sin6_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ default: ++ snprintf(na->data, na->len, "unknown address type: %d", ++ addr->sa_family); ++ len = -EINVAL; ++ break; ++ } ++ return len; ++} + #endif /* SUNRPC_SVC_XPRT_H */ +diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xdr.h +--- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-31 20:41:19.173118431 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-31 20:42:05.605107904 -0400 +@@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp) + return p + 2; + } + ++static inline __be32 * ++xdr_decode_opaque_fixed(__be32 *p, void *ptr, unsigned int len) ++{ ++ memcpy(ptr, p, len); ++ return p + XDR_QUADLEN(len); ++} ++ + /* + * Adjust kvec to reflect end of xdr'ed data (RPC client XDR) + */ +@@ -197,6 +204,7 @@ struct xdr_stream { + + extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); + extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); ++extern __be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q); + extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, + unsigned int base, unsigned int len); + extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); +diff -up linux-2.6.34.noarch/localversion-pnfs.orig linux-2.6.34.noarch/localversion-pnfs +--- linux-2.6.34.noarch/localversion-pnfs.orig 2010-08-31 20:42:05.605107904 -0400 ++++ linux-2.6.34.noarch/localversion-pnfs 2010-08-31 20:42:05.605107904 -0400 +@@ -0,0 +1 @@ ++-pnfs +diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/sunrpc/Makefile +--- linux-2.6.34.noarch/net/sunrpc/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-08-31 20:42:05.606020148 -0400 +@@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt + svc.o svcsock.o svcauth.o svcauth_unix.o \ + addr.o rpcb_clnt.o timer.o xdr.o \ + sunrpc_syms.o cache.o rpc_pipe.o \ +- svc_xprt.o ++ svc_xprt.o simple_rpc_pipefs.o + sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o + sunrpc-$(CONFIG_PROC_FS) += stats.o + sunrpc-$(CONFIG_SYSCTL) += sysctl.o +diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c +--- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-31 20:42:05.606020148 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-31 20:42:05.607108065 -0400 +@@ -0,0 +1,424 @@ ++/* ++ * net/sunrpc/simple_rpc_pipefs.c ++ * ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++ ++/* ++ * Make an rpc_pipefs pipe named @name at the root of the mounted rpc_pipefs ++ * filesystem. ++ * ++ * If @wait_for_open is non-zero and an upcall is later queued but the userland ++ * end of the pipe has not yet been opened, the upcall will remain queued until ++ * the pipe is opened; otherwise, the upcall queueing will return with -EPIPE. ++ */ ++struct dentry *pipefs_mkpipe(const char *name, const struct rpc_pipe_ops *ops, ++ int wait_for_open) ++{ ++ struct dentry *dir, *pipe; ++ struct vfsmount *mnt; ++ ++ mnt = rpc_get_mount(); ++ if (IS_ERR(mnt)) { ++ pipe = ERR_CAST(mnt); ++ goto out; ++ } ++ dir = mnt->mnt_root; ++ if (!dir) { ++ pipe = ERR_PTR(-ENOENT); ++ goto out; ++ } ++ pipe = rpc_mkpipe(dir, name, NULL, ops, ++ wait_for_open ? RPC_PIPE_WAIT_FOR_OPEN : 0); ++out: ++ return pipe; ++} ++EXPORT_SYMBOL(pipefs_mkpipe); ++ ++/* ++ * Shutdown a pipe made by pipefs_mkpipe(). ++ * XXX: do we need to retain an extra reference on the mount? ++ */ ++void pipefs_closepipe(struct dentry *pipe) ++{ ++ rpc_unlink(pipe); ++ rpc_put_mount(); ++} ++EXPORT_SYMBOL(pipefs_closepipe); ++ ++/* ++ * Initialize a struct pipefs_list -- which are a way to keep track of callers ++ * who're blocked having made an upcall and are awaiting a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_find_upcall_msgid() for how ++ * to use them. ++ */ ++inline void pipefs_init_list(struct pipefs_list *list) ++{ ++ INIT_LIST_HEAD(&list->list); ++ spin_lock_init(&list->list_lock); ++} ++EXPORT_SYMBOL(pipefs_init_list); ++ ++/* ++ * Alloc/init a generic pipefs message header and copy into its message body ++ * an arbitrary data payload. ++ * ++ * struct pipefs_hdr's are meant to serve as generic, general-purpose message ++ * headers for easy rpc_pipefs I/O. When an upcall is made, the ++ * struct pipefs_hdr is assigned to a struct rpc_pipe_msg and delivered ++ * therein. --And yes, the naming can seem a little confusing at first: ++ * ++ * When one thinks of an upcall "message", in simple_rpc_pipefs that's a ++ * struct pipefs_hdr (possibly with an attached message body). A ++ * struct rpc_pipe_msg is actually only the -vehicle- by which the "real" ++ * message is delivered and processed. ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen, u16 padlen) ++{ ++ u16 totallen; ++ struct pipefs_hdr *msg = NULL; ++ ++ totallen = sizeof(*msg) + datalen + padlen; ++ if (totallen > PAGE_SIZE) { ++ msg = ERR_PTR(-E2BIG); ++ goto out; ++ } ++ ++ msg = kzalloc(totallen, GFP_KERNEL); ++ if (!msg) { ++ msg = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ msg->msgid = msgid; ++ msg->type = type; ++ msg->flags = flags; ++ msg->totallen = totallen; ++ memcpy(payload_of(msg), data, datalen); ++out: ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg_padded); ++ ++/* ++ * See the description of pipefs_alloc_init_msg_padded(). ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen) ++{ ++ return pipefs_alloc_init_msg_padded(msgid, type, flags, data, ++ datalen, 0); ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg); ++ ++ ++static void pipefs_init_rpcmsg(struct rpc_pipe_msg *rpcmsg, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ memset(rpcmsg, 0, sizeof(*rpcmsg)); ++ rpcmsg->data = msg; ++ rpcmsg->len = msg->totallen; ++ rpcmsg->flags = upflags; ++} ++ ++static struct rpc_pipe_msg *pipefs_alloc_init_rpcmsg(struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ struct rpc_pipe_msg *rpcmsg; ++ ++ rpcmsg = kmalloc(sizeof(*rpcmsg), GFP_KERNEL); ++ if (!rpcmsg) ++ return ERR_PTR(-ENOMEM); ++ ++ pipefs_init_rpcmsg(rpcmsg, msg, upflags); ++ return rpcmsg; ++} ++ ++ ++/* represents an upcall that'll block and wait for a reply */ ++struct pipefs_upcall { ++ u32 msgid; ++ struct rpc_pipe_msg rpcmsg; ++ struct list_head list; ++ wait_queue_head_t waitq; ++ struct pipefs_hdr *reply; ++}; ++ ++ ++static void pipefs_init_upcall_waitreply(struct pipefs_upcall *upcall, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ upcall->reply = NULL; ++ upcall->msgid = msg->msgid; ++ INIT_LIST_HEAD(&upcall->list); ++ init_waitqueue_head(&upcall->waitq); ++ pipefs_init_rpcmsg(&upcall->rpcmsg, msg, upflags); ++} ++ ++static int __pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_upcall *upcall, ++ struct pipefs_list *uplist, ++ u32 timeout) ++{ ++ int err = 0; ++ DECLARE_WAITQUEUE(wq, current); ++ ++ add_wait_queue(&upcall->waitq, &wq); ++ spin_lock(&uplist->list_lock); ++ list_add(&upcall->list, &uplist->list); ++ spin_unlock(&uplist->list_lock); ++ ++ err = rpc_queue_upcall(pipe->d_inode, &upcall->rpcmsg); ++ if (err < 0) ++ goto out; ++ ++ if (timeout) { ++ /* retval of 0 means timer expired */ ++ err = schedule_timeout_uninterruptible(timeout); ++ if (err == 0 && upcall->reply == NULL) ++ err = -ETIMEDOUT; ++ } else { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ } ++ ++out: ++ spin_lock(&uplist->list_lock); ++ list_del_init(&upcall->list); ++ spin_unlock(&uplist->list_lock); ++ remove_wait_queue(&upcall->waitq, &wq); ++ return err; ++} ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace, place the calling thread ++ * on @uplist, and block the thread to wait for a reply. If @timeout is ++ * nonzero, the thread will be blocked for at most @timeout jiffies. ++ * ++ * (To convert time units into jiffies, consider the functions ++ * msecs_to_jiffies(), usecs_to_jiffies(), timeval_to_jiffies(), and ++ * timespec_to_jiffies().) ++ * ++ * Once a reply is received by your downcall handler, call ++ * pipefs_assign_upcall_reply() with @uplist to find the corresponding upcall, ++ * assign the reply, and wake the waiting thread. ++ * ++ * This function's return value pointer may be an error and should be checked ++ * with IS_ERR() before attempting to access the reply message. ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list *uplist, ++ u8 upflags, u32 timeout) ++{ ++ int err = 0; ++ struct pipefs_upcall upcall; ++ ++ pipefs_init_upcall_waitreply(&upcall, msg, upflags); ++ err = __pipefs_queue_upcall_waitreply(pipe, &upcall, uplist, timeout); ++ if (err < 0) { ++ kfree(upcall.reply); ++ upcall.reply = ERR_PTR(err); ++ } ++ ++ return upcall.reply; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_waitreply); ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace and immediately return (i.e., ++ * no reply is expected). ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++int pipefs_queue_upcall_noreply(struct dentry *pipe, struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ int err = 0; ++ struct rpc_pipe_msg *rpcmsg; ++ ++ upflags |= PIPEFS_AUTOFREE_RPCMSG; ++ rpcmsg = pipefs_alloc_init_rpcmsg(msg, upflags); ++ if (IS_ERR(rpcmsg)) { ++ err = PTR_ERR(rpcmsg); ++ goto out; ++ } ++ err = rpc_queue_upcall(pipe->d_inode, rpcmsg); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_noreply); ++ ++ ++static struct pipefs_upcall *pipefs_find_upcall_msgid(u32 msgid, ++ struct pipefs_list *uplist) ++{ ++ struct pipefs_upcall *upcall; ++ ++ spin_lock(&uplist->list_lock); ++ list_for_each_entry(upcall, &uplist->list, list) ++ if (upcall->msgid == msgid) ++ goto out; ++ upcall = NULL; ++out: ++ spin_unlock(&uplist->list_lock); ++ return upcall; ++} ++ ++/* ++ * In your rpc_pipe_ops->downcall() handler, once you've read in a downcall ++ * message and have determined that it is a reply to a waiting upcall, ++ * you can use this function to find the appropriate upcall, assign the result, ++ * and wake the upcall thread. ++ * ++ * The reply message must have the same msgid as the original upcall message's. ++ * ++ * See also pipefs_queue_upcall_waitreply() and pipefs_readmsg(). ++ */ ++int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist) ++{ ++ int err = 0; ++ struct pipefs_upcall *upcall; ++ ++ upcall = pipefs_find_upcall_msgid(reply->msgid, uplist); ++ if (!upcall) { ++ printk(KERN_ERR "%s: ERROR: have reply but no matching upcall " ++ "for msgid %d\n", __func__, reply->msgid); ++ err = -ENOENT; ++ goto out; ++ } ++ upcall->reply = reply; ++ wake_up(&upcall->waitq); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_assign_upcall_reply); ++ ++/* ++ * Generic method to read-in and return a newly-allocated message which begins ++ * with a struct pipefs_hdr. ++ */ ++struct pipefs_hdr *pipefs_readmsg(struct file *filp, const char __user *src, ++ size_t len) ++{ ++ int err = 0, hdrsize; ++ struct pipefs_hdr *msg = NULL; ++ ++ hdrsize = sizeof(*msg); ++ if (len < hdrsize) { ++ printk(KERN_ERR "%s: ERROR: header is too short (%d vs %d)\n", ++ __func__, (int) len, hdrsize); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ msg = kzalloc(len, GFP_KERNEL); ++ if (!msg) { ++ err = -ENOMEM; ++ goto out; ++ } ++ if (copy_from_user(msg, src, len)) ++ err = -EFAULT; ++out: ++ if (err) { ++ kfree(msg); ++ msg = ERR_PTR(err); ++ } ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_readmsg); ++ ++/* ++ * Generic rpc_pipe_ops->upcall() handler implementation. ++ * ++ * Don't call this directly: to make an upcall, use ++ * pipefs_queue_upcall_waitreply() or pipefs_queue_upcall_noreply(). ++ */ ++ssize_t pipefs_generic_upcall(struct file *filp, struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen) ++{ ++ char *data; ++ ssize_t len, left; ++ ++ data = (char *)rpcmsg->data + rpcmsg->copied; ++ len = rpcmsg->len - rpcmsg->copied; ++ if (len > buflen) ++ len = buflen; ++ ++ left = copy_to_user(dst, data, len); ++ if (left < 0) { ++ rpcmsg->errno = left; ++ return left; ++ } ++ ++ len -= left; ++ rpcmsg->copied += len; ++ rpcmsg->errno = 0; ++ return len; ++} ++EXPORT_SYMBOL(pipefs_generic_upcall); ++ ++/* ++ * Generic rpc_pipe_ops->destroy_msg() handler implementation. ++ * ++ * Items are only freed if @rpcmsg->flags has been set appropriately. ++ * See pipefs_queue_upcall_noreply() and rpc_pipe_fs.h. ++ */ ++void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg) ++{ ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_UPCALL_MSG) ++ kfree(rpcmsg->data); ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_RPCMSG) ++ kfree(rpcmsg); ++} ++EXPORT_SYMBOL(pipefs_generic_destroy_msg); +diff -up linux-2.6.34.noarch/net/sunrpc/xdr.c.orig linux-2.6.34.noarch/net/sunrpc/xdr.c +--- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-08-31 20:41:19.188144022 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-31 20:42:05.607108065 -0400 +@@ -395,24 +395,29 @@ xdr_shrink_pagelen(struct xdr_buf *buf, + { + struct kvec *tail; + size_t copy; +- char *p; + unsigned int pglen = buf->page_len; ++ unsigned int tailbuf_len; + + tail = buf->tail; + BUG_ON (len > pglen); + ++ tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len; ++ + /* Shift the tail first */ +- if (tail->iov_len != 0) { +- p = (char *)tail->iov_base + len; ++ if (tailbuf_len != 0) { ++ unsigned int free_space = tailbuf_len - tail->iov_len; ++ ++ if (len < free_space) ++ free_space = len; ++ tail->iov_len += free_space; ++ ++ copy = len; + if (tail->iov_len > len) { +- copy = tail->iov_len - len; +- memmove(p, tail->iov_base, copy); ++ char *p = (char *)tail->iov_base + len; ++ memmove(p, tail->iov_base, tail->iov_len - len); + } else +- buf->buflen -= len; +- /* Copy from the inlined pages into the tail */ +- copy = len; +- if (copy > tail->iov_len) + copy = tail->iov_len; ++ /* Copy from the inlined pages into the tail */ + _copy_from_pages((char *)tail->iov_base, + buf->pages, buf->page_base + pglen - len, + copy); +@@ -496,6 +501,27 @@ __be32 * xdr_reserve_space(struct xdr_st + EXPORT_SYMBOL_GPL(xdr_reserve_space); + + /** ++ * xdr_rewind_stream - rewind a stream back to some checkpoint ++ * @xdr: pointer to xdr_stream ++ * @q: some checkpoint at historical place of @xdr ++ * ++ * Restors an xdr stream to some historical point. @q must be ++ * a logical xdr point in the past that was sampled by @q = @xdr->p. ++ */ ++__be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q) ++{ ++ size_t nbytes = (xdr->p - q) << 2; ++ ++ BUG_ON(xdr->p < q); ++ BUG_ON(nbytes > xdr->iov->iov_len || nbytes > xdr->buf->len); ++ xdr->p = q; ++ xdr->iov->iov_len -= nbytes; ++ xdr->buf->len -= nbytes; ++ return q; ++} ++EXPORT_SYMBOL_GPL(xdr_rewind_stream); ++ ++/** + * xdr_write_pages - Insert a list of pages into an XDR buffer for sending + * @xdr: pointer to xdr_stream + * @pages: list of pages From cf0a5bb309bf8f67c9d4549718137d08d20da726 Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Sat, 4 Sep 2010 09:23:12 -0400 Subject: [PATCH 19/20] Removed localversion-nfs file Signed-off-by: Steve Dickson --- pnfs-all-2.6.35-2010-08-24-f13.patch | 393 +++++++++++++-------------- 1 file changed, 194 insertions(+), 199 deletions(-) diff --git a/pnfs-all-2.6.35-2010-08-24-f13.patch b/pnfs-all-2.6.35-2010-08-24-f13.patch index 17d1c844d..7d82d9fa4 100644 --- a/pnfs-all-2.6.35-2010-08-24-f13.patch +++ b/pnfs-all-2.6.35-2010-08-24-f13.patch @@ -1,6 +1,6 @@ diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arch/um/os-Linux/mem.c ---- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-08-31 20:41:16.924243041 -0400 -+++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-08-31 20:42:05.486160576 -0400 +--- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-09-04 09:20:04.110038647 -0400 ++++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-09-04 09:21:44.875202803 -0400 @@ -13,6 +13,7 @@ #include #include @@ -11,7 +11,7 @@ diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arc #include "os.h" diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd.c --- linux-2.6.34.noarch/block/genhd.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/block/genhd.c 2010-08-31 20:42:05.487160201 -0400 ++++ linux-2.6.34.noarch/block/genhd.c 2010-09-04 09:21:44.875202803 -0400 @@ -1009,6 +1009,7 @@ static void disk_release(struct device * struct class block_class = { .name = "block", @@ -21,8 +21,8 @@ diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd. static char *block_devnode(struct device *dev, mode_t *mode) { diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt ---- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-31 20:42:05.486160576 -0400 -+++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-08-31 20:42:05.486160576 -0400 +--- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-09-04 09:21:44.876222743 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-09-04 09:21:44.876222743 -0400 @@ -0,0 +1,211 @@ +(c) 2007 Network Appliance Inc. + @@ -236,8 +236,8 @@ diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6. + + diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/drivers/md/dm-ioctl.c ---- linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig 2010-08-31 20:41:17.063232968 -0400 -+++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-08-31 20:42:05.488160560 -0400 +--- linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig 2010-09-04 09:20:04.252180557 -0400 ++++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-09-04 09:21:44.877242928 -0400 @@ -657,6 +657,12 @@ static int dev_create(struct dm_ioctl *p return r; } @@ -292,7 +292,7 @@ diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/driv int r; diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drivers/scsi/hosts.c --- linux-2.6.34.noarch/drivers/scsi/hosts.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-08-31 20:42:05.489160594 -0400 ++++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-09-04 09:21:44.879035601 -0400 @@ -49,7 +49,7 @@ static void scsi_host_cls_release(struct put_device(&class_to_shost(dev)->shost_gendev); } @@ -304,7 +304,7 @@ diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drive }; diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/exofs.h --- linux-2.6.34.noarch/fs/exofs/exofs.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-08-31 20:42:05.492243039 -0400 ++++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-09-04 09:21:44.879035601 -0400 @@ -36,13 +36,9 @@ #include #include @@ -360,8 +360,8 @@ diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/ + #endif diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs/export.c ---- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-08-31 20:42:05.493222759 -0400 -+++ linux-2.6.34.noarch/fs/exofs/export.c 2010-08-31 20:42:05.493222759 -0400 +--- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-09-04 09:21:44.880171068 -0400 ++++ linux-2.6.34.noarch/fs/exofs/export.c 2010-09-04 09:21:44.880171068 -0400 @@ -0,0 +1,396 @@ +/* + * export.c - Implementation of the pnfs_export_operations @@ -761,7 +761,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs +} diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/inode.c --- linux-2.6.34.noarch/fs/exofs/inode.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-08-31 20:42:05.494222756 -0400 ++++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-09-04 09:21:44.881160952 -0400 @@ -833,7 +833,7 @@ void exofs_truncate(struct inode *inode) if (unlikely(wait_obj_created(oi))) goto fail; @@ -781,7 +781,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/ * Fill in an inode read from the OSD and set it up for use diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/Kbuild --- linux-2.6.34.noarch/fs/exofs/Kbuild.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-08-31 20:42:05.490222933 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-09-04 09:21:44.882160660 -0400 @@ -13,4 +13,5 @@ # @@ -790,7 +790,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/K obj-$(CONFIG_EXOFS_FS) += exofs.o diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/Kconfig --- linux-2.6.34.noarch/fs/exofs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-08-31 20:42:05.491232880 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-09-04 09:21:44.883039027 -0400 @@ -1,6 +1,7 @@ config EXOFS_FS tristate "exofs: OSD based file system support" @@ -801,7 +801,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/ as its backing storage. diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/super.c --- linux-2.6.34.noarch/fs/exofs/super.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exofs/super.c 2010-08-31 20:42:05.496073173 -0400 ++++ linux-2.6.34.noarch/fs/exofs/super.c 2010-09-04 09:21:44.883039027 -0400 @@ -621,6 +621,7 @@ static int exofs_fill_super(struct super sb->s_fs_info = sbi; sb->s_op = &exofs_sops; @@ -812,7 +812,7 @@ diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/ EXOFS_ERR("ERROR: exofs_iget failed\n"); diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exportfs/expfs.c --- linux-2.6.34.noarch/fs/exportfs/expfs.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-08-31 20:42:05.497212975 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-09-04 09:21:44.884180594 -0400 @@ -16,6 +16,13 @@ #include #include @@ -829,7 +829,7 @@ diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exp diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/exportfs/Makefile --- linux-2.6.34.noarch/fs/exportfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-08-31 20:42:05.496073173 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-09-04 09:21:44.885160697 -0400 @@ -3,4 +3,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs.o @@ -840,8 +840,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/ex +exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o +exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c ---- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-31 20:42:05.497212975 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-31 20:42:05.498113655 -0400 +--- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-09-04 09:21:44.885160697 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-09-04 09:21:44.885160697 -0400 @@ -0,0 +1,158 @@ +/* + * linux/fs/nfsd/nfs4blocklayoutxdr.c @@ -1002,8 +1002,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34. +} +EXPORT_SYMBOL_GPL(blocklayout_encode_layout); diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c ---- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-31 20:42:05.498113655 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-31 20:42:05.498113655 -0400 +--- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-09-04 09:21:44.886051895 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-09-04 09:21:44.886051895 -0400 @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -1224,8 +1224,8 @@ diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.n +} +EXPORT_SYMBOL(filelayout_encode_layout); diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c ---- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-31 20:42:05.499125509 -0400 -+++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-31 20:42:05.499125509 -0400 +--- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-09-04 09:21:44.887054758 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-09-04 09:21:44.887054758 -0400 @@ -0,0 +1,289 @@ +/* + * pnfs_osd_xdr_enc.c @@ -1518,7 +1518,7 @@ diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.no +EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gfs2/ops_fstype.c --- linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-08-31 20:42:05.500123860 -0400 ++++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-09-04 09:21:44.888035389 -0400 @@ -19,6 +19,7 @@ #include #include @@ -1539,7 +1539,7 @@ diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gf sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig --- linux-2.6.34.noarch/fs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/Kconfig 2010-08-31 20:42:05.490222933 -0400 ++++ linux-2.6.34.noarch/fs/Kconfig 2010-09-04 09:21:44.889035490 -0400 @@ -224,6 +224,31 @@ config LOCKD_V4 config EXPORTFS tristate @@ -1573,8 +1573,8 @@ diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig tristate select FS_POSIX_ACL diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-31 20:42:05.503222878 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-31 20:42:05.503222878 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-09-04 09:21:44.890035431 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-09-04 09:21:44.890035431 -0400 @@ -0,0 +1,66 @@ +#include +#include @@ -1643,8 +1643,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.or + return; +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-31 20:42:05.504232855 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-31 20:42:05.504232855 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-09-04 09:21:44.891045310 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-09-04 09:21:44.891045310 -0400 @@ -0,0 +1,1160 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.c @@ -2807,8 +2807,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34. +module_init(nfs4blocklayout_init); +module_exit(nfs4blocklayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-31 20:42:05.506119071 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-31 20:42:05.506119071 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-09-04 09:21:44.892025716 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-09-04 09:21:44.892025716 -0400 @@ -0,0 +1,335 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdev.c @@ -3146,8 +3146,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6. + goto out; +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-31 20:42:05.506119071 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-31 20:42:05.506119071 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-09-04 09:21:44.893035500 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-09-04 09:21:44.893035500 -0400 @@ -0,0 +1,120 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdm.c @@ -3270,8 +3270,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.3 + } +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h ---- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-31 20:42:05.505169618 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-31 20:42:05.505169618 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-09-04 09:21:44.894045279 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-09-04 09:21:44.894045279 -0400 @@ -0,0 +1,302 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h @@ -3576,8 +3576,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34. + +#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c ---- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-31 20:42:05.507113260 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-08-31 20:42:05.508119925 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-09-04 09:21:44.895035248 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-09-04 09:21:44.895035248 -0400 @@ -0,0 +1,948 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h @@ -4528,8 +4528,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noar + } +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile ---- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-31 20:42:05.502212803 -0400 -+++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-08-31 20:42:05.502212803 -0400 +--- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-09-04 09:21:44.895035248 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-09-04 09:21:44.896025369 -0400 @@ -0,0 +1,6 @@ +# +# Makefile for the pNFS block layout driver kernel module @@ -4539,7 +4539,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarc + extents.o block-device-discovery-pipe.o diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/callback.h --- linux-2.6.34.noarch/fs/nfs/callback.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-08-31 20:42:05.508119925 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-09-04 09:21:44.896025369 -0400 @@ -111,6 +111,13 @@ extern int nfs41_validate_delegation_sta #define RCA4_TYPE_MASK_RDATA_DLG 0 @@ -4596,7 +4596,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/c extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/nfs/callback_proc.c --- linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-08-31 20:42:05.509093330 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-09-04 09:21:44.897056128 -0400 @@ -8,10 +8,15 @@ #include #include @@ -5079,7 +5079,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/ return status; diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/nfs/callback_xdr.c --- linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-08-31 20:42:05.510143651 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-09-04 09:21:44.898072186 -0400 @@ -22,6 +22,8 @@ #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) @@ -5281,8 +5281,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/n .process_op = (callback_process_op_t)nfs4_callback_sequence, .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/client.c ---- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-08-31 20:41:19.144140225 -0400 -+++ linux-2.6.34.noarch/fs/nfs/client.c 2010-08-31 20:42:05.511222861 -0400 +--- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-09-04 09:20:05.988202702 -0400 ++++ linux-2.6.34.noarch/fs/nfs/client.c 2010-09-04 09:21:44.900025165 -0400 @@ -39,6 +39,7 @@ #include #include @@ -5491,8 +5491,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/cli goto error; diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/bl_com.c ---- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-08-31 20:42:05.550110844 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-08-31 20:42:05.550110844 -0400 +--- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-09-04 09:21:44.900025165 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-09-04 09:21:44.901035455 -0400 @@ -0,0 +1,292 @@ +#if defined(CONFIG_SPNFS_BLOCK) + @@ -5787,8 +5787,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/b +} +#endif /* CONFIG_SPNFS_BLOCK */ diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/bl_ops.c ---- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-08-31 20:42:05.551222888 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-08-31 20:42:05.551222888 -0400 +--- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-09-04 09:21:44.902035254 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-09-04 09:21:44.902035254 -0400 @@ -0,0 +1,1672 @@ +/* + * bl_ops.c @@ -7463,8 +7463,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/b + +#endif /* CONFIG_SPNFS_BLOCK */ diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs/delegation.c ---- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-08-31 20:41:19.144140225 -0400 -+++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-08-31 20:42:05.512106042 -0400 +--- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-09-04 09:20:05.988202702 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-09-04 09:21:44.903025737 -0400 @@ -104,7 +104,8 @@ again: continue; if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) @@ -7541,7 +7541,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs rcu_read_unlock(); diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs/delegation.h --- linux-2.6.34.noarch/fs/nfs/delegation.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-08-31 20:42:05.513114811 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-09-04 09:21:44.904035627 -0400 @@ -34,9 +34,7 @@ enum { int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); @@ -7554,8 +7554,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c ---- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-08-31 20:41:19.196140434 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-08-31 20:42:05.553222784 -0400 +--- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-09-04 09:20:06.039203080 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-09-04 09:21:44.905045348 -0400 @@ -17,11 +17,19 @@ #include #include @@ -7733,7 +7733,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/e svcauth_unix_purge(); diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/direct.c --- linux-2.6.34.noarch/fs/nfs/direct.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-08-31 20:42:05.514196343 -0400 ++++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-09-04 09:21:44.906025356 -0400 @@ -267,6 +267,38 @@ static const struct rpc_call_ops nfs_rea .rpc_release = nfs_direct_read_release, }; @@ -7979,7 +7979,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/dir user_addr += bytes; diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kconfig --- linux-2.6.34.noarch/fs/nfsd/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-08-31 20:42:05.549222922 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-09-04 09:21:44.907035472 -0400 @@ -79,3 +79,52 @@ config NFSD_V4 available from http://linux-nfs.org/. @@ -8035,7 +8035,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kc + If unsure, say N. diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/Makefile --- linux-2.6.34.noarch/fs/nfsd/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-08-31 20:42:05.549222922 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-09-04 09:21:44.907035472 -0400 @@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ @@ -8045,8 +8045,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/M +nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o +nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-31 20:41:19.197150385 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-08-31 20:42:05.554114789 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-09-04 09:20:06.040212867 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-09-04 09:21:44.908055511 -0400 @@ -40,7 +40,6 @@ #define NFSPROC4_CB_NULL 0 @@ -8586,8 +8586,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/ +} +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-31 20:42:05.556172071 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-31 20:42:05.556172071 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-09-04 09:21:44.910025108 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-09-04 09:21:44.910025108 -0400 @@ -0,0 +1,1679 @@ +/****************************************************************************** + * @@ -10269,8 +10269,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfs + return status; +} diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-31 20:42:05.557222774 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-31 20:42:05.557222774 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-09-04 09:21:44.911025728 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-09-04 09:21:44.911025728 -0400 @@ -0,0 +1,461 @@ +/****************************************************************************** + * @@ -10734,8 +10734,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/n +}; +EXPORT_SYMBOL(pnfs_dlm_export_ops); diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-31 20:42:05.558141620 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-31 20:42:05.558141620 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-09-04 09:21:44.912035398 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-09-04 09:21:44.912035398 -0400 @@ -0,0 +1,620 @@ +/* +* linux/fs/nfsd/nfs4pnfsds.c @@ -11358,8 +11358,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nf + +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-31 20:41:19.198160463 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-08-31 20:42:05.559129617 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-09-04 09:20:06.041223204 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-09-04 09:21:44.913035888 -0400 @@ -34,10 +34,14 @@ */ #include @@ -11834,8 +11834,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd static const char *nfsd4_op_name(unsigned opnum) diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-08-31 20:41:19.200150153 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-08-31 20:42:05.561202607 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-09-04 09:20:06.043212709 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-09-04 09:21:44.916015197 -0400 @@ -42,6 +42,8 @@ #include "xdr4.h" #include "vfs.h" @@ -12351,8 +12351,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs } diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c ---- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-08-31 20:41:19.202150173 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-08-31 20:42:05.563232916 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-09-04 09:20:06.045212665 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-09-04 09:21:44.918025318 -0400 @@ -47,9 +47,14 @@ #include #include @@ -12971,8 +12971,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/ [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c ---- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-08-31 20:41:19.203150982 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-08-31 20:42:05.565212801 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-09-04 09:20:06.047233081 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-09-04 09:21:44.920025397 -0400 @@ -13,10 +13,15 @@ #include #include @@ -13149,8 +13149,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/n remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h ---- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-08-31 20:41:19.204160960 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-08-31 20:42:05.565212801 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-09-04 09:20:06.047233081 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-09-04 09:21:44.920025397 -0400 @@ -285,11 +285,17 @@ extern time_t nfsd4_grace; #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ NFSD4_SUPPORTED_ATTRS_WORD0 @@ -13172,7 +13172,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfs { diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.c --- linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-08-31 20:42:05.566222921 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-09-04 09:21:44.921045937 -0400 @@ -10,6 +10,7 @@ #include @@ -13210,7 +13210,7 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nf __u32 tfh[2]; diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.h --- linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-08-31 20:42:05.567233002 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-09-04 09:21:44.922035547 -0400 @@ -14,6 +14,7 @@ enum nfsd_fsid { FSID_UUID8, FSID_UUID16, @@ -13263,8 +13263,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nf + #endif /* _LINUX_NFSD_FH_INT_H */ diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/nfssvc.c ---- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-08-31 20:41:17.274232911 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-08-31 20:42:05.568144414 -0400 +--- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-09-04 09:20:04.514160362 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-09-04 09:21:44.923045353 -0400 @@ -115,7 +115,7 @@ struct svc_program nfsd_program = { }; @@ -13275,8 +13275,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/n int nfsd_vers(int vers, enum vers_op change) { diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pnfsd.h ---- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-08-31 20:42:05.569090615 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-08-31 20:42:05.569090615 -0400 +--- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-09-04 09:21:44.923045353 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-09-04 09:21:44.923045353 -0400 @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2005 The Regents of the University of Michigan. @@ -13422,8 +13422,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pn + +#endif /* LINUX_NFSD_PNFSD_H */ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c ---- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-31 20:42:05.569090615 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-31 20:42:05.569090615 -0400 +--- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-09-04 09:21:44.924046083 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-09-04 09:21:44.924046083 -0400 @@ -0,0 +1,225 @@ +/* + * linux/fs/nfsd/pnfs_lexp.c @@ -13651,8 +13651,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nf + inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; +} diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_com.c ---- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-31 20:42:05.570119170 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-08-31 20:42:05.570119170 -0400 +--- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-09-04 09:21:44.925035828 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-09-04 09:21:44.925035828 -0400 @@ -0,0 +1,535 @@ +/* + * fs/nfsd/spnfs_com.c @@ -14190,8 +14190,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfs +} +#endif /* CONFIG_PROC_FS */ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c ---- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-31 20:42:05.571097807 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-08-31 20:42:05.572091128 -0400 +--- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-09-04 09:21:44.926030099 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-09-04 09:21:44.926030099 -0400 @@ -0,0 +1,878 @@ +/* + * fs/nfsd/spnfs_ops.c @@ -15072,8 +15072,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfs + return 0; +} diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h ---- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-08-31 20:41:19.205016844 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-08-31 20:42:05.572091128 -0400 +--- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-09-04 09:20:06.048233523 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-09-04 09:21:44.927025219 -0400 @@ -242,6 +242,12 @@ struct nfs4_client { u32 cl_cb_seq_nr; struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ @@ -15190,8 +15190,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/st + #endif /* NFSD4_STATE_H */ diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs.c ---- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-08-31 20:41:17.275233561 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-08-31 20:42:05.573121119 -0400 +--- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-09-04 09:20:04.515160297 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-09-04 09:21:44.929025356 -0400 @@ -37,7 +37,12 @@ #ifdef CONFIG_NFSD_V4 #include @@ -15318,8 +15318,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs. out_nfserr: err = nfserrno(host_err); diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h ---- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-08-31 20:41:19.206170424 -0400 -+++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-08-31 20:42:05.575139084 -0400 +--- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-09-04 09:20:06.049232898 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-09-04 09:21:44.930035442 -0400 @@ -37,6 +37,8 @@ #ifndef _LINUX_NFSD_XDR4_H #define _LINUX_NFSD_XDR4_H @@ -15396,8 +15396,8 @@ diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr struct nfs4_replay * replay; }; diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file.c ---- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-08-31 20:41:19.146161064 -0400 -+++ linux-2.6.34.noarch/fs/nfs/file.c 2010-08-31 20:42:05.515139585 -0400 +--- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-09-04 09:20:05.990223533 -0400 ++++ linux-2.6.34.noarch/fs/nfs/file.c 2010-09-04 09:21:44.930035442 -0400 @@ -36,6 +36,7 @@ #include "internal.h" #include "iostat.h" @@ -15515,8 +15515,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file. if (!ret) return VM_FAULT_LOCKED; diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inode.c ---- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-08-31 20:41:19.149170418 -0400 -+++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-08-31 20:42:05.516222809 -0400 +--- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-09-04 09:20:05.993222927 -0400 ++++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-09-04 09:21:44.932035441 -0400 @@ -48,6 +48,7 @@ #include "internal.h" #include "fscache.h" @@ -15730,8 +15730,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inod nfs_fs_proc_exit(); nfsiod_stop(); diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/internal.h ---- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-08-31 20:41:19.149170418 -0400 -+++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-08-31 20:42:05.517099944 -0400 +--- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-09-04 09:20:05.993222927 -0400 ++++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-09-04 09:21:44.933035332 -0400 @@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_serv struct nfs_fattr *); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); @@ -15792,7 +15792,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/i struct page *, struct page *); diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kconfig --- linux-2.6.34.noarch/fs/nfs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-08-31 20:42:05.500123860 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-09-04 09:21:44.933035332 -0400 @@ -79,10 +79,48 @@ config NFS_V4_1 depends on NFS_V4 && EXPERIMENTAL help @@ -15845,7 +15845,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kcon depends on NFS_FS=y && IP_PNP diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Makefile --- linux-2.6.34.noarch/fs/nfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-08-31 20:42:05.501268752 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-09-04 09:21:44.934046035 -0400 @@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4x delegation.o idmap.o \ callback.o callback_xdr.o callback_proc.o \ @@ -15860,8 +15860,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Mak +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ +obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs3proc.c ---- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-08-31 20:41:19.152180625 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-08-31 20:42:05.518232887 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-09-04 09:20:05.996242985 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-09-04 09:21:44.935035426 -0400 @@ -833,6 +833,7 @@ const struct nfs_rpc_ops nfs_v3_clientop .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs3_dir_inode_operations, @@ -15871,8 +15871,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/n .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-31 20:42:05.519163219 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-08-31 20:42:05.520222923 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-09-04 09:21:44.936035595 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-09-04 09:21:44.936035595 -0400 @@ -0,0 +1,768 @@ +/* + * linux/fs/nfs/nfs4filelayout.c @@ -16643,8 +16643,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs +module_init(nfs4filelayout_init); +module_exit(nfs4filelayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-31 20:42:05.521233147 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-31 20:42:05.521233147 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-09-04 09:21:44.937035580 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-09-04 09:21:44.937035580 -0400 @@ -0,0 +1,635 @@ +/* + * linux/fs/nfs/nfs4filelayoutdev.c @@ -17282,8 +17282,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch +} + diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h ---- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-31 20:42:05.520222923 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-08-31 20:42:05.520222923 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-09-04 09:21:44.938035519 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-09-04 09:21:44.938035519 -0400 @@ -0,0 +1,96 @@ +/* + * pnfs_nfs4filelayout.h @@ -17382,8 +17382,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs + +#endif /* FS_NFS_NFS4FILELAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nfs4_fs.h ---- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-31 20:41:19.154160465 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-08-31 20:42:05.519163219 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-09-04 09:20:05.998222938 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-09-04 09:21:44.939035693 -0400 @@ -45,8 +45,28 @@ enum nfs4_client_state { NFS4CLNT_RECLAIM_NOGRACE, NFS4CLNT_DELEGRETURN, @@ -17532,8 +17532,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nf /* nfs4xdr.c */ diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs4proc.c ---- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-08-31 20:41:19.157140145 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-08-31 20:42:05.524099925 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-09-04 09:20:06.001202714 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-09-04 09:21:44.942015100 -0400 @@ -49,12 +49,14 @@ #include #include @@ -19198,7 +19198,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/n .setattr = nfs4_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs/nfs4renewd.c --- linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-08-31 20:42:05.526213255 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-09-04 09:21:44.944045456 -0400 @@ -54,17 +54,17 @@ void nfs4_renew_state(struct work_struct *work) @@ -19221,8 +19221,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs spin_lock(&clp->cl_lock); lease = clp->cl_lease_time; diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/nfs4state.c ---- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-08-31 20:41:19.158078621 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-08-31 20:42:05.527232994 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-09-04 09:20:06.002213222 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-09-04 09:21:44.945035417 -0400 @@ -48,11 +48,13 @@ #include #include @@ -19545,8 +19545,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/ test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs4xdr.c ---- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-31 20:41:19.160150207 -0400 -+++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-08-31 20:42:05.530092192 -0400 +--- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-09-04 09:20:06.004212730 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-09-04 09:21:44.948015074 -0400 @@ -50,8 +50,10 @@ #include #include @@ -21056,8 +21056,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nf }; diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild ---- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-31 20:42:05.532213157 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-08-31 20:42:05.532213157 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-09-04 09:21:44.950025182 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-09-04 09:21:44.950025182 -0400 @@ -0,0 +1,11 @@ +# +# Makefile for the pNFS Objects Layout Driver kernel module @@ -21071,8 +21071,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs +panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o +obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-31 20:42:05.533243491 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-31 20:42:05.534105468 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-09-04 09:21:44.951035482 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-09-04 09:21:44.951035482 -0400 @@ -0,0 +1,1087 @@ +/* + * objio_osd.c @@ -22162,8 +22162,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noar +module_init(objlayout_init); +module_exit(objlayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-31 20:42:05.535059115 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-08-31 20:42:05.535059115 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-09-04 09:21:44.952035857 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-09-04 09:21:44.952035857 -0400 @@ -0,0 +1,790 @@ +/* + * objlayout.c @@ -22956,8 +22956,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noar + .uninitialize_mountpoint = objlayout_uninitialize_mountpoint, +}; diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h ---- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-31 20:42:05.535059115 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-08-31 20:42:05.535059115 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-09-04 09:21:44.953025191 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-09-04 09:21:44.953025191 -0400 @@ -0,0 +1,171 @@ +/* + * objlayout.h @@ -23131,8 +23131,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noar + +#endif /* _OBJLAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-31 20:42:05.536110535 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-31 20:42:05.536110535 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-09-04 09:21:44.954045432 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-09-04 09:21:44.954045432 -0400 @@ -0,0 +1,734 @@ +/* + * panfs_shim.c @@ -23869,8 +23869,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noa +module_init(panlayout_init); +module_exit(panlayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h ---- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-31 20:42:05.537124598 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-31 20:42:05.537124598 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-09-04 09:21:44.955035904 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-09-04 09:21:44.955035904 -0400 @@ -0,0 +1,482 @@ +/* + * panfs_shim.h @@ -24355,8 +24355,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noa + +#endif /* _PANLAYOUT_PANFS_SHIM_H */ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c ---- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-31 20:42:05.538121971 -0400 -+++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-31 20:42:05.538121971 -0400 +--- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-09-04 09:21:44.956036011 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-09-04 09:21:44.956036011 -0400 @@ -0,0 +1,435 @@ +/* + * pnfs_osd_xdr.c @@ -24794,8 +24794,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6. + return 0; +} diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/pagelist.c ---- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-08-31 20:41:19.162150222 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-08-31 20:42:05.539131687 -0400 +--- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-09-04 09:20:06.006202442 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-09-04 09:21:44.957035861 -0400 @@ -20,6 +20,7 @@ #include @@ -24918,8 +24918,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/p if (res == INT_MAX) goto out; diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs.c ---- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-08-31 20:42:05.541150301 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-08-31 20:42:05.541150301 -0400 +--- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-09-04 09:21:44.959025145 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-09-04 09:21:44.959025145 -0400 @@ -0,0 +1,2037 @@ +/* + * linux/fs/nfs/pnfs.c @@ -26959,8 +26959,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs. +} +EXPORT_SYMBOL(nfs4_put_deviceid_cache); diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs.h ---- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-08-31 20:42:05.542222767 -0400 -+++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-08-31 20:42:05.542222767 -0400 +--- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-09-04 09:21:44.960025819 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-09-04 09:21:44.960025819 -0400 @@ -0,0 +1,354 @@ +/* + * fs/nfs/pnfs.h @@ -27317,8 +27317,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs. + +#endif /* FS_NFS_PNFS_H */ diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc.c ---- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-08-31 20:41:19.163155499 -0400 -+++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-08-31 20:42:05.543103394 -0400 +--- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-09-04 09:20:06.007232858 -0400 ++++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-09-04 09:21:44.961035556 -0400 @@ -443,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, stru fattr = nfs_alloc_fattr(); status = -ENOMEM; @@ -27346,8 +27346,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc. .getattr = nfs_proc_getattr, .setattr = nfs_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read.c ---- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-08-31 20:41:19.164160482 -0400 -+++ linux-2.6.34.noarch/fs/nfs/read.c 2010-08-31 20:42:05.544233042 -0400 +--- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-09-04 09:20:06.008232903 -0400 ++++ linux-2.6.34.noarch/fs/nfs/read.c 2010-09-04 09:21:44.962035703 -0400 @@ -18,8 +18,12 @@ #include #include @@ -27562,8 +27562,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read. nfs_add_stats(inode, NFSIOS_READPAGES, npages); read_complete: diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/super.c ---- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-08-31 20:41:19.165170508 -0400 -+++ linux-2.6.34.noarch/fs/nfs/super.c 2010-08-31 20:42:05.545114737 -0400 +--- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-09-04 09:20:06.009232934 -0400 ++++ linux-2.6.34.noarch/fs/nfs/super.c 2010-09-04 09:21:44.963035469 -0400 @@ -64,6 +64,7 @@ #include "iostat.h" #include "internal.h" @@ -27611,8 +27611,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/supe #endif diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unlink.c ---- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-08-31 20:41:19.166151095 -0400 -+++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-08-31 20:42:05.546131839 -0400 +--- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-09-04 09:20:06.010203248 -0400 ++++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-09-04 09:21:44.964036069 -0400 @@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task struct nfs_unlinkdata *data = calldata; struct nfs_server *server = NFS_SERVER(data->dir); @@ -27623,8 +27623,8 @@ diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unl return; rpc_call_start(task); diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/write.c ---- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-08-31 20:41:17.273213379 -0400 -+++ linux-2.6.34.noarch/fs/nfs/write.c 2010-08-31 20:42:05.548212682 -0400 +--- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-09-04 09:20:04.513160311 -0400 ++++ linux-2.6.34.noarch/fs/nfs/write.c 2010-09-04 09:21:44.966025174 -0400 @@ -20,6 +20,7 @@ #include #include @@ -28313,7 +28313,7 @@ diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/writ int nfs_wb_page_cancel(struct inode *inode, struct page *page) diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/include/linux/exportfs.h --- linux-2.6.34.noarch/include/linux/exportfs.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-08-31 20:42:05.577222704 -0400 ++++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-09-04 09:21:44.967035352 -0400 @@ -2,6 +2,7 @@ #define LINUX_EXPORTFS_H 1 @@ -28386,8 +28386,8 @@ diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/i +#endif /* CONFIG_PNFSD */ #endif /* LINUX_EXPORTFS_H */ diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/include/linux/exp_xdr.h ---- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-08-31 20:42:05.576053304 -0400 -+++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-08-31 20:42:05.576053304 -0400 +--- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-09-04 09:21:44.969025737 -0400 ++++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-09-04 09:21:44.969025737 -0400 @@ -0,0 +1,141 @@ +#ifndef _LINUX_EXP_XDR_H +#define _LINUX_EXP_XDR_H @@ -28531,8 +28531,8 @@ diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/in +} +#endif /* _LINUX_EXP_XDR_H */ diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include/linux/fs.h ---- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-08-31 20:41:19.120034834 -0400 -+++ linux-2.6.34.noarch/include/linux/fs.h 2010-08-31 20:42:05.579212604 -0400 +--- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-09-04 09:20:05.965243003 -0400 ++++ linux-2.6.34.noarch/include/linux/fs.h 2010-09-04 09:21:44.971015113 -0400 @@ -387,6 +387,7 @@ struct inodes_stat_t { #include @@ -28551,7 +28551,7 @@ diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include struct dentry *s_root; diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/include/linux/nfs4.h --- linux-2.6.34.noarch/include/linux/nfs4.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-08-31 20:42:05.581035627 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-09-04 09:21:44.973025301 -0400 @@ -17,7 +17,10 @@ #define NFS4_BITMAP_SIZE 2 @@ -28681,8 +28681,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/inclu #endif diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfs4_pnfs.h ---- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-31 20:42:05.583087731 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-08-31 20:42:05.583087731 -0400 +--- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-09-04 09:21:44.974035325 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-09-04 09:21:44.974035325 -0400 @@ -0,0 +1,329 @@ +/* + * include/linux/nfs4_pnfs.h @@ -29014,8 +29014,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/ + +#endif /* LINUX_NFS4_PNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarch/include/linux/nfsd4_block.h ---- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-08-31 20:42:05.596098115 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-08-31 20:42:05.596098115 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-09-04 09:21:44.976025566 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-09-04 09:21:44.976025566 -0400 @@ -0,0 +1,101 @@ +#ifndef NFSD4_BLOCK +#define NFSD4_BLOCK @@ -29119,8 +29119,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarc +#endif /* NFSD4_BLOCK */ + diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h ---- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-31 20:42:05.597097942 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-08-31 20:42:05.597097942 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-09-04 09:21:44.977035317 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-09-04 09:21:44.977035317 -0400 @@ -0,0 +1,345 @@ +/* + * include/linux/nfsd4_spnfs.h @@ -29469,7 +29469,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarc +#endif /* NFS_SPNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch/include/linux/nfsd/const.h --- linux-2.6.34.noarch/include/linux/nfsd/const.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-08-31 20:42:05.591097762 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-09-04 09:21:44.978015841 -0400 @@ -29,6 +29,7 @@ #ifdef __KERNEL__ @@ -29480,7 +29480,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch * Largest number of bytes we need to allocate for an NFS diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch/include/linux/nfsd/debug.h --- linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-08-31 20:42:05.591097762 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-09-04 09:21:44.978015841 -0400 @@ -32,6 +32,8 @@ #define NFSDDBG_REPCACHE 0x0080 #define NFSDDBG_XDR 0x0100 @@ -29492,7 +29492,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarch/include/linux/nfsd/export.h --- linux-2.6.34.noarch/include/linux/nfsd/export.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-08-31 20:42:05.592118086 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-09-04 09:21:44.979055116 -0400 @@ -100,6 +100,7 @@ struct svc_export { uid_t ex_anon_uid; gid_t ex_anon_gid; @@ -29502,8 +29502,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarc struct nfsd4_fs_locations ex_fslocs; int ex_nflavors; diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h ---- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-31 20:42:05.592118086 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-31 20:42:05.592118086 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-09-04 09:21:44.979055116 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-09-04 09:21:44.980035474 -0400 @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -29638,8 +29638,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.3 + +#endif /* NFSD_NFS4LAYOUTXDR_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h ---- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-31 20:42:05.593020723 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-31 20:42:05.593020723 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-09-04 09:21:44.980035474 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-09-04 09:21:44.980035474 -0400 @@ -0,0 +1,54 @@ +/****************************************************************************** + * @@ -29696,8 +29696,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34. + +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h ---- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-31 20:42:05.594107962 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-31 20:42:05.594107962 -0400 +--- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-09-04 09:21:44.981055721 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-09-04 09:21:44.981055721 -0400 @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. @@ -29972,7 +29972,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.n +#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noarch/include/linux/nfsd/syscall.h --- linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-08-31 20:42:05.594107962 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-09-04 09:21:44.982035422 -0400 @@ -29,6 +29,7 @@ /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ @@ -30010,8 +30010,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noar union nfsctl_res { diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/include/linux/nfs_fs.h ---- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-08-31 20:41:19.168160480 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-08-31 20:42:05.584098019 -0400 +--- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-09-04 09:20:06.012232950 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-09-04 09:21:44.983045467 -0400 @@ -72,13 +72,20 @@ struct nfs_access_entry { int mask; }; @@ -30111,8 +30111,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/inc #ifdef __KERNEL__ diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/include/linux/nfs_fs_sb.h ---- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-31 20:41:19.168160480 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-08-31 20:42:05.586087719 -0400 +--- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-09-04 09:20:06.012232950 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-09-04 09:21:44.985025570 -0400 @@ -15,6 +15,7 @@ struct nlm_host; struct nfs4_sequence_args; struct nfs4_sequence_res; @@ -30187,7 +30187,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/ atomic_t active; /* Keep trace of any activity to this server */ diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch/include/linux/nfs_iostat.h --- linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-08-31 20:42:05.587097913 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-09-04 09:21:44.986035288 -0400 @@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { NFSIOS_SHORTREAD, NFSIOS_SHORTWRITE, @@ -30200,7 +30200,7 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/include/linux/nfs_page.h --- linux-2.6.34.noarch/include/linux/nfs_page.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-08-31 20:42:05.588097898 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-09-04 09:21:44.987025532 -0400 @@ -39,6 +39,7 @@ struct nfs_page { struct list_head wb_list; /* Defines state of page: */ struct page *wb_page; /* page to read in/write out */ @@ -30249,8 +30249,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/i struct inode *inode, int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/include/linux/nfs_xdr.h ---- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-08-31 20:41:19.169171911 -0400 -+++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-08-31 20:42:05.590087729 -0400 +--- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-09-04 09:20:06.013233555 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-09-04 09:21:44.989035583 -0400 @@ -3,6 +3,8 @@ #include @@ -30528,8 +30528,8 @@ diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/in extern struct rpc_version nfs_version3; extern struct rpc_version nfs_version4; diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.noarch/include/linux/panfs_shim_api.h ---- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-08-31 20:42:05.598087997 -0400 -+++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-08-31 20:42:05.599087710 -0400 +--- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-09-04 09:21:44.990025422 -0400 ++++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-09-04 09:21:44.991025218 -0400 @@ -0,0 +1,57 @@ +#ifndef _PANFS_SHIM_API_H +#define _PANFS_SHIM_API_H @@ -30589,8 +30589,8 @@ diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.no + +#endif /* _PANFS_SHIM_API_H */ diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h ---- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-31 20:42:05.600025088 -0400 -+++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-08-31 20:42:05.600025088 -0400 +--- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-09-04 09:21:44.992035338 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-09-04 09:21:44.992035338 -0400 @@ -0,0 +1,439 @@ +/* + * pnfs_osd_xdr.h @@ -31033,7 +31033,7 @@ diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noar +#endif /* __PNFS_OSD_XDR_H__ */ diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/include/linux/posix_acl.h --- linux-2.6.34.noarch/include/linux/posix_acl.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-08-31 20:42:05.601087875 -0400 ++++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-09-04 09:21:44.993025468 -0400 @@ -8,6 +8,7 @@ #ifndef __LINUX_POSIX_ACL_H #define __LINUX_POSIX_ACL_H @@ -31044,7 +31044,7 @@ diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/ #define ACL_UNDEFINED_ID (-1) diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h --- linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-08-31 20:42:05.602100892 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-09-04 09:21:44.994025129 -0400 @@ -14,6 +14,8 @@ /* size of an XDR encoding unit in bytes, i.e. 32bit */ #define XDR_UNIT (4) @@ -31056,7 +31056,7 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.n diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h --- linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-31 20:42:05.603108001 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-09-04 09:21:44.995045529 -0400 @@ -3,6 +3,7 @@ #ifdef __KERNEL__ @@ -31077,8 +31077,8 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.3 struct rpc_pipe_ops { diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h ---- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-31 20:42:05.603108001 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-31 20:42:05.603108001 -0400 +--- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-09-04 09:21:44.995045529 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-09-04 09:21:44.995045529 -0400 @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2008 The Regents of the University of Michigan. @@ -31193,7 +31193,7 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux +#endif /* _SIMPLE_RPC_PIPEFS_H_ */ diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h --- linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-31 20:42:05.604049784 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-09-04 09:21:44.996061803 -0400 @@ -166,4 +166,41 @@ static inline char *__svc_print_addr(con return buf; @@ -31237,8 +31237,8 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.n +} #endif /* SUNRPC_SVC_XPRT_H */ diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xdr.h ---- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-31 20:41:19.173118431 -0400 -+++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-08-31 20:42:05.605107904 -0400 +--- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-09-04 09:20:06.017243774 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-09-04 09:21:44.997045653 -0400 @@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp) return p + 2; } @@ -31261,14 +31261,9 @@ diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, unsigned int len); extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); -diff -up linux-2.6.34.noarch/localversion-pnfs.orig linux-2.6.34.noarch/localversion-pnfs ---- linux-2.6.34.noarch/localversion-pnfs.orig 2010-08-31 20:42:05.605107904 -0400 -+++ linux-2.6.34.noarch/localversion-pnfs 2010-08-31 20:42:05.605107904 -0400 -@@ -0,0 +1 @@ -+-pnfs diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/sunrpc/Makefile --- linux-2.6.34.noarch/net/sunrpc/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-08-31 20:42:05.606020148 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-09-04 09:21:44.998058968 -0400 @@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt svc.o svcsock.o svcauth.o svcauth_unix.o \ addr.o rpcb_clnt.o timer.o xdr.o \ @@ -31279,8 +31274,8 @@ diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/su sunrpc-$(CONFIG_PROC_FS) += stats.o sunrpc-$(CONFIG_SYSCTL) += sysctl.o diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c ---- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-31 20:42:05.606020148 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-31 20:42:05.607108065 -0400 +--- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-09-04 09:21:44.999045582 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-09-04 09:21:44.999045582 -0400 @@ -0,0 +1,424 @@ +/* + * net/sunrpc/simple_rpc_pipefs.c @@ -31707,8 +31702,8 @@ diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.no +} +EXPORT_SYMBOL(pipefs_generic_destroy_msg); diff -up linux-2.6.34.noarch/net/sunrpc/xdr.c.orig linux-2.6.34.noarch/net/sunrpc/xdr.c ---- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-08-31 20:41:19.188144022 -0400 -+++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-08-31 20:42:05.607108065 -0400 +--- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-09-04 09:20:06.031222775 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-09-04 09:21:45.000045387 -0400 @@ -395,24 +395,29 @@ xdr_shrink_pagelen(struct xdr_buf *buf, { struct kvec *tail; From 9e6bd6253b21f27fbc50f0788b1dd4a32ecae976 Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Thu, 30 Sep 2010 10:58:43 -0400 Subject: [PATCH 20/20] - Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-09-14 Signed-off-by: Steve Dickson --- kernel.spec | 9 +- pnfs-all-2.6.35-2010-09-14-f13.patch | 31775 +++++++++++++++++++++++++ 2 files changed, 31781 insertions(+), 3 deletions(-) create mode 100644 pnfs-all-2.6.35-2010-09-14-f13.patch diff --git a/kernel.spec b/kernel.spec index 2eabb4dab..da80a8709 100644 --- a/kernel.spec +++ b/kernel.spec @@ -23,7 +23,7 @@ Summary: The Linux kernel # # (Uncomment the '#' and both spaces below to set the buildid.) # -%define buildid .pnfs34.2010.08.24 +%define buildid .pnfs35.2010.09.14 ################################################################### # The buildid can also be specified on the rpmbuild command line @@ -818,7 +818,7 @@ Patch12581: xen-use-percpu-interrupts-for-ipis-and-virqs.patch Patch30000: nfs-35-fc.patch Patch30001: nfsd-35-fc.patch -Patch30002: pnfs-all-2.6.35-2010-08-24-f13.patch +Patch30002: pnfs-all-2.6.35-2010-09-14-f13.patch Patch30003: linux-2.6-pnfs-compile.patch Patch30004: linux-2.6.35-inline.patch @@ -1551,7 +1551,7 @@ ApplyPatch xen-use-percpu-interrupts-for-ipis-and-virqs.patch ApplyPatch nfs-35-fc.patch ApplyPatch nfsd-35-fc.patch -ApplyPatch pnfs-all-2.6.35-2010-08-24-f13.patch +ApplyPatch pnfs-all-2.6.35-2010-09-14-f13.patch ApplyPatch linux-2.6-pnfs-compile.patch ApplyPatch linux-2.6.35-inline.patch # END OF PATCH APPLICATIONS @@ -2175,6 +2175,9 @@ fi %changelog +* Thu Sep 30 2010 Steve Dickson +- Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-09-14 + * Mon Sep 27 2010 Ben Skeggs 2.6.34.7-58 - nouveau: better handling of certain GPU errors diff --git a/pnfs-all-2.6.35-2010-09-14-f13.patch b/pnfs-all-2.6.35-2010-09-14-f13.patch new file mode 100644 index 000000000..2d6f9a09b --- /dev/null +++ b/pnfs-all-2.6.35-2010-09-14-f13.patch @@ -0,0 +1,31775 @@ +diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arch/um/os-Linux/mem.c +--- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-09-30 10:14:57.591122000 -0400 ++++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-09-30 10:17:08.383984000 -0400 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include "init.h" + #include "kern_constants.h" + #include "os.h" +diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd.c +--- linux-2.6.34.noarch/block/genhd.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/block/genhd.c 2010-09-30 10:17:08.410985000 -0400 +@@ -1009,6 +1009,7 @@ static void disk_release(struct device * + struct class block_class = { + .name = "block", + }; ++EXPORT_SYMBOL(block_class); + + static char *block_devnode(struct device *dev, mode_t *mode) + { +diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt +--- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-09-30 10:17:08.376984000 -0400 ++++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-09-30 10:17:08.378989000 -0400 +@@ -0,0 +1,211 @@ ++(c) 2007 Network Appliance Inc. ++ ++spNFS ++----- ++ ++An spNFS system consists of a Meta Data Server (MDS), a number of Client machines (C) and a number of Data Servers (DS). ++ ++A file system is mounted by the clients from the MDS, and all file data ++is striped across the DSs. ++ ++Identify the machines that will be filling each of these roles. ++ ++The spnfs kernel will be installed on all machines: clients, the MDS and DSs. ++ ++ ++Building and installing the spNFS kernel ++---------------------------------------- ++ ++Get the spNFS kernel from: ++ ++ git://linux-nfs.org/~bhalevy/linux-pnfs.git ++ ++Use the pnfs-all-latest branch and add these options to your .config file ++ ++ CONFIG_NETWORK_FILESYSTEMS=y ++ CONFIG_NFS_FS=m ++ CONFIG_NFS_V4=y ++ CONFIG_NFS_V4_1=y ++ CONFIG_PNFS=y ++ CONFIG_NFSD=m ++ CONFIG_PNFSD=y ++ # CONFIG_PNFSD_LOCAL_EXPORT is not set ++ CONFIG_SPNFS=y ++ ++By default, spNFS uses whole-file layouts. Layout segments can be enabled ++by adding: ++ ++ CONFIG_SPNFS_LAYOUTSEGMENTS=y ++ ++to your .config file. ++ ++Building and installation of kernel+modules is as usual. ++This kernel should be installed and booted on the client, MDS and DSs. ++ ++Note that CONFIG_PNFSD_LOCAL_EXPORT must be disabled for spnfs as it ++takes over the pnfs export interface. ++ ++Building nfs-utils ++------------------ ++ ++Get the nfs-utils package containing spnfsd from: ++ ++ git://linux-nfs.org/~bhalevy/pnfs-nfs-utils.git ++ ++Follow the standard instructions for building nfs-utils. ++ ++After building, the spnfsd daemon will be located in utils/spnfsd. The spnfsd ++daemon will only be needed on the MDS. ++ ++ ++Installation ++------------ ++ ++The nfs-utils package contains a default spnfsd.conf file in ++utils/spnfsd/spnfsd.conf. Copy this file to /etc/spnfsd.conf. ++ ++By default, the DS-Mount-Directory is set to /spnfs (see spnfsd.conf). Under ++this directory, mount points must be created for each DS to ++be used for pNFS data stripes. These mount points are named by the ip address ++of the corresponding DS. In the sample spnfsd.conf, there are two ++DSs defined (172.16.28.134 and 172.16.28.141). ++ ++Following the sample spnfsd.conf, ++ ++ mkdir /spnfs ++ ++on the MDS (corresponding to DS-Mount-Directory). Then ++ ++ mkdir /spnfs/172.16.28.134 ++ mkdir /spnfs/172.16.28.141 ++ ++to create the mount points for the DSs. ++ ++On the DSs, chose a directory where data stripes will be created by the MDS. ++For the sample file, this directory is /pnfs, so on each DS execute: ++ ++ mkdir /pnfs ++ ++This directory is specified in the spnfsd.conf file by the DS*_ROOT option ++(where * is replaced by the DS number). DS_ROOT is specified relative to ++the directory being exported by the DSs. In our example, our DSs are exporting ++the root directory (/) and therefore our DS_ROOT is /pnfs. On the DSs, we have ++the following entry in /etc/exports: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check) ++ ++N.B. If we had created a /exports directory and a /pnfs directory under ++/exports, and if we were exporting /exports, then DS_ROOT would still be /pnfs ++(not /exports/pnfs). ++ ++It may be useful to add entries to /etc/fstab on the MDS to automatically ++mount the DS_ROOT file systems. For this example, our MDS fstab would ++contain: ++ ++ 172.17.84.128:/pnfs /spnfs/172.17.84.128 nfs defaults 1 2 ++ 172.17.84.122:/pnfs /spnfs/172.17.84.122 nfs defaults 1 2 ++ ++The DS mounts must be performed manually or via fstab at this time (automatic ++mounting, directory creation, etc. are on the todo list). To perform I/O ++through the MDS, the DS mounts MUST use NFSv3 at this time (this restriction ++will eventually be removed). ++ ++ ++On the MDS, choose a file system to use with spNFS and export it, e.g.: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check,pnfs) ++ ++Make sure nfsd and all supporting processes are running on the MDS and DSs. ++ ++ ++Running ++------- ++ ++If rpc_pipefs is not already mounted (if you're running idmapd it probably is), ++you may want to add the following line to /etc/fstab: ++ ++ rpc_pipefs /var/lib/nfs/rpc_pipefs rpc_pipefs defaults 0 0 ++ ++to automatically mount rpc_pipefs. ++ ++With spnfsd.conf configured for your environment and the mounts mounted as ++described above, spnfsd can now be started. ++ ++On the MDS, execute spnfsd: ++ ++ spnfsd ++ ++The executable is located in the directory where it was built, and ++may also have been installed elsewhere depending on how you built nfs-utils. ++It will run in the foreground by default, and in fact will do so despite ++any options suggesting the contrary (it's still a debugging build). ++ ++On the client, make sure the nfslayoutdriver module is loaded: ++ ++ modprobe nfslayoutdriver ++ ++Then mount the file system from the MDS: ++ ++ mount -t nfs4 -o minorversion=1 mds:/ /mnt ++ ++I/O through the MDS is now supported. To use it, do not load the ++nfslayoutdriver on the client, and mount the MDS using NFSv4 or 4.1 ++(NFSv2 and v3 are not yet supported). ++ ++You may now use spNFS by performing file system activities in /mnt. ++If you create files in /mnt, you should see stripe files corresponding to ++new files being created on the DSs. The current implementation names the ++stripe files based on the inode number of the file on the MDS. For example, ++if you create a file foo in /mnt and do an 'ls -li /mnt/foo': ++ ++ # ls -li foo ++ 1233 -rw-r--r-- 1 root root 0 Nov 29 15:54 foo ++ ++You should see stripe files on each under /pnfs (per the sample) named ++1233. The file /pnfs/1233 on DS1 will contain the first bytes ++of data written to foo, DS2 will contain the next bytes, etc. ++Removing /mnt/foo will remove the corresponding stripe files on the DSs. ++Other file system operations should behave (mostly :-) as expected. ++ ++ ++Layout Segments ++--------------- ++ ++If the kernel is compiled to support layout segments, there will ++be two files created under /proc/fs/spnfs for controlling layout ++segment functionality. ++ ++To enable layout segments, write a '1' to /proc/fs/spnfs/layoutseg, e.g.: ++ ++ echo 1 > /proc/fs/spnfs/layoutseg ++ ++Layout segments can be disabled (returning to whole-file layouts) by ++writing a '0' to /proc/fs/spnfs/layoutseg: ++ ++ echo 0 > /proc/fs/spnfs/layoutseg ++ ++When layout segments are enabled, the size of the layouts returned can ++be specified by writing a decimal number (ascii representation) to ++/proc/fs/spnfs/layoutsegsize: ++ ++ echo 1024 > /proc/fs/spnfs/layoutsegsize ++ ++The value'0' has a special meaning--it causes the server to return a ++layout that is exactly the size requested by the client: ++ ++ echo 0 > /proc/fs/spnfs/layoutsegsize ++ ++ ++Troubleshooting ++--------------- ++ ++If you see data being written to the files on the MDS rather than ++the stripe files, make sure the nfslayoutdriver is loaded on the client ++(see above). ++ ++If you get a "permission denied" error, make sure mountd is running on the mds ++(it occasionally fails to start). ++ ++Bugs, enhancements, compliments, complaints to: dmuntz@netapp.com ++ ++ +diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/drivers/md/dm-ioctl.c +--- linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig 2010-09-30 10:15:01.214222000 -0400 ++++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-09-30 10:17:08.417985000 -0400 +@@ -657,6 +657,12 @@ static int dev_create(struct dm_ioctl *p + return r; + } + ++int dm_dev_create(struct dm_ioctl *param) ++{ ++ return dev_create(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_create); ++ + /* + * Always use UUID for lookups if it's present, otherwise use name or dev. + */ +@@ -751,6 +757,12 @@ static int dev_remove(struct dm_ioctl *p + return 0; + } + ++int dm_dev_remove(struct dm_ioctl *param) ++{ ++ return dev_remove(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_remove); ++ + /* + * Check a string doesn't overrun the chunk of + * memory we copied from userland. +@@ -923,6 +935,12 @@ static int do_resume(struct dm_ioctl *pa + return r; + } + ++int dm_do_resume(struct dm_ioctl *param) ++{ ++ return do_resume(param); ++} ++EXPORT_SYMBOL(dm_do_resume); ++ + /* + * Set or unset the suspension state of a device. + * If the device already is in the requested state we just return its status. +@@ -1200,6 +1218,12 @@ out: + return r; + } + ++int dm_table_load(struct dm_ioctl *param, size_t param_size) ++{ ++ return table_load(param, param_size); ++} ++EXPORT_SYMBOL(dm_table_load); ++ + static int table_clear(struct dm_ioctl *param, size_t param_size) + { + int r; +diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drivers/scsi/hosts.c +--- linux-2.6.34.noarch/drivers/scsi/hosts.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-09-30 10:17:08.422988000 -0400 +@@ -49,7 +49,7 @@ static void scsi_host_cls_release(struct + put_device(&class_to_shost(dev)->shost_gendev); + } + +-static struct class shost_class = { ++struct class shost_class = { + .name = "scsi_host", + .dev_release = scsi_host_cls_release, + }; +diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/exofs.h +--- linux-2.6.34.noarch/fs/exofs/exofs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-09-30 10:17:08.444986000 -0400 +@@ -36,13 +36,9 @@ + #include + #include + #include ++#include + #include "common.h" + +-/* FIXME: Remove once pnfs hits mainline +- * #include +- */ +-#include "pnfs.h" +- + #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) + + #ifdef CONFIG_EXOFS_DEBUG +@@ -103,6 +99,7 @@ struct exofs_sb_info { + struct exofs_i_info { + struct inode vfs_inode; /* normal in-memory inode */ + wait_queue_head_t i_wq; /* wait queue for inode */ ++ spinlock_t i_layout_lock; /* lock for layout/return/recall */ + unsigned long i_flags; /* various atomic flags */ + uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ + uint32_t i_dir_start_lookup; /* which page to start lookup */ +@@ -166,6 +163,9 @@ static inline unsigned exofs_io_state_si + */ + #define OBJ_2BCREATED 0 /* object will be created soon*/ + #define OBJ_CREATED 1 /* object has been created on the osd*/ ++/* Below are not used atomic but reuse the same i_flags */ ++#define OBJ_LAYOUT_IS_GIVEN 2 /* inode has given layouts to clients*/ ++#define OBJ_IN_LAYOUT_RECALL 3 /* inode is in the middle of a layout recall*/ + + static inline int obj_2bcreated(struct exofs_i_info *oi) + { +@@ -304,4 +304,20 @@ extern const struct inode_operations exo + extern const struct inode_operations exofs_symlink_inode_operations; + extern const struct inode_operations exofs_fast_symlink_inode_operations; + ++/* export.c */ ++typedef int (exofs_recall_fn)(struct inode *inode); ++#ifdef CONFIG_PNFSD ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo); ++void exofs_init_export(struct super_block *sb); ++#else ++static inline int exofs_inode_recall_layout(struct inode *inode, ++ enum pnfs_iomode iomode, exofs_recall_fn todo) ++{ ++ return todo(inode); ++} ++ ++static inline void exofs_init_export(struct super_block *sb) {} ++#endif ++ + #endif +diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs/export.c +--- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-09-30 10:17:08.447987000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/export.c 2010-09-30 10:17:08.449986000 -0400 +@@ -0,0 +1,396 @@ ++/* ++ * export.c - Implementation of the pnfs_export_operations ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Boaz Harrosh ++ * ++ * This file is part of exofs. ++ * ++ * exofs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation. Since it is based on ext2, and the only ++ * valid version of GPL for the Linux kernel is version 2, the only valid ++ * version of GPL for exofs is version 2. ++ * ++ * exofs is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with exofs; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include ++#include "exofs.h" ++ ++static int exofs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_OSD2_OBJECTS; ++} ++ ++static void set_dev_id(struct pnfs_deviceid *pnfs_devid, u64 sbid, u64 devid) ++{ ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)pnfs_devid; ++ ++ dev_id->sbid = sbid; ++ dev_id->devid = devid; ++} ++ ++static int cb_layout_recall(struct inode *inode, enum pnfs_iomode iomode, ++ u64 offset, u64 length, void *cookie) ++{ ++ struct nfsd4_pnfs_cb_layout cbl; ++ struct pnfsd_cb_ctl cb_ctl; ++ int status; ++ ++ memset(&cb_ctl, 0, sizeof(cb_ctl)); ++ status = pnfsd_get_cb_op(&cb_ctl); ++ if (unlikely(status)) { ++ EXOFS_ERR("%s: nfsd unloaded!! inode (0x%lx) status=%d\n", ++ __func__, inode->i_ino, status); ++ goto err; ++ } ++ ++ memset(&cbl, 0, sizeof(cbl)); ++ cbl.cbl_recall_type = RETURN_FILE; ++ cbl.cbl_seg.layout_type = LAYOUT_OSD2_OBJECTS; ++ cbl.cbl_seg.iomode = iomode; ++ cbl.cbl_seg.offset = offset; ++ cbl.cbl_seg.length = length; ++ cbl.cbl_cookie = cookie; ++ ++ status = cb_ctl.cb_op->cb_layout_recall(inode->i_sb, inode, &cbl); ++ pnfsd_put_cb_op(&cb_ctl); ++ ++err: ++ return status; ++} ++ ++static enum nfsstat4 exofs_layout_get( ++ struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; ++ struct exofs_layout *el = &sbi->layout; ++ struct pnfs_osd_object_cred *creds = NULL; ++ struct pnfs_osd_layout layout; ++ __be32 *start; ++ bool in_recall; ++ int i, err; ++ enum nfsstat4 nfserr; ++ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ res->lg_seg.iomode = IOMODE_RW; ++ res->lg_return_on_close = true; /* TODO: unused but will be soon */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ creds = kcalloc(el->s_numdevs, sizeof(*creds), GFP_KERNEL); ++ if (!creds) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto out; ++ } ++ ++ /* Fill in a pnfs_osd_layout struct */ ++ layout.olo_map = sbi->data_map; ++ ++ for (i = 0; i < el->s_numdevs; i++) { ++ struct pnfs_osd_object_cred *cred = &creds[i]; ++ osd_id id = exofs_oi_objno(oi); ++ unsigned dev = exofs_layout_od_id(el, id, i); ++ ++ set_dev_id(&cred->oc_object_id.oid_device_id, args->lg_sbid, ++ dev); ++ cred->oc_object_id.oid_partition_id = el->s_pid; ++ cred->oc_object_id.oid_object_id = id; ++ cred->oc_osd_version = osd_dev_is_ver1(el->s_ods[dev]) ? ++ PNFS_OSD_VERSION_1 : ++ PNFS_OSD_VERSION_2; ++ cred->oc_cap_key_sec = PNFS_OSD_CAP_KEY_SEC_NONE; ++ ++ cred->oc_cap_key.cred_len = 0; ++ cred->oc_cap_key.cred = NULL; ++ ++ cred->oc_cap.cred_len = OSD_CAP_LEN; ++ cred->oc_cap.cred = oi->i_cred; ++ } ++ ++ layout.olo_comps_index = 0; ++ layout.olo_num_comps = el->s_numdevs; ++ layout.olo_comps = creds; ++ ++ err = pnfs_osd_xdr_encode_layout(xdr, &layout); ++ if (err) { ++ nfserr = NFS4ERR_TOOSMALL; /* FIXME: Change osd_xdr error codes */ ++ goto out; ++ } ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ if (!in_recall) { ++ __set_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ nfserr = NFS4_OK; ++ } else { ++ nfserr = NFS4ERR_RECALLCONFLICT; ++ } ++ spin_unlock(&oi->i_layout_lock); ++ ++out: ++ kfree(creds); ++ EXOFS_DBGMSG("(0x%lx) nfserr=%u xdr_bytes=%zu\n", ++ inode->i_ino, nfserr, exp_xdr_qbytes(xdr->p - start)); ++ return nfserr; ++} ++ ++/* NOTE: inode mutex must NOT be held */ ++static int exofs_layout_commit( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct timespec mtime; ++ loff_t i_size; ++ int in_recall; ++ ++ /* In case of a recall we ignore the new size and mtime since they ++ * are going to be changed again by truncate, and since we cannot take ++ * the inode lock in that case. ++ */ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ if (in_recall) { ++ EXOFS_DBGMSG("(0x%lx) commit was called during recall\n", ++ inode->i_ino); ++ return 0; ++ } ++ ++ /* NOTE: I would love to call inode_setattr here ++ * but i cannot since this will cause an eventual vmtruncate, ++ * which will cause a layout_recall. So open code the i_size ++ * and mtime/atime changes under i_mutex. ++ */ ++ mutex_lock_nested(&inode->i_mutex, I_MUTEX_NORMAL); ++ ++ if (args->lc_mtime.seconds) { ++ mtime.tv_sec = args->lc_mtime.seconds; ++ mtime.tv_nsec = args->lc_mtime.nseconds; ++ ++ /* layout commit may only make time bigger, since there might ++ * be reordering of the notifications and it might arrive after ++ * A local change. ++ * TODO: if mtime > ctime then we know set_attr did an mtime ++ * in the future. and we can let this update through ++ */ ++ if (0 <= timespec_compare(&mtime, &inode->i_mtime)) ++ mtime = inode->i_mtime; ++ } else { ++ mtime = current_fs_time(inode->i_sb); ++ } ++ ++ /* TODO: Will below work? since mark_inode_dirty has it's own ++ * Time handling ++ */ ++ inode->i_atime = inode->i_mtime = mtime; ++ ++ i_size = i_size_read(inode); ++ if (args->lc_newoffset) { ++ loff_t new_size = args->lc_last_wr + 1; ++ ++ if (i_size < new_size) { ++ i_size_write(inode, i_size = new_size); ++ res->lc_size_chg = 1; ++ res->lc_newsize = new_size; ++ } ++ } ++ /* TODO: else { i_size = osd_get_object_length() } */ ++ ++/* TODO: exofs does not currently use the osd_xdr part of the layout_commit */ ++ ++ mark_inode_dirty_sync(inode); ++ ++ mutex_unlock(&inode->i_mutex); ++ EXOFS_DBGMSG("(0x%lx) i_size=0x%llx lcp->off=0x%llx\n", ++ inode->i_ino, i_size, args->lc_last_wr); ++ return 0; ++} ++ ++static void exofs_handle_error(struct pnfs_osd_ioerr *ioerr) ++{ ++ EXOFS_ERR("exofs_handle_error: errno=%d is_write=%d obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ ioerr->oer_errno, ioerr->oer_iswrite, ++ _LLU(ioerr->oer_component.oid_object_id), ++ _LLU(ioerr->oer_comp_offset), ++ _LLU(ioerr->oer_comp_length)); ++} ++ ++static int exofs_layout_return( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ __be32 *p = args->lrf_body; ++ unsigned len = exp_xdr_qwords(args->lrf_body_len); ++ ++ EXOFS_DBGMSG("(0x%lx) cookie %p xdr_len %d\n", ++ inode->i_ino, args->lr_cookie, len); ++ ++ while (len >= pnfs_osd_ioerr_xdr_sz()) { ++ struct pnfs_osd_ioerr ioerr; ++ ++ p = pnfs_osd_xdr_decode_ioerr(&ioerr, p); ++ len -= pnfs_osd_ioerr_xdr_sz(); ++ exofs_handle_error(&ioerr); ++ } ++ ++ if (args->lr_cookie) { ++ struct exofs_i_info *oi = exofs_i(inode); ++ bool in_recall; ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ __clear_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ /* TODO: how to communicate cookie with the waiter */ ++ if (in_recall) ++ wake_up(&oi->i_wq); /* wakeup any recalls */ ++ } ++ ++ return 0; ++} ++ ++int exofs_get_device_info(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct exofs_sb_info *sbi = sb->s_fs_info; ++ struct pnfs_osd_deviceaddr devaddr; ++ const struct osd_dev_info *odi; ++ u64 devno = devid->devid; ++ __be32 *start; ++ int err; ++ ++ memset(&devaddr, 0, sizeof(devaddr)); ++ ++ if (unlikely(devno >= sbi->layout.s_numdevs)) ++ return -ENODEV; ++ ++ odi = osduld_device_info(sbi->layout.s_ods[devno]); ++ ++ devaddr.oda_systemid.len = odi->systemid_len; ++ devaddr.oda_systemid.data = (void *)odi->systemid; /* !const cast */ ++ ++ devaddr.oda_osdname.len = odi->osdname_len ; ++ devaddr.oda_osdname.data = (void *)odi->osdname;/* !const cast */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ err = -E2BIG; ++ goto err; ++ } ++ ++ err = pnfs_osd_xdr_encode_deviceaddr(xdr, &devaddr); ++ if (err) ++ goto err; ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ EXOFS_DBGMSG("xdr_bytes=%Zu devno=%lld osdname-%s\n", ++ exp_xdr_qbytes(xdr->p - start), devno, odi->osdname); ++ return 0; ++ ++err: ++ EXOFS_DBGMSG("Error: err=%d at_byte=%zu\n", ++ err, exp_xdr_qbytes(xdr->p - start)); ++ return err; ++} ++ ++struct pnfs_export_operations exofs_pnfs_ops = { ++ .layout_type = exofs_layout_type, ++ .layout_get = exofs_layout_get, ++ .layout_commit = exofs_layout_commit, ++ .layout_return = exofs_layout_return, ++ .get_device_info = exofs_get_device_info, ++}; ++ ++static bool is_layout_returned(struct exofs_i_info *oi) ++{ ++ bool layout_given; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ return !layout_given; ++} ++ ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ int layout_given; ++ int error = 0; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ __set_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ if (!layout_given) ++ goto exec; ++ ++ for (;;) { ++ EXOFS_DBGMSG("(0x%lx) has_layout issue a recall\n", ++ inode->i_ino); ++ error = cb_layout_recall(inode, iomode, 0, NFS4_MAX_UINT64, ++ &oi->i_wq); ++ switch (error) { ++ case 0: ++ case -EAGAIN: ++ break; ++ case -ENOENT: ++ goto exec; ++ default: ++ goto err; ++ } ++ ++ error = wait_event_interruptible(oi->i_wq, ++ is_layout_returned(oi)); ++ if (error) ++ goto err; ++ } ++ ++exec: ++ error = todo(inode); ++ ++err: ++ spin_lock(&oi->i_layout_lock); ++ __clear_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ EXOFS_DBGMSG("(0x%lx) return=>%d\n", inode->i_ino, error); ++ return error; ++} ++ ++void exofs_init_export(struct super_block *sb) ++{ ++ sb->s_pnfs_op = &exofs_pnfs_ops; ++} +diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/inode.c +--- linux-2.6.34.noarch/fs/exofs/inode.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-09-30 10:17:08.454986000 -0400 +@@ -833,7 +833,7 @@ void exofs_truncate(struct inode *inode) + if (unlikely(wait_obj_created(oi))) + goto fail; + +- ret = _do_truncate(inode); ++ ret = exofs_inode_recall_layout(inode, IOMODE_ANY, _do_truncate); + if (ret) + goto fail; + +@@ -964,6 +964,7 @@ static void __oi_init(struct exofs_i_inf + { + init_waitqueue_head(&oi->i_wq); + oi->i_flags = 0; ++ spin_lock_init(&oi->i_layout_lock); + } + /* + * Fill in an inode read from the OSD and set it up for use +diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/Kbuild +--- linux-2.6.34.noarch/fs/exofs/Kbuild.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-09-30 10:17:08.434986000 -0400 +@@ -13,4 +13,5 @@ + # + + exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o ++exofs-$(CONFIG_PNFSD) += export.o + obj-$(CONFIG_EXOFS_FS) += exofs.o +diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/Kconfig +--- linux-2.6.34.noarch/fs/exofs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-09-30 10:17:08.438994000 -0400 +@@ -1,6 +1,7 @@ + config EXOFS_FS + tristate "exofs: OSD based file system support" + depends on SCSI_OSD_ULD ++ select EXPORTFS_OSD_LAYOUT if PNFSD + help + EXOFS is a file system that uses an OSD storage device, + as its backing storage. +diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/super.c +--- linux-2.6.34.noarch/fs/exofs/super.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exofs/super.c 2010-09-30 10:17:08.465986000 -0400 +@@ -621,6 +621,7 @@ static int exofs_fill_super(struct super + sb->s_fs_info = sbi; + sb->s_op = &exofs_sops; + sb->s_export_op = &exofs_export_ops; ++ exofs_init_export(sb); + root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); + if (IS_ERR(root)) { + EXOFS_ERR("ERROR: exofs_iget failed\n"); +diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exportfs/expfs.c +--- linux-2.6.34.noarch/fs/exportfs/expfs.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-09-30 10:17:08.489990000 -0400 +@@ -16,6 +16,13 @@ + #include + #include + ++#if defined(CONFIG_PNFSD) ++struct pnfsd_cb_ctl pnfsd_cb_ctl = { ++ .lock = __SPIN_LOCK_UNLOCKED(pnfsd_cb_ctl.lock) ++}; ++EXPORT_SYMBOL(pnfsd_cb_ctl); ++#endif /* CONFIG_PNFSD */ ++ + #define dprintk(fmt, args...) do{}while(0) + + +diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/exportfs/Makefile +--- linux-2.6.34.noarch/fs/exportfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-09-30 10:17:08.484990000 -0400 +@@ -3,4 +3,7 @@ + + obj-$(CONFIG_EXPORTFS) += exportfs.o + +-exportfs-objs := expfs.o ++exportfs-y := expfs.o ++exportfs-$(CONFIG_EXPORTFS_FILE_LAYOUT) += nfs4filelayoutxdr.o ++exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o ++exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o +diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c +--- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-09-30 10:17:08.492991000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-09-30 10:17:08.494987000 -0400 +@@ -0,0 +1,158 @@ ++/* ++ * linux/fs/nfsd/nfs4blocklayoutxdr.c ++ * ++ * ++ * Created by Rick McNeal on 3/31/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++#include ++#include ++#include ++#include ++ ++static int ++bl_encode_simple(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 12 + 4 + bld->u.simple.bld_sig_len); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u32(p, 1); ++ p = exp_xdr_encode_u64(p, bld->u.simple.bld_offset); ++ exp_xdr_encode_opaque(p, bld->u.simple.bld_sig, ++ bld->u.simple.bld_sig_len); ++ ++ return 0; ++} ++ ++static int ++bl_encode_slice(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2 + 2 + 1); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_start); ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_len); ++ exp_xdr_encode_u32(p, bld->u.slice.bld_index); ++ ++ return 0; ++} ++ ++static int ++bl_encode_concat(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ return -ENOTSUPP; ++} ++ ++static int ++bl_encode_stripe(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ int i; ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 2 + 1 + bld->u.stripe.bld_stripes); ++ ++ p = exp_xdr_encode_u64(p, bld->u.stripe.bld_chunk_size); ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripes); ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripe_indexs[i]); ++ ++ return 0; ++} ++ ++int ++blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes) ++{ ++ u32 num_vols = 0, ++ *layoutlen_p = xdr->p; ++ pnfs_blocklayout_devinfo_t *bld; ++ int status = 0; ++ __be32 *p; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -ETOOSMALL; ++ p += 2; ++ ++ /* ++ * All simple volumes with their signature are required to be listed ++ * first. ++ */ ++ list_for_each_entry(bld, volumes, bld_list) { ++ num_vols++; ++ p = exp_xdr_reserve_qwords(xdr, 1); ++ if (!p) ++ return -ETOOSMALL; ++ p = exp_xdr_encode_u32(p, bld->bld_type); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ status = bl_encode_simple(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_SLICE: ++ status = bl_encode_slice(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ status = bl_encode_concat(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ status = bl_encode_stripe(xdr, bld); ++ break; ++ default: ++ BUG(); ++ } ++ if (status) ++ goto error; ++ } ++ ++ /* ---- Fill in the overall length and number of volumes ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (xdr->p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, num_vols); ++ ++error: ++ return status; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_devinfo); ++ ++enum nfsstat4 ++blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *bl_head) ++{ ++ struct pnfs_blocklayout_layout *b; ++ u32 *layoutlen_p = xdr->p, ++ extents = 0; ++ __be32 *p; ++ ++ /* ++ * Save spot for opaque block layout length and number of extents, ++ * fill-in later. ++ */ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p += 2; ++ ++ list_for_each_entry(b, bl_head, bll_list) { ++ extents++; ++ p = exp_xdr_reserve_qwords(xdr, 5 * 2 + 1); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.sbid); ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.devid); ++ p = exp_xdr_encode_u64(p, b->bll_foff); ++ p = exp_xdr_encode_u64(p, b->bll_len); ++ p = exp_xdr_encode_u64(p, b->bll_soff); ++ p = exp_xdr_encode_u32(p, b->bll_es); ++ } ++ ++ /* ---- Fill in the overall length and number of extents ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, extents); ++ ++ return NFS4_OK; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_layout); +diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c +--- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-09-30 10:17:08.496992000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-09-30 10:17:08.498993000 -0400 +@@ -0,0 +1,218 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++/* We do our-own dprintk so filesystems are not dependent on sunrpc */ ++#ifdef dprintk ++#undef dprintk ++#endif ++#define dprintk(fmt, args, ...) do { } while (0) ++ ++/* Calculate the XDR length of the GETDEVICEINFO4resok structure ++ * excluding the gdir_notification and the gdir_device_addr da_layout_type. ++ */ ++static int fl_devinfo_xdr_words(const struct pnfs_filelayout_device *fdev) ++{ ++ struct pnfs_filelayout_devaddr *fl_addr; ++ struct pnfs_filelayout_multipath *mp; ++ int i, j, nwords; ++ ++ /* da_addr_body length, indice length, indices, ++ * multipath_list4 length */ ++ nwords = 1 + 1 + fdev->fl_stripeindices_length + 1; ++ for (i = 0; i < fdev->fl_device_length; i++) { ++ mp = &fdev->fl_device_list[i]; ++ nwords++; /* multipath list length */ ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ fl_addr = mp->fl_multipath_list; ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_netid.len); ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_addr.len); ++ } ++ } ++ dprintk("<-- %s nwords %d\n", __func__, nwords); ++ return nwords; ++} ++ ++/* Encodes the nfsv4_1_file_layout_ds_addr4 structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++int ++filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ unsigned int i, j, len = 0, opaque_words; ++ u32 *p_in; ++ u32 index_count = fdev->fl_stripeindices_length; ++ u32 dev_count = fdev->fl_device_length; ++ int error = 0; ++ __be32 *p; ++ ++ opaque_words = fl_devinfo_xdr_words(fdev); ++ dprintk("%s: Begin indx_cnt: %u dev_cnt: %u total size %u\n", ++ __func__, ++ index_count, ++ dev_count, ++ opaque_words*4); ++ ++ /* check space for opaque length */ ++ p = p_in = exp_xdr_reserve_qwords(xdr, opaque_words); ++ if (!p) { ++ error = -ETOOSMALL; ++ goto out; ++ } ++ ++ /* Fill in length later */ ++ p++; ++ ++ /* encode device list indices */ ++ p = exp_xdr_encode_u32(p, index_count); ++ for (i = 0; i < index_count; i++) ++ p = exp_xdr_encode_u32(p, fdev->fl_stripeindices_list[i]); ++ ++ /* encode device list */ ++ p = exp_xdr_encode_u32(p, dev_count); ++ for (i = 0; i < dev_count; i++) { ++ struct pnfs_filelayout_multipath *mp = &fdev->fl_device_list[i]; ++ ++ p = exp_xdr_encode_u32(p, mp->fl_multipath_length); ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ struct pnfs_filelayout_devaddr *da = ++ &mp->fl_multipath_list[j]; ++ ++ /* Encode device info */ ++ p = exp_xdr_encode_opaque(p, da->r_netid.data, ++ da->r_netid.len); ++ p = exp_xdr_encode_opaque(p, da->r_addr.data, ++ da->r_addr.len); ++ } ++ } ++ ++ /* backfill in length. Subtract 4 for da_addr_body size */ ++ len = (char *)p - (char *)p_in; ++ exp_xdr_encode_u32(p_in, len - 4); ++ ++ error = 0; ++out: ++ dprintk("%s: End err %d xdrlen %d\n", ++ __func__, error, len); ++ return error; ++} ++EXPORT_SYMBOL(filelayout_encode_devinfo); ++ ++/* Encodes the loc_body structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++enum nfsstat4 ++filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp) ++{ ++ u32 len = 0, nfl_util, fhlen, i; ++ u32 *layoutlen_p; ++ enum nfsstat4 nfserr; ++ __be32 *p; ++ ++ dprintk("%s: device_id %llx:%llx fsi %u, numfh %u\n", ++ __func__, ++ flp->device_id.pnfs_fsid, ++ flp->device_id.pnfs_devid, ++ flp->lg_first_stripe_index, ++ flp->lg_fh_length); ++ ++ /* Ensure file system added at least one file handle */ ++ if (flp->lg_fh_length <= 0) { ++ dprintk("%s: File Layout has no file handles!!\n", __func__); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto out; ++ } ++ ++ /* Ensure room for len, devid, util, first_stripe_index, ++ * pattern_offset, number of filehandles */ ++ p = layoutlen_p = exp_xdr_reserve_qwords(xdr, 1+2+2+1+1+2+1); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ /* save spot for opaque file layout length, fill-in later*/ ++ p++; ++ ++ /* encode device id */ ++ p = exp_xdr_encode_u64(p, flp->device_id.sbid); ++ p = exp_xdr_encode_u64(p, flp->device_id.devid); ++ ++ /* set and encode flags */ ++ nfl_util = flp->lg_stripe_unit; ++ if (flp->lg_commit_through_mds) ++ nfl_util |= NFL4_UFLG_COMMIT_THRU_MDS; ++ if (flp->lg_stripe_type == STRIPE_DENSE) ++ nfl_util |= NFL4_UFLG_DENSE; ++ p = exp_xdr_encode_u32(p, nfl_util); ++ ++ /* encode first stripe index */ ++ p = exp_xdr_encode_u32(p, flp->lg_first_stripe_index); ++ ++ /* encode striping pattern start */ ++ p = exp_xdr_encode_u64(p, flp->lg_pattern_offset); ++ ++ /* encode number of file handles */ ++ p = exp_xdr_encode_u32(p, flp->lg_fh_length); ++ ++ /* encode file handles */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ fhlen = flp->lg_fh_list[i].fh_size; ++ p = exp_xdr_reserve_space(xdr, 4 + fhlen); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ p = exp_xdr_encode_opaque(p, &flp->lg_fh_list[i].fh_base, fhlen); ++ } ++ ++ /* Set number of bytes encoded = total_bytes_encoded - length var */ ++ len = (char *)p - (char *)layoutlen_p; ++ exp_xdr_encode_u32(layoutlen_p, len - 4); ++ ++ nfserr = NFS4_OK; ++out: ++ dprintk("%s: End err %u xdrlen %d\n", ++ __func__, nfserr, len); ++ return nfserr; ++} ++EXPORT_SYMBOL(filelayout_encode_layout); +diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c +--- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-09-30 10:17:08.501989000 -0400 ++++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-09-30 10:17:08.503988000 -0400 +@@ -0,0 +1,289 @@ ++/* ++ * pnfs_osd_xdr_enc.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_data_map( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_data_map *data_map) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 1+2+1+1+1+1); ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, data_map->odm_num_comps); ++ p = exp_xdr_encode_u64(p, data_map->odm_stripe_unit); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_width); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_depth); ++ p = exp_xdr_encode_u32(p, data_map->odm_mirror_cnt); ++ p = exp_xdr_encode_u32(p, data_map->odm_raid_algorithm); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline int pnfs_osd_xdr_encode_objid( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2+2+2+2); ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)&object_id->oid_device_id; ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u64(p, dev_id->sbid); ++ p = exp_xdr_encode_u64(p, dev_id->devid); ++ p = exp_xdr_encode_u64(p, object_id->oid_partition_id); ++ p = exp_xdr_encode_u64(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * enum pnfs_osd_cap_key_sec4 { ++ * PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ * PNFS_OSD_CAP_KEY_SEC_SSV = 1 ++ * }; ++ * ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_object_cred( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_object_cred *olo_comp) ++{ ++ __be32 *p; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_objid(xdr, &olo_comp->oc_object_id); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_space(xdr, 3*4 + 4+olo_comp->oc_cap.cred_len); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, olo_comp->oc_osd_version); ++ ++ /* No sec for now */ ++ p = exp_xdr_encode_u32(p, PNFS_OSD_CAP_KEY_SEC_NONE); ++ p = exp_xdr_encode_u32(p, 0); /* opaque oc_capability_key<> */ ++ ++ exp_xdr_encode_opaque(p, olo_comp->oc_cap.cred, ++ olo_comp->oc_cap.cred_len); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_layout { ++ * struct pnfs_osd_data_map olo_map; ++ * u32 olo_comps_index; ++ * u32 olo_num_comps; ++ * struct pnfs_osd_object_cred *olo_comps; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *pol) ++{ ++ __be32 *p; ++ u32 i; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_data_map(xdr, &pol->olo_map); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, pol->olo_comps_index); ++ p = exp_xdr_encode_u32(p, pol->olo_num_comps); ++ ++ for (i = 0; i < pol->olo_num_comps; i++) { ++ err = pnfs_osd_xdr_encode_object_cred(xdr, &pol->olo_comps[i]); ++ if (err) ++ return err; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_layout); ++ ++static int _encode_string(struct exp_xdr_stream *xdr, ++ const struct nfs4_string *str) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, 4 + str->len); ++ ++ if (!p) ++ return -E2BIG; ++ exp_xdr_encode_opaque(p, str->data, str->len); ++ return 0; ++} ++ ++/* struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr) ++{ ++ __be32 *p; ++ int err; ++ ++ p = exp_xdr_reserve_space(xdr, 4 + 4 + sizeof(devaddr->oda_lun)); ++ if (!p) ++ return -E2BIG; ++ ++ /* Empty oda_targetid */ ++ p = exp_xdr_encode_u32(p, OBJ_TARGET_ANON); ++ ++ /* Empty oda_targetaddr for now */ ++ p = exp_xdr_encode_u32(p, 0); ++ ++ /* oda_lun */ ++ exp_xdr_encode_bytes(p, devaddr->oda_lun, sizeof(devaddr->oda_lun)); ++ ++ err = _encode_string(xdr, &devaddr->oda_systemid); ++ if (err) ++ return err; ++ ++ err = pnfs_osd_xdr_encode_object_cred(xdr, ++ &devaddr->oda_root_obj_cred); ++ if (err) ++ return err; ++ ++ err = _encode_string(xdr, &devaddr->oda_osdname); ++ if (err) ++ return err; ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_deviceaddr); ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p) ++{ ++ lou->dsu_valid = be32_to_cpu(*p++); ++ if (lou->dsu_valid) ++ p = xdr_decode_hyper(p, &lou->dsu_delta); ++ lou->olu_ioerr_flag = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_layoutupdate); ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline __be32 * ++pnfs_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) ++{ ++ /* FIXME: p = xdr_decode_fixed(...) */ ++ memcpy(objid->oid_device_id.data, p, sizeof(objid->oid_device_id.data)); ++ p += XDR_QUADLEN(sizeof(objid->oid_device_id.data)); ++ ++ p = xdr_decode_hyper(p, &objid->oid_partition_id); ++ p = xdr_decode_hyper(p, &objid->oid_object_id); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p) ++{ ++ p = pnfs_osd_xdr_decode_objid(p, &ioerr->oer_component); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_offset); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_length); ++ ioerr->oer_iswrite = be32_to_cpu(*p++); ++ ioerr->oer_errno = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); +diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gfs2/ops_fstype.c +--- linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-09-30 10:17:08.509988000 -0400 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + #include "gfs2.h" + #include "incore.h" +@@ -1146,6 +1147,9 @@ static int fill_super(struct super_block + sb->s_magic = GFS2_MAGIC; + sb->s_op = &gfs2_super_ops; + sb->s_export_op = &gfs2_export_ops; ++#if defined(CONFIG_PNFSD) ++ sb->s_pnfs_op = &pnfs_dlm_export_ops; ++#endif /* CONFIG_PNFSD */ + sb->s_xattr = gfs2_xattr_handlers; + sb->s_qcop = &gfs2_quotactl_ops; + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; +diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig +--- linux-2.6.34.noarch/fs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/Kconfig 2010-09-30 10:17:08.428989000 -0400 +@@ -224,6 +224,31 @@ config LOCKD_V4 + config EXPORTFS + tristate + ++config EXPORTFS_FILE_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 files layout type. ++ Must be automatically selected by supporting filesystems. ++ ++config EXPORTFS_OSD_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 objects layout type. ++ Must be automatically selected by supporting osd ++ filesystems. ++ ++ If unsure, say N. ++ ++config EXPORTFS_BLOCK_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 blocks layout type. ++ Must be automatically selected by supporting filesystems. ++ ++ + config NFS_ACL_SUPPORT + tristate + select FS_POSIX_ACL +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-09-30 10:17:08.528988000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-09-30 10:17:08.529994000 -0400 +@@ -0,0 +1,66 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pipefs_list bl_device_list; ++struct dentry *bl_device_pipe; ++ ++ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, size_t len) ++{ ++ int err; ++ struct pipefs_hdr *msg; ++ ++ dprintk("Entering %s...\n", __func__); ++ ++ msg = pipefs_readmsg(filp, src, len); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: unable to read pipefs message.\n"); ++ return PTR_ERR(msg); ++ } ++ ++ /* now assign the result, which wakes the blocked thread */ ++ err = pipefs_assign_upcall_reply(msg, &bl_device_list); ++ if (err) { ++ dprintk("ERROR: failed to assign upcall with id %u\n", ++ msg->msgid); ++ kfree(msg); ++ } ++ return len; ++} ++ ++static const struct rpc_pipe_ops bl_pipe_ops = { ++ .upcall = pipefs_generic_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = pipefs_generic_destroy_msg, ++}; ++ ++int bl_pipe_init(void) ++{ ++ dprintk("%s: block_device pipefs registering...\n", __func__); ++ bl_device_pipe = pipefs_mkpipe("bl_device_pipe", &bl_pipe_ops, 1); ++ if (IS_ERR(bl_device_pipe)) ++ dprintk("ERROR, unable to make block_device pipe\n"); ++ ++ if (!bl_device_pipe) ++ dprintk("bl_device_pipe is NULL!\n"); ++ else ++ dprintk("bl_device_pipe created!\n"); ++ pipefs_init_list(&bl_device_list); ++ return 0; ++} ++ ++void bl_pipe_exit(void) ++{ ++ dprintk("%s: block_device pipefs unregistering...\n", __func__); ++ if (IS_ERR(bl_device_pipe)) ++ return ; ++ pipefs_closepipe(bl_device_pipe); ++ return; ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-09-30 10:17:08.533988000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-09-30 10:17:08.535989000 -0400 +@@ -0,0 +1,1160 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include ++ ++#include /* various write calls */ ++#include /* struct bio */ ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Andy Adamson "); ++MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); ++ ++/* Callback operations to the pNFS client */ ++static struct pnfs_client_operations *pnfs_block_callback_ops; ++ ++static void print_page(struct page *page) ++{ ++ dprintk("PRINTPAGE page %p\n", page); ++ dprintk(" PagePrivate %d\n", PagePrivate(page)); ++ dprintk(" PageUptodate %d\n", PageUptodate(page)); ++ dprintk(" PageError %d\n", PageError(page)); ++ dprintk(" PageDirty %d\n", PageDirty(page)); ++ dprintk(" PageReferenced %d\n", PageReferenced(page)); ++ dprintk(" PageLocked %d\n", PageLocked(page)); ++ dprintk(" PageWriteback %d\n", PageWriteback(page)); ++ dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); ++ dprintk("\n"); ++} ++ ++/* Given the be associated with isect, determine if page data needs to be ++ * initialized. ++ */ ++static int is_hole(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return !is_sector_initialized(be->be_inval, isect); ++} ++ ++/* Given the be associated with isect, determine if page data can be ++ * written to disk. ++ */ ++static int is_writable(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return is_sector_initialized(be->be_inval, isect); ++} ++ ++static int ++dont_like_caller(struct nfs_page *req) ++{ ++ if (atomic_read(&req->wb_complete)) { ++ /* Called by _multi */ ++ return 1; ++ } else { ++ /* Called by _one */ ++ return 0; ++ } ++} ++ ++static enum pnfs_try_status ++bl_commit(struct nfs_write_data *nfs_data, ++ int sync) ++{ ++ dprintk("%s enter\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++/* The data we are handed might be spread across several bios. We need ++ * to track when the last one is finished. ++ */ ++struct parallel_io { ++ struct kref refcnt; ++ struct rpc_call_ops call_ops; ++ void (*pnfs_callback) (void *data); ++ void *data; ++}; ++ ++static inline struct parallel_io *alloc_parallel(void *data) ++{ ++ struct parallel_io *rv; ++ ++ rv = kmalloc(sizeof(*rv), GFP_KERNEL); ++ if (rv) { ++ rv->data = data; ++ kref_init(&rv->refcnt); ++ } ++ return rv; ++} ++ ++static inline void get_parallel(struct parallel_io *p) ++{ ++ kref_get(&p->refcnt); ++} ++ ++static void destroy_parallel(struct kref *kref) ++{ ++ struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); ++ ++ dprintk("%s enter\n", __func__); ++ p->pnfs_callback(p->data); ++ kfree(p); ++} ++ ++static inline void put_parallel(struct parallel_io *p) ++{ ++ kref_put(&p->refcnt, destroy_parallel); ++} ++ ++static struct bio * ++bl_submit_bio(int rw, struct bio *bio) ++{ ++ if (bio) { ++ get_parallel(bio->bi_private); ++ dprintk("%s submitting %s bio %u@%llu\n", __func__, ++ rw == READ ? "read" : "write", ++ bio->bi_size, (u64)bio->bi_sector); ++ submit_bio(rw, bio); ++ } ++ return NULL; ++} ++ ++static inline void ++bl_done_with_rpage(struct page *page, const int ok) ++{ ++ if (ok) { ++ ClearPagePnfsErr(page); ++ SetPageUptodate(page); ++ } else { ++ ClearPageUptodate(page); ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ } ++ /* Page is unlocked via rpc_release. Should really be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_read(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_rpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++static void bl_read_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ pnfs_block_callback_ops->nfs_readlist_complete(rdata); ++} ++ ++static void ++bl_end_par_io_read(void *data) ++{ ++ struct nfs_read_data *rdata = data; ++ ++ INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); ++ schedule_work(&rdata->task.u.tk_work); ++} ++ ++/* We don't want normal .rpc_call_done callback used, so we replace it ++ * with this stub. ++ */ ++static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) ++{ ++ return; ++} ++ ++static enum pnfs_try_status ++bl_read_pagelist(struct nfs_read_data *rdata, ++ unsigned nr_pages) ++{ ++ int i, hole; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t f_offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct page **pages = rdata->args.pages; ++ int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, ++ nr_pages, f_offset, count); ++ ++ if (dont_like_caller(rdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ goto use_mds; ++ } ++ if ((nr_pages == 1) && PagePnfsErr(rdata->req->wb_page)) { ++ /* We want to fall back to mds in case of read_page ++ * after error on read_pages. ++ */ ++ dprintk("%s PG_pnfserr set\n", __func__); ++ goto use_mds; ++ } ++ par = alloc_parallel(rdata); ++ if (!par) ++ goto use_mds; ++ par->call_ops = *rdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_read; ++ /* At this point, we can no longer jump to use_mds */ ++ ++ isect = (sector_t) (f_offset >> 9); ++ /* Code assumes extents are page-aligned */ ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ put_extent(cow_read); ++ bio = bl_submit_bio(READ, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(rdata->pdata.lseg), ++ isect, &cow_read); ++ if (!be) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ if (cow_read) { ++ sector_t cow_length = cow_read->be_length - ++ (isect - cow_read->be_f_offset); ++ extent_length = min(extent_length, cow_length); ++ } ++ } ++ hole = is_hole(be, isect); ++ if (hole && !cow_read) { ++ bio = bl_submit_bio(READ, bio); ++ /* Fill hole w/ zeroes w/o accessing device */ ++ dprintk("%s Zeroing page for hole\n", __func__); ++ zero_user(pages[i], 0, ++ min_t(int, PAGE_CACHE_SIZE, count)); ++ print_page(pages[i]); ++ bl_done_with_rpage(pages[i], 1); ++ } else { ++ struct pnfs_block_extent *be_read; ++ ++ be_read = (hole && cow_read) ? cow_read : be; ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - ++ be_read->be_f_offset + ++ be_read->be_v_offset; ++ bio->bi_bdev = be_read->be_mdev; ++ bio->bi_end_io = bl_end_io_read; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(READ, bio); ++ } ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ if ((isect << 9) >= rdata->inode->i_size) { ++ rdata->res.eof = 1; ++ rdata->res.count = rdata->inode->i_size - f_offset; ++ } else { ++ rdata->res.count = (isect << 9) - f_offset; ++ } ++ put_extent(be); ++ put_extent(cow_read); ++ bl_submit_bio(READ, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++ ++ use_mds: ++ dprintk("Giving up and using normal NFS\n"); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static void mark_extents_written(struct pnfs_block_layout *bl, ++ __u64 offset, __u32 count) ++{ ++ sector_t isect, end; ++ struct pnfs_block_extent *be; ++ ++ dprintk("%s(%llu, %u)\n", __func__, offset, count); ++ if (count == 0) ++ return; ++ isect = (offset & (long)(PAGE_CACHE_MASK)) >> 9; ++ end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); ++ end >>= 9; ++ while (isect < end) { ++ sector_t len; ++ be = find_get_extent(bl, isect, NULL); ++ BUG_ON(!be); /* FIXME */ ++ len = min(end, be->be_f_offset + be->be_length) - isect; ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ mark_for_commit(be, isect, len); /* What if fails? */ ++ isect += len; ++ put_extent(be); ++ } ++} ++ ++/* STUB - this needs thought */ ++static inline void ++bl_done_with_wpage(struct page *page, const int ok) ++{ ++ if (!ok) { ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ /* This is an inline copy of nfs_zap_mapping */ ++ /* This is oh so fishy, and needs deep thought */ ++ if (page->mapping->nrpages != 0) { ++ struct inode *inode = page->mapping->host; ++ spin_lock(&inode->i_lock); ++ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; ++ spin_unlock(&inode->i_lock); ++ } ++ } ++ /* end_page_writeback called in rpc_release. Should be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_write(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_wpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++/* Function scheduled for call during bl_end_par_io_write, ++ * it marks sectors as written and extends the commitlist. ++ */ ++static void bl_write_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ if (!wdata->task.tk_status) { ++ /* Marks for LAYOUTCOMMIT */ ++ /* BUG - this should be called after each bio, not after ++ * all finish, unless have some way of storing success/failure ++ */ ++ mark_extents_written(BLK_LSEG2EXT(wdata->pdata.lseg), ++ wdata->args.offset, wdata->args.count); ++ } ++ pnfs_block_callback_ops->nfs_writelist_complete(wdata); ++} ++ ++/* Called when last of bios associated with a bl_write_pagelist call finishes */ ++static void ++bl_end_par_io_write(void *data) ++{ ++ struct nfs_write_data *wdata = data; ++ ++ /* STUB - ignoring error handling */ ++ wdata->task.tk_status = 0; ++ wdata->verf.committed = NFS_FILE_SYNC; ++ INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); ++ schedule_work(&wdata->task.u.tk_work); ++} ++ ++static enum pnfs_try_status ++bl_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int sync) ++{ ++ int i; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t offset = wdata->args.offset; ++ size_t count = wdata->args.count; ++ struct page **pages = wdata->args.pages; ++ int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); ++ if (!wdata->req->wb_lseg) { ++ dprintk("%s no lseg, falling back to MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ if (dont_like_caller(wdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ /* At this point, wdata->pages is a (sequential) list of nfs_pages. ++ * We want to write each, and if there is an error remove it from ++ * list and call ++ * nfs_retry_request(req) to have it redone using nfs. ++ * QUEST? Do as block or per req? Think have to do per block ++ * as part of end_bio ++ */ ++ par = alloc_parallel(wdata); ++ if (!par) ++ return PNFS_NOT_ATTEMPTED; ++ par->call_ops = *wdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_write; ++ /* At this point, have to be more careful with error handling */ ++ ++ isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> 9); ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ bio = bl_submit_bio(WRITE, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(wdata->pdata.lseg), ++ isect, NULL); ++ if (!be || !is_writable(be, isect)) { ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ } ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - be->be_f_offset + ++ be->be_v_offset; ++ bio->bi_bdev = be->be_mdev; ++ bio->bi_end_io = bl_end_io_write; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(WRITE, bio); ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ wdata->res.count = (isect << 9) - (offset & (long)PAGE_CACHE_MASK); ++ put_extent(be); ++ bl_submit_bio(WRITE, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++} ++ ++/* FIXME - range ignored */ ++static void ++release_extents(struct pnfs_block_layout *bl, ++ struct pnfs_layout_range *range) ++{ ++ int i; ++ struct pnfs_block_extent *be; ++ ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ while (!list_empty(&bl->bl_extents[i])) { ++ be = list_first_entry(&bl->bl_extents[i], ++ struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++} ++ ++static void ++release_inval_marks(struct pnfs_inval_markings *marks) ++{ ++ struct pnfs_inval_tracking *pos, *temp; ++ ++ list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { ++ list_del(&pos->it_link); ++ kfree(pos); ++ } ++ return; ++} ++ ++/* Note we are relying on caller locking to prevent nasty races. */ ++static void ++bl_free_layout(struct pnfs_layout_hdr *lo) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ ++ dprintk("%s enter\n", __func__); ++ release_extents(bl, NULL); ++ release_inval_marks(&bl->bl_inval); ++ kfree(bl); ++} ++ ++static struct pnfs_layout_hdr * ++bl_alloc_layout(struct inode *inode) ++{ ++ struct pnfs_block_layout *bl; ++ ++ dprintk("%s enter\n", __func__); ++ bl = kzalloc(sizeof(*bl), GFP_KERNEL); ++ if (!bl) ++ return NULL; ++ spin_lock_init(&bl->bl_ext_lock); ++ INIT_LIST_HEAD(&bl->bl_extents[0]); ++ INIT_LIST_HEAD(&bl->bl_extents[1]); ++ INIT_LIST_HEAD(&bl->bl_commit); ++ bl->bl_count = 0; ++ bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> 9; ++ INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); ++ return &bl->bl_layout; ++} ++ ++static void ++bl_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter\n", __func__); ++ kfree(lseg); ++} ++ ++/* Because the generic infrastructure does not correctly merge layouts, ++ * we pretty much ignore lseg, and store all data layout wide, so we ++ * can correctly merge. Eventually we should push some correct merge ++ * behavior up to the generic code, as the current behavior tends to ++ * cause lots of unnecessary overlapping LAYOUTGET requests. ++ */ ++static struct pnfs_layout_segment * ++bl_alloc_lseg(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr) ++{ ++ struct pnfs_layout_segment *lseg; ++ int status; ++ ++ dprintk("%s enter\n", __func__); ++ lseg = kzalloc(sizeof(*lseg) + 0, GFP_KERNEL); ++ if (!lseg) ++ return NULL; ++ status = nfs4_blk_process_layoutget(lo, lgr); ++ if (status) { ++ /* We don't want to call the full-blown bl_free_lseg, ++ * since on error extents were not touched. ++ */ ++ /* STUB - we really want to distinguish between 2 error ++ * conditions here. This lseg failed, but lo data structures ++ * are OK, or we hosed the lo data structures. The calling ++ * code probably needs to distinguish this too. ++ */ ++ kfree(lseg); ++ return ERR_PTR(status); ++ } ++ return lseg; ++} ++ ++static int ++bl_setup_layoutcommit(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutcommit_args *arg) ++{ ++ struct nfs_server *nfss = PNFS_NFS_SERVER(lo); ++ struct bl_layoutupdate_data *layoutupdate_data; ++ ++ dprintk("%s enter\n", __func__); ++ /* Need to ensure commit is block-size aligned */ ++ if (nfss->pnfs_blksize) { ++ u64 mask = nfss->pnfs_blksize - 1; ++ u64 offset = arg->range.offset & mask; ++ ++ arg->range.offset -= offset; ++ arg->range.length += offset + mask; ++ arg->range.length &= ~mask; ++ } ++ ++ layoutupdate_data = kmalloc(sizeof(struct bl_layoutupdate_data), ++ GFP_KERNEL); ++ if (unlikely(!layoutupdate_data)) ++ return -ENOMEM; ++ INIT_LIST_HEAD(&layoutupdate_data->ranges); ++ arg->layoutdriver_data = layoutupdate_data; ++ ++ return 0; ++} ++ ++static void ++bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg) ++{ ++ dprintk("%s enter\n", __func__); ++ encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); ++} ++ ++static void ++bl_cleanup_layoutcommit(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutcommit_args *arg, int status) ++{ ++ dprintk("%s enter\n", __func__); ++ clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), arg, status); ++ kfree(arg->layoutdriver_data); ++} ++ ++static void free_blk_mountid(struct block_mount_id *mid) ++{ ++ if (mid) { ++ struct pnfs_block_dev *dev; ++ spin_lock(&mid->bm_lock); ++ while (!list_empty(&mid->bm_devlist)) { ++ dev = list_first_entry(&mid->bm_devlist, ++ struct pnfs_block_dev, ++ bm_node); ++ list_del(&dev->bm_node); ++ free_block_dev(dev); ++ } ++ spin_unlock(&mid->bm_lock); ++ kfree(mid); ++ } ++} ++ ++/* This is mostly copied form the filelayout's get_device_info function. ++ * It seems much of this should be at the generic pnfs level. ++ */ ++static struct pnfs_block_dev * ++nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, ++ struct pnfs_deviceid *d_id, ++ struct list_head *sdlist) ++{ ++ struct pnfs_device *dev; ++ struct pnfs_block_dev *rv = NULL; ++ u32 max_resp_sz; ++ int max_pages; ++ struct page **pages = NULL; ++ int i, rc; ++ ++ /* ++ * Use the session max response size as the basis for setting ++ * GETDEVICEINFO's maxcount ++ */ ++ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; ++ max_pages = max_resp_sz >> PAGE_SHIFT; ++ dprintk("%s max_resp_sz %u max_pages %d\n", ++ __func__, max_resp_sz, max_pages); ++ ++ dev = kmalloc(sizeof(*dev), GFP_KERNEL); ++ if (!dev) { ++ dprintk("%s kmalloc failed\n", __func__); ++ return NULL; ++ } ++ ++ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); ++ if (pages == NULL) { ++ kfree(dev); ++ return NULL; ++ } ++ for (i = 0; i < max_pages; i++) { ++ pages[i] = alloc_page(GFP_KERNEL); ++ if (!pages[i]) ++ goto out_free; ++ } ++ ++ /* set dev->area */ ++ dev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); ++ if (!dev->area) ++ goto out_free; ++ ++ memcpy(&dev->dev_id, d_id, sizeof(*d_id)); ++ dev->layout_type = LAYOUT_BLOCK_VOLUME; ++ dev->dev_notify_types = 0; ++ dev->pages = pages; ++ dev->pgbase = 0; ++ dev->pglen = PAGE_SIZE * max_pages; ++ dev->mincount = 0; ++ ++ dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); ++ rc = pnfs_block_callback_ops->nfs_getdeviceinfo(server, dev); ++ dprintk("%s getdevice info returns %d\n", __func__, rc); ++ if (rc) ++ goto out_free; ++ ++ rv = nfs4_blk_decode_device(server, dev, sdlist); ++ out_free: ++ if (dev->area != NULL) ++ vunmap(dev->area); ++ for (i = 0; i < max_pages; i++) ++ __free_page(pages[i]); ++ kfree(pages); ++ kfree(dev); ++ return rv; ++} ++ ++ ++/* ++ * Retrieve the list of available devices for the mountpoint. ++ */ ++static int ++bl_initialize_mountpoint(struct nfs_server *server, const struct nfs_fh *fh) ++{ ++ struct block_mount_id *b_mt_id = NULL; ++ struct pnfs_mount_type *mtype = NULL; ++ struct pnfs_devicelist *dlist = NULL; ++ struct pnfs_block_dev *bdev; ++ LIST_HEAD(block_disklist); ++ int status = 0, i; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (server->pnfs_blksize == 0) { ++ dprintk("%s Server did not return blksize\n", __func__); ++ return -EINVAL; ++ } ++ b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_KERNEL); ++ if (!b_mt_id) { ++ status = -ENOMEM; ++ goto out_error; ++ } ++ /* Initialize nfs4 block layout mount id */ ++ spin_lock_init(&b_mt_id->bm_lock); ++ INIT_LIST_HEAD(&b_mt_id->bm_devlist); ++ ++ dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_KERNEL); ++ if (!dlist) ++ goto out_error; ++ dlist->eof = 0; ++ while (!dlist->eof) { ++ status = pnfs_block_callback_ops->nfs_getdevicelist( ++ server, fh, dlist); ++ if (status) ++ goto out_error; ++ dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", ++ __func__, dlist->num_devs, dlist->eof); ++ /* For each device returned in dlist, call GETDEVICEINFO, and ++ * decode the opaque topology encoding to create a flat ++ * volume topology, matching VOLUME_SIMPLE disk signatures ++ * to disks in the visible block disk list. ++ * Construct an LVM meta device from the flat volume topology. ++ */ ++ for (i = 0; i < dlist->num_devs; i++) { ++ bdev = nfs4_blk_get_deviceinfo(server, fh, ++ &dlist->dev_id[i], ++ &block_disklist); ++ if (!bdev) ++ goto out_error; ++ spin_lock(&b_mt_id->bm_lock); ++ list_add(&bdev->bm_node, &b_mt_id->bm_devlist); ++ spin_unlock(&b_mt_id->bm_lock); ++ } ++ } ++ dprintk("%s SUCCESS\n", __func__); ++ server->pnfs_ld_data = b_mt_id; ++ ++ out_return: ++ kfree(dlist); ++ return status; ++ ++ out_error: ++ free_blk_mountid(b_mt_id); ++ kfree(mtype); ++ goto out_return; ++} ++ ++static int ++bl_uninitialize_mountpoint(struct nfs_server *server) ++{ ++ struct block_mount_id *b_mt_id = server->pnfs_ld_data; ++ ++ dprintk("%s enter\n", __func__); ++ free_blk_mountid(b_mt_id); ++ dprintk("%s RETURNS\n", __func__); ++ return 0; ++} ++ ++/* STUB - mark intersection of layout and page as bad, so is not ++ * used again. ++ */ ++static void mark_bad_read(void) ++{ ++ return; ++} ++ ++/* Copied from buffer.c */ ++static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) ++{ ++ if (uptodate) { ++ set_buffer_uptodate(bh); ++ } else { ++ /* This happens, due to failed READA attempts. */ ++ clear_buffer_uptodate(bh); ++ } ++ unlock_buffer(bh); ++} ++ ++/* Copied from buffer.c */ ++static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) ++{ ++ __end_buffer_read_notouch(bh, uptodate); ++} ++ ++/* ++ * map_block: map a requested I/0 block (isect) into an offset in the LVM ++ * meta block_device ++ */ ++static void ++map_block(sector_t isect, struct pnfs_block_extent *be, struct buffer_head *bh) ++{ ++ dprintk("%s enter be=%p\n", __func__, be); ++ ++ set_buffer_mapped(bh); ++ bh->b_bdev = be->be_mdev; ++ bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> ++ (be->be_mdev->bd_inode->i_blkbits - 9); ++ ++ dprintk("%s isect %ld, bh->b_blocknr %ld, using bsize %Zd\n", ++ __func__, (long)isect, ++ (long)bh->b_blocknr, ++ bh->b_size); ++ return; ++} ++ ++/* Given an unmapped page, zero it (or read in page for COW), ++ * and set appropriate flags/markings, but it is safe to not initialize ++ * the range given in [from, to). ++ */ ++/* This is loosely based on nobh_write_begin */ ++static int ++init_page_for_write(struct pnfs_block_layout *bl, struct page *page, ++ unsigned from, unsigned to, sector_t **pages_to_mark) ++{ ++ struct buffer_head *bh; ++ int inval, ret = -EIO; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect; ++ ++ dprintk("%s enter, %p\n", __func__, page); ++ bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); ++ if (!bh) { ++ ret = -ENOMEM; ++ goto cleanup; ++ } ++ ++ isect = (sector_t)page->index << (PAGE_CACHE_SHIFT - 9); ++ be = find_get_extent(bl, isect, &cow_read); ++ if (!be) ++ goto cleanup; ++ inval = is_hole(be, isect); ++ dprintk("%s inval=%i, from=%u, to=%u\n", __func__, inval, from, to); ++ if (inval) { ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) { ++ dprintk("%s PANIC - got NONE_DATA extent %p\n", ++ __func__, be); ++ goto cleanup; ++ } ++ map_block(isect, be, bh); ++ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); ++ } ++ if (PageUptodate(page)) { ++ /* Do nothing */ ++ } else if (inval & !cow_read) { ++ zero_user_segments(page, 0, from, to, PAGE_CACHE_SIZE); ++ } else if (0 < from || PAGE_CACHE_SIZE > to) { ++ struct pnfs_block_extent *read_extent; ++ ++ read_extent = (inval && cow_read) ? cow_read : be; ++ map_block(isect, read_extent, bh); ++ lock_buffer(bh); ++ bh->b_end_io = end_buffer_read_nobh; ++ submit_bh(READ, bh); ++ dprintk("%s: Waiting for buffer read\n", __func__); ++ /* XXX Don't really want to hold layout lock here */ ++ wait_on_buffer(bh); ++ if (!buffer_uptodate(bh)) ++ goto cleanup; ++ } ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ /* There is a BUG here if is a short copy after write_begin, ++ * but I think this is a generic fs bug. The problem is that ++ * we have marked the page as initialized, but it is possible ++ * that the section not copied may never get copied. ++ */ ++ ret = mark_initialized_sectors(be->be_inval, isect, ++ PAGE_CACHE_SECTORS, ++ pages_to_mark); ++ /* Want to preallocate mem so above can't fail */ ++ if (ret) ++ goto cleanup; ++ } ++ SetPageMappedToDisk(page); ++ ret = 0; ++ ++cleanup: ++ free_buffer_head(bh); ++ put_extent(be); ++ put_extent(cow_read); ++ if (ret) { ++ /* Need to mark layout with bad read...should now ++ * just use nfs4 for reads and writes. ++ */ ++ mark_bad_read(); ++ } ++ return ret; ++} ++ ++static int ++bl_write_begin(struct pnfs_layout_segment *lseg, struct page *page, loff_t pos, ++ unsigned count, struct pnfs_fsdata *fsdata) ++{ ++ unsigned from, to; ++ int ret; ++ sector_t *pages_to_mark = NULL; ++ struct pnfs_block_layout *bl = BLK_LSEG2EXT(lseg); ++ ++ dprintk("%s enter, %u@%lld\n", __func__, count, pos); ++ print_page(page); ++ /* The following code assumes blocksize >= PAGE_CACHE_SIZE */ ++ if (bl->bl_blocksize < (PAGE_CACHE_SIZE >> 9)) { ++ dprintk("%s Can't handle blocksize %llu\n", __func__, ++ (u64)bl->bl_blocksize); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ return 0; ++ } ++ if (PageMappedToDisk(page)) { ++ /* Basically, this is a flag that says we have ++ * successfully called write_begin already on this page. ++ */ ++ /* NOTE - there are cache consistency issues here. ++ * For example, what if the layout is recalled, then regained? ++ * If the file is closed and reopened, will the page flags ++ * be reset? If not, we'll have to use layout info instead of ++ * the page flag. ++ */ ++ return 0; ++ } ++ from = pos & (PAGE_CACHE_SIZE - 1); ++ to = from + count; ++ ret = init_page_for_write(bl, page, from, to, &pages_to_mark); ++ if (ret) { ++ dprintk("%s init page failed with %i", __func__, ret); ++ /* Revert back to plain NFS and just continue on with ++ * write. This assumes there is no request attached, which ++ * should be true if we get here. ++ */ ++ BUG_ON(PagePrivate(page)); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ kfree(pages_to_mark); ++ ret = 0; ++ } else { ++ fsdata->private = pages_to_mark; ++ } ++ return ret; ++} ++ ++/* CAREFUL - what happens if copied < count??? */ ++static int ++bl_write_end(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter, %u@%lld, lseg=%p\n", __func__, count, pos, lseg); ++ print_page(page); ++ if (lseg) ++ SetPageUptodate(page); ++ return 0; ++} ++ ++/* Return any memory allocated to fsdata->private, and take advantage ++ * of no page locks to mark pages noted in write_begin as needing ++ * initialization. ++ */ ++static void ++bl_write_end_cleanup(struct file *filp, struct pnfs_fsdata *fsdata) ++{ ++ struct page *page; ++ pgoff_t index; ++ sector_t *pos; ++ struct address_space *mapping = filp->f_mapping; ++ struct pnfs_fsdata *fake_data; ++ struct pnfs_layout_segment *lseg; ++ ++ if (!fsdata) ++ return; ++ lseg = fsdata->lseg; ++ if (!lseg) ++ return; ++ pos = fsdata->private; ++ if (!pos) ++ return; ++ dprintk("%s enter with pos=%llu\n", __func__, (u64)(*pos)); ++ for (; *pos != ~0; pos++) { ++ index = *pos >> (PAGE_CACHE_SHIFT - 9); ++ /* XXX How do we properly deal with failures here??? */ ++ page = grab_cache_page_write_begin(mapping, index, 0); ++ if (!page) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", __func__); ++ continue; ++ } ++ dprintk("%s: Examining block page\n", __func__); ++ print_page(page); ++ if (!PageMappedToDisk(page)) { ++ /* XXX How do we properly deal with failures here??? */ ++ dprintk("%s Marking block page\n", __func__); ++ init_page_for_write(BLK_LSEG2EXT(fsdata->lseg), page, ++ PAGE_CACHE_SIZE, PAGE_CACHE_SIZE, ++ NULL); ++ print_page(page); ++ fake_data = kzalloc(sizeof(*fake_data), GFP_KERNEL); ++ if (!fake_data) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", ++ __func__); ++ unlock_page(page); ++ continue; ++ } ++ get_lseg(lseg); ++ fake_data->lseg = lseg; ++ fake_data->bypass_eof = 1; ++ mapping->a_ops->write_end(filp, mapping, ++ index << PAGE_CACHE_SHIFT, ++ PAGE_CACHE_SIZE, ++ PAGE_CACHE_SIZE, ++ page, fake_data); ++ /* Note fake_data is freed by nfs_write_end */ ++ } else ++ unlock_page(page); ++ } ++ kfree(fsdata->private); ++ fsdata->private = NULL; ++} ++ ++static ssize_t ++bl_get_stripesize(struct pnfs_layout_hdr *lo) ++{ ++ dprintk("%s enter\n", __func__); ++ return 0; ++} ++ ++/* This is called by nfs_can_coalesce_requests via nfs_pageio_do_add_request. ++ * Should return False if there is a reason requests can not be coalesced, ++ * otherwise, should default to returning True. ++ */ ++static int ++bl_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ dprintk("%s enter\n", __func__); ++ if (pgio->pg_iswrite) ++ return prev->wb_lseg == req->wb_lseg; ++ else ++ return 1; ++} ++ ++static struct layoutdriver_io_operations blocklayout_io_operations = { ++ .commit = bl_commit, ++ .read_pagelist = bl_read_pagelist, ++ .write_pagelist = bl_write_pagelist, ++ .write_begin = bl_write_begin, ++ .write_end = bl_write_end, ++ .write_end_cleanup = bl_write_end_cleanup, ++ .alloc_layout = bl_alloc_layout, ++ .free_layout = bl_free_layout, ++ .alloc_lseg = bl_alloc_lseg, ++ .free_lseg = bl_free_lseg, ++ .setup_layoutcommit = bl_setup_layoutcommit, ++ .encode_layoutcommit = bl_encode_layoutcommit, ++ .cleanup_layoutcommit = bl_cleanup_layoutcommit, ++ .initialize_mountpoint = bl_initialize_mountpoint, ++ .uninitialize_mountpoint = bl_uninitialize_mountpoint, ++}; ++ ++static struct layoutdriver_policy_operations blocklayout_policy_operations = { ++ .get_stripesize = bl_get_stripesize, ++ .pg_test = bl_pg_test, ++}; ++ ++static struct pnfs_layoutdriver_type blocklayout_type = { ++ .id = LAYOUT_BLOCK_VOLUME, ++ .name = "LAYOUT_BLOCK_VOLUME", ++ .ld_io_ops = &blocklayout_io_operations, ++ .ld_policy_ops = &blocklayout_policy_operations, ++}; ++ ++static int __init nfs4blocklayout_init(void) ++{ ++ dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); ++ ++ pnfs_block_callback_ops = pnfs_register_layoutdriver(&blocklayout_type); ++ bl_pipe_init(); ++ return 0; ++} ++ ++static void __exit nfs4blocklayout_exit(void) ++{ ++ dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", ++ __func__); ++ ++ pnfs_unregister_layoutdriver(&blocklayout_type); ++ bl_pipe_exit(); ++} ++ ++module_init(nfs4blocklayout_init); ++module_exit(nfs4blocklayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-09-30 10:17:08.542991000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-09-30 10:17:08.544989000 -0400 +@@ -0,0 +1,335 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdev.c ++ * ++ * Device operations for the pnfs nfs4 file layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include /* __bread */ ++ ++#include ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes) ++{ ++ uint32_t *q = p + XDR_QUADLEN(nbytes); ++ if (unlikely(q > end || q < p)) ++ return NULL; ++ return p; ++} ++EXPORT_SYMBOL(blk_overflow); ++ ++/* Open a block_device by device number. */ ++struct block_device *nfs4_blkdev_get(dev_t dev) ++{ ++ struct block_device *bd; ++ ++ dprintk("%s enter\n", __func__); ++ bd = open_by_devnum(dev, FMODE_READ); ++ if (IS_ERR(bd)) ++ goto fail; ++ return bd; ++fail: ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ return NULL; ++} ++ ++/* ++ * Release the block device ++ */ ++int nfs4_blkdev_put(struct block_device *bdev) ++{ ++ dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), ++ MINOR(bdev->bd_dev)); ++ bd_release(bdev); ++ return blkdev_put(bdev, FMODE_READ); ++} ++ ++/* Decodes pnfs_block_deviceaddr4 (draft-8) which is XDR encoded ++ * in dev->dev_addr_buf. ++ */ ++struct pnfs_block_dev * ++nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist) ++{ ++ struct pnfs_block_dev *rv = NULL; ++ struct block_device *bd = NULL; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint32_t major, minor; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return NULL; ++ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); ++ dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, ++ dev->mincount); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_MOUNT, 0, dev->area, ++ dev->mincount); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out_err; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ dprintk("%s CALLING USERSPACE DAEMON\n", __func__); ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out_err; ++ } ++ if (reply->status != BL_DEVICE_REQUEST_PROC) { ++ dprintk("%s failed to open device: %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ memcpy(&major, (uint32_t *)(payload_of(reply)), sizeof(uint32_t)); ++ memcpy(&minor, (uint32_t *)(payload_of(reply) + sizeof(uint32_t)), ++ sizeof(uint32_t)); ++ bd = nfs4_blkdev_get(MKDEV(major, minor)); ++ if (IS_ERR(bd)) { ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ ++ rv = kzalloc(sizeof(*rv), GFP_KERNEL); ++ if (!rv) ++ goto out_err; ++ ++ rv->bm_mdev = bd; ++ memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct pnfs_deviceid)); ++ dprintk("%s Created device %s with bd_block_size %u\n", ++ __func__, ++ bd->bd_disk->disk_name, ++ bd->bd_block_size); ++ kfree(reply); ++ kfree(msg); ++ return rv; ++ ++out_err: ++ kfree(rv); ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return NULL; ++} ++ ++/* Map deviceid returned by the server to constructed block_device */ ++static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, ++ struct pnfs_deviceid *id) ++{ ++ struct block_device *rv = NULL; ++ struct block_mount_id *mid; ++ struct pnfs_block_dev *dev; ++ ++ dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); ++ mid = BLK_ID(lo); ++ spin_lock(&mid->bm_lock); ++ list_for_each_entry(dev, &mid->bm_devlist, bm_node) { ++ if (memcmp(id->data, dev->bm_mdevid.data, ++ NFS4_PNFS_DEVICEID4_SIZE) == 0) { ++ rv = dev->bm_mdev; ++ goto out; ++ } ++ } ++ out: ++ spin_unlock(&mid->bm_lock); ++ dprintk("%s returning %p\n", __func__, rv); ++ return rv; ++} ++ ++/* Tracks info needed to ensure extents in layout obey constraints of spec */ ++struct layout_verification { ++ u32 mode; /* R or RW */ ++ u64 start; /* Expected start of next non-COW extent */ ++ u64 inval; /* Start of INVAL coverage */ ++ u64 cowread; /* End of COW read coverage */ ++}; ++ ++/* Verify the extent meets the layout requirements of the pnfs-block draft, ++ * section 2.3.1. ++ */ ++static int verify_extent(struct pnfs_block_extent *be, ++ struct layout_verification *lv) ++{ ++ if (lv->mode == IOMODE_READ) { ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA || ++ be->be_state == PNFS_BLOCK_INVALID_DATA) ++ return -EIO; ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } ++ /* lv->mode == IOMODE_RW */ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ if (lv->cowread > lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ lv->inval = lv->start; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_READ_DATA) { ++ if (be->be_f_offset > lv->start) ++ return -EIO; ++ if (be->be_f_offset < lv->inval) ++ return -EIO; ++ if (be->be_f_offset < lv->cowread) ++ return -EIO; ++ /* It looks like you might want to min this with lv->start, ++ * but you really don't. ++ */ ++ lv->inval = lv->inval + be->be_length; ++ lv->cowread = be->be_f_offset + be->be_length; ++ return 0; ++ } else ++ return -EIO; ++} ++ ++/* XDR decode pnfs_block_layout4 structure */ ++int ++nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ uint32_t *p = (uint32_t *)lgr->layout.buf; ++ uint32_t *end = (uint32_t *)((char *)lgr->layout.buf + lgr->layout.len); ++ int i, status = -EIO; ++ uint32_t count; ++ struct pnfs_block_extent *be = NULL, *save; ++ uint64_t tmp; /* Used by READSECTOR */ ++ struct layout_verification lv = { ++ .mode = lgr->range.iomode, ++ .start = lgr->range.offset >> 9, ++ .inval = lgr->range.offset >> 9, ++ .cowread = lgr->range.offset >> 9, ++ }; ++ ++ LIST_HEAD(extents); ++ ++ BLK_READBUF(p, end, 4); ++ READ32(count); ++ ++ dprintk("%s enter, number of extents %i\n", __func__, count); ++ BLK_READBUF(p, end, (28 + NFS4_PNFS_DEVICEID4_SIZE) * count); ++ ++ /* Decode individual extents, putting them in temporary ++ * staging area until whole layout is decoded to make error ++ * recovery easier. ++ */ ++ for (i = 0; i < count; i++) { ++ be = alloc_extent(); ++ if (!be) { ++ status = -ENOMEM; ++ goto out_err; ++ } ++ READ_DEVID(&be->be_devid); ++ be->be_mdev = translate_devid(lo, &be->be_devid); ++ if (!be->be_mdev) ++ goto out_err; ++ /* The next three values are read in as bytes, ++ * but stored as 512-byte sector lengths ++ */ ++ READ_SECTOR(be->be_f_offset); ++ READ_SECTOR(be->be_length); ++ READ_SECTOR(be->be_v_offset); ++ READ32(be->be_state); ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ be->be_inval = &bl->bl_inval; ++ if (verify_extent(be, &lv)) { ++ dprintk("%s verify failed\n", __func__); ++ goto out_err; ++ } ++ list_add_tail(&be->be_node, &extents); ++ } ++ if (p != end) { ++ dprintk("%s Undecoded cruft at end of opaque\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lgr->range.offset + lgr->range.length != lv.start << 9) { ++ dprintk("%s Final length mismatch\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lv.start < lv.cowread) { ++ dprintk("%s Final uncovered COW extent\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ /* Extents decoded properly, now try to merge them in to ++ * existing layout extents. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ list_for_each_entry_safe(be, save, &extents, be_node) { ++ list_del(&be->be_node); ++ status = add_and_merge_extent(bl, be); ++ if (status) { ++ spin_unlock(&bl->bl_ext_lock); ++ /* This is a fairly catastrophic error, as the ++ * entire layout extent lists are now corrupted. ++ * We should have some way to distinguish this. ++ */ ++ be = NULL; ++ goto out_err; ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ status = 0; ++ out: ++ dprintk("%s returns %i\n", __func__, status); ++ return status; ++ ++ out_err: ++ put_extent(be); ++ while (!list_empty(&extents)) { ++ be = list_first_entry(&extents, struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ goto out; ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-09-30 10:17:08.546994000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-09-30 10:17:08.548993000 -0400 +@@ -0,0 +1,120 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdm.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2007 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Fred Isaman ++ * Andy Adamson ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include /* gendisk - used in a dprintk*/ ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Defines used for calculating memory usage in nfs4_blk_flatten() */ ++#define ARGSIZE 24 /* Max bytes needed for linear target arg string */ ++#define SPECSIZE (sizeof8(struct dm_target_spec) + ARGSIZE) ++#define SPECS_PER_PAGE (PAGE_SIZE / SPECSIZE) ++#define SPEC_HEADER_ADJUST (SPECS_PER_PAGE - \ ++ (PAGE_SIZE - sizeof8(struct dm_ioctl)) / SPECSIZE) ++#define roundup8(x) (((x)+7) & ~7) ++#define sizeof8(x) roundup8(sizeof(x)) ++ ++static int dev_remove(dev_t dev) ++{ ++ int ret = 1; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint64_t bl_dev; ++ uint32_t major = MAJOR(dev), minor = MINOR(dev); ++ ++ dprintk("Entering %s\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return ret; ++ ++ memcpy((void *)&bl_dev, &major, sizeof(uint32_t)); ++ memcpy((void *)&bl_dev + sizeof(uint32_t), &minor, sizeof(uint32_t)); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_UMOUNT, 0, (void *)&bl_dev, ++ sizeof(uint64_t)); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out; ++ } ++ ++ if (reply->status == BL_DEVICE_REQUEST_PROC) ++ ret = 0; /*TODO: what to return*/ ++out: ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return ret; ++} ++ ++/* ++ * Release meta device ++ */ ++static int nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) ++{ ++ int rv; ++ ++ dprintk("%s Releasing\n", __func__); ++ /* XXX Check return? */ ++ rv = nfs4_blkdev_put(bdev->bm_mdev); ++ dprintk("%s nfs4_blkdev_put returns %d\n", __func__, rv); ++ ++ rv = dev_remove(bdev->bm_mdev->bd_dev); ++ dprintk("%s Returns %d\n", __func__, rv); ++ return rv; ++} ++ ++void free_block_dev(struct pnfs_block_dev *bdev) ++{ ++ if (bdev) { ++ if (bdev->bm_mdev) { ++ dprintk("%s Removing DM device: %d:%d\n", ++ __func__, ++ MAJOR(bdev->bm_mdev->bd_dev), ++ MINOR(bdev->bm_mdev->bd_dev)); ++ /* XXX Check status ?? */ ++ nfs4_blk_metadev_release(bdev); ++ } ++ kfree(bdev); ++ } ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h +--- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-09-30 10:17:08.538988000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-09-30 10:17:08.539994000 -0400 +@@ -0,0 +1,302 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#ifndef FS_NFS_NFS4BLOCKLAYOUT_H ++#define FS_NFS_NFS4BLOCKLAYOUT_H ++ ++#include ++#include ++#include /* Needed for struct dm_ioctl*/ ++ ++#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> 9) ++ ++#define PG_pnfserr PG_owner_priv_1 ++#define PagePnfsErr(page) test_bit(PG_pnfserr, &(page)->flags) ++#define SetPagePnfsErr(page) set_bit(PG_pnfserr, &(page)->flags) ++#define ClearPagePnfsErr(page) clear_bit(PG_pnfserr, &(page)->flags) ++ ++extern int dm_dev_create(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_dev_remove(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_do_resume(struct dm_ioctl *param); ++extern int dm_table_load(struct dm_ioctl *param, size_t param_size); ++ ++struct block_mount_id { ++ spinlock_t bm_lock; /* protects list */ ++ struct list_head bm_devlist; /* holds pnfs_block_dev */ ++}; ++ ++struct pnfs_block_dev { ++ struct list_head bm_node; ++ struct pnfs_deviceid bm_mdevid; /* associated devid */ ++ struct block_device *bm_mdev; /* meta device itself */ ++}; ++ ++/* holds visible disks that can be matched against VOLUME_SIMPLE signatures */ ++struct visible_block_device { ++ struct list_head vi_node; ++ struct block_device *vi_bdev; ++ int vi_mapped; ++ int vi_put_done; ++}; ++ ++enum blk_vol_type { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, /* maps to a single LU */ ++ PNFS_BLOCK_VOLUME_SLICE = 1, /* slice of another volume */ ++ PNFS_BLOCK_VOLUME_CONCAT = 2, /* concatenation of multiple volumes */ ++ PNFS_BLOCK_VOLUME_STRIPE = 3 /* striped across multiple volumes */ ++}; ++ ++/* All disk offset/lengths are stored in 512-byte sectors */ ++struct pnfs_blk_volume { ++ uint32_t bv_type; ++ sector_t bv_size; ++ struct pnfs_blk_volume **bv_vols; ++ int bv_vol_n; ++ union { ++ dev_t bv_dev; ++ sector_t bv_stripe_unit; ++ sector_t bv_offset; ++ }; ++}; ++ ++/* Since components need not be aligned, cannot use sector_t */ ++struct pnfs_blk_sig_comp { ++ int64_t bs_offset; /* In bytes */ ++ uint32_t bs_length; /* In bytes */ ++ char *bs_string; ++}; ++ ++/* Maximum number of signatures components in a simple volume */ ++# define PNFS_BLOCK_MAX_SIG_COMP 16 ++ ++struct pnfs_blk_sig { ++ int si_num_comps; ++ struct pnfs_blk_sig_comp si_comps[PNFS_BLOCK_MAX_SIG_COMP]; ++}; ++ ++enum exstate4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ ++ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ ++}; ++ ++#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ ++ ++struct my_tree_t { ++ sector_t mtt_step_size; /* Internal sector alignment */ ++ struct list_head mtt_stub; /* Should be a radix tree */ ++}; ++ ++struct pnfs_inval_markings { ++ spinlock_t im_lock; ++ struct my_tree_t im_tree; /* Sectors that need LAYOUTCOMMIT */ ++ sector_t im_block_size; /* Server blocksize in sectors */ ++}; ++ ++struct pnfs_inval_tracking { ++ struct list_head it_link; ++ int it_sector; ++ int it_tags; ++}; ++ ++/* sector_t fields are all in 512-byte sectors */ ++struct pnfs_block_extent { ++ struct kref be_refcnt; ++ struct list_head be_node; /* link into lseg list */ ++ struct pnfs_deviceid be_devid; /* STUB - remevable??? */ ++ struct block_device *be_mdev; ++ sector_t be_f_offset; /* the starting offset in the file */ ++ sector_t be_length; /* the size of the extent */ ++ sector_t be_v_offset; /* the starting offset in the volume */ ++ enum exstate4 be_state; /* the state of this extent */ ++ struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ ++}; ++ ++/* Shortened extent used by LAYOUTCOMMIT */ ++struct pnfs_block_short_extent { ++ struct list_head bse_node; ++ struct pnfs_deviceid bse_devid; /* STUB - removable??? */ ++ struct block_device *bse_mdev; ++ sector_t bse_f_offset; /* the starting offset in the file */ ++ sector_t bse_length; /* the size of the extent */ ++}; ++ ++static inline void ++INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) ++{ ++ spin_lock_init(&marks->im_lock); ++ INIT_LIST_HEAD(&marks->im_tree.mtt_stub); ++ marks->im_block_size = blocksize; ++ marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, ++ blocksize); ++} ++ ++enum extentclass4 { ++ RW_EXTENT = 0, /* READWRTE and INVAL */ ++ RO_EXTENT = 1, /* READ and NONE */ ++ EXTENT_LISTS = 2, ++}; ++ ++static inline int choose_list(enum exstate4 state) ++{ ++ if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) ++ return RO_EXTENT; ++ else ++ return RW_EXTENT; ++} ++ ++struct pnfs_block_layout { ++ struct pnfs_layout_hdr bl_layout; ++ struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ ++ spinlock_t bl_ext_lock; /* Protects list manipulation */ ++ struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ ++ struct list_head bl_commit; /* Needs layout commit */ ++ unsigned int bl_count; /* entries in bl_commit */ ++ sector_t bl_blocksize; /* Server blocksize in sectors */ ++}; ++ ++/* this struct is comunicated between: ++ * bl_setup_layoutcommit && bl_encode_layoutcommit && bl_cleanup_layoutcommit ++ */ ++struct bl_layoutupdate_data { ++ struct list_head ranges; ++}; ++ ++#define BLK_ID(lo) ((struct block_mount_id *)(PNFS_NFS_SERVER(lo)->pnfs_ld_data)) ++ ++static inline struct pnfs_block_layout * ++BLK_LO2EXT(struct pnfs_layout_hdr *lo) ++{ ++ return container_of(lo, struct pnfs_block_layout, bl_layout); ++} ++ ++static inline struct pnfs_block_layout * ++BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) ++{ ++ return BLK_LO2EXT(lseg->layout); ++} ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes); ++ ++#define BLK_READBUF(p, e, nbytes) do { \ ++ p = blk_overflow(p, e, nbytes); \ ++ if (!p) { \ ++ printk(KERN_WARNING \ ++ "%s: reply buffer overflowed in line %d.\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++} while (0) ++ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (uint64_t)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++#define READ_DEVID(x) COPYMEM((x)->data, NFS4_PNFS_DEVICEID4_SIZE) ++#define READ_SECTOR(x) do { \ ++ READ64(tmp); \ ++ if (tmp & 0x1ff) { \ ++ printk(KERN_WARNING \ ++ "%s Value not 512-byte aligned at line %d\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++ (x) = tmp >> 9; \ ++} while (0) ++ ++#define WRITE32(n) do { \ ++ *p++ = htonl(n); \ ++ } while (0) ++#define WRITE64(n) do { \ ++ *p++ = htonl((uint32_t)((n) >> 32)); \ ++ *p++ = htonl((uint32_t)(n)); \ ++} while (0) ++#define WRITEMEM(ptr, nbytes) do { \ ++ p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ ++} while (0) ++#define WRITE_DEVID(x) WRITEMEM((x)->data, NFS4_PNFS_DEVICEID4_SIZE) ++ ++/* blocklayoutdev.c */ ++struct block_device *nfs4_blkdev_get(dev_t dev); ++int nfs4_blkdev_put(struct block_device *bdev); ++struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist); ++int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr); ++int nfs4_blk_create_block_disk_list(struct list_head *); ++void nfs4_blk_destroy_disk_list(struct list_head *); ++/* blocklayoutdm.c */ ++int nfs4_blk_flatten(struct pnfs_blk_volume *, int, struct pnfs_block_dev *); ++void free_block_dev(struct pnfs_block_dev *bdev); ++/* extents.c */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read); ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages); ++void put_extent(struct pnfs_block_extent *be); ++struct pnfs_block_extent *alloc_extent(void); ++struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be); ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect); ++int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg); ++void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct nfs4_layoutcommit_args *arg, ++ int status); ++int add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new); ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length); ++ ++#include ++ ++extern struct pipefs_list bl_device_list; ++extern struct dentry *bl_device_pipe; ++ ++int bl_pipe_init(void); ++void bl_pipe_exit(void); ++ ++#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ ++#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ ++#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ ++#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ ++#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ ++ ++#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c +--- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-09-30 10:17:08.565989000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-09-30 10:17:08.567989000 -0400 +@@ -0,0 +1,948 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include "blocklayout.h" ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Bit numbers */ ++#define EXTENT_INITIALIZED 0 ++#define EXTENT_WRITTEN 1 ++#define EXTENT_IN_COMMIT 2 ++#define INTERNAL_EXISTS MY_MAX_TAGS ++#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) ++ ++/* Returns largest t<=s s.t. t%base==0 */ ++static inline sector_t normalize(sector_t s, int base) ++{ ++ sector_t tmp = s; /* Since do_div modifies its argument */ ++ return s - do_div(tmp, base); ++} ++ ++static inline sector_t normalize_up(sector_t s, int base) ++{ ++ return normalize(s + base - 1, base); ++} ++ ++/* Complete stub using list while determine API wanted */ ++ ++/* Returns tags, or negative */ ++static int32_t _find_entry(struct my_tree_t *tree, u64 s) ++{ ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu) enter\n", __func__, s); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) ++ return pos->it_tags & INTERNAL_MASK; ++ else ++ break; ++ } ++ return -ENOENT; ++} ++ ++static inline ++int _has_tag(struct my_tree_t *tree, u64 s, int32_t tag) ++{ ++ int32_t tags; ++ ++ dprintk("%s(%llu, %i) enter\n", __func__, s, tag); ++ s = normalize(s, tree->mtt_step_size); ++ tags = _find_entry(tree, s); ++ if ((tags < 0) || !(tags & (1 << tag))) ++ return 0; ++ else ++ return 1; ++} ++ ++/* Creates entry with tag, or if entry already exists, unions tag to it. ++ * If storage is not NULL, newly created entry will use it. ++ * Returns number of entries added, or negative on error. ++ */ ++static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag, ++ struct pnfs_inval_tracking *storage) ++{ ++ int found = 0; ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) { ++ found = 1; ++ break; ++ } else ++ break; ++ } ++ if (found) { ++ pos->it_tags |= (1 << tag); ++ return 0; ++ } else { ++ struct pnfs_inval_tracking *new; ++ if (storage) ++ new = storage; ++ else { ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ } ++ new->it_sector = s; ++ new->it_tags = (1 << tag); ++ list_add(&new->it_link, &pos->it_link); ++ return 1; ++ } ++} ++ ++/* XXXX Really want option to not create */ ++/* Over range, unions tag with existing entries, else creates entry with tag */ ++static int _set_range(struct my_tree_t *tree, int32_t tag, u64 s, u64 length) ++{ ++ u64 i; ++ ++ dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); ++ for (i = normalize(s, tree->mtt_step_size); i < s + length; ++ i += tree->mtt_step_size) ++ if (_add_entry(tree, i, tag, NULL)) ++ return -ENOMEM; ++ return 0; ++} ++ ++/* Ensure that future operations on given range of tree will not malloc */ ++static int _preload_range(struct my_tree_t *tree, u64 offset, u64 length) ++{ ++ u64 start, end, s; ++ int count, i, used = 0, status = -ENOMEM; ++ struct pnfs_inval_tracking **storage; ++ ++ dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); ++ start = normalize(offset, tree->mtt_step_size); ++ end = normalize_up(offset + length, tree->mtt_step_size); ++ count = (int)(end - start) / (int)tree->mtt_step_size; ++ ++ /* Pre-malloc what memory we might need */ ++ storage = kmalloc(sizeof(*storage) * count, GFP_KERNEL); ++ if (!storage) ++ return -ENOMEM; ++ for (i = 0; i < count; i++) { ++ storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), ++ GFP_KERNEL); ++ if (!storage[i]) ++ goto out_cleanup; ++ } ++ ++ /* Now need lock - HOW??? */ ++ ++ for (s = start; s < end; s += tree->mtt_step_size) ++ used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); ++ ++ /* Unlock - HOW??? */ ++ status = 0; ++ ++ out_cleanup: ++ for (i = used; i < count; i++) { ++ if (!storage[i]) ++ break; ++ kfree(storage[i]); ++ } ++ kfree(storage); ++ return status; ++} ++ ++static void set_needs_init(sector_t *array, sector_t offset) ++{ ++ sector_t *p = array; ++ ++ dprintk("%s enter\n", __func__); ++ if (!p) ++ return; ++ while (*p < offset) ++ p++; ++ if (*p == offset) ++ return; ++ else if (*p == ~0) { ++ *p++ = offset; ++ *p = ~0; ++ return; ++ } else { ++ sector_t *save = p; ++ dprintk("%s Adding %llu\n", __func__, (u64)offset); ++ while (*p != ~0) ++ p++; ++ p++; ++ memmove(save + 1, save, (char *)p - (char *)save); ++ *save = offset; ++ return; ++ } ++} ++ ++/* We are relying on page lock to serialize this */ ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Assume start, end already sector aligned */ ++static int ++_range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag) ++{ ++ struct pnfs_inval_tracking *pos; ++ u64 expect = 0; ++ ++ dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector >= end) ++ continue; ++ if (!expect) { ++ if ((pos->it_sector == end - tree->mtt_step_size) && ++ (pos->it_tags & (1 << tag))) { ++ expect = pos->it_sector - tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ continue; ++ } else { ++ return 0; ++ } ++ } ++ if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) ++ return 0; ++ expect -= tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ } ++ return 0; ++} ++ ++static int is_range_written(struct pnfs_inval_markings *marks, ++ sector_t start, sector_t end) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Marks sectors in [offest, offset_length) as having been initialized. ++ * All lengths are step-aligned, where step is min(pagesize, blocksize). ++ * Notes where partial block is initialized, and helps prepare it for ++ * complete initialization later. ++ */ ++/* Currently assumes offset is page-aligned */ ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages) ++{ ++ sector_t s, start, end; ++ sector_t *array = NULL; /* Pages to mark */ ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", ++ __func__, (u64)offset, (u64)length); ++ s = max((sector_t) 3, ++ 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); ++ dprintk("%s set max=%llu\n", __func__, (u64)s); ++ if (pages) { ++ array = kmalloc(s * sizeof(sector_t), GFP_KERNEL); ++ if (!array) ++ goto outerr; ++ array[0] = ~0; ++ } ++ ++ start = normalize(offset, marks->im_block_size); ++ end = normalize_up(offset + length, marks->im_block_size); ++ if (_preload_range(&marks->im_tree, start, end - start)) ++ goto outerr; ++ ++ spin_lock(&marks->im_lock); ++ ++ for (s = normalize_up(start, PAGE_CACHE_SECTORS); ++ s < offset; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s pre-area pages\n", __func__); ++ /* Portion of used block is not initialized */ ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) ++ goto out_unlock; ++ for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); ++ s < end; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s post-area pages\n", __func__); ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ ++ spin_unlock(&marks->im_lock); ++ ++ if (pages) { ++ if (array[0] == ~0) { ++ kfree(array); ++ *pages = NULL; ++ } else ++ *pages = array; ++ } ++ return 0; ++ ++ out_unlock: ++ spin_unlock(&marks->im_lock); ++ outerr: ++ if (pages) { ++ kfree(array); ++ *pages = NULL; ++ } ++ return -ENOMEM; ++} ++ ++/* Marks sectors in [offest, offset+length) as having been written to disk. ++ * All lengths should be block aligned. ++ */ ++int mark_written_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length) ++{ ++ int status; ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, ++ (u64)offset, (u64)length); ++ spin_lock(&marks->im_lock); ++ status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); ++ spin_unlock(&marks->im_lock); ++ return status; ++} ++ ++static void print_short_extent(struct pnfs_block_short_extent *be) ++{ ++ dprintk("PRINT SHORT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->bse_length); ++ } ++} ++ ++void print_clist(struct list_head *list, unsigned int count) ++{ ++ struct pnfs_block_short_extent *be; ++ unsigned int i = 0; ++ ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, bse_node) { ++ i++; ++ print_short_extent(be); ++ } ++ if (i != count) ++ dprintk("\n\nExpected %u entries\n\n\n", count); ++ dprintk("****************\n"); ++} ++ ++/* Note: In theory, we should do more checking that devid's match between ++ * old and new, but if they don't, the lists are too corrupt to salvage anyway. ++ */ ++/* Note this is very similar to add_and_merge_extent */ ++static void add_to_commitlist(struct pnfs_block_layout *bl, ++ struct pnfs_block_short_extent *new) ++{ ++ struct list_head *clist = &bl->bl_commit; ++ struct pnfs_block_short_extent *old, *save; ++ sector_t end = new->bse_f_offset + new->bse_length; ++ ++ dprintk("%s enter\n", __func__); ++ print_short_extent(new); ++ print_clist(clist, bl->bl_count); ++ bl->bl_count++; ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe(old, save, clist, bse_node) { ++ if (new->bse_f_offset < old->bse_f_offset) ++ break; ++ if (end <= old->bse_f_offset + old->bse_length) { ++ /* Range is already in list */ ++ bl->bl_count--; ++ kfree(new); ++ return; ++ } else if (new->bse_f_offset <= ++ old->bse_f_offset + old->bse_length) { ++ /* new overlaps or abuts existing be */ ++ if (new->bse_mdev == old->bse_mdev) { ++ /* extend new to fully replace old */ ++ new->bse_length += new->bse_f_offset - ++ old->bse_f_offset; ++ new->bse_f_offset = old->bse_f_offset; ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ } ++ /* Note that if we never hit the above break, old will not point to a ++ * valid extent. However, in that case &old->bse_node==list. ++ */ ++ list_add_tail(&new->bse_node, &old->bse_node); ++ /* Scan forward for overlaps. If we find any, extend new and ++ * remove the overlapped extent. ++ */ ++ old = list_prepare_entry(new, clist, bse_node); ++ list_for_each_entry_safe_continue(old, save, clist, bse_node) { ++ if (end < old->bse_f_offset) ++ break; ++ /* new overlaps or abuts old */ ++ if (new->bse_mdev == old->bse_mdev) { ++ if (end < old->bse_f_offset + old->bse_length) { ++ /* extend new to fully cover old */ ++ end = old->bse_f_offset + old->bse_length; ++ new->bse_length = end - new->bse_f_offset; ++ } ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ dprintk("%s: after merging\n", __func__); ++ print_clist(clist, bl->bl_count); ++} ++ ++/* Note the range described by offset, length is guaranteed to be contained ++ * within be. ++ */ ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length) ++{ ++ sector_t new_end, end = offset + length; ++ struct pnfs_block_short_extent *new; ++ struct pnfs_block_layout *bl = container_of(be->be_inval, ++ struct pnfs_block_layout, ++ bl_inval); ++ ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ ++ mark_written_sectors(be->be_inval, offset, length); ++ /* We want to add the range to commit list, but it must be ++ * block-normalized, and verified that the normalized range has ++ * been entirely written to disk. ++ */ ++ new->bse_f_offset = offset; ++ offset = normalize(offset, bl->bl_blocksize); ++ if (offset < new->bse_f_offset) { ++ if (is_range_written(be->be_inval, offset, new->bse_f_offset)) ++ new->bse_f_offset = offset; ++ else ++ new->bse_f_offset = offset + bl->bl_blocksize; ++ } ++ new_end = normalize_up(end, bl->bl_blocksize); ++ if (end < new_end) { ++ if (is_range_written(be->be_inval, end, new_end)) ++ end = new_end; ++ else ++ end = new_end - bl->bl_blocksize; ++ } ++ if (end <= new->bse_f_offset) { ++ kfree(new); ++ return 0; ++ } ++ new->bse_length = end - new->bse_f_offset; ++ new->bse_devid = be->be_devid; ++ new->bse_mdev = be->be_mdev; ++ ++ spin_lock(&bl->bl_ext_lock); ++ /* new will be freed, either by add_to_commitlist if it decides not ++ * to use it, or after LAYOUTCOMMIT uses it in the commitlist. ++ */ ++ add_to_commitlist(bl, new); ++ spin_unlock(&bl->bl_ext_lock); ++ return 0; ++} ++ ++static void print_bl_extent(struct pnfs_block_extent *be) ++{ ++ dprintk("PRINT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->be_length); ++ dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); ++ dprintk(" be_state %d\n", be->be_state); ++ } ++} ++ ++static void ++destroy_extent(struct kref *kref) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = container_of(kref, struct pnfs_block_extent, be_refcnt); ++ dprintk("%s be=%p\n", __func__, be); ++ kfree(be); ++} ++ ++void ++put_extent(struct pnfs_block_extent *be) ++{ ++ if (be) { ++ dprintk("%s enter %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_put(&be->be_refcnt, destroy_extent); ++ } ++} ++ ++struct pnfs_block_extent *alloc_extent(void) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = kmalloc(sizeof(struct pnfs_block_extent), GFP_KERNEL); ++ if (!be) ++ return NULL; ++ INIT_LIST_HEAD(&be->be_node); ++ kref_init(&be->be_refcnt); ++ be->be_inval = NULL; ++ return be; ++} ++ ++struct pnfs_block_extent * ++get_extent(struct pnfs_block_extent *be) ++{ ++ if (be) ++ kref_get(&be->be_refcnt); ++ return be; ++} ++ ++void print_elist(struct list_head *list) ++{ ++ struct pnfs_block_extent *be; ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, be_node) { ++ print_bl_extent(be); ++ } ++ dprintk("****************\n"); ++} ++ ++static inline int ++extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) ++{ ++ /* Note this assumes new->be_f_offset >= old->be_f_offset */ ++ return (new->be_state == old->be_state) && ++ ((new->be_state == PNFS_BLOCK_NONE_DATA) || ++ ((new->be_v_offset - old->be_v_offset == ++ new->be_f_offset - old->be_f_offset) && ++ new->be_mdev == old->be_mdev)); ++} ++ ++/* Adds new to appropriate list in bl, modifying new and removing existing ++ * extents as appropriate to deal with overlaps. ++ * ++ * See find_get_extent for list constraints. ++ * ++ * Refcount on new is already set. If end up not using it, or error out, ++ * need to put the reference. ++ * ++ * Lock is held by caller. ++ */ ++int ++add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new) ++{ ++ struct pnfs_block_extent *be, *tmp; ++ sector_t end = new->be_f_offset + new->be_length; ++ struct list_head *list; ++ ++ dprintk("%s enter with be=%p\n", __func__, new); ++ print_bl_extent(new); ++ list = &bl->bl_extents[choose_list(new->be_state)]; ++ print_elist(list); ++ ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe_reverse(be, tmp, list, be_node) { ++ if (new->be_f_offset >= be->be_f_offset + be->be_length) ++ break; ++ if (new->be_f_offset >= be->be_f_offset) { ++ if (end <= be->be_f_offset + be->be_length) { ++ /* new is a subset of existing be*/ ++ if (extents_consistent(be, new)) { ++ dprintk("%s: new is subset, ignoring\n", ++ __func__); ++ put_extent(new); ++ return 0; ++ } else { ++ goto out_err; ++ } ++ } else { ++ /* |<-- be -->| ++ * |<-- new -->| */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ new->be_length += new->be_f_offset - ++ be->be_f_offset; ++ new->be_f_offset = be->be_f_offset; ++ new->be_v_offset = be->be_v_offset; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } else if (end >= be->be_f_offset + be->be_length) { ++ /* new extent overlap existing be */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } else if (end > be->be_f_offset) { ++ /* |<-- be -->| ++ *|<-- new -->| */ ++ if (extents_consistent(new, be)) { ++ /* extend new to fully replace be */ ++ new->be_length += be->be_f_offset + be->be_length - ++ new->be_f_offset - new->be_length; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } ++ /* Note that if we never hit the above break, be will not point to a ++ * valid extent. However, in that case &be->be_node==list. ++ */ ++ list_add(&new->be_node, &be->be_node); ++ dprintk("%s: inserting new\n", __func__); ++ print_elist(list); ++ /* STUB - The per-list consistency checks have all been done, ++ * should now check cross-list consistency. ++ */ ++ return 0; ++ ++ out_err: ++ put_extent(new); ++ return -EIO; ++} ++ ++/* Returns extent, or NULL. If a second READ extent exists, it is returned ++ * in cow_read, if given. ++ * ++ * The extents are kept in two seperate ordered lists, one for READ and NONE, ++ * one for READWRITE and INVALID. Within each list, we assume: ++ * 1. Extents are ordered by file offset. ++ * 2. For any given isect, there is at most one extents that matches. ++ */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read) ++{ ++ struct pnfs_block_extent *be, *cow, *ret; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ cow = ret = NULL; ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret && ++ (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ if (!ret) ++ ret = be; ++ else if (be->be_state != PNFS_BLOCK_READ_DATA) ++ put_extent(be); ++ else ++ cow = be; ++ break; ++ } ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ if (cow_read) ++ *cow_read = cow; ++ print_bl_extent(ret); ++ return ret; ++} ++ ++/* Similar to find_get_extent, but called with lock held, and ignores cow */ ++static struct pnfs_block_extent * ++find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) ++{ ++ struct pnfs_block_extent *be, *ret = NULL; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ ret = be; ++ break; ++ } ++ } ++ } ++ print_bl_extent(ret); ++ return ret; ++} ++ ++int ++encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg) ++{ ++ sector_t start, end; ++ struct pnfs_block_short_extent *lce, *save; ++ unsigned int count = 0; ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct list_head *ranges = &bld->ranges; ++ __be32 *p, *xdr_start; ++ ++ dprintk("%s enter\n", __func__); ++ start = arg->range.offset >> 9; ++ end = start + (arg->range.length >> 9); ++ dprintk("%s set start=%llu, end=%llu\n", ++ __func__, (u64)start, (u64)end); ++ ++ /* BUG - creation of bl_commit is buggy - need to wait for ++ * entire block to be marked WRITTEN before it can be added. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ /* Want to adjust for possible truncate */ ++ /* We now want to adjust argument range */ ++ ++ /* XDR encode the ranges found */ ++ xdr_start = xdr_reserve_space(xdr, 8); ++ if (!xdr_start) ++ goto out; ++ list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { ++ p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); ++ if (!p) ++ break; ++ WRITE_DEVID(&lce->bse_devid); ++ WRITE64(lce->bse_f_offset << 9); ++ WRITE64(lce->bse_length << 9); ++ WRITE64(0LL); ++ WRITE32(PNFS_BLOCK_READWRITE_DATA); ++ list_del(&lce->bse_node); ++ list_add_tail(&lce->bse_node, ranges); ++ bl->bl_count--; ++ count++; ++ } ++ xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); ++ xdr_start[1] = cpu_to_be32(count); ++out: ++ spin_unlock(&bl->bl_ext_lock); ++ dprintk("%s found %i ranges\n", __func__, count); ++ return 0; ++} ++ ++/* Helper function to set_to_rw that initialize a new extent */ ++static void ++_prep_new_extent(struct pnfs_block_extent *new, ++ struct pnfs_block_extent *orig, ++ sector_t offset, sector_t length, int state) ++{ ++ kref_init(&new->be_refcnt); ++ /* don't need to INIT_LIST_HEAD(&new->be_node) */ ++ memcpy(&new->be_devid, &orig->be_devid, sizeof(struct pnfs_deviceid)); ++ new->be_mdev = orig->be_mdev; ++ new->be_f_offset = offset; ++ new->be_length = length; ++ new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; ++ new->be_state = state; ++ new->be_inval = orig->be_inval; ++} ++ ++/* Tries to merge be with extent in front of it in list. ++ * Frees storage if not used. ++ */ ++static struct pnfs_block_extent * ++_front_merge(struct pnfs_block_extent *be, struct list_head *head, ++ struct pnfs_block_extent *storage) ++{ ++ struct pnfs_block_extent *prev; ++ ++ if (!storage) ++ goto no_merge; ++ if (&be->be_node == head || be->be_node.prev == head) ++ goto no_merge; ++ prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); ++ if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || ++ !extents_consistent(prev, be)) ++ goto no_merge; ++ _prep_new_extent(storage, prev, prev->be_f_offset, ++ prev->be_length + be->be_length, prev->be_state); ++ list_replace(&prev->be_node, &storage->be_node); ++ put_extent(prev); ++ list_del(&be->be_node); ++ put_extent(be); ++ return storage; ++ ++ no_merge: ++ kfree(storage); ++ return be; ++} ++ ++static u64 ++set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) ++{ ++ u64 rv = offset + length; ++ struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; ++ struct pnfs_block_extent *children[3]; ++ struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; ++ int i = 0, j; ++ ++ dprintk("%s(%llu, %llu)\n", __func__, offset, length); ++ /* Create storage for up to three new extents e1, e2, e3 */ ++ e1 = kmalloc(sizeof(*e1), GFP_KERNEL); ++ e2 = kmalloc(sizeof(*e2), GFP_KERNEL); ++ e3 = kmalloc(sizeof(*e3), GFP_KERNEL); ++ /* BUG - we are ignoring any failure */ ++ if (!e1 || !e2 || !e3) ++ goto out_nosplit; ++ ++ spin_lock(&bl->bl_ext_lock); ++ be = find_get_extent_locked(bl, offset); ++ rv = be->be_f_offset + be->be_length; ++ if (be->be_state != PNFS_BLOCK_INVALID_DATA) { ++ spin_unlock(&bl->bl_ext_lock); ++ goto out_nosplit; ++ } ++ /* Add e* to children, bumping e*'s krefs */ ++ if (be->be_f_offset != offset) { ++ _prep_new_extent(e1, be, be->be_f_offset, ++ offset - be->be_f_offset, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e1; ++ print_bl_extent(e1); ++ } else ++ merge1 = e1; ++ _prep_new_extent(e2, be, offset, ++ min(length, be->be_f_offset + be->be_length - offset), ++ PNFS_BLOCK_READWRITE_DATA); ++ children[i++] = e2; ++ print_bl_extent(e2); ++ if (offset + length < be->be_f_offset + be->be_length) { ++ _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, ++ be->be_f_offset + be->be_length - ++ offset - length, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e3; ++ print_bl_extent(e3); ++ } else ++ merge2 = e3; ++ ++ /* Remove be from list, and insert the e* */ ++ /* We don't get refs on e*, since this list is the base reference ++ * set when init'ed. ++ */ ++ if (i < 3) ++ children[i] = NULL; ++ new = children[0]; ++ list_replace(&be->be_node, &new->be_node); ++ put_extent(be); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); ++ for (j = 1; j < i; j++) { ++ old = new; ++ new = children[j]; ++ list_add(&new->be_node, &old->be_node); ++ } ++ if (merge2) { ++ /* This is a HACK, should just create a _back_merge function */ ++ new = list_entry(new->be_node.next, ++ struct pnfs_block_extent, be_node); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ ++ /* Since we removed the base reference above, be is now scheduled for ++ * destruction. ++ */ ++ put_extent(be); ++ dprintk("%s returns %llu after split\n", __func__, rv); ++ return rv; ++ ++ out_nosplit: ++ kfree(e1); ++ kfree(e2); ++ kfree(e3); ++ dprintk("%s returns %llu without splitting\n", __func__, rv); ++ return rv; ++} ++ ++void ++clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct nfs4_layoutcommit_args *arg, ++ int status) ++{ ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct pnfs_block_short_extent *lce, *save; ++ ++ dprintk("%s status %d\n", __func__, status); ++ list_for_each_entry_safe_reverse(lce, save, &bld->ranges, bse_node) { ++ if (likely(!status)) { ++ u64 offset = lce->bse_f_offset; ++ u64 end = offset + lce->bse_length; ++ ++ do { ++ offset = set_to_rw(bl, offset, end - offset); ++ } while (offset < end); ++ ++ kfree(lce); ++ } else { ++ spin_lock(&bl->bl_ext_lock); ++ add_to_commitlist(bl, lce); ++ spin_unlock(&bl->bl_ext_lock); ++ } ++ } ++} +diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile +--- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-09-30 10:17:08.524988000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-09-30 10:17:08.525996000 -0400 +@@ -0,0 +1,6 @@ ++# ++# Makefile for the pNFS block layout driver kernel module ++# ++obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o ++blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \ ++ extents.o block-device-discovery-pipe.o +diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/callback.h +--- linux-2.6.34.noarch/fs/nfs/callback.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-09-30 10:17:08.585990000 -0400 +@@ -111,6 +111,13 @@ extern int nfs41_validate_delegation_sta + + #define RCA4_TYPE_MASK_RDATA_DLG 0 + #define RCA4_TYPE_MASK_WDATA_DLG 1 ++#define RCA4_TYPE_MASK_DIR_DLG 2 ++#define RCA4_TYPE_MASK_FILE_LAYOUT 3 ++#define RCA4_TYPE_MASK_BLK_LAYOUT 4 ++#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8 ++#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 ++#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 ++#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 + + struct cb_recallanyargs { + struct sockaddr *craa_addr; +@@ -127,6 +134,39 @@ struct cb_recallslotargs { + extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, + void *dummy); + ++struct cb_layoutrecallargs { ++ struct sockaddr *cbl_addr; ++ struct nfs_fh cbl_fh; ++ struct pnfs_layout_range cbl_seg; ++ struct nfs_fsid cbl_fsid; ++ uint32_t cbl_recall_type; ++ uint32_t cbl_layout_type; ++ uint32_t cbl_layoutchanged; ++ nfs4_stateid cbl_stateid; ++}; ++ ++extern unsigned nfs4_callback_layoutrecall( ++ struct cb_layoutrecallargs *args, ++ void *dummy); ++ ++struct cb_devicenotifyitem { ++ uint32_t cbd_notify_type; ++ uint32_t cbd_layout_type; ++ struct pnfs_deviceid cbd_dev_id; ++ uint32_t cbd_immediate; ++}; ++ ++/* XXX: Should be dynamic up to max compound size */ ++#define NFS4_DEV_NOTIFY_MAXENTRIES 10 ++struct cb_devicenotifyargs { ++ struct sockaddr *addr; ++ int ndevs; ++ struct cb_devicenotifyitem devs[NFS4_DEV_NOTIFY_MAXENTRIES]; ++}; ++ ++extern unsigned nfs4_callback_devicenotify( ++ struct cb_devicenotifyargs *args, ++ void *dummy); + #endif /* CONFIG_NFS_V4_1 */ + + extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); +diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/nfs/callback_proc.c +--- linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-09-30 10:17:08.591990000 -0400 +@@ -8,10 +8,15 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + #include "nfs4_fs.h" + #include "callback.h" + #include "delegation.h" + #include "internal.h" ++#include "pnfs.h" + + #ifdef NFS_DEBUG + #define NFSDBG_FACILITY NFSDBG_CALLBACK +@@ -62,16 +67,6 @@ out: + return res->status; + } + +-static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *) +-{ +-#if defined(CONFIG_NFS_V4_1) +- if (clp->cl_minorversion > 0) +- return nfs41_validate_delegation_stateid; +-#endif +- return nfs4_validate_delegation_stateid; +-} +- +- + __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) + { + struct nfs_client *clp; +@@ -92,8 +87,7 @@ __be32 nfs4_callback_recall(struct cb_re + inode = nfs_delegation_find_inode(clp, &args->fh); + if (inode != NULL) { + /* Set up a helper thread to actually return the delegation */ +- switch (nfs_async_inode_return_delegation(inode, &args->stateid, +- nfs_validate_delegation_stateid(clp))) { ++ switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { + case 0: + res = 0; + break; +@@ -116,24 +110,364 @@ out: + + int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) + { +- if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, +- sizeof(delegation->stateid.data)) != 0) ++ if (delegation == NULL || memcmp(delegation->stateid.u.data, ++ stateid->u.data, ++ sizeof(delegation->stateid.u.data))) + return 0; + return 1; + } + + #if defined(CONFIG_NFS_V4_1) + ++static bool ++pnfs_is_next_layout_stateid(const struct pnfs_layout_hdr *lo, ++ const nfs4_stateid stateid) ++{ ++ int seqlock; ++ bool res; ++ u32 oldseqid, newseqid; ++ ++ do { ++ seqlock = read_seqbegin(&lo->seqlock); ++ oldseqid = be32_to_cpu(lo->stateid.u.stateid.seqid); ++ newseqid = be32_to_cpu(stateid.u.stateid.seqid); ++ res = !memcmp(lo->stateid.u.stateid.other, ++ stateid.u.stateid.other, ++ NFS4_STATEID_OTHER_SIZE); ++ if (res) { /* comparing layout stateids */ ++ if (oldseqid == ~0) ++ res = (newseqid == 1); ++ else ++ res = (newseqid == oldseqid + 1); ++ } else { /* open stateid */ ++ res = !memcmp(lo->stateid.u.data, ++ &zero_stateid, ++ NFS4_STATEID_SIZE); ++ if (res) ++ res = (newseqid == 1); ++ } ++ } while (read_seqretry(&lo->seqlock, seqlock)); ++ ++ return res; ++} ++ ++/* ++ * Retrieve an inode based on layout recall parameters ++ * ++ * Note: caller must iput(inode) to dereference the inode. ++ */ ++static struct inode * ++nfs_layoutrecall_find_inode(struct nfs_client *clp, ++ const struct cb_layoutrecallargs *args) ++{ ++ struct nfs_inode *nfsi; ++ struct pnfs_layout_hdr *lo; ++ struct nfs_server *server; ++ struct inode *ino = NULL; ++ ++ dprintk("%s: Begin recall_type=%d clp %p\n", ++ __func__, args->cbl_recall_type, clp); ++ ++ spin_lock(&clp->cl_lock); ++ list_for_each_entry(lo, &clp->cl_layouts, layouts) { ++ nfsi = PNFS_NFS_INODE(lo); ++ if (!nfsi) ++ continue; ++ ++ dprintk("%s: Searching inode=%lu\n", ++ __func__, nfsi->vfs_inode.i_ino); ++ ++ if (args->cbl_recall_type == RETURN_FILE) { ++ if (nfs_compare_fh(&args->cbl_fh, &nfsi->fh)) ++ continue; ++ } else if (args->cbl_recall_type == RETURN_FSID) { ++ server = NFS_SERVER(&nfsi->vfs_inode); ++ if (server->fsid.major != args->cbl_fsid.major || ++ server->fsid.minor != args->cbl_fsid.minor) ++ continue; ++ } ++ ++ /* Make sure client didn't clean up layout without ++ * telling the server */ ++ if (!has_layout(nfsi)) ++ continue; ++ ++ ino = igrab(&nfsi->vfs_inode); ++ dprintk("%s: Found inode=%p\n", __func__, ino); ++ break; ++ } ++ spin_unlock(&clp->cl_lock); ++ return ino; ++} ++ ++struct recall_layout_threadargs { ++ struct inode *inode; ++ struct nfs_client *clp; ++ struct completion started; ++ struct cb_layoutrecallargs *rl; ++ int result; ++}; ++ ++static int pnfs_recall_layout(void *data) ++{ ++ struct inode *inode, *ino; ++ struct nfs_client *clp; ++ struct cb_layoutrecallargs rl; ++ struct nfs4_layoutreturn *lrp; ++ struct recall_layout_threadargs *args = ++ (struct recall_layout_threadargs *)data; ++ int status = 0; ++ ++ daemonize("nfsv4-layoutreturn"); ++ ++ dprintk("%s: recall_type=%d fsid 0x%llx-0x%llx start\n", ++ __func__, args->rl->cbl_recall_type, ++ args->rl->cbl_fsid.major, args->rl->cbl_fsid.minor); ++ ++ clp = args->clp; ++ inode = args->inode; ++ rl = *args->rl; ++ ++ /* support whole file layouts only */ ++ rl.cbl_seg.offset = 0; ++ rl.cbl_seg.length = NFS4_MAX_UINT64; ++ ++ if (rl.cbl_recall_type == RETURN_FILE) { ++ if (pnfs_is_next_layout_stateid(NFS_I(inode)->layout, ++ rl.cbl_stateid)) ++ status = pnfs_return_layout(inode, &rl.cbl_seg, ++ &rl.cbl_stateid, RETURN_FILE, ++ false); ++ else ++ status = cpu_to_be32(NFS4ERR_DELAY); ++ if (status) ++ dprintk("%s RETURN_FILE error: %d\n", __func__, status); ++ else ++ status = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); ++ args->result = status; ++ complete(&args->started); ++ goto out; ++ } ++ ++ status = cpu_to_be32(NFS4_OK); ++ args->result = status; ++ complete(&args->started); ++ args = NULL; ++ ++ /* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */ ++ while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) { ++ /* FIXME: need to check status on pnfs_return_layout */ ++ pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, false); ++ iput(ino); ++ } ++ ++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); ++ if (!lrp) { ++ dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n", ++ __func__); ++ goto out; ++ } ++ ++ /* send final layoutreturn */ ++ lrp->args.reclaim = 0; ++ lrp->args.layout_type = rl.cbl_layout_type; ++ lrp->args.return_type = rl.cbl_recall_type; ++ lrp->args.range = rl.cbl_seg; ++ lrp->args.inode = inode; ++ nfs4_proc_layoutreturn(lrp, true); ++ ++out: ++ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); ++ nfs_put_client(clp); ++ module_put_and_exit(0); ++ dprintk("%s: exit status %d\n", __func__, 0); ++ return 0; ++} ++ ++/* ++ * Asynchronous layout recall! ++ */ ++static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode, ++ struct cb_layoutrecallargs *rl) ++{ ++ struct recall_layout_threadargs data = { ++ .clp = clp, ++ .inode = inode, ++ .rl = rl, ++ }; ++ struct task_struct *t; ++ int status = -EAGAIN; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ /* FIXME: do not allow two concurrent layout recalls */ ++ if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) ++ return status; ++ ++ init_completion(&data.started); ++ __module_get(THIS_MODULE); ++ if (!atomic_inc_not_zero(&clp->cl_count)) ++ goto out_put_no_client; ++ ++ t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout"); ++ if (IS_ERR(t)) { ++ printk(KERN_INFO "NFS: Layout recall callback thread failed " ++ "for client (clientid %08x/%08x)\n", ++ (unsigned)(clp->cl_clientid >> 32), ++ (unsigned)(clp->cl_clientid)); ++ status = PTR_ERR(t); ++ goto out_module_put; ++ } ++ wait_for_completion(&data.started); ++ return data.result; ++out_module_put: ++ nfs_put_client(clp); ++out_put_no_client: ++ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); ++ module_put(THIS_MODULE); ++ return status; ++} ++ ++static int pnfs_recall_all_layouts(struct nfs_client *clp) ++{ ++ struct cb_layoutrecallargs rl; ++ struct inode *inode; ++ int status = 0; ++ ++ rl.cbl_recall_type = RETURN_ALL; ++ rl.cbl_seg.iomode = IOMODE_ANY; ++ rl.cbl_seg.offset = 0; ++ rl.cbl_seg.length = NFS4_MAX_UINT64; ++ ++ /* we need the inode to get the nfs_server struct */ ++ inode = nfs_layoutrecall_find_inode(clp, &rl); ++ if (!inode) ++ return status; ++ status = pnfs_async_return_layout(clp, inode, &rl); ++ iput(inode); ++ ++ return status; ++} ++ ++__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args, ++ void *dummy) ++{ ++ struct nfs_client *clp; ++ struct inode *inode = NULL; ++ __be32 res; ++ int status; ++ unsigned int num_client = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); ++ clp = nfs_find_client(args->cbl_addr, 4); ++ if (clp == NULL) { ++ dprintk("%s: no client for addr %u.%u.%u.%u\n", ++ __func__, NIPQUAD(args->cbl_addr)); ++ goto out; ++ } ++ ++ res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); ++ do { ++ struct nfs_client *prev = clp; ++ num_client++; ++ /* the callback must come from the MDS personality */ ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) ++ goto loop; ++ if (args->cbl_recall_type == RETURN_FILE) { ++ inode = nfs_layoutrecall_find_inode(clp, args); ++ if (inode != NULL) { ++ status = pnfs_async_return_layout(clp, inode, ++ args); ++ if (status) ++ res = cpu_to_be32(NFS4ERR_DELAY); ++ iput(inode); ++ } ++ } else { /* _ALL or _FSID */ ++ /* we need the inode to get the nfs_server struct */ ++ inode = nfs_layoutrecall_find_inode(clp, args); ++ if (!inode) ++ goto loop; ++ status = pnfs_async_return_layout(clp, inode, args); ++ if (status) ++ res = cpu_to_be32(NFS4ERR_DELAY); ++ iput(inode); ++ } ++loop: ++ clp = nfs_find_client_next(prev); ++ nfs_put_client(prev); ++ } while (clp != NULL); ++ ++out: ++ dprintk("%s: exit with status = %d numclient %u\n", ++ __func__, ntohl(res), num_client); ++ return res; ++} ++ ++/* Remove the deviceid(s) from the nfs_client deviceid cache */ ++static __be32 pnfs_devicenotify_client(struct nfs_client *clp, ++ struct cb_devicenotifyargs *args) ++{ ++ uint32_t type; ++ int i; ++ ++ dprintk("%s: --> clp %p\n", __func__, clp); ++ ++ for (i = 0; i < args->ndevs; i++) { ++ struct cb_devicenotifyitem *dev = &args->devs[i]; ++ type = dev->cbd_notify_type; ++ if (type == NOTIFY_DEVICEID4_DELETE && clp->cl_devid_cache) ++ nfs4_delete_device(clp->cl_devid_cache, ++ &dev->cbd_dev_id); ++ else if (type == NOTIFY_DEVICEID4_CHANGE) ++ printk(KERN_ERR "%s: NOTIFY_DEVICEID4_CHANGE " ++ "not supported\n", __func__); ++ } ++ return 0; ++} ++ ++__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, ++ void *dummy) ++{ ++ struct nfs_client *clp; ++ __be32 res = 0; ++ unsigned int num_client = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ res = __constant_htonl(NFS4ERR_INVAL); ++ clp = nfs_find_client(args->addr, 4); ++ if (clp == NULL) { ++ dprintk("%s: no client for addr %u.%u.%u.%u\n", ++ __func__, NIPQUAD(args->addr)); ++ goto out; ++ } ++ ++ do { ++ struct nfs_client *prev = clp; ++ num_client++; ++ res = pnfs_devicenotify_client(clp, args); ++ clp = nfs_find_client_next(prev); ++ nfs_put_client(prev); ++ } while (clp != NULL); ++ ++out: ++ dprintk("%s: exit with status = %d numclient %u\n", ++ __func__, ntohl(res), num_client); ++ return res; ++} ++ + int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) + { + if (delegation == NULL) + return 0; + +- /* seqid is 4-bytes long */ +- if (((u32 *) &stateid->data)[0] != 0) ++ if (stateid->u.stateid.seqid != 0) + return 0; +- if (memcmp(&delegation->stateid.data[4], &stateid->data[4], +- sizeof(stateid->data)-4)) ++ if (memcmp(&delegation->stateid.u.stateid.other, ++ &stateid->u.stateid.other, ++ NFS4_STATEID_OTHER_SIZE)) + return 0; + + return 1; +@@ -335,13 +669,37 @@ out: + return status; + } + ++static inline bool ++validate_bitmap_values(const unsigned long *mask) ++{ ++ int i; ++ ++ if (*mask == 0) ++ return true; ++ if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_WDATA_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_DIR_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, mask) || ++ test_bit(RCA4_TYPE_MASK_BLK_LAYOUT, mask)) ++ return true; ++ for (i = RCA4_TYPE_MASK_OBJ_LAYOUT_MIN; ++ i <= RCA4_TYPE_MASK_OBJ_LAYOUT_MAX; i++) ++ if (test_bit(i, mask)) ++ return true; ++ for (i = RCA4_TYPE_MASK_OTHER_LAYOUT_MIN; ++ i <= RCA4_TYPE_MASK_OTHER_LAYOUT_MAX; i++) ++ if (test_bit(i, mask)) ++ return true; ++ return false; ++} ++ + __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) + { + struct nfs_client *clp; + __be32 status; + fmode_t flags = 0; + +- status = htonl(NFS4ERR_OP_NOT_IN_SESSION); ++ status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + clp = nfs_find_client(args->craa_addr, 4); + if (clp == NULL) + goto out; +@@ -349,16 +707,25 @@ __be32 nfs4_callback_recallany(struct cb + dprintk("NFS: RECALL_ANY callback request from %s\n", + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + ++ status = cpu_to_be32(NFS4ERR_INVAL); ++ if (!validate_bitmap_values((const unsigned long *) ++ &args->craa_type_mask)) ++ return status; ++ ++ status = cpu_to_be32(NFS4_OK); + if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags = FMODE_READ; + if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags |= FMODE_WRITE; ++ if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) ++ &args->craa_type_mask)) ++ if (pnfs_recall_all_layouts(clp) == -EAGAIN) ++ status = cpu_to_be32(NFS4ERR_DELAY); + + if (flags) + nfs_expire_all_delegation_types(clp, flags); +- status = htonl(NFS4_OK); + out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/nfs/callback_xdr.c +--- linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-09-30 10:17:08.597991000 -0400 +@@ -22,6 +22,8 @@ + #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + + #if defined(CONFIG_NFS_V4_1) ++#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) ++#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ + 4 + 1 + 3) + #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +@@ -136,7 +138,7 @@ static __be32 decode_stateid(struct xdr_ + p = read_buf(xdr, 16); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); +- memcpy(stateid->data, p, 16); ++ memcpy(stateid->u.data, p, 16); + return 0; + } + +@@ -220,6 +222,148 @@ out: + + #if defined(CONFIG_NFS_V4_1) + ++static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct cb_layoutrecallargs *args) ++{ ++ __be32 *p; ++ __be32 status = 0; ++ ++ args->cbl_addr = svc_addr(rqstp); ++ p = read_buf(xdr, 4 * sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ ++ args->cbl_layout_type = ntohl(*p++); ++ args->cbl_seg.iomode = ntohl(*p++); ++ args->cbl_layoutchanged = ntohl(*p++); ++ args->cbl_recall_type = ntohl(*p++); ++ ++ if (likely(args->cbl_recall_type == RETURN_FILE)) { ++ status = decode_fh(xdr, &args->cbl_fh); ++ if (unlikely(status != 0)) ++ goto out; ++ ++ p = read_buf(xdr, 2 * sizeof(uint64_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ p = xdr_decode_hyper(p, &args->cbl_seg.offset); ++ p = xdr_decode_hyper(p, &args->cbl_seg.length); ++ status = decode_stateid(xdr, &args->cbl_stateid); ++ if (unlikely(status != 0)) ++ goto out; ++ } else if (args->cbl_recall_type == RETURN_FSID) { ++ p = read_buf(xdr, 2 * sizeof(uint64_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ p = xdr_decode_hyper(p, &args->cbl_fsid.major); ++ p = xdr_decode_hyper(p, &args->cbl_fsid.minor); ++ } ++ dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d " ++ "fsid %llx-%llx fhsize %d\n", __func__, ++ args->cbl_layout_type, args->cbl_seg.iomode, ++ args->cbl_layoutchanged, args->cbl_recall_type, ++ args->cbl_fsid.major, args->cbl_fsid.minor, ++ args->cbl_fh.size); ++out: ++ dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); ++ return status; ++} ++ ++static ++__be32 decode_devicenotify_args(struct svc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct cb_devicenotifyargs *args) ++{ ++ __be32 *p; ++ __be32 status = 0; ++ u32 tmp; ++ int n, i; ++ args->ndevs = 0; ++ ++ args->addr = svc_addr(rqstp); ++ ++ /* Num of device notifications */ ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ n = ntohl(*p++); ++ if (n <= 0) ++ goto out; ++ ++ /* XXX: need to possibly return error in this case */ ++ if (n > NFS4_DEV_NOTIFY_MAXENTRIES) { ++ dprintk("%s: Processing (%d) notifications out of (%d)\n", ++ __func__, NFS4_DEV_NOTIFY_MAXENTRIES, n); ++ n = NFS4_DEV_NOTIFY_MAXENTRIES; ++ } ++ ++ /* Decode each dev notification */ ++ for (i = 0; i < n; i++) { ++ struct cb_devicenotifyitem *dev = &args->devs[i]; ++ ++ p = read_buf(xdr, (4 * sizeof(uint32_t)) ++ + NFS4_PNFS_DEVICEID4_SIZE); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* bitmap size */ ++ if (tmp != 1) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_notify_type = ntohl(*p++); ++ if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* opaque size */ ++ if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) && ++ (tmp != NFS4_PNFS_DEVICEID4_SIZE + 8)) || ++ ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) && ++ (tmp != NFS4_PNFS_DEVICEID4_SIZE + 4))) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_layout_type = ntohl(*p++); ++ memcpy(dev->cbd_dev_id.data, p, NFS4_PNFS_DEVICEID4_SIZE); ++ p += XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE); ++ ++ if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_DELAY); ++ goto out; ++ } ++ dev->cbd_immediate = ntohl(*p++); ++ } else { ++ dev->cbd_immediate = 0; ++ } ++ ++ args->ndevs++; ++ ++ dprintk("%s: type %d layout 0x%x immediate %d\n", ++ __func__, dev->cbd_notify_type, dev->cbd_layout_type, ++ dev->cbd_immediate); ++ } ++out: ++ dprintk("%s: status %d ndevs %d\n", ++ __func__, ntohl(status), args->ndevs); ++ return status; ++} ++ + static __be32 decode_sessionid(struct xdr_stream *xdr, + struct nfs4_sessionid *sid) + { +@@ -574,11 +718,11 @@ preprocess_nfs41_op(int nop, unsigned in + case OP_CB_SEQUENCE: + case OP_CB_RECALL_ANY: + case OP_CB_RECALL_SLOT: ++ case OP_CB_LAYOUTRECALL: ++ case OP_CB_NOTIFY_DEVICEID: + *op = &callback_ops[op_nr]; + break; + +- case OP_CB_LAYOUTRECALL: +- case OP_CB_NOTIFY_DEVICEID: + case OP_CB_NOTIFY: + case OP_CB_PUSH_DELEG: + case OP_CB_RECALLABLE_OBJ_AVAIL: +@@ -739,6 +883,18 @@ static struct callback_op callback_ops[] + .res_maxsize = CB_OP_RECALL_RES_MAXSZ, + }, + #if defined(CONFIG_NFS_V4_1) ++ [OP_CB_LAYOUTRECALL] = { ++ .process_op = (callback_process_op_t)nfs4_callback_layoutrecall, ++ .decode_args = ++ (callback_decode_arg_t)decode_layoutrecall_args, ++ .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, ++ }, ++ [OP_CB_NOTIFY_DEVICEID] = { ++ .process_op = (callback_process_op_t)nfs4_callback_devicenotify, ++ .decode_args = ++ (callback_decode_arg_t)decode_devicenotify_args, ++ .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ, ++ }, + [OP_CB_SEQUENCE] = { + .process_op = (callback_process_op_t)nfs4_callback_sequence, + .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, +diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/client.c +--- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-09-30 10:15:17.723710000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/client.c 2010-09-30 10:17:08.603991000 -0400 +@@ -39,6 +39,7 @@ + #include + #include + #include ++#include + + #include + +@@ -48,6 +49,7 @@ + #include "iostat.h" + #include "internal.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_CLIENT + +@@ -150,11 +152,14 @@ static struct nfs_client *nfs_alloc_clie + clp->cl_boot_time = CURRENT_TIME; + clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; + clp->cl_minorversion = cl_init->minorversion; ++ clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; + #endif + cred = rpc_lookup_machine_cred(); + if (!IS_ERR(cred)) + clp->cl_machine_cred = cred; +- ++#if defined(CONFIG_NFS_V4_1) ++ INIT_LIST_HEAD(&clp->cl_layouts); ++#endif + nfs_fscache_get_client_cookie(clp); + + return clp; +@@ -178,7 +183,7 @@ static void nfs4_clear_client_minor_vers + clp->cl_session = NULL; + } + +- clp->cl_call_sync = _nfs4_call_sync; ++ clp->cl_mvops = nfs_v4_minor_ops[0]; + #endif /* CONFIG_NFS_V4_1 */ + } + +@@ -188,7 +193,7 @@ static void nfs4_clear_client_minor_vers + static void nfs4_destroy_callback(struct nfs_client *clp) + { + if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) +- nfs_callback_down(clp->cl_minorversion); ++ nfs_callback_down(clp->cl_mvops->minor_version); + } + + static void nfs4_shutdown_client(struct nfs_client *clp) +@@ -251,6 +256,7 @@ void nfs_put_client(struct nfs_client *c + nfs_free_client(clp); + } + } ++EXPORT_SYMBOL(nfs_put_client); + + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + /* +@@ -343,7 +349,7 @@ static int nfs_sockaddr_match_ipaddr(con + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, including the port number. + */ +-static int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2) + { + if (sa1->sa_family != sa2->sa_family) +@@ -357,6 +363,7 @@ static int nfs_sockaddr_cmp(const struct + } + return 0; + } ++EXPORT_SYMBOL(nfs_sockaddr_cmp); + + /* + * Find a client by IP address and protocol version +@@ -548,6 +555,7 @@ int nfs4_check_client_ready(struct nfs_c + return -EPROTONOSUPPORT; + return 0; + } ++EXPORT_SYMBOL(nfs4_check_client_ready); + + /* + * Initialise the timeout values for a connection +@@ -865,9 +873,34 @@ error: + } + + /* ++ * Initialize the pNFS layout driver and setup pNFS related parameters ++ */ ++static void nfs4_init_pnfs(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) ++{ ++#if defined(CONFIG_NFS_V4_1) ++ struct nfs_client *clp = server->nfs_client; ++ ++ if (nfs4_has_session(clp) && ++ (clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) { ++ server->pnfs_blksize = fsinfo->blksize; ++ set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype); ++ pnfs_set_ds_iosize(server); ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ ++static void nfs4_uninit_pnfs(struct nfs_server *server) ++{ ++#if defined(CONFIG_NFS_V4_1) ++ if (server->nfs_client && nfs4_has_session(server->nfs_client)) ++ unmount_pnfs_layoutdriver(server); ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ ++/* + * Load up the server record from information gained in an fsinfo record + */ +-static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) ++static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) + { + unsigned long max_rpc_payload; + +@@ -897,6 +930,8 @@ static void nfs_server_set_fsinfo(struct + if (server->wsize > NFS_MAX_FILE_IO_SIZE) + server->wsize = NFS_MAX_FILE_IO_SIZE; + server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; ++ nfs4_init_pnfs(server, mntfh, fsinfo); ++ + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); + + server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); +@@ -938,7 +973,7 @@ static int nfs_probe_fsinfo(struct nfs_s + if (error < 0) + goto out_error; + +- nfs_server_set_fsinfo(server, &fsinfo); ++ nfs_server_set_fsinfo(server, mntfh, &fsinfo); + + /* Get some general file system info */ + if (server->namelen == 0) { +@@ -1016,6 +1051,7 @@ void nfs_free_server(struct nfs_server * + { + dprintk("--> nfs_free_server()\n"); + ++ nfs4_uninit_pnfs(server); + spin_lock(&nfs_client_lock); + list_del(&server->client_link); + list_del(&server->master_link); +@@ -1126,7 +1162,7 @@ static int nfs4_init_callback(struct nfs + return error; + } + +- error = nfs_callback_up(clp->cl_minorversion, ++ error = nfs_callback_up(clp->cl_mvops->minor_version, + clp->cl_rpcclient->cl_xprt); + if (error < 0) { + dprintk("%s: failed to start callback. Error = %d\n", +@@ -1143,10 +1179,8 @@ static int nfs4_init_callback(struct nfs + */ + static int nfs4_init_client_minor_version(struct nfs_client *clp) + { +- clp->cl_call_sync = _nfs4_call_sync; +- + #if defined(CONFIG_NFS_V4_1) +- if (clp->cl_minorversion) { ++ if (clp->cl_mvops->minor_version) { + struct nfs4_session *session = NULL; + /* + * Create the session and mark it expired. +@@ -1158,7 +1192,13 @@ static int nfs4_init_client_minor_versio + return -ENOMEM; + + clp->cl_session = session; +- clp->cl_call_sync = _nfs4_call_sync_session; ++ /* ++ * The create session reply races with the server back ++ * channel probe. Mark the client NFS_CS_SESSION_INITING ++ * so that the client back channel can find the ++ * nfs_client struct ++ */ ++ clp->cl_cons_state = NFS_CS_SESSION_INITING; + } + #endif /* CONFIG_NFS_V4_1 */ + +@@ -1216,7 +1256,7 @@ error: + /* + * Set up an NFS4 client + */ +-static int nfs4_set_client(struct nfs_server *server, ++int nfs4_set_client(struct nfs_server *server, + const char *hostname, + const struct sockaddr *addr, + const size_t addrlen, +@@ -1259,6 +1299,7 @@ error: + dprintk("<-- nfs4_set_client() = xerror %d\n", error); + return error; + } ++EXPORT_SYMBOL(nfs4_set_client); + + + /* +@@ -1448,7 +1489,7 @@ struct nfs_server *nfs4_create_referral_ + data->authflavor, + parent_server->client->cl_xprt->prot, + parent_server->client->cl_timeout, +- parent_client->cl_minorversion); ++ parent_client->cl_mvops->minor_version); + if (error < 0) + goto error; + +diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/bl_com.c +--- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-09-30 10:17:08.822996000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-09-30 10:17:08.824003000 -0400 +@@ -0,0 +1,292 @@ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++static ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); ++static void bl_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops bl_upcall_ops = { ++ .upcall = bl_pipe_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = bl_pipe_destroy_msg, ++}; ++ ++bl_comm_t *bl_comm_global; ++ ++int ++nfsd_bl_start(void) ++{ ++ bl_comm_t *bl_comm = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ dprintk("%s: starting pipe\n", __func__); ++ if (bl_comm_global) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ bl_comm = kzalloc(sizeof (*bl_comm), GFP_KERNEL); ++ if (!bl_comm) { ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ /* FIXME: rename to "spnfs_block" */ ++ bl_comm->pipe_dentry = rpc_mkpipe(nd.path.dentry, "pnfs_block", bl_comm, ++ &bl_upcall_ops, 0); ++ if (IS_ERR(bl_comm->pipe_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ mutex_init(&bl_comm->lock); ++ mutex_init(&bl_comm->pipe_lock); ++ init_waitqueue_head(&bl_comm->pipe_wq); ++ ++ bl_comm_global = bl_comm; ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(bl_comm); ++ return rc; ++} ++ ++void ++nfsd_bl_stop(void) ++{ ++ bl_comm_t *c = bl_comm_global; ++ ++ dprintk("%s: stopping pipe\n", __func__); ++ if (!c) ++ return; ++ rpc_unlink(c->pipe_dentry); ++ rpc_put_mount(); ++ bl_comm_global = NULL; ++ kfree(c); ++} ++ ++static ssize_t ++bl_pipe_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *dst, ++ size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied, ++ left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ ++ return mlen; ++} ++ ++static ssize_t ++bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ bl_comm_t *bc = (bl_comm_t *)rpci->private; ++ bl_comm_msg_t *im = &bc->msg; ++ int ret; ++ bl_comm_res_t *res; ++ ++ ++ if (mlen == 0) { ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ im->msg_res = NULL; ++ wake_up(&bc->pipe_wq); ++ return -EFAULT; ++ } ++ ++ if ((res = kmalloc(mlen, GFP_KERNEL)) == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(res, src, mlen)) { ++ kfree(res); ++ return -EFAULT; ++ } ++ ++ mutex_lock(&bc->pipe_lock); ++ ++ ret = mlen; ++ im->msg_status = res->res_status; ++ im->msg_res = res; ++ ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++ return ret; ++} ++ ++static void ++bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ bl_comm_msg_t *im = msg->data; ++ bl_comm_t *bc = container_of(im, struct bl_comm, msg); ++ ++ if (msg->errno >= 0) ++ return; ++ ++ mutex_lock(&bc->pipe_lock); ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++} ++ ++int ++bl_upcall(bl_comm_t *bc, bl_comm_msg_t *upmsg, bl_comm_res_t **res) ++{ ++ struct rpc_pipe_msg msg; ++ DECLARE_WAITQUEUE(wq, current); ++ int rval = 1; ++ bl_comm_msg_t *m = &bc->msg; ++ ++ if (bc == NULL) { ++ dprintk("%s: No pNFS block daemon available\n", __func__); ++ return 1; ++ } ++ ++ mutex_lock(&bc->lock); ++ mutex_lock(&bc->pipe_lock); ++ ++ memcpy(m, upmsg, sizeof (*m)); ++ ++ memset(&msg, 0, sizeof (msg)); ++ msg.data = m; ++ msg.len = sizeof (*m); ++ ++ add_wait_queue(&bc->pipe_wq, &wq); ++ rval = rpc_queue_upcall(bc->pipe_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&bc->pipe_lock); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ mutex_lock(&bc->pipe_lock); ++ ++ if (m->msg_status == PNFS_BLOCK_SUCCESS) { ++ *res = m->msg_res; ++ rval = 0; ++ } else ++ rval = 1; ++ ++out: ++ mutex_unlock(&bc->pipe_lock); ++ mutex_unlock(&bc->lock); ++ return rval; ++} ++ ++static ssize_t ctl_write(struct file *file, const char __user *buf, size_t len, ++ loff_t *offset) ++{ ++ int cmd, ++ rc; ++ bl_comm_t *bc = bl_comm_global; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof (int))) ++ return -EFAULT; ++ switch (cmd) { ++ case PNFS_BLOCK_CTL_STOP: ++ msg.msg_type = PNFS_UPCALL_MSG_STOP; ++ (void) bl_upcall(bc, &msg, &res); ++ kfree(res); ++ nfsd_bl_stop(); ++ break; ++ ++ case PNFS_BLOCK_CTL_START: ++ rc = nfsd_bl_start(); ++ if (rc != 0) ++ return rc; ++ break; ++ ++ case PNFS_BLOCK_CTL_VERS: ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bc, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ kfree(res); ++ break; ++ ++ default: ++ dprintk("%s: unknown ctl command %d\n", __func__, cmd); ++ break; ++ } ++ return len; ++} ++ ++static struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++ ++/* ++ * bl_init_proc -- set up proc interfaces ++ * ++ * Creating a pnfs_block directory isn't really required at this point ++ * since we've only got a single node in that directory. If the need for ++ * more nodes doesn't present itself shortly this code should revert ++ * to a single top level node. McNeal 11-Aug-2008. ++ */ ++int ++bl_init_proc(void) ++{ ++ struct proc_dir_entry *e; ++ ++ e = proc_mkdir("fs/pnfs_block", NULL); ++ if (!e) ++ return -ENOMEM; ++ ++ e = create_proc_entry("fs/pnfs_block/ctl", 0, NULL); ++ if (!e) ++ return -ENOMEM; ++ e->proc_fops = &ctl_ops; ++ ++ return 0; ++} ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/bl_ops.c +--- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-09-30 10:17:08.827998000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-09-30 10:17:08.829998000 -0400 +@@ -0,0 +1,1672 @@ ++/* ++ * bl_ops.c ++ * spNFS ++ * ++ * Created by Rick McNeal on 4/1/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++ ++/* ++ * Block layout operations. ++ * ++ * These functions, with the exception of pnfs_block_enabled, are assigned to ++ * the super block s_export_op structure. ++ */ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#define MIN(a, b) ((a) < (b) ? (a) : (b)) ++ ++#define BL_LAYOUT_HASH_BITS 4 ++#define BL_LAYOUT_HASH_SIZE (1 << BL_LAYOUT_HASH_BITS) ++#define BL_LAYOUT_HASH_MASK (BL_LAYOUT_HASH_SIZE - 1) ++#define BL_LIST_REQ (sizeof (struct dm_ioctl) + 256) ++ ++#define bl_layout_hashval(id) \ ++ ((id) & BL_LAYOUT_HASH_MASK) ++ ++#define BLL_F_END(p) ((p)->bll_foff + (p)->bll_len) ++#define BLL_S_END(p) ((p)->bll_soff + (p)->bll_len) ++#define _2SECTS(v) ((v) >> 9) ++ ++#ifndef READ32 ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++(x) = (u64)ntohl(*p++) << 32; \ ++(x) |= ntohl(*p++); \ ++} while (0) ++#endif ++ ++ ++typedef enum {True, False} boolean_t; ++/* ---- block layoutget and commit structure ---- */ ++typedef struct bl_layout_rec { ++ struct list_head blr_hash, ++ blr_layouts; ++ dev_t blr_rdev; ++ struct inode *blr_inode; ++ int blr_recalled; // debug ++ u64 blr_orig_size, ++ blr_commit_size, ++ blr_ext_size; ++ spinlock_t blr_lock; // Protects blr_layouts ++} bl_layout_rec_t; ++ ++static struct list_head layout_hash; ++static struct list_head layout_hashtbl[BL_LAYOUT_HASH_SIZE]; ++static spinlock_t layout_hashtbl_lock; ++ ++/* ---- prototypes ---- */ ++static boolean_t device_slice(dev_t devid); ++static boolean_t device_dm(dev_t devid); ++static boolean_t layout_inode_add(struct inode *i, bl_layout_rec_t **); ++static bl_layout_rec_t *layout_inode_find(struct inode *i); ++static void layout_inode_del(struct inode *i); ++static char *map_state2name(enum pnfs_block_extent_state4 s); ++static pnfs_blocklayout_devinfo_t *bld_alloc(struct list_head *volume, int type); ++static void bld_free(pnfs_blocklayout_devinfo_t *bld); ++static pnfs_blocklayout_devinfo_t *bld_simple(struct list_head *volumes, ++ dev_t devid, int local_index); ++static pnfs_blocklayout_devinfo_t *bld_slice(struct list_head *volumes, ++ dev_t devid, int my_loc, int idx); ++static int layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg); ++struct list_head *layout_cache_iter(bl_layout_rec_t *r, ++ struct list_head *bl_possible, struct nfsd4_layout_seg *seg); ++static void layout_cache_merge(bl_layout_rec_t *r, struct list_head *h); ++static int layout_cache_update(bl_layout_rec_t *r, struct list_head *h); ++static void layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg); ++static void print_bll(pnfs_blocklayout_layout_t *b, char *); ++static inline boolean_t layout_cache_fill_from_list(bl_layout_rec_t *r, ++ struct list_head *h, struct nfsd4_layout_seg *seg); ++static inline void bll_collapse(bl_layout_rec_t *r, ++ pnfs_blocklayout_layout_t *c); ++static pnfs_blocklayout_layout_t *bll_alloc(u64 offset, u64 len, ++ enum bl_cache_state state, struct list_head *h); ++static pnfs_blocklayout_layout_t *bll_alloc_dup(pnfs_blocklayout_layout_t *b, ++ enum bl_cache_state c, struct list_head *h); ++static inline boolean_t layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s); ++static void extents_setup(struct fiemap_extent_info *fei); ++static void extents_count(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_get(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_process(struct fiemap_extent_info *fei, ++ struct list_head *bl_candidates, struct nfsd4_layout_seg *, dev_t dev, ++ pnfs_blocklayout_layout_t *b); ++static void extents_cleanup(struct fiemap_extent_info *fei); ++ ++void ++nfsd_bl_init(void) ++{ ++ int i; ++ dprintk("%s loaded\n", __func__); ++ ++ spin_lock_init(&layout_hashtbl_lock); ++ INIT_LIST_HEAD(&layout_hash); ++ for (i = 0; i < BL_LAYOUT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&layout_hashtbl[i]); ++ bl_init_proc(); ++} ++ ++/* ++ * pnfs_block_enabled -- check to see if this file system should be export as ++ * block pnfs ++ */ ++int ++pnfs_block_enabled(struct inode *inode, int ex_flags) ++{ ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ static int bl_comm_once = 0; ++ ++ dprintk("--> %s\n", __func__); ++ /* ++ * FIXME: Figure out method to determine if this file system should ++ * be exported. The following areas need to be checked. ++ * (1) Validate that this file system was exported as a pNFS ++ * block-layout ++ * (2) Has there been successful communication with the ++ * volume daemon? ++ */ ++ /* Check #1 */ ++#ifdef notyet ++ if (!(ex_flags & NFSEXP_PNFS_BLOCK)) { ++ dprintk("%s: pnfs_block not set in export\n", __func__); ++ return 0; ++ } ++#endif ++ ++ /* Check #1 */ ++ if (!bl_comm_once) { ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ if (msg.u.msg_vers != res->u.vers) { ++ dprintk("%s: vers mismatch, kernel != daemon\n", ++ __func__); ++ kfree(res); ++ return 0; ++ } ++ } ++ bl_comm_once = 1; ++ ++ kfree(res); ++ ++ dprintk("<-- %s okay\n", __func__); ++ return 1; ++} ++ ++int ++bl_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_BLOCK_VOLUME; ++} ++ ++int ++bl_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_devid = sb->s_dev; ++ res->gd_verf = 1; ++ res->gd_cookie = 1; ++ return 0; ++} ++ ++static int ++bl_getdeviceinfo_slice(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld_slice_p, ++ *bld_simple_p, ++ *bld; ++ int status = -EIO, ++ location = 0; ++ struct list_head volumes; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ bld_simple_p = bld_simple(&volumes, devid->devid, ++ location++); ++ if (!bld_simple_p) ++ goto out; ++ bld_slice_p = bld_slice(&volumes, devid->devid, location++, ++ bld_simple_p->bld_index_loc); ++ ++ if (!bld_slice_p) ++ goto out; ++ ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ if (bld->bld_type == PNFS_BLOCK_VOLUME_SIMPLE) ++ kfree(bld->u.simple.bld_sig); ++ bld_free(bld); ++ } ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++static int ++bl_getdeviceinfo_dm(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ int status = -EIO, // default to error ++ i, ++ location = 0; ++ struct list_head volumes; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMGET; ++ msg.u.msg_dev = devid->devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: upcall for DMGET failed\n", __func__); ++ goto out; ++ } ++ ++ /* ++ * Don't use bld_alloc() here. If used this will be the first volume ++ * type added to the list whereas the protocol requires it to be the ++ * last. ++ */ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ goto out; ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = PNFS_BLOCK_VOLUME_STRIPE; ++ bld->u.stripe.bld_stripes = res->u.stripe.num_stripes; ++ bld->u.stripe.bld_chunk_size = res->u.stripe.stripe_size * 512LL; ++ dprintk("%s: stripes %d, chunk_size %Lu\n", __func__, ++ bld->u.stripe.bld_stripes, bld->u.stripe.bld_chunk_size / 512LL); ++ ++ bld->u.stripe.bld_stripe_indexs = kmalloc(bld->u.stripe.bld_stripes * ++ sizeof (int), GFP_KERNEL); ++ if (!bld->u.stripe.bld_stripe_indexs) ++ goto out; ++ ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) { ++ dev_t dev; ++ pnfs_blocklayout_devinfo_t *bldp; ++ ++ dev = MKDEV(res->u.stripe.devs[i].major, ++ res->u.stripe.devs[i].minor); ++ if (dev == 0) ++ goto out; ++ ++ bldp = bld_simple(&volumes, dev, location++); ++ if (!bldp) { ++ dprintk("%s: bld_simple failed\n", __func__); ++ goto out; ++ } ++ bldp = bld_slice(&volumes, dev, location++, bldp->bld_index_loc); ++ ++ if (!bldp) { ++ dprintk("%s: bld_slice failed\n", __func__); ++ goto out; ++ } ++ bld->u.stripe.bld_stripe_indexs[i] = bldp->bld_index_loc; ++ ++ } ++ list_add_tail(&bld->bld_list, &volumes); ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SLICE: ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ // No memory to release for these ++ break; ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ kfree(bld->u.simple.bld_sig); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ kfree(bld->u.stripe.bld_stripe_indexs); ++ break; ++ } ++ bld_free(bld); ++ } ++ kfree(res); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++/* ++ * bl_getdeviceinfo -- determine device tree for requested devid ++ */ ++int ++bl_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ if (device_slice(devid->devid) == True) ++ return bl_getdeviceinfo_slice(sb, xdr, devid); ++ else if (device_dm(devid->devid) == True) ++ return bl_getdeviceinfo_dm(sb, xdr, devid); ++ return -EINVAL; ++} ++ ++enum nfsstat4 ++bl_layoutget(struct inode *i, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ pnfs_blocklayout_layout_t *b; ++ bl_layout_rec_t *r; ++ struct list_head bl_possible, ++ *bl_candidates = NULL; ++ boolean_t del_on_error = False; ++ int adj; ++ enum nfsstat4 nfserr = NFS4_OK; ++ ++ dprintk("--> %s (inode=[0x%x:%lu], offset=%Lu, len=%Lu, iomode=%d)\n", ++ __func__, i->i_sb->s_dev, i->i_ino, _2SECTS(res->lg_seg.offset), ++ _2SECTS(res->lg_seg.length), res->lg_seg.iomode); ++ ++ if (res->lg_seg.length == 0) { ++ printk("%s: request length of 0, error condition\n", __func__); ++ return NFS4ERR_BADLAYOUT; ++ } ++ ++ /* ++ * Adjust the length as required per spec. ++ * - First case is were the length is set to (u64)-1. Cheap means to ++ * define the end of the file. ++ * - Second case is were the I/O mode is read-only, but the request is ++ * past the end of the file so the request needs to be trimed. ++ */ ++ if ((res->lg_seg.length == NFS4_MAX_UINT64) || ++ (((res->lg_seg.offset + res->lg_seg.length) > i->i_size) && ++ (res->lg_seg.iomode == IOMODE_READ))) ++ res->lg_seg.length = i->i_size - res->lg_seg.offset; ++ ++ adj = (res->lg_seg.offset & 511) ? res->lg_seg.offset & 511 : 0; ++ res->lg_seg.offset -= adj; ++ res->lg_seg.length = (res->lg_seg.length + adj + 511) & ~511; ++ ++ if (res->lg_seg.iomode != IOMODE_READ) ++ if (i->i_op->fallocate(i, FALLOC_FL_KEEP_SIZE, ++ res->lg_seg.offset, res->lg_seg.length)) ++ return NFS4ERR_IO; ++ ++ INIT_LIST_HEAD(&bl_possible); ++ ++ if ((r = layout_inode_find(i)) == NULL) { ++ if (layout_inode_add(i, &r) == False) { ++ printk("%s: layout_inode_add failed\n", __func__); ++ return NFS4ERR_IO; ++ } ++ del_on_error = True; ++ } ++ BUG_ON(!r); ++ ++ spin_lock(&r->blr_lock); ++ ++ if (layout_cache_fill_from(r, &bl_possible, &res->lg_seg)) { ++ /* ++ * This will send LAYOUTTRYAGAIN error to the client. ++ */ ++ dprintk("%s: layout_cache_fill_from() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res->lg_return_on_close = 1; ++ res->lg_seg.length = 0; ++ ++ bl_candidates = layout_cache_iter(r, &bl_possible, &res->lg_seg); ++ if (!bl_candidates) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ layout_cache_merge(r, bl_candidates); ++ if (layout_cache_update(r, bl_candidates)) { ++ /* ---- Failed to allocate memory. ---- */ ++ dprintk("%s: layout_cache_update() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ nfserr = blocklayout_encode_layout(xdr, bl_candidates); ++ if (nfserr) ++ dprintk("%s: layoutget xdr routine failed\n", __func__); ++ ++layoutget_cleanup: ++ if (bl_candidates) { ++ while (!list_empty(bl_candidates)) { ++ b = list_entry(bl_candidates->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ } ++ ++ spin_unlock(&r->blr_lock); ++ if (unlikely(nfserr)) { ++ if (del_on_error == True) ++ layout_inode_del(i); ++ res->lg_seg.length = 0; ++ res->lg_seg.offset = 0; ++ } ++ ++ dprintk("<-- %s (rval %u)\n", __func__, nfserr); ++ return nfserr; ++} ++ ++/* ++ * bl_layoutcommit -- commit changes, especially size, to file systemj ++ * ++ * Currently this routine isn't called and everything is handled within ++ * nfsd4_layoutcommit(). By not calling this routine the server doesn't ++ * handle a partial return, a set of extents, of the layout. The extents ++ * are decoded here, but nothing is done with them. If this routine is ++ * be called the interface must change to pass the 'dentry' pointer such ++ * that notify_change() can be called. ++ */ ++int ++bl_layoutcommit(struct inode *i, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ bl_layout_rec_t *r; ++ int status = 0; ++ u64 lw_plus; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ r = layout_inode_find(i); ++ if (r) { ++ lw_plus = args->lc_last_wr + 1; ++ if (args->lc_newoffset) { ++ dprintk(" lc_last_wr %Lu\n", lw_plus); ++ if (r->blr_orig_size < lw_plus) { ++ r->blr_orig_size = lw_plus; ++ res->lc_size_chg = 1; ++ res->lc_newsize = lw_plus; ++ } ++ } ++ ++ if (args->lc_up_len) { ++ int extents, ++ i; ++ struct pnfs_blocklayout_layout *b; ++ __be32 *p = args->lc_up_layout; ++ ++ /* ++ * Client is returning a set of extents which ++ * should/could be used to update the file system. ++ * See section 2.3.2 in draft-ietf-nfsv4-pnfs-block-08 ++ */ ++ READ32(extents); ++ dprintk(" Client returning %d extents: data size %d\n", ++ extents, args->lc_up_len); ++ b = kmalloc(sizeof (struct pnfs_blocklayout_layout) * ++ extents, GFP_KERNEL); ++ if (b) { ++ for (i = 0; i < extents; i++) { ++ READ64(b[i].bll_vol_id.sbid); ++ READ64(b[i].bll_vol_id.devid); ++ READ64(b[i].bll_foff); ++ READ64(b[i].bll_len); ++ READ64(b[i].bll_soff); ++ READ32(b[i].bll_es); ++ dprintk(" %d: foff %Lu, len %Lu, soff %Lu " ++ "state %s\n", ++ i, _2SECTS(b[i].bll_foff), ++ _2SECTS(b[i].bll_len), ++ _2SECTS(b[i].bll_soff), ++ map_state2name(b[i].bll_es)); ++ } ++ kfree(b); ++ } else { ++ status = -ENOMEM; ++ } ++ } ++ } else ++ dprintk("%s: Unexpected commit to inode %p\n", __func__, i); ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutreturn(struct inode *i, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ int status = 0; ++ bl_layout_rec_t *r; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ ++ r = layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ layout_cache_del(r, &args->lr_seg); ++ spin_unlock(&r->blr_lock); ++ dprintk(" ext_size %Lu, i_size %Lu, orig_size %Lu\n", ++ r->blr_ext_size, i->i_size, r->blr_orig_size); ++ } ++ ++ layout_inode_del(i); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ bl_layout_rec_t *r; ++ pnfs_blocklayout_layout_t *b; ++ u64 adj; ++ ++ dprintk("--> %s\n", __func__); ++ BUG_ON(!len); ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk(" recalling layout [0x%x:%lu], %Lu:%Lu\n", ++ inode->i_sb->s_dev, inode->i_ino, ++ _2SECTS(offset), _2SECTS(len)); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* ++ * XXX figure out how to get a sb since there's no ++ * inode ptr ++ */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++restart: ++ r = layout_inode_find(inode); ++ if (r && len && !r->blr_recalled) { ++ spin_lock(&r->blr_lock); ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (!r->blr_recalled && !b->bll_recalled && ++ (offset >= b->bll_foff) && (offset < BLL_F_END(b))) { ++ b->bll_recalled = 1; ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_BLOCK_VOLUME; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = 0; ++ lr.cbl_seg.length = NFS4_MAX_UINT64; ++ r->blr_recalled = 1; ++ dprintk(" FULL LAYOUTRECALL\n"); ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ ++ /* ++ * Currently there are only two cases where the ++ * layout is being returned. ++ * (1) Someone is issuing a NFS_WRITE operation ++ * to this layout. ++ * (2) The file has been truncated which means ++ * the layout is immediately made invalid. ++ * In both cases the client must write any ++ * uncommitted modifications to the server via ++ * NFS_WRITE. ++ */ ++ lr.cbl_layoutchanged = 1; ++ ++ /* ++ * Need to drop the lock because we'll get a ++ * layoutreturn which will block waiting for ++ * the lock. The request will come in on the ++ * same thread which will cause a deadlock. ++ */ ++ spin_unlock(&r->blr_lock); ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ adj = MIN(b->bll_len - (offset - b->bll_foff), ++ len); ++ offset += adj; ++ len -= adj; ++ if (!len) { ++ spin_lock(&r->blr_lock); ++ break; ++ } ++ /* ++ * Since layoutreturn will have been called we ++ * can't assume blr_layouts is still valid, ++ * so restart. ++ */ ++ goto restart; ++ } ++ } ++ spin_unlock(&r->blr_lock); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++/* ++ * []------------------------------------------------------------------[] ++ * | Support functions from here on down. | ++ * []------------------------------------------------------------------[] ++ */ ++ ++/* ++ * bld_simple -- given a dev_t build a simple volume structure ++ * ++ * Simple volume contains the device signature and offset to that data in ++ * the storage volume. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_simple(struct list_head *volumes, dev_t devid, int local_index) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSIG; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to get signature information\n", __func__); ++ goto error; ++ } ++ ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SIMPLE); ++ if (!bld) ++ return NULL; ++ ++ bld->u.simple.bld_offset = (res->u.sig.sector * 512LL) + res->u.sig.offset; ++ bld->u.simple.bld_sig_len = res->u.sig.len; ++ bld->u.simple.bld_sig = kmalloc(res->u.sig.len, GFP_KERNEL); ++ if (!bld->u.simple.bld_sig) ++ goto error; ++ ++ memcpy(bld->u.simple.bld_sig, res->u.sig.sig, res->u.sig.len); ++ kfree(res); ++ return bld; ++ ++error: ++ if (bld) ++ bld_free(bld); ++ if (res) ++ kfree(res); ++ dprintk("%s: error in bld_simple\n", __func__); ++ return NULL; ++} ++ ++/* ++ * bld_slice -- given a dev_t build a slice volume structure ++ * ++ * A slice volume contains the length of the slice/partition and its offset ++ * from the beginning of the storage volume. There's also a reference to ++ * the "simple" volume which contains this slice. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_slice(struct list_head *volumes, dev_t devid, int my_loc, int simple_loc) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SLICE); ++ if (!bld) ++ return NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSLICE; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Upcall to get slice info failed\n"); ++ bld_free(bld); ++ return NULL; ++ } ++ ++ bld->bld_devid.devid = devid; ++ bld->bld_index_loc = my_loc; ++ bld->u.slice.bld_start = res->u.slice.start * 512LL; ++ bld->u.slice.bld_len = res->u.slice.length * 512LL; ++ bld->u.slice.bld_index = simple_loc; ++ ++ dprintk("%s: start %Lu, len %Lu\n", __func__, ++ bld->u.slice.bld_start / 512LL, bld->u.slice.bld_len / 512LL); ++ ++ kfree(res); ++ dprintk("<-- %s (rval %p)\n", __func__, bld); ++ return bld; ++} ++ ++static int ++layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!list_empty(&r->blr_layouts)) ++ if (layout_cache_fill_from_list(r, h, seg) == False) ++ return -EIO; ++ ++ /* ++ * This deals with two conditions. ++ * (1) When blr_layouts is empty we need to create the first entry ++ * (2) When the range requested falls past the end of any current ++ * layout the residual must be taken care of. ++ */ ++ if (seg->length) { ++ n = bll_alloc(seg->offset, seg->length, BLOCK_LAYOUT_NEW, h); ++ if (!n) ++ return -ENOMEM; ++ dprintk(" remaining at %Lu, len %Lu\n", _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len)); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++struct list_head * ++layout_cache_iter(bl_layout_rec_t *r, struct list_head *bl_possible, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n = NULL; ++ struct list_head *bl_candidates = NULL; ++ struct fiemap_extent_info fei; ++ struct inode *i; ++ dev_t dev; ++ ++ dev = r->blr_rdev; ++ i = r->blr_inode; ++ ++ dprintk("--> %s\n", __func__); ++ bl_candidates = kmalloc(sizeof (*bl_candidates), GFP_KERNEL); ++ if (!bl_candidates) ++ return NULL; ++ INIT_LIST_HEAD(bl_candidates); ++ extents_setup(&fei); ++ ++ list_for_each_entry(b, bl_possible, bll_list) { ++ if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ extents_count(&fei, i, b->bll_foff, b->bll_len); ++ if (fei.fi_extents_mapped) { ++ ++ /* ++ * Common case here. Got a range which has ++ * extents. Now get those extents and process ++ * them into pNFS extents. ++ */ ++ if (extents_get(&fei, i, b->bll_foff, ++ b->bll_len) == False) ++ goto cleanup; ++ if (extents_process(&fei, bl_candidates, ++ seg, dev, b) == False) ++ goto cleanup; ++ extents_cleanup(&fei); ++ ++ } else if (seg->iomode == IOMODE_READ) { ++ ++ /* ++ * Found a hole in a file while reading. No ++ * problem, just create a pNFS extent for the ++ * range and let the client know there's no ++ * backing store. ++ */ ++ n = bll_alloc(b->bll_foff, b->bll_len, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += b->bll_len; ++ } else { ++ ++ /* ++ * There's a problem here. Since the iomode ++ * is read/write fallocate should have allocated ++ * any necessary storage for the given range. ++ */ ++ dprintk(" Extent count for RW is 0\n"); ++ goto cleanup; ++ } ++ ++ } else { ++ n = bll_alloc_dup(b, b->bll_cache_state, bl_candidates); ++ seg->length += n->bll_len; ++ } ++ ++ if (r->blr_ext_size < (b->bll_foff + b->bll_len)) ++ r->blr_ext_size = b->bll_foff + b->bll_len; ++ } ++ ++ while (!list_empty(bl_possible)) { ++ b = list_entry(bl_possible->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ ++ b = list_first_entry(bl_candidates, struct pnfs_blocklayout_layout, ++ bll_list); ++ seg->offset = b->bll_foff; ++ dprintk("<-- %s okay\n", __func__); ++ return bl_candidates; ++ ++cleanup: ++ extents_cleanup(&fei); ++ if (bl_candidates) ++ kfree(bl_candidates); ++ dprintk("<-- %s, error occurred\n", __func__); ++ return NULL; ++} ++ ++/* ++ * layout_cache_merge -- collapse layouts which make up a contiguous range. ++ */ ++static void ++layout_cache_merge(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *p; ++ ++ dprintk("--> %s\n", __func__); ++restart: ++ p = NULL; ++ list_for_each_entry(b, h, bll_list) { ++ if (p && (BLL_S_END(p) == b->bll_soff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ /* ++ * We've got a condidate. ++ */ ++#ifdef too_verbose ++ dprintk(" merge %Lu(f):%Lu(l):%Lu(s) into %Lu(f):%Lu(l):%Lu(s)\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), ++ _2SECTS(p->bll_foff), _2SECTS(p->bll_len), ++ _2SECTS(b->bll_soff)); ++#endif ++ ++ if (p->bll_cache_state == BLOCK_LAYOUT_CACHE) ++ p->bll_cache_state = BLOCK_LAYOUT_UPDATE; ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else if (p && (BLL_F_END(p) == b->bll_foff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es == PNFS_BLOCK_NONE_DATA)) { ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else ++ p = b; ++ } ++ dprintk("<-- %s\n", __func__); ++} ++ ++static int ++layout_cache_update(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *c, ++ *n; ++ boolean_t status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ if (list_empty(&r->blr_layouts)) { ++ /* ---- Just add entries and return ---- */ ++ dprintk(" cache empty for inode 0x%x:%ld\n", r->blr_rdev, ++ r->blr_inode->i_ino); ++ list_for_each_entry(b, h, bll_list) { ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, ++ &r->blr_layouts); ++ if (!c) { ++ status = -ENOMEM; ++ break; ++ } ++ dprintk(" adding %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), c->bll_es); ++ } ++ return status; ++ } ++ ++ list_for_each_entry(b, h, bll_list) { ++ BUG_ON(!b->bll_vol_id.devid); ++ if (b->bll_cache_state == BLOCK_LAYOUT_UPDATE) { ++ boolean_t found = False; ++ list_for_each_entry(c, &r->blr_layouts, bll_list) { ++ if ((b->bll_soff >= c->bll_soff) && ++ (b->bll_soff < BLL_S_END(c)) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ u64 u; ++ ++ if ((b->bll_foff < c->bll_foff) || ++ (b->bll_foff > BLL_F_END(c))) ++ BUG(); ++ ++ u = BLL_S_END(b) - BLL_S_END(c); ++ /* ++ * The updated cache entry has to be ++ * different than the current. ++ * Otherwise the cache state for 'b' ++ * should be BLOCK_LAYOUT_CACHE. ++ */ ++ BUG_ON(BLL_S_END(b) < BLL_S_END(c)); ++ ++ dprintk(" " ++ "updating %Lu(f):%Lu(l):%Lu(s) to len %Lu\n", ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), ++ _2SECTS(c->bll_len + u)); ++ c->bll_len += u; ++ bll_collapse(r, c); ++ found = True; ++ break; ++ } ++ } ++ ++ if (found == False) { ++ dprintk(" ERROR Expected to find" ++ " %Lu(f):%Lu(l):%Lu(s), but didn't\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_for_each_entry(c, &r->blr_layouts, bll_list) ++ print_bll(c, "Cached"); ++ BUG(); ++ } ++ } else if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ c = list_first_entry(&r->blr_layouts, ++ struct pnfs_blocklayout_layout, bll_list); ++ if (b->bll_foff < c->bll_foff) { ++ /* ++ * Special case where new entry is before ++ * first cached entry. ++ */ ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, NULL); ++ list_add(&c->bll_list, &r->blr_layouts); ++ dprintk(" new entry at head of list at %Lu, " ++ "len %Lu\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len)); ++ } else { ++ list_for_each_entry(c, &r->blr_layouts, ++ bll_list) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, ++ bll_list); ++ /* ++ * This is ugly, but can't think of ++ * another way to examine this case. ++ * Consider the following. Need to ++ * add an entry which starts at 40 ++ * and the cache has the following ++ * entries: ++ * Start Length ++ * 10 5 ++ * 30 5 ++ * 50 5 ++ * So, need to look and see if the new ++ * entry starts after the current ++ * cache, but before the next one. ++ * There's a catch in that the next ++ * entry might not be valid as it's ++ * really just a pointer to the list ++ * head. ++ */ ++ if (((b->bll_foff >= ++ BLL_F_END(c)) && ++ (c->bll_list.next == &r->blr_layouts)) || ++ ((b->bll_foff >= ++ BLL_F_END(c)) && ++ (b->bll_foff < n->bll_foff))) { ++ ++ n = bll_alloc_dup(b, ++ BLOCK_LAYOUT_CACHE, NULL); ++ dprintk(" adding new %Lu:%Lu" ++ " after %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len), ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len)); ++ list_add(&n->bll_list, ++ &c->bll_list); ++ break; ++ } ++ } ++ } ++ } ++ } ++ dprintk("<-- %s\n", __func__); ++ return status; ++} ++ ++static void ++layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg_in) ++{ ++ struct pnfs_blocklayout_layout *b, ++ *n; ++ u64 len; ++ struct nfsd4_layout_seg seg = *seg_in; ++ ++ dprintk("--> %s\n", __func__); ++ if (seg.length == NFS4_MAX_UINT64) { ++ r->blr_recalled = 0; ++ dprintk(" Fast return of all layouts\n"); ++ while (!list_empty(&r->blr_layouts)) { ++ b = list_entry(r->blr_layouts.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ dprintk(" foff %Lu, len %Lu, soff %Lu\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ dprintk("<-- %s\n", __func__); ++ return; ++ } ++ ++restart: ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg.offset == b->bll_foff) { ++ /* ++ * This handle the following three cases: ++ * (1) return layout matches entire cache layout ++ * (2) return layout matches beginning portion of cache ++ * (3) return layout matches entire cache layout and ++ * into next entry. Varies from #1 in end case. ++ */ ++ dprintk(" match on offsets, %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length)); ++ len = MIN(seg.length, b->bll_len); ++ b->bll_foff += len; ++ b->bll_soff += len; ++ b->bll_len -= len; ++ seg.length -= len; ++ seg.offset += len; ++ if (!b->bll_len) { ++ list_del(&b->bll_list); ++ kfree(b); ++ dprintk(" removing cache line\n"); ++ if (!seg.length) { ++ dprintk(" also finished\n"); ++ goto complete; ++ } ++ /* ++ * Since 'b' was freed we can't continue at the ++ * next entry which is referenced as ++ * b->bll_list.next by the list_for_each_entry ++ * macro. Need to restart the loop. ++ * TODO: Think about creating a dummy 'b' which ++ * would keep list_for_each_entry() happy. ++ */ ++ goto restart; ++ } ++ if (!seg.length) { ++ dprintk(" finished, but cache line not" ++ "empty\n"); ++ goto complete; ++ } ++ } else if ((seg.offset >= b->bll_foff) && ++ (seg.offset < BLL_F_END(b))) { ++ /* ++ * layout being returned is within this cache line. ++ */ ++ dprintk(" layout %Lu:%Lu within cache line %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ BUG_ON(!seg.length); ++ if ((seg.offset + seg.length) >= BLL_F_END(b)) { ++ /* ++ * Layout returned starts in the middle of ++ * cache entry and just need to trim back ++ * cache to shorter length. ++ */ ++ dprintk(" trim back cache line\n"); ++ len = seg.offset - b->bll_foff; ++ seg.offset += b->bll_len - len; ++ seg.length -= b->bll_len - len; ++ b->bll_len = len; ++ if (!seg.length) ++ return; ++ } else { ++ /* ++ * Need to split current cache layout because ++ * chunk is being removed from the middle. ++ */ ++ dprintk(" split cache line\n"); ++ len = seg.offset + seg.length; ++ n = bll_alloc(len, ++ (b->bll_foff + b->bll_len) - len, ++ BLOCK_LAYOUT_CACHE, NULL); ++ n->bll_soff = b->bll_soff + len; ++ list_add(&n->bll_list, &b->bll_list); ++ b->bll_len = seg.offset - b->bll_foff; ++ return; ++ } ++ } ++ } ++complete: ++ if (list_empty(&r->blr_layouts)) ++ r->blr_recalled = 0; ++ dprintk("<-- %s\n", __func__); ++} ++ ++/* ++ * layout_cache_fill_from_list -- fills from cache list ++ * ++ * NOTE: This routine was only seperated out from layout_cache_file_from() ++ * to reduce the indentation level which makes the code easier to read. ++ */ ++static inline boolean_t ++layout_cache_fill_from_list(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n; ++ enum pnfs_block_extent_state4 s; ++ ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg->offset < b->bll_foff) { ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, b->bll_foff - seg->offset), ++ BLOCK_LAYOUT_NEW, NULL); ++ if (!n) ++ return False; ++ ++ list_add(&n->bll_list, h->prev); ++ dprintk(" new: %Lu:%Lu, added before %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ ++ if ((seg->offset >= b->bll_foff) && ++ (seg->offset < BLL_F_END(b))) { ++ if (layout_conflict(b, seg->iomode, &s) == False) { ++ dprintk(" CONFLICT FOUND: " ++ "%Lu(f):%Lu(l):%Lu(s) state %d, iomode %d\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es, ++ seg->iomode); ++ return False; ++ } ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, BLL_F_END(b) - seg->offset), ++ BLOCK_LAYOUT_CACHE, h); ++ dprintk(" CACHE hit: Found %Lu(f):%Lu(l): " ++ "in %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es); ++ if (!n) ++ return False; ++ ++ n->bll_soff = b->bll_soff + seg->offset - b->bll_foff; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ n->bll_es = s; ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ } ++ return True; ++} ++ ++static u64 ++bll_alloc_holey(struct list_head *bl_candidates, u64 offset, u64 length, ++ dev_t dev) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ n = bll_alloc(offset, length, BLOCK_LAYOUT_NEW, bl_candidates); ++ if (!n) ++ return 0; ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ ++ return n->bll_len; ++} ++ ++static void ++extents_setup(struct fiemap_extent_info *fei) ++{ ++ fei->fi_extents_start = NULL; ++} ++ ++/* ++ * extents_count -- Determine the number of extents for a given range. ++ * ++ * No need to call set_fs() here because the function ++ * doesn't use copy_to_user() if it's only counting ++ * the number of extents needed. ++ */ ++static void ++extents_count(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ dprintk(" Need fiemap of %Ld:%Ld\n", _2SECTS(foff), _2SECTS(len)); ++ fei->fi_flags = FIEMAP_FLAG_SYNC; ++ fei->fi_extents_max = 0; ++ fei->fi_extents_start = NULL; ++ fei->fi_extents_mapped = 0; ++ i->i_op->fiemap(i, fei, foff, len + (1 << i->i_sb->s_blocksize_bits) - 1); ++} ++ ++/* ++ * extents_get -- Get list of extents for range ++ * ++ * extents_count() must have been called before this routine such that ++ * fi_extents_mapped is known. ++ */ ++static boolean_t ++extents_get(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ int m_space, ++ rval; ++ struct fiemap_extent *fe; ++ mm_segment_t old_fs = get_fs(); ++ ++ /* ++ * Now malloc the correct amount of space ++ * needed. It's possible for the file to have changed ++ * between calls which would require more space for ++ * the extents. If that occurs the last extent will ++ * not have FIEMAP_EXTENT_LAST set and the error will ++ * be caught in extents_process(). ++ */ ++ m_space = fei->fi_extents_mapped * sizeof (struct fiemap_extent); ++ fe = kmalloc(m_space, GFP_KERNEL); ++ if (!fe) ++ return False; ++ memset(fe, 0, m_space); ++ ++ fei->fi_extents_max = fei->fi_extents_mapped; ++ fei->fi_extents_mapped = 0; ++ fei->fi_extents_start = fe; ++ ++ set_fs(KERNEL_DS); ++ rval = i->i_op->fiemap(i, fei, foff, len + ++ (1 << i->i_sb->s_blocksize_bits) - 1); ++ set_fs(old_fs); ++ ++ if (rval || !fei->fi_extents_mapped) { ++ dprintk(" No extents. Wanted %d, got %d\n", ++ fei->fi_extents_max, fei->fi_extents_mapped); ++ kfree(fe); ++ fei->fi_extents_start = NULL; ++ return False; ++ } else ++ return True; ++} ++ ++/* ++ * extents_process -- runs through the extent returned from the file system and ++ * creates block layout entries. ++ */ ++static boolean_t ++extents_process(struct fiemap_extent_info *fei, struct list_head *bl_candidates, ++ struct nfsd4_layout_seg *seg, dev_t dev, pnfs_blocklayout_layout_t *b) ++{ ++ struct fiemap_extent *fep, ++ *fep_last = NULL; ++ int i; ++ pnfs_blocklayout_layout_t *n; ++ u64 last_end, ++ rval; ++ ++ dprintk("--> %s\n", __func__); ++ for (fep = fei->fi_extents_start, i = 0; i < fei->fi_extents_mapped; ++ i++, fep++) { ++ ++ BUG_ON(!fep->fe_physical); ++ /* ++ * Deal with corner cases of hoel-y files. ++ */ ++ if (fep_last && ((fep_last->fe_logical + fep_last->fe_length) != ++ fep->fe_logical)) { ++ ++ /* ++ * If the last extent doesn't end logically ++ * at the beginning of the current we've got ++ * hole and need to create a pNFS extent. ++ */ ++ dprintk(" Got a hole at %Ld:%Ld \n", ++ _2SECTS(fep_last->fe_logical), ++ _2SECTS(fep_last->fe_length)); ++ last_end = fep_last->fe_logical + fep_last->fe_length; ++ rval = bll_alloc_holey(bl_candidates, last_end, ++ fep->fe_logical - last_end, dev); ++ if (!rval) ++ return False; ++ seg->length += rval; ++ } ++ ++ n = bll_alloc(fep->fe_logical, fep->fe_length, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ if (unlikely(n == NULL)) { ++ dprintk("%s: bll_alloc failed\n", __func__); ++ return False; ++ } ++ ++ n->bll_soff = fep->fe_physical; ++ n->bll_es = seg->iomode == IOMODE_READ ? ++ PNFS_BLOCK_READ_DATA : PNFS_BLOCK_READWRITE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += fep->fe_length; ++ print_bll(n, "New extent"); ++ fep_last = fep; ++ } ++ dprintk("<-- %s (i=%d)\n", __func__, i); ++ ++ return True; ++} ++ ++static void ++extents_cleanup(struct fiemap_extent_info *fei) ++{ ++ if (fei->fi_extents_start) { ++ kfree(fei->fi_extents_start); ++ fei->fi_extents_start = NULL; ++ } ++} ++ ++/* ++ * device_slice -- check to see if device is a slice or DM ++ */ ++static boolean_t ++device_slice(dev_t devid) ++{ ++ struct block_device *bd = open_by_devnum(devid, FMODE_READ); ++ boolean_t rval = False; ++ ++ if (bd) { ++ if (bd->bd_disk->minors > 1) ++ rval = True; ++ blkdev_put(bd, FMODE_READ); ++ } ++ return rval; ++} ++ ++/* ++ * device_dm -- check to see if device is a Device Mapper volume. ++ * ++ * Returns 1 for DM or 0 if not ++ */ ++static boolean_t ++device_dm(dev_t devid) ++{ ++ boolean_t rval = False; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMCHK; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Failed upcall to check on DM status\n"); ++ } else if (res->u.dm_vol) { ++ rval = True; ++ dprintk("Device is DM volume\n"); ++ } else ++ dprintk("Device is not DM volume\n"); ++ kfree(res); ++ ++ return rval; ++} ++ ++static boolean_t ++layout_inode_add(struct inode *i, bl_layout_rec_t **p) ++{ ++ bl_layout_rec_t *r = NULL; ++ ++ if (!i->i_op->fiemap || !i->i_op->fallocate) { ++ printk("pNFS: file system doesn't support required fiemap or" ++ "fallocate methods\n"); ++ return False; ++ } ++ ++ r = kmalloc(sizeof (*r), GFP_KERNEL); ++ if (!r) ++ goto error; ++ ++ r->blr_rdev = i->i_sb->s_dev; ++ r->blr_inode = i; ++ r->blr_orig_size = i->i_size; ++ r->blr_ext_size = 0; ++ r->blr_recalled = 0; ++ INIT_LIST_HEAD(&r->blr_layouts); ++ spin_lock_init(&r->blr_lock); ++ spin_lock(&layout_hashtbl_lock); ++ list_add_tail(&r->blr_hash, &layout_hash); ++ spin_unlock(&layout_hashtbl_lock); ++ *p = r; ++ return True; ++ ++error: ++ if (r) ++ kfree(r); ++ return False; ++} ++ ++static bl_layout_rec_t * ++__layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ if (!list_empty(&layout_hash)) { ++ list_for_each_entry(r, &layout_hash, blr_hash) { ++ if ((r->blr_inode->i_ino == i->i_ino) && ++ (r->blr_rdev == i->i_sb->s_dev)) { ++ return r; ++ } ++ } ++ } ++ return NULL; ++} ++ ++static bl_layout_rec_t * ++layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ spin_unlock(&layout_hashtbl_lock); ++ ++ return r; ++} ++ ++static void ++layout_inode_del(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ if (list_empty(&r->blr_layouts)) { ++ list_del(&r->blr_hash); ++ spin_unlock(&r->blr_lock); ++ kfree(r); ++ } else { ++ spin_unlock(&r->blr_lock); ++ } ++ } else { ++ dprintk("%s: failed to find inode [0x%x:%lu] in table for delete\n", ++ __func__, i->i_sb->s_dev, i->i_ino); ++ } ++ spin_unlock(&layout_hashtbl_lock); ++} ++ ++/* ++ * map_state2name -- converts state in ascii string. ++ * ++ * Used for debug messages only. ++ */ ++static char * ++map_state2name(enum pnfs_block_extent_state4 s) ++{ ++ switch (s) { ++ case PNFS_BLOCK_READWRITE_DATA: return " RW"; ++ case PNFS_BLOCK_READ_DATA: return " RO"; ++ case PNFS_BLOCK_INVALID_DATA: return "INVALID"; ++ case PNFS_BLOCK_NONE_DATA: return " NONE"; ++ default: ++ BUG(); ++ } ++} ++ ++static pnfs_blocklayout_devinfo_t * ++bld_alloc(struct list_head *volumes, int type) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ return NULL; ++ ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = type; ++ list_add_tail(&bld->bld_list, volumes); ++ ++ return bld; ++} ++ ++static void ++bld_free(pnfs_blocklayout_devinfo_t *bld) ++{ ++ list_del(&bld->bld_list); ++ kfree(bld); ++} ++ ++static void ++print_bll(pnfs_blocklayout_layout_t *b, char *text) ++{ ++ dprintk(" BLL: %s\n", text); ++ dprintk(" foff %Lu, soff %Lu, len %Lu, state %s\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_soff), _2SECTS(b->bll_len), ++ map_state2name(b->bll_es)); ++} ++ ++static inline void ++bll_collapse(bl_layout_rec_t *r, pnfs_blocklayout_layout_t *c) ++{ ++ pnfs_blocklayout_layout_t *n; ++ int dbg_count = 0; ++ u64 endpoint; ++ ++ BUG_ON(c->bll_es == PNFS_BLOCK_NONE_DATA); ++ while (c->bll_list.next != &r->blr_layouts) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ endpoint = BLL_S_END(c); ++ if ((n->bll_soff >= c->bll_soff) && ++ (n->bll_soff < endpoint)) { ++ if (endpoint < BLL_S_END(n)) { ++ /* ++ * The following is possible. ++ * ++ * ++ * Existing: +---+ +---+ ++ * New: +-----------------------+ ++ * The client request merge entries together ++ * but didn't require picking up all of the ++ * last entry. So, we still need to delete ++ * the last entry and add the remaining space ++ * to the new entry. ++ */ ++ c->bll_len += BLL_S_END(n) - endpoint; ++ } ++ dbg_count++; ++ list_del(&n->bll_list); ++ kfree(n); ++ } else { ++ break; ++ } ++ } ++ /* ---- Debug only, remove before integration ---- */ ++ if (dbg_count) ++ dprintk(" Collapsed %d cache entries between %Lu(s) and %Lu(s)\n", ++ dbg_count, _2SECTS(c->bll_soff), _2SECTS(BLL_S_END(c))); ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc(u64 offset, u64 len, enum bl_cache_state state, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = kmalloc(sizeof (*n), GFP_KERNEL); ++ if (n) { ++ memset(n, 0, sizeof (*n)); ++ n->bll_foff = offset; ++ n->bll_len = len; ++ n->bll_cache_state = state; ++ if (h) ++ list_add_tail(&n->bll_list, h); ++ } ++ return n; ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc_dup(pnfs_blocklayout_layout_t *b, enum bl_cache_state c, ++ struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = bll_alloc(b->bll_foff, b->bll_len, c, h); ++ if (n) { ++ n->bll_es = b->bll_es; ++ n->bll_soff = b->bll_soff; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ } ++ return n; ++} ++ ++static inline boolean_t ++layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s) ++{ ++ /* ---- Normal case ---- */ ++ *s = b->bll_es; ++ ++ switch (b->bll_es) { ++ case PNFS_BLOCK_READWRITE_DATA: ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_READ_DATA; ++ /* ---- Any use is permitted. ---- */ ++ break; ++ case PNFS_BLOCK_READ_DATA: ++ /* ---- Committed as read only data. ---- */ ++ if (iomode == IOMODE_RW) ++ return False; ++ break; ++ case PNFS_BLOCK_INVALID_DATA: ++ /* ---- Blocks have been allocated, but not initialized ---- */ ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_NONE_DATA; ++ break; ++ case PNFS_BLOCK_NONE_DATA: ++ /* ---- Hole-y file. No backing store avail. ---- */ ++ if (iomode != IOMODE_READ) ++ return False; ++ break; ++ default: ++ BUG(); ++ } ++ return True; ++} ++ ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs/delegation.c +--- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-09-30 10:15:17.729711000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-09-30 10:17:08.609991000 -0400 +@@ -104,7 +104,8 @@ again: + continue; + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + continue; +- if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) ++ if (memcmp(state->stateid.u.data, stateid->u.data, ++ sizeof(state->stateid.u.data)) != 0) + continue; + get_nfs_open_context(ctx); + spin_unlock(&inode->i_lock); +@@ -133,8 +134,8 @@ void nfs_inode_reclaim_delegation(struct + if (delegation != NULL) { + spin_lock(&delegation->lock); + if (delegation->inode != NULL) { +- memcpy(delegation->stateid.data, res->delegation.data, +- sizeof(delegation->stateid.data)); ++ memcpy(delegation->stateid.u.data, res->delegation.u.data, ++ sizeof(delegation->stateid.u.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + oldcred = delegation->cred; +@@ -187,8 +188,9 @@ static struct nfs_delegation *nfs_detach + if (delegation == NULL) + goto nomatch; + spin_lock(&delegation->lock); +- if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data, +- sizeof(delegation->stateid.data)) != 0) ++ if (stateid != NULL && memcmp(delegation->stateid.u.data, ++ stateid->u.data, ++ sizeof(delegation->stateid.u.data)) != 0) + goto nomatch_unlock; + list_del_rcu(&delegation->super_list); + delegation->inode = NULL; +@@ -216,8 +218,8 @@ int nfs_inode_set_delegation(struct inod + delegation = kmalloc(sizeof(*delegation), GFP_NOFS); + if (delegation == NULL) + return -ENOMEM; +- memcpy(delegation->stateid.data, res->delegation.data, +- sizeof(delegation->stateid.data)); ++ memcpy(delegation->stateid.u.data, res->delegation.u.data, ++ sizeof(delegation->stateid.u.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + delegation->change_attr = nfsi->change_attr; +@@ -471,9 +473,7 @@ void nfs_expire_unreferenced_delegations + /* + * Asynchronous delegation recall! + */ +-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, +- int (*validate_stateid)(struct nfs_delegation *delegation, +- const nfs4_stateid *stateid)) ++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) + { + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_delegation *delegation; +@@ -481,7 +481,7 @@ int nfs_async_inode_return_delegation(st + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + +- if (!validate_stateid(delegation, stateid)) { ++ if (!clp->cl_mvops->validate_stateid(delegation, stateid)) { + rcu_read_unlock(); + return -ENOENT; + } +@@ -562,7 +562,8 @@ int nfs4_copy_delegation_stateid(nfs4_st + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation != NULL) { +- memcpy(dst->data, delegation->stateid.data, sizeof(dst->data)); ++ memcpy(dst->u.data, delegation->stateid.u.data, ++ sizeof(dst->u.data)); + ret = 1; + } + rcu_read_unlock(); +diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs/delegation.h +--- linux-2.6.34.noarch/fs/nfs/delegation.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-09-30 10:17:08.615000000 -0400 +@@ -34,9 +34,7 @@ enum { + int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); + void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); + int nfs_inode_return_delegation(struct inode *inode); +-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, +- int (*validate_stateid)(struct nfs_delegation *delegation, +- const nfs4_stateid *stateid)); ++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); + void nfs_inode_return_delegation_noreclaim(struct inode *inode); + + struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); +diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c +--- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-09-30 10:15:18.314726000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-09-30 10:17:08.834999000 -0400 +@@ -17,11 +17,19 @@ + #include + #include + ++#include ++#if defined(CONFIG_SPNFS) ++#include ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif ++#endif + #include + #include + + #include "nfsd.h" + #include "nfsfh.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_EXPORT + +@@ -352,6 +360,40 @@ static int svc_export_upcall(struct cach + return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); + } + ++#if defined(CONFIG_PNFSD) ++static struct pnfsd_cb_operations pnfsd_cb_op = { ++ .cb_layout_recall = nfsd_layout_recall_cb, ++ .cb_device_notify = nfsd_device_notify_cb, ++ ++ .cb_get_state = nfs4_pnfs_cb_get_state, ++ .cb_change_state = nfs4_pnfs_cb_change_state, ++}; ++ ++#if defined(CONFIG_SPNFS) ++static struct pnfs_export_operations spnfs_export_ops = { ++ .layout_type = spnfs_layout_type, ++ .get_device_info = spnfs_getdeviceinfo, ++ .get_device_iter = spnfs_getdeviceiter, ++ .layout_get = spnfs_layoutget, ++ .layout_return = spnfs_layoutreturn, ++}; ++ ++static struct pnfs_export_operations spnfs_ds_export_ops = { ++ .get_state = spnfs_get_state, ++}; ++ ++#if defined(CONFIG_SPNFS_BLOCK) ++static struct pnfs_export_operations bl_export_ops = { ++ .layout_type = bl_layout_type, ++ .get_device_info = bl_getdeviceinfo, ++ .get_device_iter = bl_getdeviceiter, ++ .layout_get = bl_layoutget, ++ .layout_return = bl_layoutreturn, ++}; ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_SPNFS */ ++#endif /* CONFIG_PNFSD */ ++ + static struct svc_export *svc_export_update(struct svc_export *new, + struct svc_export *old); + static struct svc_export *svc_export_lookup(struct svc_export *); +@@ -395,6 +437,47 @@ static int check_export(struct inode *in + return -EINVAL; + } + ++#if !defined(CONFIG_SPNFS) ++ if (inode->i_sb->s_pnfs_op && ++ (!inode->i_sb->s_pnfs_op->layout_type || ++ !inode->i_sb->s_pnfs_op->get_device_info || ++ !inode->i_sb->s_pnfs_op->layout_get)) { ++ dprintk("exp_export: export of invalid fs pnfs export ops.\n"); ++ return -EINVAL; ++ } ++#endif /* CONFIG_SPNFS */ ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ if (!inode->i_sb->s_pnfs_op) ++ pnfsd_lexp_init(inode); ++ return 0; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, *flags)) { ++ dprintk("set pnfs block export structure... \n"); ++ inode->i_sb->s_pnfs_op = &bl_export_ops; ++ } else ++#endif /* CONFIG_SPNFS_BLOCK */ ++ /* ++ * spnfs_enabled() indicates we're an MDS. ++ * XXX Better to check an export time option as well. ++ */ ++ if (spnfs_enabled()) { ++ dprintk("set spnfs export structure...\n"); ++ inode->i_sb->s_pnfs_op = &spnfs_export_ops; ++ } else { ++ dprintk("%s spnfs not in use\n", __func__); ++ ++ /* ++ * get_state is needed if we're a DS using spnfs. ++ * XXX Better to check an export time option instead. ++ */ ++ inode->i_sb->s_pnfs_op = &spnfs_ds_export_ops; ++ } ++#endif /* CONFIG_SPNFS */ ++ + return 0; + + } +@@ -586,6 +669,8 @@ static int svc_export_parse(struct cache + if (exp.ex_uuid == NULL) + err = -ENOMEM; + } ++ } else if (strcmp(buf, "pnfs") == 0) { ++ exp.ex_pnfs = 1; + } else if (strcmp(buf, "secinfo") == 0) + err = secinfo_parse(&mesg, buf, &exp); + else +@@ -660,6 +745,8 @@ static int svc_export_show(struct seq_fi + seq_printf(m, "%02x", exp->ex_uuid[i]); + } + } ++ if (exp->ex_pnfs) ++ seq_puts(m, ",pnfs"); + show_secinfo(m, exp); + } + seq_puts(m, ")\n"); +@@ -687,6 +774,7 @@ static void svc_export_init(struct cache + new->ex_fslocs.locations = NULL; + new->ex_fslocs.locations_count = 0; + new->ex_fslocs.migrated = 0; ++ new->ex_pnfs = 0; + } + + static void export_update(struct cache_head *cnew, struct cache_head *citem) +@@ -699,6 +787,7 @@ static void export_update(struct cache_h + new->ex_anon_uid = item->ex_anon_uid; + new->ex_anon_gid = item->ex_anon_gid; + new->ex_fsid = item->ex_fsid; ++ new->ex_pnfs = item->ex_pnfs; + new->ex_uuid = item->ex_uuid; + item->ex_uuid = NULL; + new->ex_pathname = item->ex_pathname; +@@ -1635,8 +1724,17 @@ nfsd_export_init(void) + if (rv) + return rv; + rv = cache_register(&svc_expkey_cache); +- if (rv) ++ if (rv) { + cache_unregister(&svc_export_cache); ++ goto out; ++ } ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = THIS_MODULE; ++ pnfsd_cb_ctl.cb_op = &pnfsd_cb_op; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ ++out: + return rv; + + } +@@ -1664,6 +1762,12 @@ nfsd_export_shutdown(void) + + exp_writelock(); + ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = NULL; ++ pnfsd_cb_ctl.cb_op = NULL; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ + cache_unregister(&svc_expkey_cache); + cache_unregister(&svc_export_cache); + svcauth_unix_purge(); +diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/direct.c +--- linux-2.6.34.noarch/fs/nfs/direct.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-09-30 10:17:08.620991000 -0400 +@@ -267,6 +267,38 @@ static const struct rpc_call_ops nfs_rea + .rpc_release = nfs_direct_read_release, + }; + ++static long nfs_direct_read_execute(struct nfs_read_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ nfs_fattr_init(&data->fattr); ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ NFS_PROTO(inode)->read_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct read call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each rsize'd chunk of the user's buffer, dispatch an NFS READ + * operation. If nfs_readdata_alloc() or get_user_pages() fails, +@@ -283,7 +315,6 @@ static ssize_t nfs_direct_read_schedule_ + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; + size_t rsize = NFS_SERVER(inode)->rsize; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -343,26 +374,9 @@ static ssize_t nfs_direct_read_schedule_ + data->res.fattr = &data->fattr; + data->res.eof = 0; + data->res.count = bytes; +- nfs_fattr_init(&data->fattr); +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct read call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_read_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +@@ -448,12 +462,15 @@ static void nfs_direct_free_writedata(st + } + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg); ++ + static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) + { + struct inode *inode = dreq->inode; + struct list_head *p; + struct nfs_write_data *data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = dreq->ctx->cred, + }; +@@ -487,25 +504,7 @@ static void nfs_direct_write_reschedule( + * Reuse data->task; data->args should not have changed + * since the original request was sent. + */ +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- /* +- * We're called via an RPC callback, so BKL is already held. +- */ +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); +- +- dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- data->args.count, +- (unsigned long long)data->args.offset); ++ nfs_direct_write_execute(data, &task_setup_data, &msg); + } + + if (put_dreq(dreq)) +@@ -548,10 +547,31 @@ static const struct rpc_call_ops nfs_com + .rpc_release = nfs_direct_commit_release, + }; + ++static long nfs_direct_commit_execute(struct nfs_direct_req *dreq, ++ struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct rpc_task *task; ++ ++ NFS_PROTO(data->inode)->commit_setup(data, msg); ++ ++ /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ ++ dreq->commit_data = NULL; ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ return 0; ++} ++ + static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) + { + struct nfs_write_data *data = dreq->commit_data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +@@ -579,16 +599,7 @@ static void nfs_direct_commit_schedule(s + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- NFS_PROTO(data->inode)->commit_setup(data, &msg); +- +- /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ +- dreq->commit_data = NULL; +- +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); +- +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); ++ nfs_direct_commit_execute(dreq, data, &task_setup_data, &msg); + } + + static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +@@ -690,6 +701,36 @@ static const struct rpc_call_ops nfs_wri + .rpc_release = nfs_direct_write_release, + }; + ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ NFS_PROTO(inode)->write_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE + * operation. If nfs_writedata_alloc() or get_user_pages() fails, +@@ -705,7 +746,6 @@ static ssize_t nfs_direct_write_schedule + struct inode *inode = ctx->path.dentry->d_inode; + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -771,24 +811,8 @@ static ssize_t nfs_direct_write_schedule + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct write call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_write_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kconfig +--- linux-2.6.34.noarch/fs/nfsd/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-09-30 10:17:08.815000000 -0400 +@@ -79,3 +79,52 @@ config NFSD_V4 + available from http://linux-nfs.org/. + + If unsure, say N. ++ ++config PNFSD ++ bool "NFSv4.1 server support for Parallel NFS (pNFS) (DEVELOPER ONLY)" ++ depends on NFSD_V4 && EXPERIMENTAL ++ select EXPORTFS_FILE_LAYOUT ++ help ++ This option enables support for the parallel NFS features of the ++ minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1) ++ in the kernel's NFS server. ++ ++ Unless you're an NFS developer, say N. ++ ++config PNFSD_LOCAL_EXPORT ++ bool "Enable pNFS support for exporting local filesystems for debugging purposes" ++ depends on PNFSD ++ help ++ Say Y here if you want your pNFS server to export local file systems ++ over the files layout type. With this option the MDS (metadata ++ server) functions also as a single DS (data server). This is mostly ++ useful for development and debugging purposes. ++ ++ If unsure, say N. ++ ++config SPNFS ++ bool "Provide spNFS server support (EXPERIMENTAL)" ++ depends on PNFSD ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS server support. ++ ++ If unsure, say N. ++ ++config SPNFS_LAYOUTSEGMENTS ++ bool "Allow spNFS to return partial file layouts (EXPERIMENTAL)" ++ depends on SPNFS ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS to be able to return layout segments. ++ ++ If unsure, say N. ++ ++config SPNFS_BLOCK ++ bool "Provide Block Layout server support (EXPERIMENTAL)" ++ depends on SPNFS ++ select EXPORTFS_BLOCK_LAYOUT ++ help ++ Say Y here if you want spNFS block layout support ++ ++ If unsure, say N. +diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/Makefile +--- linux-2.6.34.noarch/fs/nfsd/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-09-30 10:17:08.820000000 -0400 +@@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs + nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o + nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ + nfs4acl.o nfs4callback.o nfs4recover.o ++nfsd-$(CONFIG_PNFSD) += nfs4pnfsd.o nfs4pnfsdlm.o nfs4pnfsds.o ++nfsd-$(CONFIG_PNFSD_LOCAL_EXPORT) += pnfsd_lexp.o ++nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o ++nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-09-30 10:15:18.320728000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-09-30 10:17:08.841998000 -0400 +@@ -40,7 +40,6 @@ + + #define NFSPROC4_CB_NULL 0 + #define NFSPROC4_CB_COMPOUND 1 +-#define NFS4_STATEID_SIZE 16 + + /* Index of predefined Linux callback client operations */ + +@@ -48,11 +47,17 @@ enum { + NFSPROC4_CLNT_CB_NULL = 0, + NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_SEQUENCE, ++#if defined(CONFIG_PNFSD) ++ NFSPROC4_CLNT_CB_LAYOUT, ++ NFSPROC4_CLNT_CB_DEVICE, ++#endif + }; + + enum nfs_cb_opnum4 { + OP_CB_RECALL = 4, ++ OP_CB_LAYOUT = 5, + OP_CB_SEQUENCE = 11, ++ OP_CB_DEVICE = 14, + }; + + #define NFS4_MAXTAGLEN 20 +@@ -78,6 +83,19 @@ enum nfs_cb_opnum4 { + #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) ++#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 3 + \ ++ enc_nfs4_fh_sz + 4) ++#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) ++#define NFS4_enc_cb_device_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 6) ++#define NFS4_dec_cb_device_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) + + /* + * Generic encode routines from fs/nfs/nfs4xdr.c +@@ -94,6 +112,10 @@ xdr_writemem(__be32 *p, const void *ptr, + } + + #define WRITE32(n) *p++ = htonl(n) ++#define WRITE64(n) do { \ ++ *p++ = htonl((u32)((n) >> 32)); \ ++ *p++ = htonl((u32)(n)); \ ++} while (0) + #define WRITEMEM(ptr,nbytes) do { \ + p = xdr_writemem(p, ptr, nbytes); \ + } while (0) +@@ -204,6 +226,16 @@ nfs_cb_stat_to_errno(int stat) + */ + + static void ++encode_stateid(struct xdr_stream *xdr, stateid_t *sid) ++{ ++ __be32 *p; ++ ++ RESERVE_SPACE(sizeof(stateid_t)); ++ WRITE32(sid->si_generation); ++ WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); ++} ++ ++static void + encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) + { + __be32 * p; +@@ -228,10 +260,10 @@ encode_cb_recall(struct xdr_stream *xdr, + __be32 *p; + int len = dp->dl_fh.fh_size; + +- RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len); ++ RESERVE_SPACE(4); + WRITE32(OP_CB_RECALL); +- WRITE32(dp->dl_stateid.si_generation); +- WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t)); ++ encode_stateid(xdr, &dp->dl_stateid); ++ RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2)); + WRITE32(0); /* truncate optimization not implemented */ + WRITE32(len); + WRITEMEM(&dp->dl_fh.fh_base, len); +@@ -259,6 +291,111 @@ encode_cb_sequence(struct xdr_stream *xd + hdr->nops++; + } + ++#if defined(CONFIG_PNFSD) ++ ++#include "pnfsd.h" ++ ++static void ++encode_cb_layout(struct xdr_stream *xdr, struct nfs4_layoutrecall *clr, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ RESERVE_SPACE(20); ++ WRITE32(OP_CB_LAYOUT); ++ WRITE32(clr->cb.cbl_seg.layout_type); ++ WRITE32(clr->cb.cbl_seg.iomode); ++ WRITE32(clr->cb.cbl_layoutchanged); ++ WRITE32(clr->cb.cbl_recall_type); ++ if (unlikely(clr->cb.cbl_recall_type == RETURN_FSID)) { ++ struct nfs4_fsid fsid = clr->cb.cbl_fsid; ++ ++ RESERVE_SPACE(16); ++ WRITE64(fsid.major); ++ WRITE64(fsid.minor); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "fsid 0x%llx-0x%llx\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, fsid.major, fsid.minor); ++ } else if (clr->cb.cbl_recall_type == RETURN_FILE) { ++ int len = clr->clr_file->fi_fhlen; ++ stateid_t *cbl_sid = (stateid_t *)&clr->cb.cbl_sid; ++ ++ RESERVE_SPACE(20 + len); ++ WRITE32(len); ++ WRITEMEM(clr->clr_file->fi_fhval, len); ++ WRITE64(clr->cb.cbl_seg.offset); ++ WRITE64(clr->cb.cbl_seg.length); ++ encode_stateid(xdr, cbl_sid); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "offset %lld length %lld stateid " STATEID_FMT "\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, ++ clr->cb.cbl_seg.offset, clr->cb.cbl_seg.length, ++ STATEID_VAL(cbl_sid)); ++ } else { ++ dprintk("%s: type %x iomode %d changed %d recall_type %d\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type); ++ } ++ hdr->nops++; ++} ++ ++static void ++encode_cb_device(struct xdr_stream *xdr, struct nfs4_notify_device *nd, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ int i; ++ int len = nd->nd_list->cbd_len; ++ struct nfsd4_pnfs_cb_dev_item *cbd = nd->nd_list->cbd_list; ++ ++ dprintk("NFSD %s: --> num %d\n", __func__, len); ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ RESERVE_SPACE(8); ++ WRITE32(OP_CB_DEVICE); ++ ++ /* notify4 cnda_changes<>; */ ++ WRITE32(len); ++ for (i = 0; i < len; i++) { ++ dprintk("%s: nt %d lt %d devid x%llx-x%llx im %d i %d\n", ++ __func__, cbd[i].cbd_notify_type, ++ cbd[i].cbd_layout_type, ++ cbd[i].cbd_devid.sbid, ++ cbd[i].cbd_devid.devid, ++ cbd[i].cbd_immediate, i); ++ ++ BUG_ON(cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_DELETE); ++ RESERVE_SPACE(32); ++ /* bitmap4 notify_mask; */ ++ WRITE32(1); ++ WRITE32(cbd[i].cbd_notify_type); ++ /* opaque notify_vals<>; */ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) ++ WRITE32(24); ++ else ++ WRITE32(20); ++ WRITE32(cbd[i].cbd_layout_type); ++ WRITE64(cbd[i].cbd_devid.sbid); ++ WRITE64(cbd[i].cbd_devid.devid); ++ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) { ++ RESERVE_SPACE(4); ++ WRITE32(cbd[i].cbd_immediate); ++ } ++ } ++ hdr->nops++; ++} ++#endif /* CONFIG_PNFSD */ ++ + static int + nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) + { +@@ -288,6 +425,45 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst * + return 0; + } + ++#if defined(CONFIG_PNFSD) ++static int ++nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, u32 *p, ++ struct nfs4_rpc_args *rpc_args) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_layoutrecall *args = rpc_args->args_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = rpc_args->args_seq.cbs_minorversion, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_cb_compound_hdr(&xdr, &hdr); ++ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); ++ encode_cb_layout(&xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++ return 0; ++} ++ ++static int ++nfs4_xdr_enc_cb_device(struct rpc_rqst *req, u32 *p, ++ struct nfs4_rpc_args *rpc_args) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_notify_device *args = rpc_args->args_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = rpc_args->args_seq.cbs_minorversion, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_cb_compound_hdr(&xdr, &hdr); ++ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); ++ encode_cb_device(&xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++ return 0; ++} ++#endif /* CONFIG_PNFSD */ + + static int + decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ +@@ -403,6 +579,48 @@ out: + return status; + } + ++#if defined(CONFIG_PNFSD) ++static int ++nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, u32 *p, ++ struct nfsd4_cb_sequence *seq) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_cb_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_cb_sequence(&xdr, seq, rqstp); ++ if (status) ++ goto out; ++ status = decode_cb_op_hdr(&xdr, OP_CB_LAYOUT); ++out: ++ return status; ++} ++ ++static int ++nfs4_xdr_dec_cb_device(struct rpc_rqst *rqstp, u32 *p, ++ struct nfsd4_cb_sequence *seq) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_cb_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_cb_sequence(&xdr, seq, rqstp); ++ if (status) ++ goto out; ++ status = decode_cb_op_hdr(&xdr, OP_CB_DEVICE); ++out: ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * RPC procedure tables + */ +@@ -420,6 +638,10 @@ out: + static struct rpc_procinfo nfs4_cb_procedures[] = { + PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null), + PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall), ++#if defined(CONFIG_PNFSD) ++ PROC(CB_LAYOUT, COMPOUND, enc_cb_layout, dec_cb_layout), ++ PROC(CB_DEVICE, COMPOUND, enc_cb_device, dec_cb_device), ++#endif + }; + + static struct rpc_version nfs_cb_version4 = { +@@ -606,10 +828,9 @@ out: + * TODO: cb_sequence should support referring call lists, cachethis, multiple + * slots, and mark callback channel down on communication errors. + */ +-static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_prepare_sequence(struct rpc_task *task, ++ struct nfs4_client *clp) + { +- struct nfs4_delegation *dp = calldata; +- struct nfs4_client *clp = dp->dl_client; + struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; + u32 minorversion = clp->cl_cb_conn.cb_minorversion; + int status = 0; +@@ -629,11 +850,15 @@ static void nfsd4_cb_prepare(struct rpc_ + rpc_call_start(task); + } + +-static void nfsd4_cb_done(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_recall_prepare(struct rpc_task *task, void *calldata) + { + struct nfs4_delegation *dp = calldata; +- struct nfs4_client *clp = dp->dl_client; ++ nfsd4_cb_prepare_sequence(task, dp->dl_client); ++} + ++static void nfsd4_cb_done_sequence(struct rpc_task *task, ++ struct nfs4_client *clp) ++{ + dprintk("%s: minorversion=%d\n", __func__, + clp->cl_cb_conn.cb_minorversion); + +@@ -657,7 +882,7 @@ static void nfsd4_cb_recall_done(struct + struct nfs4_client *clp = dp->dl_client; + struct rpc_clnt *current_rpc_client = clp->cl_cb_client; + +- nfsd4_cb_done(task, calldata); ++ nfsd4_cb_done_sequence(task, clp); + + if (current_rpc_client == NULL) { + /* We're shutting down; give up. */ +@@ -688,7 +913,7 @@ static void nfsd4_cb_recall_done(struct + if (dp->dl_retries--) { + rpc_delay(task, 2*HZ); + task->tk_status = 0; +- rpc_restart_call(task); ++ rpc_restart_call_prepare(task); + return; + } else { + atomic_set(&clp->cl_cb_set, 0); +@@ -704,7 +929,7 @@ static void nfsd4_cb_recall_release(void + } + + static const struct rpc_call_ops nfsd4_cb_recall_ops = { +- .rpc_call_prepare = nfsd4_cb_prepare, ++ .rpc_call_prepare = nfsd4_cb_recall_prepare, + .rpc_call_done = nfsd4_cb_recall_done, + .rpc_release = nfsd4_cb_recall_release, + }; +@@ -781,3 +1006,173 @@ void nfsd4_cb_recall(struct nfs4_delegat + { + queue_work(callback_wq, &dp->dl_recall.cb_work); + } ++ ++#if defined(CONFIG_PNFSD) ++static void nfsd4_cb_layout_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ nfsd4_cb_prepare_sequence(task, clr->clr_client); ++} ++ ++static void nfsd4_cb_layout_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ struct nfs4_client *clp = clr->clr_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ if (!task->tk_status) ++ return; ++ ++ printk("%s: clp %p cb_client %p fp %p failed with status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ clr->clr_file, ++ task->tk_status); ++ ++ switch (task->tk_status) { ++ case -EIO: ++ /* Network partition? */ ++ atomic_set(&clp->cl_cb_set, 0); ++ warn_no_callback_path(clp, task->tk_status); ++ /* FIXME: ++ * The pnfs standard states that we need to only expire ++ * the client after at-least "lease time" .eg lease-time * 2 ++ * when failing to communicate a recall ++ */ ++ break; ++ case -NFS4ERR_DELAY: ++ /* Pole the client until it's done with the layout */ ++ rpc_delay(task, HZ/100); /* 10 mili-seconds */ ++ task->tk_status = 0; ++ rpc_restart_call_prepare(task); ++ break; ++ case -NFS4ERR_NOMATCHING_LAYOUT: ++ task->tk_status = 0; ++ nomatching_layout(clr); ++ } ++} ++ ++static void nfsd4_cb_layout_release(void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ kfree(clr->clr_args); ++ clr->clr_args = NULL; ++ put_layoutrecall(clr); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_layout_ops = { ++ .rpc_call_prepare = nfsd4_cb_layout_prepare, ++ .rpc_call_done = nfsd4_cb_layout_done, ++ .rpc_release = nfsd4_cb_layout_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++int ++nfsd4_cb_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfs4_client *clp = clr->clr_client; ++ struct rpc_clnt *clnt = clp->cl_cb_client; ++ struct nfs4_rpc_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_LAYOUT], ++ .rpc_cred = callback_cred ++ }; ++ int status; ++ ++ args = kzalloc(sizeof(*args), GFP_KERNEL); ++ if (!args) { ++ status = -ENOMEM; ++ goto out; ++ } ++ clr->clr_args = args; ++ args->args_op = clr; ++ msg.rpc_argp = args; ++ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, ++ &nfsd4_cb_layout_ops, clr); ++out: ++ if (status) { ++ kfree(args); ++ put_layoutrecall(clr); ++ } ++ dprintk("NFSD: nfsd4_cb_layout: status %d\n", status); ++ return status; ++} ++ ++static void nfsd4_cb_device_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ nfsd4_cb_prepare_sequence(task, cbnd->nd_client); ++} ++ ++static void nfsd4_cb_device_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ struct nfs4_client *clp = cbnd->nd_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ dprintk("%s: clp %p cb_client %p: status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ task->tk_status); ++ ++ if (task->tk_status == -EIO) { ++ /* Network partition? */ ++ atomic_set(&clp->cl_cb_set, 0); ++ warn_no_callback_path(clp, task->tk_status); ++ } ++} ++ ++static void nfsd4_cb_device_release(void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ kfree(cbnd->nd_args); ++ cbnd->nd_args = NULL; ++ kfree(cbnd); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_device_ops = { ++ .rpc_call_prepare = nfsd4_cb_device_prepare, ++ .rpc_call_done = nfsd4_cb_device_done, ++ .rpc_release = nfsd4_cb_device_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++int ++nfsd4_cb_notify_device(struct nfs4_notify_device *cbnd) ++{ ++ struct nfs4_client *clp = cbnd->nd_client; ++ struct rpc_clnt *clnt = clp->cl_cb_client; ++ struct nfs4_rpc_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_DEVICE], ++ .rpc_cred = callback_cred ++ }; ++ int status = -EIO; ++ ++ dprintk("%s: clp %p\n", __func__, clp); ++ ++ args = kzalloc(sizeof(*args), GFP_KERNEL); ++ if (!args) { ++ status = -ENOMEM; ++ goto out; ++ } ++ args->args_op = cbnd; ++ msg.rpc_argp = args; ++ ++ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, ++ &nfsd4_cb_device_ops, cbnd); ++out: ++ if (status) ++ kfree(args); ++ dprintk("%s: status %d\n", __func__, status); ++ return status; ++} ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-09-30 10:17:08.845997000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-09-30 10:17:08.863998000 -0400 +@@ -0,0 +1,1679 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ *****************************************************************************/ ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Globals */ ++static u32 current_layoutid = 1; ++ ++/* ++ * Currently used for manipulating the layout state. ++ */ ++static DEFINE_SPINLOCK(layout_lock); ++ ++#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_SMP) ++# define BUG_ON_UNLOCKED_LAYOUT() BUG_ON(!spin_is_locked(&layout_lock)) ++#else ++# define BUG_ON_UNLOCKED_LAYOUT() ++#endif ++ ++/* ++ * Layout state - NFSv4.1 pNFS ++ */ ++static struct kmem_cache *pnfs_layout_slab; ++static struct kmem_cache *pnfs_layoutrecall_slab; ++ ++/* hash table for nfsd4_pnfs_deviceid.sbid */ ++#define SBID_HASH_BITS 8 ++#define SBID_HASH_SIZE (1 << SBID_HASH_BITS) ++#define SBID_HASH_MASK (SBID_HASH_SIZE - 1) ++ ++struct sbid_tracker { ++ u64 id; ++ struct super_block *sb; ++ struct list_head hash; ++}; ++ ++static u64 current_sbid; ++static struct list_head sbid_hashtbl[SBID_HASH_SIZE]; ++ ++static inline unsigned long ++sbid_hashval(struct super_block *sb) ++{ ++ return hash_ptr(sb, SBID_HASH_BITS); ++} ++ ++static inline struct sbid_tracker * ++alloc_sbid(void) ++{ ++ return kmalloc(sizeof(struct sbid_tracker), GFP_KERNEL); ++} ++ ++static void ++destroy_sbid(struct sbid_tracker *sbid) ++{ ++ spin_lock(&layout_lock); ++ list_del(&sbid->hash); ++ spin_unlock(&layout_lock); ++ kfree(sbid); ++} ++ ++void ++nfsd4_free_pnfs_slabs(void) ++{ ++ int i; ++ struct sbid_tracker *sbid; ++ ++ nfsd4_free_slab(&pnfs_layout_slab); ++ nfsd4_free_slab(&pnfs_layoutrecall_slab); ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ while (!list_empty(&sbid_hashtbl[i])) { ++ sbid = list_first_entry(&sbid_hashtbl[i], ++ struct sbid_tracker, ++ hash); ++ destroy_sbid(sbid); ++ } ++ } ++} ++ ++int ++nfsd4_init_pnfs_slabs(void) ++{ ++ int i; ++ ++ pnfs_layout_slab = kmem_cache_create("pnfs_layouts", ++ sizeof(struct nfs4_layout), 0, 0, NULL); ++ if (pnfs_layout_slab == NULL) ++ return -ENOMEM; ++ pnfs_layoutrecall_slab = kmem_cache_create("pnfs_layoutrecalls", ++ sizeof(struct nfs4_layoutrecall), 0, 0, NULL); ++ if (pnfs_layoutrecall_slab == NULL) ++ return -ENOMEM; ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ INIT_LIST_HEAD(&sbid_hashtbl[i]); ++ } ++ ++ return 0; ++} ++ ++/* XXX: Need to implement the notify types and track which ++ * clients have which devices. */ ++void pnfs_set_device_notify(clientid_t *clid, unsigned int types) ++{ ++ struct nfs4_client *clp; ++ dprintk("%s: -->\n", __func__); ++ ++ nfs4_lock_state(); ++ /* Indicate that client has a device so we can only notify ++ * the correct clients */ ++ clp = find_confirmed_client(clid); ++ if (clp) { ++ atomic_inc(&clp->cl_deviceref); ++ dprintk("%s: Incr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++ } ++ nfs4_unlock_state(); ++} ++ ++/* Clear notifications for this client ++ * XXX: Do we need to loop through a clean up all ++ * krefs when nfsd cleans up the client? */ ++void pnfs_clear_device_notify(struct nfs4_client *clp) ++{ ++ atomic_dec(&clp->cl_deviceref); ++ dprintk("%s: Decr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++} ++ ++static struct nfs4_layout_state * ++alloc_init_layout_state(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid) ++{ ++ struct nfs4_layout_state *new; ++ ++ /* FIXME: use a kmem_cache */ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return new; ++ get_nfs4_file(fp); ++ INIT_LIST_HEAD(&new->ls_perfile); ++ INIT_LIST_HEAD(&new->ls_layouts); ++ kref_init(&new->ls_ref); ++ new->ls_client = clp; ++ new->ls_file = fp; ++ new->ls_stateid.si_boot = stateid->si_boot; ++ new->ls_stateid.si_stateownerid = 0; /* identifies layout stateid */ ++ new->ls_stateid.si_generation = 1; ++ spin_lock(&layout_lock); ++ new->ls_stateid.si_fileid = current_layoutid++; ++ list_add(&new->ls_perfile, &fp->fi_layout_states); ++ spin_unlock(&layout_lock); ++ return new; ++} ++ ++static inline void ++get_layout_state(struct nfs4_layout_state *ls) ++{ ++ kref_get(&ls->ls_ref); ++} ++ ++static void ++destroy_layout_state_common(struct nfs4_layout_state *ls) ++{ ++ struct nfs4_file *fp = ls->ls_file; ++ ++ dprintk("pNFS %s: ls %p fp %p clp %p\n", __func__, ls, fp, ++ ls->ls_client); ++ BUG_ON(!list_empty(&ls->ls_layouts)); ++ kfree(ls); ++ put_nfs4_file(fp); ++} ++ ++static void ++destroy_layout_state(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ spin_lock(&layout_lock); ++ list_del(&ls->ls_perfile); ++ spin_unlock(&layout_lock); ++ destroy_layout_state_common(ls); ++} ++ ++static void ++destroy_layout_state_locked(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ list_del(&ls->ls_perfile); ++ destroy_layout_state_common(ls); ++} ++ ++static inline void ++put_layout_state(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state); ++} ++ ++static inline void ++put_layout_state_locked(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state_locked); ++} ++ ++/* ++ * Search the fp->fi_layout_state list for a layout state with the clientid. ++ * If not found, then this is a 'first open/delegation/lock stateid' from ++ * the client for this file. ++ * Called under the layout_lock. ++ */ ++static struct nfs4_layout_state * ++find_get_layout_state(struct nfs4_client *clp, struct nfs4_file *fp) ++{ ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_for_each_entry(ls, &fp->fi_layout_states, ls_perfile) { ++ if (ls->ls_client == clp) { ++ dprintk("pNFS %s: before GET ls %p ls_ref %d\n", ++ __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ get_layout_state(ls); ++ return ls; ++ } ++ } ++ return NULL; ++} ++ ++static __be32 ++verify_stateid(struct nfs4_file *fp, stateid_t *stateid) ++{ ++ struct nfs4_stateid *local = NULL; ++ struct nfs4_delegation *temp = NULL; ++ ++ /* check if open or lock stateid */ ++ local = find_stateid(stateid, RD_STATE); ++ if (local) ++ return 0; ++ temp = find_delegation_stateid(fp->fi_inode, stateid); ++ if (temp) ++ return 0; ++ return nfserr_bad_stateid; ++} ++ ++/* ++ * nfs4_preocess_layout_stateid () ++ * ++ * We have looked up the nfs4_file corresponding to the current_fh, and ++ * confirmed the clientid. Pull the few tests from nfs4_preprocess_stateid_op() ++ * that make sense with a layout stateid. ++ * ++ * Called with the state_lock held ++ * Returns zero and stateid is updated, or error. ++ * ++ * Note: the struct nfs4_layout_state pointer is only set by layoutget. ++ */ ++static __be32 ++nfs4_process_layout_stateid(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid, struct nfs4_layout_state **lsp) ++{ ++ struct nfs4_layout_state *ls = NULL; ++ __be32 status = 0; ++ ++ dprintk("--> %s clp %p fp %p \n", __func__, clp, fp); ++ ++ dprintk("%s: operation stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ goto out; ++ ++ /* Is this the first use of this layout ? */ ++ spin_lock(&layout_lock); ++ ls = find_get_layout_state(clp, fp); ++ spin_unlock(&layout_lock); ++ if (!ls) { ++ /* Only alloc layout state on layoutget (which sets lsp). */ ++ if (!lsp) { ++ dprintk("%s ERROR: Not layoutget & no layout stateid\n", ++ __func__); ++ status = nfserr_bad_stateid; ++ goto out; ++ } ++ dprintk("%s Initial stateid for layout: file %p client %p\n", ++ __func__, fp, clp); ++ ++ /* verify input stateid */ ++ status = verify_stateid(fp, stateid); ++ if (status) { ++ dprintk("%s ERROR: invalid open/deleg/lock stateid\n", ++ __func__); ++ goto out; ++ } ++ ls = alloc_init_layout_state(clp, fp, stateid); ++ if (!ls) { ++ dprintk("%s pNFS ERROR: no memory for layout state\n", ++ __func__); ++ status = nfserr_resource; ++ goto out; ++ } ++ } else { ++ dprintk("%s Not initial stateid. Layout state %p file %p\n", ++ __func__, ls, fp); ++ ++ /* BAD STATEID */ ++ status = nfserr_bad_stateid; ++ if (memcmp(&ls->ls_stateid.si_opaque, &stateid->si_opaque, ++ sizeof(stateid_opaque_t)) != 0) { ++ ++ /* if a LAYOUTGET operation and stateid is a valid ++ * open/deleg/lock stateid, accept it as a parallel ++ * initial layout stateid ++ */ ++ if (lsp && ((verify_stateid(fp, stateid)) == 0)) { ++ dprintk("%s parallel initial layout state\n", ++ __func__); ++ goto update; ++ } ++ ++ dprintk("%s ERROR bad opaque in stateid 1\n", __func__); ++ goto out_put; ++ } ++ ++ /* stateid is a valid layout stateid for this file. */ ++ if (stateid->si_generation > ls->ls_stateid.si_generation) { ++ dprintk("%s bad stateid 1\n", __func__); ++ goto out_put; ++ } ++update: ++ update_stateid(&ls->ls_stateid); ++ dprintk("%s Updated ls_stateid to %d on layoutstate %p\n", ++ __func__, ls->ls_stateid.si_generation, ls); ++ } ++ status = 0; ++ /* Set the stateid to be encoded */ ++ memcpy(stateid, &ls->ls_stateid, sizeof(stateid_t)); ++ ++ /* Return the layout state if requested */ ++ if (lsp) { ++ get_layout_state(ls); ++ *lsp = ls; ++ } ++ dprintk("%s: layout stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(&ls->ls_stateid)); ++out_put: ++ dprintk("%s PUT LO STATE:\n", __func__); ++ put_layout_state(ls); ++out: ++ dprintk("<-- %s status %d\n", __func__, htonl(status)); ++ ++ return status; ++} ++ ++static inline struct nfs4_layout * ++alloc_layout(void) ++{ ++ return kmem_cache_alloc(pnfs_layout_slab, GFP_KERNEL); ++} ++ ++static inline void ++free_layout(struct nfs4_layout *lp) ++{ ++ kmem_cache_free(pnfs_layout_slab, lp); ++} ++ ++static void ++init_layout(struct nfs4_layout_state *ls, ++ struct nfs4_layout *lp, ++ struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg) ++{ ++ dprintk("pNFS %s: ls %p lp %p clp %p fp %p ino %p\n", __func__, ++ ls, lp, clp, fp, fp->fi_inode); ++ ++ get_nfs4_file(fp); ++ lp->lo_client = clp; ++ lp->lo_file = fp; ++ get_layout_state(ls); ++ lp->lo_state = ls; ++ memcpy(&lp->lo_seg, seg, sizeof(lp->lo_seg)); ++ spin_lock(&layout_lock); ++ list_add_tail(&lp->lo_perstate, &ls->ls_layouts); ++ list_add_tail(&lp->lo_perclnt, &clp->cl_layouts); ++ list_add_tail(&lp->lo_perfile, &fp->fi_layouts); ++ spin_unlock(&layout_lock); ++ dprintk("pNFS %s end\n", __func__); ++} ++ ++static void ++dequeue_layout(struct nfs4_layout *lp) ++{ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del(&lp->lo_perclnt); ++ list_del(&lp->lo_perfile); ++ list_del(&lp->lo_perstate); ++} ++ ++static void ++destroy_layout(struct nfs4_layout *lp) ++{ ++ struct nfs4_client *clp; ++ struct nfs4_file *fp; ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ clp = lp->lo_client; ++ fp = lp->lo_file; ++ ls = lp->lo_state; ++ dprintk("pNFS %s: lp %p clp %p fp %p ino %p ls_layouts empty %d\n", ++ __func__, lp, clp, fp, fp->fi_inode, ++ list_empty(&ls->ls_layouts)); ++ ++ kmem_cache_free(pnfs_layout_slab, lp); ++ /* release references taken by init_layout */ ++ put_layout_state_locked(ls); ++ put_nfs4_file(fp); ++} ++ ++void fs_layout_return(struct super_block *sb, struct inode *ino, ++ struct nfsd4_pnfs_layoutreturn *lrp, int flags, ++ void *recall_cookie) ++{ ++ int ret; ++ ++ if (unlikely(!sb->s_pnfs_op->layout_return)) ++ return; ++ ++ lrp->lr_flags = flags; ++ lrp->args.lr_cookie = recall_cookie; ++ ++ if (!ino) /* FSID or ALL */ ++ ino = sb->s_root->d_inode; ++ ++ ret = sb->s_pnfs_op->layout_return(ino, &lrp->args); ++ dprintk("%s: inode %lu iomode=%d offset=0x%llx length=0x%llx " ++ "cookie = %p flags 0x%x status=%d\n", ++ __func__, ino->i_ino, lrp->args.lr_seg.iomode, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, ++ recall_cookie, flags, ret); ++} ++ ++static u64 ++alloc_init_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ struct sbid_tracker *new = alloc_sbid(); ++ unsigned long hash_idx = sbid_hashval(sb); ++ u64 id = 0; ++ ++ if (likely(new)) { ++ spin_lock(&layout_lock); ++ id = ++current_sbid; ++ new->id = (id << SBID_HASH_BITS) | (hash_idx & SBID_HASH_MASK); ++ id = new->id; ++ BUG_ON(id == 0); ++ new->sb = sb; ++ ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) ++ if (sbid->sb == sb) { ++ kfree(new); ++ id = sbid->id; ++ spin_unlock(&layout_lock); ++ return id; ++ } ++ list_add(&new->hash, &sbid_hashtbl[hash_idx]); ++ spin_unlock(&layout_lock); ++ } ++ return id; ++} ++ ++struct super_block * ++find_sbid_id(u64 id) ++{ ++ struct sbid_tracker *sbid; ++ struct super_block *sb = NULL; ++ unsigned long hash_idx = id & SBID_HASH_MASK; ++ int pos = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->id != id) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ sb = sbid->sb; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return sb; ++} ++ ++u64 ++find_create_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ unsigned long hash_idx = sbid_hashval(sb); ++ int pos = 0; ++ u64 id = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->sb != sb) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ id = sbid->id; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ if (!id) ++ id = alloc_init_sbid(sb); ++ ++ return id; ++} ++ ++/* ++ * Create a layoutrecall structure ++ * An optional layoutrecall can be cloned (except for the layoutrecall lists) ++ */ ++static struct nfs4_layoutrecall * ++alloc_init_layoutrecall(struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_client *clp, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ dprintk("NFSD %s\n", __func__); ++ clr = kmem_cache_alloc(pnfs_layoutrecall_slab, GFP_KERNEL); ++ if (clr == NULL) ++ return clr; ++ ++ dprintk("NFSD %s -->\n", __func__); ++ ++ memset(clr, 0, sizeof(*clr)); ++ if (lrfile) ++ get_nfs4_file(lrfile); ++ clr->clr_client = clp; ++ clr->clr_file = lrfile; ++ clr->cb = *cbl; ++ ++ kref_init(&clr->clr_ref); ++ INIT_LIST_HEAD(&clr->clr_perclnt); ++ ++ dprintk("NFSD %s return %p\n", __func__, clr); ++ return clr; ++} ++ ++static void ++get_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ kref_get(&clr->clr_ref); ++} ++ ++static void ++destroy_layoutrecall(struct kref *kref) ++{ ++ struct nfs4_layoutrecall *clr = ++ container_of(kref, struct nfs4_layoutrecall, clr_ref); ++ dprintk("pNFS %s: clr %p fp %p clp %p\n", __func__, clr, ++ clr->clr_file, clr->clr_client); ++ BUG_ON(!list_empty(&clr->clr_perclnt)); ++ if (clr->clr_file) ++ put_nfs4_file(clr->clr_file); ++ kmem_cache_free(pnfs_layoutrecall_slab, clr); ++} ++ ++int ++put_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ return kref_put(&clr->clr_ref, destroy_layoutrecall); ++} ++ ++void * ++layoutrecall_done(struct nfs4_layoutrecall *clr) ++{ ++ void *recall_cookie = clr->cb.cbl_cookie; ++ struct nfs4_layoutrecall *parent = clr->parent; ++ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del_init(&clr->clr_perclnt); ++ put_layoutrecall(clr); ++ ++ if (parent && !put_layoutrecall(parent)) ++ recall_cookie = NULL; ++ ++ return recall_cookie; ++} ++ ++/* ++ * get_state() and cb_get_state() are ++ */ ++void ++release_pnfs_ds_dev_list(struct nfs4_stateid *stp) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ while (!list_empty(&stp->st_pnfs_ds_id)) { ++ ddp = list_entry(stp->st_pnfs_ds_id.next, ++ struct pnfs_ds_dev_entry, dd_dev_entry); ++ list_del(&ddp->dd_dev_entry); ++ kfree(ddp); ++ } ++} ++ ++static int ++nfs4_add_pnfs_ds_dev(struct nfs4_stateid *stp, u32 dsid) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ ddp = kmalloc(sizeof(*ddp), GFP_KERNEL); ++ if (!ddp) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&ddp->dd_dev_entry); ++ list_add(&ddp->dd_dev_entry, &stp->st_pnfs_ds_id); ++ ddp->dd_dsid = dsid; ++ return 0; ++} ++ ++/* ++ * are two octet ranges overlapping? ++ * start1 last1 ++ * |-----------------| ++ * start2 last2 ++ * |----------------| ++ */ ++static inline int ++lo_seg_overlapping(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 last1 = last_byte_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 last2 = last_byte_offset(start2, l2->length); ++ int ret; ++ ++ /* if last1 == start2 there's a single byte overlap */ ++ ret = (last2 >= start1) && (last1 >= start2); ++ dprintk("%s: l1 %llu:%lld l2 %llu:%lld ret=%d\n", __func__, ++ l1->offset, l1->length, l2->offset, l2->length, ret); ++ return ret; ++} ++ ++static inline int ++same_fsid_major(struct nfs4_fsid *fsid, u64 major) ++{ ++ return fsid->major == major; ++} ++ ++static inline int ++same_fsid(struct nfs4_fsid *fsid, struct svc_fh *current_fh) ++{ ++ return same_fsid_major(fsid, current_fh->fh_export->ex_fsid); ++} ++ ++/* ++ * find a layout recall conflicting with the specified layoutget ++ */ ++static int ++is_layout_recalled(struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (clr, &clp->cl_layoutrecalls, clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != seg->layout_type) ++ continue; ++ if (clr->cb.cbl_recall_type == RETURN_ALL) ++ goto found; ++ if (clr->cb.cbl_recall_type == RETURN_FSID) { ++ if (same_fsid(&clr->cb.cbl_fsid, current_fh)) ++ goto found; ++ else ++ continue; ++ } ++ BUG_ON(clr->cb.cbl_recall_type != RETURN_FILE); ++ if (clr->cb.cbl_seg.clientid == seg->clientid && ++ lo_seg_overlapping(&clr->cb.cbl_seg, seg)) ++ goto found; ++ } ++ spin_unlock(&layout_lock); ++ return 0; ++found: ++ spin_unlock(&layout_lock); ++ return 1; ++} ++ ++/* ++ * are two octet ranges overlapping or adjacent? ++ */ ++static inline int ++lo_seg_mergeable(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ /* is end1 == start2 ranges are adjacent */ ++ return (end2 >= start1) && (end1 >= start2); ++} ++ ++static void ++extend_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lg) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lg_start = lg->offset; ++ u64 lg_end = end_offset(lg_start, lg->length); ++ ++ /* lo already covers lg? */ ++ if (lo_start <= lg_start && lg_end <= lo_end) ++ return; ++ ++ /* extend start offset */ ++ if (lo_start > lg_start) ++ lo_start = lg_start; ++ ++ /* extend end offset */ ++ if (lo_end < lg_end) ++ lo_end = lg_end; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? ++ lo_end : lo_end - lo_start; ++} ++ ++static struct nfs4_layout * ++merge_layout(struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layout *lp = NULL; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (lp, &fp->fi_layouts, lo_perfile) ++ if (lp->lo_seg.layout_type == seg->layout_type && ++ lp->lo_seg.clientid == seg->clientid && ++ lp->lo_seg.iomode == seg->iomode && ++ lo_seg_mergeable(&lp->lo_seg, seg)) { ++ extend_layout(&lp->lo_seg, seg); ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return lp; ++} ++ ++__be32 ++nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *lgp, ++ struct exp_xdr_stream *xdr) ++{ ++ u32 status; ++ __be32 nfserr; ++ struct inode *ino = lgp->lg_fhp->fh_dentry->d_inode; ++ struct super_block *sb = ino->i_sb; ++ int can_merge; ++ struct nfs4_file *fp; ++ struct nfs4_client *clp; ++ struct nfs4_layout *lp = NULL; ++ struct nfs4_layout_state *ls = NULL; ++ struct nfsd4_pnfs_layoutget_arg args = { ++ .lg_minlength = lgp->lg_minlength, ++ .lg_fh = &lgp->lg_fhp->fh_handle, ++ }; ++ struct nfsd4_pnfs_layoutget_res res = { ++ .lg_seg = lgp->lg_seg, ++ }; ++ ++ dprintk("NFSD: %s Begin\n", __func__); ++ ++ args.lg_sbid = find_create_sbid(sb); ++ if (!args.lg_sbid) { ++ nfserr = nfserr_layouttrylater; ++ goto out; ++ } ++ ++ can_merge = sb->s_pnfs_op->can_merge_layouts != NULL && ++ sb->s_pnfs_op->can_merge_layouts(lgp->lg_seg.layout_type); ++ ++ nfs4_lock_state(); ++ fp = find_alloc_file(ino, lgp->lg_fhp); ++ clp = find_confirmed_client((clientid_t *)&lgp->lg_seg.clientid); ++ dprintk("pNFS %s: fp %p clp %p \n", __func__, fp, clp); ++ if (!fp || !clp) { ++ nfserr = nfserr_inval; ++ goto out_unlock; ++ } ++ ++ /* Check decoded layout stateid */ ++ nfserr = nfs4_process_layout_stateid(clp, fp, &lgp->lg_sid, &ls); ++ if (nfserr) ++ goto out_unlock; ++ ++ if (is_layout_recalled(clp, lgp->lg_fhp, &lgp->lg_seg)) { ++ nfserr = nfserr_recallconflict; ++ goto out; ++ } ++ ++ /* pre-alloc layout in case we can't merge after we call ++ * the file system ++ */ ++ lp = alloc_layout(); ++ if (!lp) { ++ nfserr = nfserr_layouttrylater; ++ goto out_unlock; ++ } ++ ++ dprintk("pNFS %s: pre-export type 0x%x maxcount %Zd " ++ "iomode %u offset %llu length %llu\n", ++ __func__, lgp->lg_seg.layout_type, ++ exp_xdr_qbytes(xdr->end - xdr->p), ++ lgp->lg_seg.iomode, lgp->lg_seg.offset, lgp->lg_seg.length); ++ ++ /* FIXME: need to eliminate the use of the state lock */ ++ nfs4_unlock_state(); ++ status = sb->s_pnfs_op->layout_get(ino, xdr, &args, &res); ++ nfs4_lock_state(); ++ ++ dprintk("pNFS %s: post-export status %u " ++ "iomode %u offset %llu length %llu\n", ++ __func__, status, res.lg_seg.iomode, ++ res.lg_seg.offset, res.lg_seg.length); ++ ++ /* ++ * The allowable error codes for the layout_get pNFS export ++ * operations vector function (from the file system) can be ++ * expanded as needed to include other errors defined for ++ * the RFC 5561 LAYOUTGET operation. ++ */ ++ switch (status) { ++ case 0: ++ nfserr = NFS4_OK; ++ break; ++ case NFS4ERR_ACCESS: ++ case NFS4ERR_BADIOMODE: ++ /* No support for LAYOUTIOMODE4_RW layouts */ ++ case NFS4ERR_BADLAYOUT: ++ /* No layout matching loga_minlength rules */ ++ case NFS4ERR_INVAL: ++ case NFS4ERR_IO: ++ case NFS4ERR_LAYOUTTRYLATER: ++ case NFS4ERR_LAYOUTUNAVAILABLE: ++ case NFS4ERR_LOCKED: ++ case NFS4ERR_NOSPC: ++ case NFS4ERR_RECALLCONFLICT: ++ case NFS4ERR_SERVERFAULT: ++ case NFS4ERR_TOOSMALL: ++ /* Requested layout too big for loga_maxcount */ ++ case NFS4ERR_WRONG_TYPE: ++ /* Not a regular file */ ++ nfserr = cpu_to_be32(status); ++ goto out_freelayout; ++ default: ++ BUG(); ++ nfserr = nfserr_serverfault; ++ } ++ ++ lgp->lg_seg = res.lg_seg; ++ lgp->lg_roc = res.lg_return_on_close; ++ ++ /* SUCCESS! ++ * Can the new layout be merged into an existing one? ++ * If so, free unused layout struct ++ */ ++ if (can_merge && merge_layout(fp, clp, &res.lg_seg)) ++ goto out_freelayout; ++ ++ /* Can't merge, so let's initialize this new layout */ ++ init_layout(ls, lp, fp, clp, lgp->lg_fhp, &res.lg_seg); ++out_unlock: ++ if (ls) ++ put_layout_state(ls); ++ if (fp) ++ put_nfs4_file(fp); ++ nfs4_unlock_state(); ++out: ++ dprintk("pNFS %s: lp %p exit nfserr %u\n", __func__, lp, ++ be32_to_cpu(nfserr)); ++ return nfserr; ++out_freelayout: ++ free_layout(lp); ++ goto out_unlock; ++} ++ ++static void ++trim_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lr) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lr_start = lr->offset; ++ u64 lr_end = end_offset(lr_start, lr->length); ++ ++ dprintk("%s:Begin lo %llu:%lld lr %llu:%lld\n", __func__, ++ lo->offset, lo->length, lr->offset, lr->length); ++ ++ /* lr fully covers lo? */ ++ if (lr_start <= lo_start && lo_end <= lr_end) { ++ lo->length = 0; ++ goto out; ++ } ++ ++ /* ++ * split not supported yet. retain layout segment. ++ * remains must be returned by the client ++ * on the final layout return. ++ */ ++ if (lo_start < lr_start && lr_end < lo_end) { ++ dprintk("%s: split not supported\n", __func__); ++ goto out; ++ } ++ ++ if (lo_start < lr_start) ++ lo_end = lr_start - 1; ++ else /* lr_end < lo_end */ ++ lo_start = lr_end + 1; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? lo_end : lo_end - lo_start; ++out: ++ dprintk("%s:End lo %llu:%lld\n", __func__, lo->offset, lo->length); ++} ++ ++static int ++pnfs_return_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ dprintk("%s: clp %p fp %p\n", __func__, clp, fp); ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &fp->fi_layouts, lo_perfile) { ++ dprintk("%s: lp %p client %p,%p lo_type %x,%x iomode %d,%d\n", ++ __func__, lp, ++ lp->lo_client, clp, ++ lp->lo_seg.layout_type, lrp->args.lr_seg.layout_type, ++ lp->lo_seg.iomode, lrp->args.lr_seg.iomode); ++ if (lp->lo_client != clp || ++ lp->lo_seg.layout_type != lrp->args.lr_seg.layout_type || ++ (lp->lo_seg.iomode != lrp->args.lr_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) || ++ !lo_seg_overlapping(&lp->lo_seg, &lrp->args.lr_seg)) ++ continue; ++ layouts_found++; ++ trim_layout(&lp->lo_seg, &lrp->args.lr_seg); ++ if (!lp->lo_seg.length) { ++ lrp->lrs_present = 0; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ } ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++pnfs_return_client_layouts(struct nfs4_client *clp, ++ struct nfsd4_pnfs_layoutreturn *lrp, u64 ex_fsid) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &clp->cl_layouts, lo_perclnt) { ++ if (lrp->args.lr_seg.layout_type != lp->lo_seg.layout_type || ++ (lrp->args.lr_seg.iomode != lp->lo_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY)) ++ continue; ++ ++ if (lrp->args.lr_return_type == RETURN_FSID && ++ !same_fsid_major(&lp->lo_file->fi_fsid, ex_fsid)) ++ continue; ++ ++ layouts_found++; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++recall_return_perfect_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode || ++ clr->cb.cbl_recall_type != lrp->args.lr_return_type) ++ return 0; ++ ++ return (clr->cb.cbl_recall_type == RETURN_FILE && ++ clr->clr_file == fp && ++ clr->cb.cbl_seg.offset == lrp->args.lr_seg.offset && ++ clr->cb.cbl_seg.length == lrp->args.lr_seg.length) || ++ ++ (clr->cb.cbl_recall_type == RETURN_FSID && ++ same_fsid(&clr->cb.cbl_fsid, current_fh)) || ++ ++ clr->cb.cbl_recall_type == RETURN_ALL; ++} ++ ++static int ++recall_return_partial_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ /* iomode matching? */ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode && ++ clr->cb.cbl_seg.iomode != IOMODE_ANY && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) ++ return 0; ++ ++ if (clr->cb.cbl_recall_type == RETURN_ALL || ++ lrp->args.lr_return_type == RETURN_ALL) ++ return 1; ++ ++ /* fsid matches? */ ++ if (clr->cb.cbl_recall_type == RETURN_FSID || ++ lrp->args.lr_return_type == RETURN_FSID) ++ return same_fsid(&clr->cb.cbl_fsid, current_fh); ++ ++ /* file matches, range overlapping? */ ++ return clr->clr_file == fp && ++ lo_seg_overlapping(&clr->cb.cbl_seg, &lrp->args.lr_seg); ++} ++ ++int nfs4_pnfs_return_layout(struct super_block *sb, struct svc_fh *current_fh, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status = 0; ++ int layouts_found = 0; ++ struct inode *ino = current_fh->fh_dentry->d_inode; ++ struct nfs4_file *fp = NULL; ++ struct nfs4_client *clp; ++ struct nfs4_layoutrecall *clr, *nextclr; ++ u64 ex_fsid = current_fh->fh_export->ex_fsid; ++ void *recall_cookie = NULL; ++ ++ dprintk("NFSD: %s\n", __func__); ++ ++ nfs4_lock_state(); ++ clp = find_confirmed_client((clientid_t *)&lrp->args.lr_seg.clientid); ++ if (!clp) ++ goto out; ++ ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ fp = find_file(ino); ++ if (!fp) { ++ printk(KERN_ERR "%s: RETURN_FILE: no nfs4_file for " ++ "ino %p:%lu\n", ++ __func__, ino, ino ? ino->i_ino : 0L); ++ goto out; ++ } ++ ++ /* Check the stateid */ ++ dprintk("%s PROCESS LO_STATEID inode %p\n", __func__, ino); ++ status = nfs4_process_layout_stateid(clp, fp, &lrp->lr_sid, ++ NULL); ++ if (status) ++ goto out_put_file; ++ ++ /* update layouts */ ++ layouts_found = pnfs_return_file_layouts(clp, fp, lrp); ++ /* optimize for the all-empty case */ ++ if (list_empty(&fp->fi_layouts)) ++ recall_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ } else { ++ layouts_found = pnfs_return_client_layouts(clp, lrp, ex_fsid); ++ } ++ ++ dprintk("pNFS %s: clp %p fp %p layout_type 0x%x iomode %d " ++ "return_type %d fsid 0x%llx offset %llu length %llu: " ++ "layouts_found %d\n", ++ __func__, clp, fp, lrp->args.lr_seg.layout_type, ++ lrp->args.lr_seg.iomode, lrp->args.lr_return_type, ++ ex_fsid, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, layouts_found); ++ ++ /* update layoutrecalls ++ * note: for RETURN_{FSID,ALL}, fp may be NULL ++ */ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (clr, nextclr, &clp->cl_layoutrecalls, ++ clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != lrp->args.lr_seg.layout_type) ++ continue; ++ ++ if (recall_return_perfect_match(clr, lrp, fp, current_fh)) ++ recall_cookie = layoutrecall_done(clr); ++ else if (layouts_found && ++ recall_return_partial_match(clr, lrp, fp, current_fh)) ++ clr->clr_time = CURRENT_TIME; ++ } ++ spin_unlock(&layout_lock); ++ ++out_put_file: ++ if (fp) ++ put_nfs4_file(fp); ++out: ++ nfs4_unlock_state(); ++ ++ /* call exported filesystem layout_return (ignore return-code) */ ++ fs_layout_return(sb, ino, lrp, 0, recall_cookie); ++ ++ dprintk("pNFS %s: exit status %d \n", __func__, status); ++ return status; ++} ++ ++/* ++ * PNFS Metadata server export operations callback for get_state ++ * ++ * called by the cluster fs when it receives a get_state() from a data ++ * server. ++ * returns status, or pnfs_get_state* with pnfs_get_state->status set. ++ * ++ */ ++int ++nfs4_pnfs_cb_get_state(struct super_block *sb, struct pnfs_get_state *arg) ++{ ++ struct nfs4_stateid *stp; ++ int flags = LOCK_STATE | OPEN_STATE; /* search both hash tables */ ++ int status = -EINVAL; ++ struct inode *ino; ++ struct nfs4_delegation *dl; ++ stateid_t *stid = (stateid_t *)&arg->stid; ++ ++ dprintk("NFSD: %s sid=" STATEID_FMT " ino %llu\n", __func__, ++ STATEID_VAL(stid), arg->ino); ++ ++ nfs4_lock_state(); ++ stp = find_stateid(stid, flags); ++ if (!stp) { ++ ino = iget_locked(sb, arg->ino); ++ if (!ino) ++ goto out; ++ ++ if (ino->i_state & I_NEW) { ++ iget_failed(ino); ++ goto out; ++ } ++ ++ dl = find_delegation_stateid(ino, stid); ++ if (dl) ++ status = 0; ++ ++ iput(ino); ++ } else { ++ /* XXX ANDROS: marc removed nfs4_check_fh - how come? */ ++ ++ /* arg->devid is the Data server id, set by the cluster fs */ ++ status = nfs4_add_pnfs_ds_dev(stp, arg->dsid); ++ if (status) ++ goto out; ++ ++ arg->access = stp->st_access_bmap; ++ *(clientid_t *)&arg->clid = ++ stp->st_stateowner->so_client->cl_clientid; ++ } ++out: ++ nfs4_unlock_state(); ++ return status; ++} ++ ++static int ++cl_has_file_layout(struct nfs4_client *clp, struct nfs4_file *lrfile, ++ stateid_t *lsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ struct nfs4_layout_state *ls; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) { ++ if (lp->lo_file != lrfile) ++ continue; ++ ++ ls = find_get_layout_state(clp, lrfile); ++ if (!ls) { ++ /* This shouldn't happen as the file should have a ++ * layout stateid if it has a layout. ++ */ ++ printk(KERN_ERR "%s: file %p has no layout stateid\n", ++ __func__, lrfile); ++ WARN_ON(1); ++ break; ++ } ++ update_stateid(&ls->ls_stateid); ++ memcpy(lsid, &ls->ls_stateid, sizeof(stateid_t)); ++ put_layout_state_locked(ls); ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return found; ++} ++ ++static int ++cl_has_fsid_layout(struct nfs4_client *clp, struct nfs4_fsid *fsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ ++ /* note: minor version unused */ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) ++ if (lp->lo_file->fi_fsid.major == fsid->major) { ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return found; ++} ++ ++static int ++cl_has_any_layout(struct nfs4_client *clp) ++{ ++ return !list_empty(&clp->cl_layouts); ++} ++ ++static int ++cl_has_layout(struct nfs4_client *clp, struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile, stateid_t *lsid) ++{ ++ switch (cbl->cbl_recall_type) { ++ case RETURN_FILE: ++ return cl_has_file_layout(clp, lrfile, lsid); ++ case RETURN_FSID: ++ return cl_has_fsid_layout(clp, &cbl->cbl_fsid); ++ default: ++ return cl_has_any_layout(clp); ++ } ++} ++ ++/* ++ * Called without the layout_lock. ++ */ ++void ++nomatching_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfsd4_pnfs_layoutreturn lr = { ++ .args.lr_return_type = clr->cb.cbl_recall_type, ++ .args.lr_seg = clr->cb.cbl_seg, ++ }; ++ struct inode *inode; ++ void *recall_cookie; ++ ++ if (clr->clr_file) { ++ inode = igrab(clr->clr_file->fi_inode); ++ if (WARN_ON(!inode)) ++ return; ++ } else { ++ inode = NULL; ++ } ++ ++ dprintk("%s: clp %p fp %p: simulating layout_return\n", __func__, ++ clr->clr_client, clr->clr_file); ++ ++ if (clr->cb.cbl_recall_type == RETURN_FILE) ++ pnfs_return_file_layouts(clr->clr_client, clr->clr_file, &lr); ++ else ++ pnfs_return_client_layouts(clr->clr_client, &lr, ++ clr->cb.cbl_fsid.major); ++ ++ spin_lock(&layout_lock); ++ recall_cookie = layoutrecall_done(clr); ++ spin_unlock(&layout_lock); ++ ++ fs_layout_return(clr->clr_sb, inode, &lr, LR_FLAG_INTERN, ++ recall_cookie); ++ iput(inode); ++} ++ ++void pnfs_expire_client(struct nfs4_client *clp) ++{ ++ for (;;) { ++ struct nfs4_layoutrecall *lrp = NULL; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layoutrecalls)) { ++ lrp = list_entry(clp->cl_layoutrecalls.next, ++ struct nfs4_layoutrecall, clr_perclnt); ++ get_layoutrecall(lrp); ++ } ++ spin_unlock(&layout_lock); ++ if (!lrp) ++ break; ++ ++ dprintk("%s: lrp %p, fp %p\n", __func__, lrp, lrp->clr_file); ++ BUG_ON(lrp->clr_client != clp); ++ nomatching_layout(lrp); ++ put_layoutrecall(lrp); ++ } ++ ++ for (;;) { ++ struct nfs4_layout *lp = NULL; ++ struct inode *inode = NULL; ++ struct nfsd4_pnfs_layoutreturn lr; ++ bool empty = false; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layouts)) { ++ lp = list_entry(clp->cl_layouts.next, ++ struct nfs4_layout, lo_perclnt); ++ inode = igrab(lp->lo_file->fi_inode); ++ memset(&lr, 0, sizeof(lr)); ++ lr.args.lr_return_type = RETURN_FILE; ++ lr.args.lr_seg = lp->lo_seg; ++ empty = list_empty(&lp->lo_file->fi_layouts); ++ BUG_ON(lp->lo_client != clp); ++ dequeue_layout(lp); ++ destroy_layout(lp); /* do not access lp after this */ ++ } ++ spin_unlock(&layout_lock); ++ if (!lp) ++ break; ++ ++ if (WARN_ON(!inode)) ++ break; ++ ++ dprintk("%s: inode %lu lp %p clp %p\n", __func__, inode->i_ino, ++ lp, clp); ++ ++ fs_layout_return(inode->i_sb, inode, &lr, LR_FLAG_EXPIRE, ++ empty ? PNFS_LAST_LAYOUT_NO_RECALLS : NULL); ++ iput(inode); ++ } ++} ++ ++struct create_recall_list_arg { ++ struct nfsd4_pnfs_cb_layout *cbl; ++ struct nfs4_file *lrfile; ++ struct list_head *todolist; ++ unsigned todo_count; ++}; ++ ++/* ++ * look for matching layout for the given client ++ * and add a pending layout recall to the todo list ++ * if found any. ++ * returns: ++ * 0 if layouts found or negative error. ++ */ ++static int ++lo_recall_per_client(struct nfs4_client *clp, void *p) ++{ ++ stateid_t lsid; ++ struct nfs4_layoutrecall *pending; ++ struct create_recall_list_arg *arg = p; ++ ++ memset(&lsid, 0, sizeof(lsid)); ++ if (!cl_has_layout(clp, arg->cbl, arg->lrfile, &lsid)) ++ return 0; ++ ++ /* Matching put done by layoutreturn */ ++ pending = alloc_init_layoutrecall(arg->cbl, clp, arg->lrfile); ++ /* out of memory, drain todo queue */ ++ if (!pending) ++ return -ENOMEM; ++ ++ *(stateid_t *)&pending->cb.cbl_sid = lsid; ++ list_add(&pending->clr_perclnt, arg->todolist); ++ arg->todo_count++; ++ return 0; ++} ++ ++/* Create a layoutrecall structure for each client based on the ++ * original structure. */ ++int ++create_layout_recall_list(struct list_head *todolist, unsigned *todo_len, ++ struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_client *clp; ++ struct create_recall_list_arg arg = { ++ .cbl = cbl, ++ .lrfile = lrfile, ++ .todolist = todolist, ++ }; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ /* If client given by fs, just do single client */ ++ if (cbl->cbl_seg.clientid) { ++ clp = find_confirmed_client( ++ (clientid_t *)&cbl->cbl_seg.clientid); ++ if (!clp) { ++ status = -ENOENT; ++ dprintk("%s: clientid %llx not found\n", __func__, ++ (unsigned long long)cbl->cbl_seg.clientid); ++ goto out; ++ } ++ ++ status = lo_recall_per_client(clp, &arg); ++ } else { ++ /* Check all clients for layout matches */ ++ status = filter_confirmed_clients(lo_recall_per_client, &arg); ++ } ++ ++out: ++ *todo_len = arg.todo_count; ++ dprintk("%s: <-- list len %u status %d\n", __func__, *todo_len, status); ++ return status; ++} ++ ++/* ++ * Recall layouts asynchronously ++ * Called with state lock. ++ */ ++static int ++spawn_layout_recall(struct super_block *sb, struct list_head *todolist, ++ unsigned todo_len) ++{ ++ struct nfs4_layoutrecall *pending; ++ struct nfs4_layoutrecall *parent = NULL; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ if (todo_len > 1) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ ++ parent = alloc_init_layoutrecall(&pending->cb, NULL, ++ pending->clr_file); ++ if (unlikely(!parent)) { ++ /* We want forward progress. If parent cannot be ++ * allocated take the first one as parent but don't ++ * execute it. Caller must check for -EAGAIN, if so ++ * When the partial recalls return, ++ * nfsd_layout_recall_cb should be called again. ++ */ ++ list_del_init(&pending->clr_perclnt); ++ if (todo_len > 2) { ++ parent = pending; ++ } else { ++ parent = NULL; ++ put_layoutrecall(pending); ++ } ++ --todo_len; ++ status = -ENOMEM; ++ } ++ } ++ ++ while (!list_empty(todolist)) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ list_del_init(&pending->clr_perclnt); ++ dprintk("%s: clp %p cb_client %p fp %p\n", __func__, ++ pending->clr_client, ++ pending->clr_client->cl_cb_client, ++ pending->clr_file); ++ if (unlikely(!pending->clr_client->cl_cb_client)) { ++ printk(KERN_INFO ++ "%s: clientid %08x/%08x has no callback path\n", ++ __func__, ++ pending->clr_client->cl_clientid.cl_boot, ++ pending->clr_client->cl_clientid.cl_id); ++ put_layoutrecall(pending); ++ continue; ++ } ++ ++ pending->clr_time = CURRENT_TIME; ++ pending->clr_sb = sb; ++ if (parent) { ++ /* If we created a parent its initial ref count is 1. ++ * We will need to de-ref it eventually. So we just ++ * don't increment on behalf of the last one. ++ */ ++ if (todo_len != 1) ++ get_layoutrecall(parent); ++ } ++ pending->parent = parent; ++ get_layoutrecall(pending); ++ /* Add to list so corresponding layoutreturn can find req */ ++ list_add(&pending->clr_perclnt, ++ &pending->clr_client->cl_layoutrecalls); ++ ++ nfsd4_cb_layout(pending); ++ --todo_len; ++ } ++ ++ return status; ++} ++ ++/* ++ * Spawn a thread to perform a recall layout ++ * ++ */ ++int nfsd_layout_recall_cb(struct super_block *sb, struct inode *inode, ++ struct nfsd4_pnfs_cb_layout *cbl) ++{ ++ int status; ++ struct nfs4_file *lrfile = NULL; ++ struct list_head todolist; ++ unsigned todo_len = 0; ++ ++ dprintk("NFSD nfsd_layout_recall_cb: inode %p cbl %p\n", inode, cbl); ++ BUG_ON(!cbl); ++ BUG_ON(cbl->cbl_recall_type != RETURN_FILE && ++ cbl->cbl_recall_type != RETURN_FSID && ++ cbl->cbl_recall_type != RETURN_ALL); ++ BUG_ON(cbl->cbl_recall_type == RETURN_FILE && !inode); ++ BUG_ON(cbl->cbl_seg.iomode != IOMODE_READ && ++ cbl->cbl_seg.iomode != IOMODE_RW && ++ cbl->cbl_seg.iomode != IOMODE_ANY); ++ ++ if (nfsd_serv == NULL) { ++ dprintk("NFSD nfsd_layout_recall_cb: nfsd_serv == NULL\n"); ++ return -ENOENT; ++ } ++ ++ nfs4_lock_state(); ++ status = -ENOENT; ++ if (inode) { ++ lrfile = find_file(inode); ++ if (!lrfile) { ++ dprintk("NFSD nfsd_layout_recall_cb: " ++ "nfs4_file not found\n"); ++ goto err; ++ } ++ if (cbl->cbl_recall_type == RETURN_FSID) ++ cbl->cbl_fsid = lrfile->fi_fsid; ++ } ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ /* If no cookie provided by FS, return a default one */ ++ if (!cbl->cbl_cookie) ++ cbl->cbl_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ ++ status = create_layout_recall_list(&todolist, &todo_len, cbl, lrfile); ++ if (list_empty(&todolist)) { ++ status = -ENOENT; ++ } else { ++ /* process todolist even if create_layout_recall_list ++ * returned an error */ ++ int status2 = spawn_layout_recall(sb, &todolist, todo_len); ++ if (status2) ++ status = status2; ++ } ++ ++err: ++ nfs4_unlock_state(); ++ if (lrfile) ++ put_nfs4_file(lrfile); ++ return (todo_len && status) ? -EAGAIN : status; ++} ++ ++struct create_device_notify_list_arg { ++ struct list_head *todolist; ++ struct nfsd4_pnfs_cb_dev_list *ndl; ++}; ++ ++static int ++create_device_notify_per_cl(struct nfs4_client *clp, void *p) ++{ ++ struct nfs4_notify_device *cbnd; ++ struct create_device_notify_list_arg *arg = p; ++ ++ if (atomic_read(&clp->cl_deviceref) <= 0) ++ return 0; ++ ++ cbnd = kmalloc(sizeof(*cbnd), GFP_KERNEL); ++ if (!cbnd) ++ return -ENOMEM; ++ ++ cbnd->nd_list = arg->ndl; ++ cbnd->nd_client = clp; ++ list_add(&cbnd->nd_perclnt, arg->todolist); ++ return 0; ++} ++ ++/* Create a list of clients to send device notifications. */ ++int ++create_device_notify_list(struct list_head *todolist, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ int status; ++ struct create_device_notify_list_arg arg = { ++ .todolist = todolist, ++ .ndl = ndl, ++ }; ++ ++ nfs4_lock_state(); ++ status = filter_confirmed_clients(create_device_notify_per_cl, &arg); ++ nfs4_unlock_state(); ++ ++ return status; ++} ++ ++/* ++ * For each client that a device, send a device notification. ++ * XXX: Need to track which clients have which devices. ++ */ ++int nfsd_device_notify_cb(struct super_block *sb, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ struct nfs4_notify_device *cbnd; ++ unsigned int notify_num = 0; ++ int status2, status = 0; ++ struct list_head todolist; ++ ++ BUG_ON(!ndl || ndl->cbd_len == 0 || !ndl->cbd_list); ++ ++ dprintk("NFSD %s: cbl %p len %u\n", __func__, ndl, ndl->cbd_len); ++ ++ if (nfsd_serv == NULL) ++ return -ENOENT; ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ status = create_device_notify_list(&todolist, ndl); ++ ++ while (!list_empty(&todolist)) { ++ cbnd = list_entry(todolist.next, struct nfs4_notify_device, ++ nd_perclnt); ++ list_del_init(&cbnd->nd_perclnt); ++ status2 = nfsd4_cb_notify_device(cbnd); ++ pnfs_clear_device_notify(cbnd->nd_client); ++ if (status2) { ++ kfree(cbnd); ++ status = status2; ++ } ++ notify_num++; ++ } ++ ++ dprintk("NFSD %s: status %d clients %u\n", ++ __func__, status, notify_num); ++ return status; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-09-30 10:17:08.866999000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-09-30 10:17:08.868998000 -0400 +@@ -0,0 +1,461 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsfh.h" ++#include "nfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Just use a linked list. Do not expect more than 32 dlm_device_entries ++ * the first implementation will just use one device per cluster file system ++ */ ++ ++static LIST_HEAD(dlm_device_list); ++static DEFINE_SPINLOCK(dlm_device_list_lock); ++ ++struct dlm_device_entry { ++ struct list_head dlm_dev_list; ++ char disk_name[DISK_NAME_LEN]; ++ int num_ds; ++ char ds_list[NFSD_DLM_DS_LIST_MAX]; ++}; ++ ++static struct dlm_device_entry * ++_nfsd4_find_pnfs_dlm_device(char *disk_name) ++{ ++ struct dlm_device_entry *dlm_pdev; ++ ++ dprintk("--> %s disk name %s\n", __func__, disk_name); ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) { ++ dprintk("%s Look for dlm_pdev %s\n", __func__, ++ dlm_pdev->disk_name); ++ if (!memcmp(dlm_pdev->disk_name, disk_name, strlen(disk_name))) { ++ spin_unlock(&dlm_device_list_lock); ++ return dlm_pdev; ++ } ++ } ++ spin_unlock(&dlm_device_list_lock); ++ return NULL; ++} ++ ++static struct dlm_device_entry * ++nfsd4_find_pnfs_dlm_device(struct super_block *sb) { ++ char dname[BDEVNAME_SIZE]; ++ ++ bdevname(sb->s_bdev, dname); ++ return _nfsd4_find_pnfs_dlm_device(dname); ++} ++ ++ssize_t ++nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen) ++{ ++ char *pos = buf; ++ ssize_t size = 0; ++ struct dlm_device_entry *dlm_pdev; ++ int ret = -EINVAL; ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) ++ { ++ int advanced; ++ advanced = snprintf(pos, buflen - size, "%s:%s\n", dlm_pdev->disk_name, dlm_pdev->ds_list); ++ if (advanced >= buflen - size) ++ goto out; ++ size += advanced; ++ pos += advanced; ++ } ++ ret = size; ++ ++out: ++ spin_unlock(&dlm_device_list_lock); ++ return ret; ++} ++ ++bool nfsd4_validate_pnfs_dlm_device(char *ds_list, int *num_ds) ++{ ++ char *start = ds_list; ++ ++ *num_ds = 0; ++ ++ while (*start) { ++ struct sockaddr_storage tempAddr; ++ int ipLen = strcspn(start, ","); ++ ++ if (!rpc_pton(start, ipLen, (struct sockaddr *)&tempAddr, sizeof(tempAddr))) ++ return false; ++ (*num_ds)++; ++ start += ipLen + 1; ++ } ++ return true; ++} ++ ++/* ++ * pnfs_dlm_device string format: ++ * block-device-path:, ++ * ++ * Examples ++ * /dev/sda:192.168.1.96,192.168.1.97' creates a data server list with ++ * two data servers for the dlm cluster file system mounted on /dev/sda. ++ * ++ * /dev/sda:192.168.1.96,192.168.1.100' ++ * replaces the data server list for /dev/sda ++ * ++ * Only the deviceid == 1 is supported. Can add device id to ++ * pnfs_dlm_device string when needed. ++ * ++ * Only the round robin each data server once stripe index is supported. ++ */ ++int ++nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len) ++ ++{ ++ struct dlm_device_entry *new, *found; ++ char *bufp = pnfs_dlm_device; ++ char *endp = bufp + strlen(bufp); ++ int err = -ENOMEM; ++ ++ dprintk("--> %s len %d\n", __func__, len); ++ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return err; ++ ++ err = -EINVAL; ++ /* disk_name */ ++ /* FIXME: need to check for valid disk_name. search superblocks? ++ * check for slash dev slash ? ++ */ ++ len = strcspn(bufp, ":"); ++ if (len > DISK_NAME_LEN) ++ goto out_free; ++ memcpy(new->disk_name, bufp, len); ++ ++ err = -EINVAL; ++ bufp += len + 1; ++ if (bufp >= endp) ++ goto out_free; ++ ++ /* data server list */ ++ /* FIXME: need to check for comma separated valid ip format */ ++ len = strcspn(bufp, ":"); ++ if (len > NFSD_DLM_DS_LIST_MAX) ++ goto out_free; ++ memcpy(new->ds_list, bufp, len); ++ ++ ++ /* validate the ips */ ++ if (!nfsd4_validate_pnfs_dlm_device(new->ds_list, &(new->num_ds))) ++ goto out_free; ++ ++ dprintk("%s disk_name %s num_ds %d ds_list %s\n", __func__, ++ new->disk_name, new->num_ds, new->ds_list); ++ ++ found = _nfsd4_find_pnfs_dlm_device(new->disk_name); ++ if (found) { ++ /* FIXME: should compare found->ds_list with new->ds_list ++ * and if it is different, kick off a CB_NOTIFY change ++ * deviceid. ++ */ ++ dprintk("%s pnfs_dlm_device %s:%s already in cache " ++ " replace ds_list with new ds_list %s\n", __func__, ++ found->disk_name, found->ds_list, new->ds_list); ++ memset(found->ds_list, 0, DISK_NAME_LEN); ++ memcpy(found->ds_list, new->ds_list, strlen(new->ds_list)); ++ found->num_ds = new->num_ds; ++ kfree(new); ++ } else { ++ dprintk("%s Adding pnfs_dlm_device %s:%s\n", __func__, ++ new->disk_name, new->ds_list); ++ spin_lock(&dlm_device_list_lock); ++ list_add(&new->dlm_dev_list, &dlm_device_list); ++ spin_unlock(&dlm_device_list_lock); ++ } ++ dprintk("<-- %s Success\n", __func__); ++ return 0; ++ ++out_free: ++ kfree(new); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ struct dlm_device_entry *dlm_pdev, *next; ++ ++ dprintk("--> %s\n", __func__); ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry_safe (dlm_pdev, next, &dlm_device_list, ++ dlm_dev_list) { ++ list_del(&dlm_pdev->dlm_dev_list); ++ kfree(dlm_pdev); ++ } ++ spin_unlock(&dlm_device_list_lock); ++} ++ ++static int nfsd4_pnfs_dlm_getdeviter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ printk(KERN_ERR "%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return -ENOTSUPP; ++ } ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ return 0; ++} ++ ++static int nfsd4_pnfs_dlm_getdevinfo(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err, len, i = 0; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_devaddr *daddr; ++ struct dlm_device_entry *dlm_pdev; ++ char *bufp; ++ ++ err = -ENOTSUPP; ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ dprintk("%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return err; ++ } ++ ++ /* We only hand out a deviceid of 1 in LAYOUTGET, so a GETDEVICEINFO ++ * with a gdia_device_id != 1 is invalid. ++ */ ++ err = -EINVAL; ++ if (devid->devid != 1) { ++ dprintk("%s: WARNING: didn't receive a deviceid of " ++ "1 (got: 0x%llx)\n", __func__, devid->devid); ++ return err; ++ } ++ ++ /* ++ * If the DS list has not been established, return -EINVAL ++ */ ++ dlm_pdev = nfsd4_find_pnfs_dlm_device(sb); ++ if (!dlm_pdev) { ++ dprintk("%s: DEBUG: disk %s Not Found\n", __func__, ++ sb->s_bdev->bd_disk->disk_name); ++ return err; ++ } ++ ++ dprintk("%s: Found disk %s with DS list |%s|\n", ++ __func__, dlm_pdev->disk_name, dlm_pdev->ds_list); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ fdev.fl_device_length = dlm_pdev->num_ds; ++ ++ err = -ENOMEM; ++ len = sizeof(*fdev.fl_device_list) * fdev.fl_device_length; ++ fdev.fl_device_list = kzalloc(len, GFP_KERNEL); ++ if (!fdev.fl_device_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device list " ++ "buffer for %d DSes.\n", __func__, i); ++ fdev.fl_device_length = 0; ++ goto out; ++ } ++ ++ /* Set a simple stripe indicie */ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = kzalloc(sizeof(u32) * ++ fdev.fl_stripeindices_length, GFP_KERNEL); ++ ++ if (!fdev.fl_stripeindices_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a stripeindices " ++ "list buffer for %d DSes.\n", __func__, i); ++ goto out; ++ } ++ for (i = 0; i < fdev.fl_stripeindices_length; i++) ++ fdev.fl_stripeindices_list[i] = i; ++ ++ /* Transfer the data server list with a single multipath entry */ ++ bufp = dlm_pdev->ds_list; ++ for (i = 0; i < fdev.fl_device_length; i++) { ++ daddr = kmalloc(sizeof(*daddr), GFP_KERNEL); ++ if (!daddr) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device " ++ "addr buffer.\n", __func__); ++ goto out; ++ } ++ ++ daddr->r_netid.data = "tcp"; ++ daddr->r_netid.len = 3; ++ ++ len = strcspn(bufp, ","); ++ daddr->r_addr.data = kmalloc(len + 4, GFP_KERNEL); ++ memcpy(daddr->r_addr.data, bufp, len); ++ /* ++ * append the port number. interpreted as two more bytes ++ * beyond the quad: ".8.1" -> 0x08.0x01 -> 0x0801 = port 2049. ++ */ ++ memcpy(daddr->r_addr.data + len, ".8.1", 4); ++ daddr->r_addr.len = len + 4; ++ ++ fdev.fl_device_list[i].fl_multipath_length = 1; ++ fdev.fl_device_list[i].fl_multipath_list = daddr; ++ ++ dprintk("%s: encoding DS |%s|\n", __func__, bufp); ++ ++ bufp += len + 1; ++ } ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ for (i = 0; i < fdev.fl_device_length; i++) ++ kfree(fdev.fl_device_list[i].fl_multipath_list); ++ kfree(fdev.fl_device_list); ++ kfree(fdev.fl_stripeindices_list); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize >= NFSSVC_MAXBLKSIZE) ++ return blocksize; ++ return NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++} ++ ++/* ++ * Look up inode block device in pnfs_dlm_device list. ++ * Hash on the inode->i_ino and number of data servers. ++ */ ++static int dlm_ino_hash(struct inode *ino) ++{ ++ struct dlm_device_entry *de; ++ u32 hash_mask = 0; ++ ++ /* If can't find the inode block device in the pnfs_dlm_deivce list ++ * then don't hand out a layout ++ */ ++ de = nfsd4_find_pnfs_dlm_device(ino->i_sb); ++ if (!de) ++ return -1; ++ hash_mask = de->num_ds - 1; ++ return ino->i_ino & hash_mask; ++} ++ ++static enum nfsstat4 nfsd4_pnfs_dlm_layoutget(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ int index; ++ enum nfsstat4 rc = NFS4_OK; ++ ++ dprintk("%s: LAYOUT_GET\n", __func__); ++ ++ /* DLM exported file systems only support layouts for READ */ ++ if (res->lg_seg.iomode == IOMODE_RW) ++ return NFS4ERR_BADIOMODE; ++ ++ index = dlm_ino_hash(inode); ++ dprintk("%s first stripe index %d i_ino %lu\n", __func__, index, ++ inode->i_ino); ++ if (index < 0) ++ return NFS4ERR_LAYOUTUNAVAILABLE; ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ /* Always give out whole file layouts */ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ /* Always give out READ ONLY layouts */ ++ res->lg_seg.iomode = IOMODE_READ; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = false; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = args->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = index; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ memcpy(fhp, args->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++nfsd4_pnfs_dlm_layouttype(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++const struct pnfs_export_operations pnfs_dlm_export_ops = { ++ .layout_type = nfsd4_pnfs_dlm_layouttype, ++ .get_device_info = nfsd4_pnfs_dlm_getdevinfo, ++ .get_device_iter = nfsd4_pnfs_dlm_getdeviter, ++ .layout_get = nfsd4_pnfs_dlm_layoutget, ++}; ++EXPORT_SYMBOL(pnfs_dlm_export_ops); +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-09-30 10:17:08.871998000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-09-30 10:17:08.873003000 -0400 +@@ -0,0 +1,620 @@ ++/* ++* linux/fs/nfsd/nfs4pnfsds.c ++* ++* Copyright (c) 2005 The Regents of the University of Michigan. ++* All rights reserved. ++* ++* Andy Adamson ++* ++* Redistribution and use in source and binary forms, with or without ++* modification, are permitted provided that the following conditions ++* are met: ++* ++* 1. Redistributions of source code must retain the above copyright ++* notice, this list of conditions and the following disclaimer. ++* 2. Redistributions in binary form must reproduce the above copyright ++* notice, this list of conditions and the following disclaimer in the ++* documentation and/or other materials provided with the distribution. ++* 3. Neither the name of the University nor the names of its ++* contributors may be used to endorse or promote products derived ++* from this software without specific prior written permission. ++* ++* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++* ++*/ ++#if defined(CONFIG_PNFSD) ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsd.h" ++#include "pnfsd.h" ++#include "state.h" ++ ++/* ++ ******************* ++ * PNFS ++ ******************* ++ */ ++/* ++ * Hash tables for pNFS Data Server state ++ * ++ * mds_nodeid: list of struct pnfs_mds_id one per Metadata server (MDS) using ++ * this data server (DS). ++ * ++ * mds_clid_hashtbl[]: uses clientid_hashval(), hash of all clientids obtained ++ * from any MDS. ++ * ++ * ds_stid_hashtbl[]: uses stateid_hashval(), hash of all stateids obtained ++ * from any MDS. ++ * ++ */ ++/* Hash tables for clientid state */ ++#define CLIENT_HASH_BITS 4 ++#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) ++#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) ++ ++#define clientid_hashval(id) \ ++ ((id) & CLIENT_HASH_MASK) ++ ++/* hash table for pnfs_ds_stateid */ ++#define STATEID_HASH_BITS 10 ++#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) ++#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) ++ ++#define stateid_hashval(owner_id, file_id) \ ++ (((owner_id) + (file_id)) & STATEID_HASH_MASK) ++ ++static struct list_head mds_id_tbl; ++static struct list_head mds_clid_hashtbl[CLIENT_HASH_SIZE]; ++static struct list_head ds_stid_hashtbl[STATEID_HASH_SIZE]; ++ ++static inline void put_ds_clientid(struct pnfs_ds_clientid *dcp); ++static inline void put_ds_mdsid(struct pnfs_mds_id *mdp); ++ ++/* Mutex for data server state. Needs to be separate from ++ * mds state mutex since a node can be both mds and ds */ ++static DEFINE_MUTEX(ds_mutex); ++static struct thread_info *ds_mutex_owner; ++ ++static void ++ds_lock_state(void) ++{ ++ mutex_lock(&ds_mutex); ++ ds_mutex_owner = current_thread_info(); ++} ++ ++static void ++ds_unlock_state(void) ++{ ++ BUG_ON(ds_mutex_owner != current_thread_info()); ++ ds_mutex_owner = NULL; ++ mutex_unlock(&ds_mutex); ++} ++ ++static int ++cmp_clid(const clientid_t *cl1, const clientid_t *cl2) ++{ ++ return (cl1->cl_boot == cl2->cl_boot) && ++ (cl1->cl_id == cl2->cl_id); ++} ++ ++void ++nfs4_pnfs_state_init(void) ++{ ++ int i; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&mds_clid_hashtbl[i]); ++ ++ for (i = 0; i < STATEID_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&ds_stid_hashtbl[i]); ++ ++ INIT_LIST_HEAD(&mds_id_tbl); ++} ++ ++static struct pnfs_mds_id * ++find_pnfs_mds_id(u32 mdsid) ++{ ++ struct pnfs_mds_id *local = NULL; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ list_for_each_entry(local, &mds_id_tbl, di_hash) { ++ if (local->di_mdsid == mdsid) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_clientid * ++find_pnfs_ds_clientid(const clientid_t *clid) ++{ ++ struct pnfs_ds_clientid *local = NULL; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = clientid_hashval(clid->cl_id); ++ list_for_each_entry(local, &mds_clid_hashtbl[hashval], dc_hash) { ++ if (cmp_clid(&local->dc_mdsclid, clid)) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_stateid * ++find_pnfs_ds_stateid(stateid_t *stid) ++{ ++ struct pnfs_ds_stateid *local = NULL; ++ u32 st_id = stid->si_stateownerid; ++ u32 f_id = stid->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_for_each_entry(local, &ds_stid_hashtbl[hashval], ds_hash) ++ if ((local->ds_stid.si_stateownerid == st_id) && ++ (local->ds_stid.si_fileid == f_id) && ++ (local->ds_stid.si_boot == stid->si_boot)) { ++ stateid_t *sid = &local->ds_stid; ++ dprintk("NFSD: %s <-- %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, local, local->ds_flags, ++ STATEID_VAL(sid)); ++ return local; ++ } ++ return NULL; ++} ++ ++static void ++release_ds_mdsid(struct kref *kref) ++{ ++ struct pnfs_mds_id *mdp = ++ container_of(kref, struct pnfs_mds_id, di_ref); ++ dprintk("pNFSD: %s\n", __func__); ++ ++ list_del(&mdp->di_hash); ++ list_del(&mdp->di_mdsclid); ++ kfree(mdp); ++} ++ ++static void ++release_ds_clientid(struct kref *kref) ++{ ++ struct pnfs_ds_clientid *dcp = ++ container_of(kref, struct pnfs_ds_clientid, dc_ref); ++ struct pnfs_mds_id *mdp; ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(dcp->dc_mdsid); ++ if (mdp) ++ put_ds_mdsid(mdp); ++ ++ list_del(&dcp->dc_hash); ++ list_del(&dcp->dc_stateid); ++ list_del(&dcp->dc_permdsid); ++ kfree(dcp); ++} ++ ++static void ++release_ds_stateid(struct kref *kref) ++{ ++ struct pnfs_ds_stateid *dsp = ++ container_of(kref, struct pnfs_ds_stateid, ds_ref); ++ struct pnfs_ds_clientid *dcp; ++ dprintk("pNFS %s: dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid(&dsp->ds_mdsclid); ++ if (dcp) ++ put_ds_clientid(dcp); ++ ++ list_del(&dsp->ds_hash); ++ list_del(&dsp->ds_perclid); ++ kfree(dsp); ++} ++ ++static inline void ++put_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_put(&dcp->dc_ref, release_ds_clientid); ++} ++ ++static inline void ++get_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_get(&dcp->dc_ref); ++} ++ ++static inline void ++put_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_put(&mdp->di_ref, release_ds_mdsid); ++} ++ ++static inline void ++get_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_get(&mdp->di_ref); ++} ++ ++static inline void ++put_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_put(&dsp->ds_ref, release_ds_stateid); ++} ++ ++static inline void ++get_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_get(&dsp->ds_ref); ++} ++ ++void ++nfs4_pnfs_state_shutdown(void) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int i; ++ ++ dprintk("pNFSD %s: -->\n", __func__); ++ ++ ds_lock_state(); ++ for (i = 0; i < STATEID_HASH_SIZE; i++) { ++ while (!list_empty(&ds_stid_hashtbl[i])) { ++ dsp = list_entry(ds_stid_hashtbl[i].next, ++ struct pnfs_ds_stateid, ds_hash); ++ put_ds_stateid(dsp); ++ } ++ } ++ ds_unlock_state(); ++} ++ ++static struct pnfs_mds_id * ++alloc_init_mds_id(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = kmalloc(sizeof(*mdp), GFP_KERNEL); ++ if (!mdp) ++ return NULL; ++ INIT_LIST_HEAD(&mdp->di_hash); ++ INIT_LIST_HEAD(&mdp->di_mdsclid); ++ list_add(&mdp->di_hash, &mds_id_tbl); ++ mdp->di_mdsid = gsp->dsid; ++ mdp->di_mdsboot = 0; ++ kref_init(&mdp->di_ref); ++ return mdp; ++} ++ ++static struct pnfs_ds_clientid * ++alloc_init_ds_clientid(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ struct pnfs_ds_clientid *dcp; ++ clientid_t *clid = (clientid_t *)&gsp->clid; ++ unsigned int hashval = clientid_hashval(clid->cl_id); ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(gsp->dsid); ++ if (!mdp) { ++ mdp = alloc_init_mds_id(gsp); ++ if (!mdp) ++ return NULL; ++ } else { ++ get_ds_mdsid(mdp); ++ } ++ ++ dcp = kmalloc(sizeof(*dcp), GFP_KERNEL); ++ if (!dcp) ++ return NULL; ++ ++ INIT_LIST_HEAD(&dcp->dc_hash); ++ INIT_LIST_HEAD(&dcp->dc_stateid); ++ INIT_LIST_HEAD(&dcp->dc_permdsid); ++ list_add(&dcp->dc_hash, &mds_clid_hashtbl[hashval]); ++ list_add(&dcp->dc_permdsid, &mdp->di_mdsclid); ++ dcp->dc_mdsclid = *clid; ++ kref_init(&dcp->dc_ref); ++ dcp->dc_mdsid = gsp->dsid; ++ return dcp; ++} ++ ++static struct pnfs_ds_stateid * ++alloc_init_ds_stateid(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct pnfs_ds_stateid *dsp; ++ u32 st_id = stidp->si_stateownerid; ++ u32 f_id = stidp->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ dsp = kmalloc(sizeof(*dsp), GFP_KERNEL); ++ if (!dsp) ++ return dsp; ++ ++ INIT_LIST_HEAD(&dsp->ds_hash); ++ INIT_LIST_HEAD(&dsp->ds_perclid); ++ memcpy(&dsp->ds_stid, stidp, sizeof(stateid_t)); ++ fh_copy_shallow(&dsp->ds_fh, &cfh->fh_handle); ++ dsp->ds_access = 0; ++ dsp->ds_status = 0; ++ dsp->ds_flags = 0L; ++ kref_init(&dsp->ds_ref); ++ set_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ clear_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ init_waitqueue_head(&dsp->ds_waitq); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_add(&dsp->ds_hash, &ds_stid_hashtbl[hashval]); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++static int ++update_ds_stateid(struct pnfs_ds_stateid *dsp, struct svc_fh *cfh, ++ struct pnfs_get_state *gsp) ++{ ++ struct pnfs_ds_clientid *dcp; ++ int new = 0; ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid((clientid_t *)&gsp->clid); ++ if (!dcp) { ++ dcp = alloc_init_ds_clientid(gsp); ++ if (!dcp) ++ return 1; ++ new = 1; ++ } ++ if (test_bit(DS_STATEID_NEW, &dsp->ds_flags)) { ++ list_add(&dsp->ds_perclid, &dcp->dc_stateid); ++ if (!new) ++ get_ds_clientid(dcp); ++ } ++ ++ memcpy(&dsp->ds_stid, &gsp->stid, sizeof(stateid_t)); ++ dsp->ds_access = gsp->access; ++ dsp->ds_status = 0; ++ dsp->ds_verifier[0] = gsp->verifier[0]; ++ dsp->ds_verifier[1] = gsp->verifier[1]; ++ memcpy(&dsp->ds_mdsclid, &gsp->clid, sizeof(clientid_t)); ++ set_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ clear_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ return 0; ++} ++ ++int ++nfs4_pnfs_cb_change_state(struct pnfs_get_state *gs) ++{ ++ stateid_t *stid = (stateid_t *)&gs->stid; ++ struct pnfs_ds_stateid *dsp; ++ ++ dprintk("pNFSD: %s stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stid)); ++ ++ ds_lock_state(); ++ dsp = find_pnfs_ds_stateid(stid); ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ if (dsp) ++ return 0; ++ return -ENOENT; ++} ++ ++/* Retrieves and validates stateid. ++ * If stateid exists and its fields match, return it. ++ * If stateid exists but either the generation or ++ * ownerids don't match, check with mds to see if it is valid. ++ * If the stateid doesn't exist, the first thread creates a ++ * invalid *marker* stateid, then checks to see if the ++ * stateid exists on the mds. If so, it validates the *marker* ++ * stateid and updates its fields. Subsequent threads that ++ * find the *marker* stateid wait until it is valid or an error ++ * occurs. ++ * Called with ds_state_lock. ++ */ ++static struct pnfs_ds_stateid * ++nfsv4_ds_get_state(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct inode *ino = cfh->fh_dentry->d_inode; ++ struct super_block *sb; ++ struct pnfs_ds_stateid *dsp = NULL; ++ struct pnfs_get_state gs = { ++ .access = 0, ++ }; ++ int status = 0, waiter = 0; ++ ++ dprintk("pNFSD: %s -->\n", __func__); ++ ++ dsp = find_pnfs_ds_stateid(stidp); ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags) && ++ (stidp->si_generation == dsp->ds_stid.si_generation)) ++ goto out_noput; ++ ++ sb = ino->i_sb; ++ if (!sb || !sb->s_pnfs_op->get_state) ++ goto out_noput; ++ ++ /* Uninitialize current state if it exists yet it doesn't match. ++ * If it is already invalid, another thread is checking state */ ++ if (dsp) { ++ if (!test_and_clear_bit(DS_STATEID_VALID, &dsp->ds_flags)) ++ waiter = 1; ++ } else { ++ dsp = alloc_init_ds_stateid(cfh, stidp); ++ if (!dsp) ++ goto out_noput; ++ } ++ ++ dprintk("pNFSD: %s Starting loop\n", __func__); ++ get_ds_stateid(dsp); ++ while (!test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ ds_unlock_state(); ++ ++ /* Another thread is checking the state */ ++ if (waiter) { ++ dprintk("pNFSD: %s waiting\n", __func__); ++ wait_event_interruptible_timeout(dsp->ds_waitq, ++ (test_bit(DS_STATEID_VALID, &dsp->ds_flags) || ++ test_bit(DS_STATEID_ERROR, &dsp->ds_flags)), ++ msecs_to_jiffies(1024)); ++ dprintk("pNFSD: %s awake\n", __func__); ++ ds_lock_state(); ++ if (test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ goto out; ++ ++ continue; ++ } ++ ++ /* Validate stateid on mds */ ++ dprintk("pNFSD: %s Checking state on MDS\n", __func__); ++ memcpy(&gs.stid, stidp, sizeof(stateid_t)); ++ status = sb->s_pnfs_op->get_state(ino, &cfh->fh_handle, &gs); ++ dprintk("pNFSD: %s from MDS status %d\n", __func__, status); ++ ds_lock_state(); ++ /* if !status and stateid is valid, update id and mark valid */ ++ if (status || update_ds_stateid(dsp, cfh, &gs)) { ++ set_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ /* remove invalid stateid from list */ ++ put_ds_stateid(dsp); ++ wake_up(&dsp->ds_waitq); ++ goto out; ++ } ++ ++ wake_up(&dsp->ds_waitq); ++ } ++out: ++ if (dsp) ++ put_ds_stateid(dsp); ++out_noput: ++ if (dsp) ++ dprintk("pNFSD: %s <-- dsp %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, dsp, dsp->ds_flags, STATEID_VAL(&dsp->ds_stid)); ++ /* If error, return null */ ++ if (dsp && test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ dsp = NULL; ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++int ++nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *cfh, stateid_t *stateid) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int status = 0; ++ ++ dprintk("pNFSD: %s --> " STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ /* Must release state lock while verifying stateid on mds */ ++ nfs4_unlock_state(); ++ ds_lock_state(); ++ dsp = nfsv4_ds_get_state(cfh, stateid); ++ if (dsp) { ++ get_ds_stateid(dsp); ++ dprintk("pNFSD: %s Found " STATEID_FMT "\n", __func__, ++ STATEID_VAL(&dsp->ds_stid)); ++ ++ dprintk("NFSD: %s: dsp %p fh_size %u:%u " ++ "fh [%08x:%08x:%08x:%08x]:[%08x:%08x:%08x:%08x] " ++ "gen %x:%x\n", ++ __func__, dsp, ++ cfh->fh_handle.fh_size, dsp->ds_fh.fh_size, ++ ((unsigned *)&cfh->fh_handle.fh_base)[0], ++ ((unsigned *)&cfh->fh_handle.fh_base)[1], ++ ((unsigned *)&cfh->fh_handle.fh_base)[2], ++ ((unsigned *)&cfh->fh_handle.fh_base)[3], ++ ((unsigned *)&dsp->ds_fh.fh_base)[0], ++ ((unsigned *)&dsp->ds_fh.fh_base)[1], ++ ((unsigned *)&dsp->ds_fh.fh_base)[2], ++ ((unsigned *)&dsp->ds_fh.fh_base)[3], ++ stateid->si_generation, dsp->ds_stid.si_generation); ++ } ++ ++ if (!dsp || ++ (cfh->fh_handle.fh_size != dsp->ds_fh.fh_size) || ++ (memcmp(&cfh->fh_handle.fh_base, &dsp->ds_fh.fh_base, ++ dsp->ds_fh.fh_size) != 0) || ++ (stateid->si_generation > dsp->ds_stid.si_generation)) ++ status = nfserr_bad_stateid; ++ else if (stateid->si_generation < dsp->ds_stid.si_generation) ++ status = nfserr_old_stateid; ++ ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ nfs4_lock_state(); ++ dprintk("pNFSD: %s <-- status %d\n", __func__, be32_to_cpu(status)); ++ return status; ++} ++ ++void ++nfs4_ds_get_verifier(stateid_t *stateid, struct super_block *sb, u32 *p) ++{ ++ struct pnfs_ds_stateid *dsp = NULL; ++ ++ dprintk("pNFSD: %s --> stid %p\n", __func__, stateid); ++ ++ ds_lock_state(); ++ if (stateid != NULL) { ++ dsp = find_pnfs_ds_stateid(stateid); ++ if (dsp) ++ get_ds_stateid(dsp); ++ } ++ ++ /* XXX: Should we fetch the stateid or wait if some other ++ * thread is currently retrieving the stateid ? */ ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ *p++ = dsp->ds_verifier[0]; ++ *p++ = dsp->ds_verifier[1]; ++ put_ds_stateid(dsp); ++ } else { ++ /* must be on MDS */ ++ ds_unlock_state(); ++ sb->s_pnfs_op->get_verifier(sb, p); ++ ds_lock_state(); ++ p += 2; ++ } ++ ds_unlock_state(); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-09-30 10:15:18.334728000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-09-30 10:17:08.878998000 -0400 +@@ -34,10 +34,14 @@ + */ + #include + #include ++#include ++#include ++#include + + #include "cache.h" + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_PROC + +@@ -372,6 +376,24 @@ nfsd4_open(struct svc_rqst *rqstp, struc + * set, (2) sets open->op_stateid, (3) sets open->op_delegation. + */ + status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); ++#if defined(CONFIG_SPNFS) ++ if (!status && spnfs_enabled()) { ++ struct inode *inode = cstate->current_fh.fh_dentry->d_inode; ++ ++ status = spnfs_open(inode, open); ++ if (status) { ++ dprintk( ++ "nfsd: pNFS could not be enabled for inode: %lu\n", ++ inode->i_ino); ++ /* ++ * XXX When there's a failure then need to indicate to ++ * future ops that no pNFS is available. Should I save ++ * the status in the inode? It's kind of a big hammer. ++ * But there may be no stripes available? ++ */ ++ } ++ } ++#endif /* CONFIG_SPNFS */ + out: + if (open->op_stateowner) { + nfs4_get_stateowner(open->op_stateowner); +@@ -454,16 +476,30 @@ nfsd4_access(struct svc_rqst *rqstp, str + &access->ac_supported); + } + ++static void ++nfsd4_get_verifier(struct super_block *sb, nfs4_verifier *verf) ++{ ++ u32 *p = (u32 *)verf->data; ++ ++#if defined(CONFIG_PNFSD) ++ if (sb->s_pnfs_op && sb->s_pnfs_op->get_verifier) { ++ nfs4_ds_get_verifier(NULL, sb, p); ++ return; ++ } ++#endif /* CONFIG_PNFSD */ ++ ++ *p++ = nfssvc_boot.tv_sec; ++ *p++ = nfssvc_boot.tv_usec; ++} ++ + static __be32 + nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_commit *commit) + { + __be32 status; + +- u32 *p = (u32 *)commit->co_verf.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; +- ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &commit->co_verf); + status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + commit->co_count); + if (status == nfserr_symlink) +@@ -816,7 +852,6 @@ nfsd4_write(struct svc_rqst *rqstp, stru + { + stateid_t *stateid = &write->wr_stateid; + struct file *filp = NULL; +- u32 *p; + __be32 status = nfs_ok; + unsigned long cnt; + +@@ -838,13 +873,49 @@ nfsd4_write(struct svc_rqst *rqstp, stru + + cnt = write->wr_buflen; + write->wr_how_written = write->wr_stable_how; +- p = (u32 *)write->wr_verifier.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; + ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &write->wr_verifier); ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(cstate->current_fh.fh_dentry->d_inode, 0)) { ++ status = bl_layoutrecall(cstate->current_fh.fh_dentry->d_inode, ++ RETURN_FILE, write->wr_offset, write->wr_buflen); ++ if (!status) { ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++ } ++ } else ++#endif ++ ++ if (spnfs_enabled()) { ++ status = spnfs_write(cstate->current_fh.fh_dentry->d_inode, ++ write->wr_offset, write->wr_buflen, write->wr_vlen, ++ rqstp); ++ if (status == nfs_ok) { ++ /* DMXXX: HACK to get filesize set */ ++ /* write one byte at offset+length-1 */ ++ struct kvec k[1]; ++ char zero = 0; ++ unsigned long cnt = 1; ++ ++ k[0].iov_base = (void *)&zero; ++ k[0].iov_len = 1; ++ nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset+write->wr_buflen-1, k, 1, ++ &cnt, &write->wr_how_written); ++ } ++ } else /* we're not an MDS */ ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++#else + status = nfsd_write(rqstp, &cstate->current_fh, filp, + write->wr_offset, rqstp->rq_vec, write->wr_vlen, + &cnt, &write->wr_how_written); ++#endif /* CONFIG_SPNFS */ ++ + if (filp) + fput(filp); + +@@ -935,6 +1006,306 @@ nfsd4_verify(struct svc_rqst *rqstp, str + return status == nfserr_same ? nfs_ok : status; + } + ++#if defined(CONFIG_PNFSD) ++ ++static __be32 ++nfsd4_layout_verify(struct super_block *sb, struct svc_export *exp, ++ unsigned int layout_type) ++{ ++ int status, type; ++ ++ /* check to see if pNFS is supported. */ ++ status = nfserr_layoutunavailable; ++ if (exp && exp->ex_pnfs == 0) { ++ dprintk("%s: Underlying file system " ++ "is not exported over pNFS\n", __func__); ++ goto out; ++ } ++ if (!sb->s_pnfs_op || !sb->s_pnfs_op->layout_type) { ++ dprintk("%s: Underlying file system " ++ "does not support pNFS\n", __func__); ++ goto out; ++ } ++ ++ type = sb->s_pnfs_op->layout_type(sb); ++ ++ /* check to see if requested layout type is supported. */ ++ status = nfserr_unknown_layouttype; ++ if (!type) ++ dprintk("BUG: %s: layout_type 0 is reserved and must not be " ++ "used by filesystem\n", __func__); ++ else if (type != layout_type) ++ dprintk("%s: requested layout type %d " ++ "does not match supported type %d\n", ++ __func__, layout_type, type); ++ else ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevlist(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevlist *gdlp) ++{ ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ int status; ++ ++ dprintk("%s: type %u maxdevices %u cookie %llu verf %llu\n", ++ __func__, gdlp->gd_layout_type, gdlp->gd_maxdevices, ++ gdlp->gd_cookie, gdlp->gd_verf); ++ ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* We must be able to encode at list one device */ ++ if (!gdlp->gd_maxdevices) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ gdlp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Do nothing if underlying file system does not support ++ * getdevicelist */ ++ if (!sb->s_pnfs_op->get_device_iter) { ++ status = nfserr_notsupp; ++ goto out; ++ } ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdlp->gd_fhp = &cstate->current_fh; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevinfo(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevinfo *gdp) ++{ ++ struct super_block *sb; ++ int status; ++ clientid_t clid; ++ ++ dprintk("%s: layout_type %u dev_id %llx:%llx maxcnt %u\n", ++ __func__, gdp->gd_layout_type, gdp->gd_devid.sbid, ++ gdp->gd_devid.devid, gdp->gd_maxcount); ++ ++ status = nfserr_inval; ++ sb = find_sbid_id(gdp->gd_devid.sbid); ++ dprintk("%s: sb %p\n", __func__, sb); ++ if (!sb) { ++ status = nfserr_noent; ++ goto out; ++ } ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, NULL, gdp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdp->gd_sb = sb; ++ ++ /* Update notifications */ ++ copy_clientid(&clid, cstate->session); ++ pnfs_set_device_notify(&clid, gdp->gd_notify_types); ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutget(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lgp->lg_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_badiomode; ++ if (lgp->lg_seg.iomode != IOMODE_READ && ++ lgp->lg_seg.iomode != IOMODE_RW) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lgp->lg_seg.iomode); ++ goto out; ++ } ++ ++ /* Set up arguments so layout can be retrieved at encode time */ ++ lgp->lg_fhp = current_fh; ++ copy_clientid((clientid_t *)&lgp->lg_seg.clientid, cstate->session); ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutcommit(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ int status; ++ struct inode *ino = NULL; ++ struct iattr ia; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ dprintk("NFSD: nfsd4_layoutcommit \n"); ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ ino = current_fh->fh_dentry->d_inode; ++ if (!ino) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = ino->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lcp->args.lc_seg.layout_type); ++ if (status) ++ goto out; ++ ++ /* This will only extend the file length. Do a quick ++ * check to see if there is any point in waiting for the update ++ * locks. ++ * TODO: Is this correct for all back ends? ++ */ ++ dprintk("%s:new offset: %d new size: %llu old size: %lld\n", ++ __func__, lcp->args.lc_newoffset, lcp->args.lc_last_wr + 1, ++ ino->i_size); ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lcp->args.lc_seg.clientid, cstate->session); ++ lcp->res.lc_size_chg = 0; ++ if (sb->s_pnfs_op->layout_commit) { ++ status = sb->s_pnfs_op->layout_commit(ino, &lcp->args, &lcp->res); ++ dprintk("%s:layout_commit result %d\n", __func__, status); ++ } else { ++ fh_lock(current_fh); ++ if ((lcp->args.lc_newoffset == 0) || ++ ((lcp->args.lc_last_wr + 1) <= ino->i_size)) { ++ status = 0; ++ lcp->res.lc_size_chg = 0; ++ fh_unlock(current_fh); ++ goto out; ++ } ++ ++ /* Try our best to update the file size */ ++ dprintk("%s: Modifying file size\n", __func__); ++ ia.ia_valid = ATTR_SIZE; ++ ia.ia_size = lcp->args.lc_last_wr + 1; ++ status = notify_change(current_fh->fh_dentry, &ia); ++ fh_unlock(current_fh); ++ dprintk("%s:notify_change result %d\n", __func__, status); ++ } ++ ++ if (!status && lcp->res.lc_size_chg && ++ EX_ISSYNC(current_fh->fh_export)) { ++ dprintk("%s: Synchronously writing inode size %llu\n", ++ __func__, ino->i_size); ++ write_inode_now(ino, 1); ++ lcp->res.lc_newsize = i_size_read(ino); ++ } ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutreturn(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lrp->args.lr_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_return_type != RETURN_FILE && ++ lrp->args.lr_return_type != RETURN_FSID && ++ lrp->args.lr_return_type != RETURN_ALL) { ++ dprintk("pNFS %s: invalid return_type %d\n", __func__, ++ lrp->args.lr_return_type); ++ goto out; ++ } ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_seg.iomode != IOMODE_READ && ++ lrp->args.lr_seg.iomode != IOMODE_RW && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lrp->args.lr_seg.iomode); ++ goto out; ++ } ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lrp->args.lr_seg.clientid, cstate->session); ++ lrp->lrs_present = (lrp->args.lr_return_type == RETURN_FILE); ++ status = nfs4_pnfs_return_layout(sb, current_fh, lrp); ++out: ++ dprintk("pNFS %s: status %d return_type 0x%x lrs_present %d\n", ++ __func__, status, lrp->args.lr_return_type, lrp->lrs_present); ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * NULL call. + */ +@@ -1317,6 +1688,29 @@ static struct nfsd4_operation nfsd4_ops[ + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_RECLAIM_COMPLETE", + }, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICELIST] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevlist, ++ .op_name = "OP_GETDEVICELIST", ++ }, ++ [OP_GETDEVICEINFO] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevinfo, ++ .op_flags = ALLOWED_WITHOUT_FH, ++ .op_name = "OP_GETDEVICEINFO", ++ }, ++ [OP_LAYOUTGET] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutget, ++ .op_name = "OP_LAYOUTGET", ++ }, ++ [OP_LAYOUTCOMMIT] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutcommit, ++ .op_name = "OP_LAYOUTCOMMIT", ++ }, ++ [OP_LAYOUTRETURN] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutreturn, ++ .op_name = "OP_LAYOUTRETURN", ++ }, ++#endif /* CONFIG_PNFSD */ + }; + + static const char *nfsd4_op_name(unsigned opnum) +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-09-30 10:15:18.345729000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-09-30 10:17:08.887003000 -0400 +@@ -42,6 +42,8 @@ + #include "xdr4.h" + #include "vfs.h" + ++#include "pnfsd.h" ++ + #define NFSDDBG_FACILITY NFSDDBG_PROC + + /* Globals */ +@@ -60,8 +62,6 @@ static u64 current_sessionid = 1; + #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) + + /* forward declarations */ +-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); +-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); + static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; + static void nfs4_set_recdir(char *recdir); + +@@ -69,6 +69,7 @@ static void nfs4_set_recdir(char *recdir + + /* Currently used for almost all code touching nfsv4 state: */ + static DEFINE_MUTEX(client_mutex); ++struct task_struct *client_mutex_owner; + + /* + * Currently used for the del_recall_lru and file hash table. In an +@@ -86,11 +87,21 @@ void + nfs4_lock_state(void) + { + mutex_lock(&client_mutex); ++ client_mutex_owner = current; ++} ++ ++#define BUG_ON_UNLOCKED_STATE() BUG_ON(client_mutex_owner != current) ++ ++void ++nfs4_bug_on_unlocked_state(void) ++{ ++ BUG_ON(client_mutex_owner != current); + } + + void + nfs4_unlock_state(void) + { ++ client_mutex_owner = NULL; + mutex_unlock(&client_mutex); + } + +@@ -109,7 +120,7 @@ opaque_hashval(const void *ptr, int nbyt + + static struct list_head del_recall_lru; + +-static inline void ++inline void + put_nfs4_file(struct nfs4_file *fi) + { + if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { +@@ -120,7 +131,7 @@ put_nfs4_file(struct nfs4_file *fi) + } + } + +-static inline void ++inline void + get_nfs4_file(struct nfs4_file *fi) + { + atomic_inc(&fi->fi_ref); +@@ -230,7 +241,10 @@ nfs4_close_delegation(struct nfs4_delega + * but we want to remove the lease in any case. */ + if (dp->dl_flock) + vfs_setlease(filp, F_UNLCK, &dp->dl_flock); ++ BUG_ON_UNLOCKED_STATE(); ++ nfs4_unlock_state(); /* allow nested layout recall/return */ + nfsd_close(filp); ++ nfs4_lock_state(); + } + + /* Called under the state lock. */ +@@ -266,8 +280,8 @@ static DEFINE_SPINLOCK(client_lock); + * reclaim_str_hashtbl[] holds known client info from previous reset/reboot + * used in reboot/reset lease grace period processing + * +- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed +- * setclientid_confirmed info. ++ * conf_id_hashtbl[], and conf_str_hashtbl[] hold ++ * confirmed setclientid_confirmed info. + * + * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed + * setclientid info. +@@ -292,6 +306,7 @@ static void unhash_generic_stateid(struc + list_del(&stp->st_hash); + list_del(&stp->st_perfile); + list_del(&stp->st_perstateowner); ++ release_pnfs_ds_dev_list(stp); + } + + static void free_generic_stateid(struct nfs4_stateid *stp) +@@ -345,7 +360,10 @@ static void release_open_stateid(struct + { + unhash_generic_stateid(stp); + release_stateid_lockowners(stp); ++ BUG_ON_UNLOCKED_STATE(); ++ nfs4_unlock_state(); /* allow nested layout recall/return */ + nfsd_close(stp->st_vfs_file); ++ nfs4_lock_state(); + free_generic_stateid(stp); + } + +@@ -739,6 +757,8 @@ expire_client(struct nfs4_client *clp) + struct nfs4_delegation *dp; + struct list_head reaplist; + ++ BUG_ON_UNLOCKED_STATE(); ++ + INIT_LIST_HEAD(&reaplist); + spin_lock(&recall_lock); + while (!list_empty(&clp->cl_delegations)) { +@@ -758,6 +778,7 @@ expire_client(struct nfs4_client *clp) + sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); + release_openowner(sop); + } ++ pnfs_expire_client(clp); + nfsd4_set_callback_client(clp, NULL); + if (clp->cl_cb_conn.cb_xprt) + svc_xprt_put(clp->cl_cb_conn.cb_xprt); +@@ -770,6 +791,13 @@ expire_client(struct nfs4_client *clp) + spin_unlock(&client_lock); + } + ++void expire_client_lock(struct nfs4_client *clp) ++{ ++ nfs4_lock_state(); ++ expire_client(clp); ++ nfs4_unlock_state(); ++} ++ + static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) + { + memcpy(target->cl_verifier.data, source->data, +@@ -859,6 +887,11 @@ static struct nfs4_client *create_client + INIT_LIST_HEAD(&clp->cl_strhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&clp->cl_layouts); ++ INIT_LIST_HEAD(&clp->cl_layoutrecalls); ++ atomic_set(&clp->cl_deviceref, 0); ++#endif /* CONFIG_PNFSD */ + INIT_LIST_HEAD(&clp->cl_sessions); + INIT_LIST_HEAD(&clp->cl_lru); + clp->cl_time = get_seconds(); +@@ -908,7 +941,7 @@ move_to_confirmed(struct nfs4_client *cl + renew_client(clp); + } + +-static struct nfs4_client * ++struct nfs4_client * + find_confirmed_client(clientid_t *clid) + { + struct nfs4_client *clp; +@@ -978,6 +1011,24 @@ find_unconfirmed_client_by_str(const cha + return NULL; + } + ++int ++filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), ++ void *arg) ++{ ++ struct nfs4_client *clp, *next; ++ int i, status = 0; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ list_for_each_entry_safe (clp, next, &conf_str_hashtbl[i], ++ cl_strhash) { ++ status = func(clp, arg); ++ if (status) ++ break; ++ } ++ ++ return status; ++} ++ + static void + gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) + { +@@ -1110,8 +1161,12 @@ nfsd4_replay_cache_entry(struct nfsd4_co + static void + nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) + { +- /* pNFS is not supported */ ++#if defined(CONFIG_PNFSD) ++ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS | ++ EXCHGID4_FLAG_USE_PNFS_DS; ++#else /* CONFIG_PNFSD */ + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; ++#endif /* CONFIG_PNFSD */ + + /* Referrals are supported, Migration is not. */ + new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; +@@ -1301,6 +1356,13 @@ nfsd4_create_session(struct svc_rqst *rq + struct nfsd4_clid_slot *cs_slot = NULL; + int status = 0; + ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ /* XXX hack to get local ip address */ ++ memcpy(&pnfsd_lexp_addr, &rqstp->rq_xprt->xpt_local, ++ sizeof(pnfsd_lexp_addr)); ++ pnfs_lexp_addr_len = rqstp->rq_xprt->xpt_locallen; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ + nfs4_lock_state(); + unconf = find_unconfirmed_client(&cr_ses->clientid); + conf = find_confirmed_client(&cr_ses->clientid); +@@ -1340,25 +1402,26 @@ nfsd4_create_session(struct svc_rqst *rq + cs_slot->sl_seqid++; /* from 0 to 1 */ + move_to_confirmed(unconf); + +- if (cr_ses->flags & SESSION4_BACK_CHAN) { +- unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; +- svc_xprt_get(rqstp->rq_xprt); +- rpc_copy_addr( +- (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, +- sa); +- unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); +- unconf->cl_cb_conn.cb_minorversion = +- cstate->minorversion; +- unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; +- unconf->cl_cb_seq_nr = 1; +- nfsd4_probe_callback(unconf, &unconf->cl_cb_conn); +- } ++ if (is_ds_only_session(unconf->cl_exchange_flags)) ++ cr_ses->flags &= ~SESSION4_BACK_CHAN; ++ + conf = unconf; + } else { + status = nfserr_stale_clientid; + goto out; + } + ++ if (cr_ses->flags & SESSION4_BACK_CHAN) { ++ conf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; ++ svc_xprt_get(rqstp->rq_xprt); ++ rpc_copy_addr((struct sockaddr *)&conf->cl_cb_conn.cb_addr, sa); ++ conf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); ++ conf->cl_cb_conn.cb_minorversion = cstate->minorversion; ++ conf->cl_cb_conn.cb_prog = cr_ses->callback_prog; ++ conf->cl_cb_seq_nr = 1; ++ nfsd4_probe_callback(conf, &conf->cl_cb_conn); ++ } ++ + /* + * We do not support RDMA or persistent sessions + */ +@@ -1746,7 +1809,7 @@ out: + + /* OPEN Share state helper functions */ + static inline struct nfs4_file * +-alloc_init_file(struct inode *ino) ++alloc_init_file(struct inode *ino, struct svc_fh *current_fh) + { + struct nfs4_file *fp; + unsigned int hashval = file_hashval(ino); +@@ -1760,6 +1823,16 @@ alloc_init_file(struct inode *ino) + fp->fi_inode = igrab(ino); + fp->fi_id = current_fileid++; + fp->fi_had_conflict = false; ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&fp->fi_layouts); ++ INIT_LIST_HEAD(&fp->fi_layout_states); ++ fp->fi_fsid.major = current_fh->fh_export->ex_fsid; ++ fp->fi_fsid.minor = 0; ++ fp->fi_fhlen = current_fh->fh_handle.fh_size; ++ BUG_ON(fp->fi_fhlen > sizeof(fp->fi_fhval)); ++ memcpy(fp->fi_fhval, ¤t_fh->fh_handle.fh_base, ++ fp->fi_fhlen); ++#endif /* CONFIG_PNFSD */ + spin_lock(&recall_lock); + list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); +@@ -1768,7 +1841,7 @@ alloc_init_file(struct inode *ino) + return NULL; + } + +-static void ++void + nfsd4_free_slab(struct kmem_cache **slab) + { + if (*slab == NULL) +@@ -1784,6 +1857,7 @@ nfsd4_free_slabs(void) + nfsd4_free_slab(&file_slab); + nfsd4_free_slab(&stateid_slab); + nfsd4_free_slab(&deleg_slab); ++ nfsd4_free_pnfs_slabs(); + } + + static int +@@ -1805,6 +1879,8 @@ nfsd4_init_slabs(void) + sizeof(struct nfs4_delegation), 0, 0, NULL); + if (deleg_slab == NULL) + goto out_nomem; ++ if (nfsd4_init_pnfs_slabs()) ++ goto out_nomem; + return 0; + out_nomem: + nfsd4_free_slabs(); +@@ -1878,6 +1954,9 @@ init_stateid(struct nfs4_stateid *stp, s + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); + INIT_LIST_HEAD(&stp->st_perfile); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &stateid_hashtbl[hashval]); + list_add(&stp->st_perstateowner, &sop->so_stateids); + list_add(&stp->st_perfile, &fp->fi_stateids); +@@ -1919,6 +1998,7 @@ find_openstateowner_str(unsigned int has + { + struct nfs4_stateowner *so = NULL; + ++ BUG_ON_UNLOCKED_STATE(); + list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { + if (same_owner_str(so, &open->op_owner, &open->op_clientid)) + return so; +@@ -1927,7 +2007,7 @@ find_openstateowner_str(unsigned int has + } + + /* search file_hashtbl[] for file */ +-static struct nfs4_file * ++struct nfs4_file * + find_file(struct inode *ino) + { + unsigned int hashval = file_hashval(ino); +@@ -1945,6 +2025,18 @@ find_file(struct inode *ino) + return NULL; + } + ++struct nfs4_file * ++find_alloc_file(struct inode *ino, struct svc_fh *current_fh) ++{ ++ struct nfs4_file *fp; ++ ++ fp = find_file(ino); ++ if (fp) ++ return fp; ++ ++ return alloc_init_file(ino, current_fh); ++} ++ + static inline int access_valid(u32 x, u32 minorversion) + { + if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) +@@ -2503,7 +2595,7 @@ nfsd4_process_open2(struct svc_rqst *rqs + if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) + goto out; + status = nfserr_resource; +- fp = alloc_init_file(ino); ++ fp = alloc_init_file(ino, current_fh); + if (fp == NULL) + goto out; + } +@@ -2730,7 +2822,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct + return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_path.dentry->d_inode; + } + +-static int ++int + STALE_STATEID(stateid_t *stateid) + { + if (stateid->si_boot == boot_time) +@@ -2740,6 +2832,16 @@ STALE_STATEID(stateid_t *stateid) + return 1; + } + ++__be32 ++nfs4_check_stateid(stateid_t *stateid) ++{ ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ return nfserr_bad_stateid; ++ if (STALE_STATEID(stateid)) ++ return nfserr_stale_stateid; ++ return 0; ++} ++ + static inline int + access_permit_read(unsigned long access_bmap) + { +@@ -2848,6 +2950,24 @@ nfs4_preprocess_stateid_op(struct nfsd4_ + if (grace_disallows_io(ino)) + return nfserr_grace; + ++#if defined(CONFIG_PNFSD) ++ if (pnfs_fh_is_ds(¤t_fh->fh_handle)) { ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ status = nfserr_bad_stateid; ++ else ++#ifdef CONFIG_GFS2_FS_LOCKING_DLM ++ { ++ dprintk("%s Don't check DS stateid\n", __func__); ++ return 0; ++ } ++#else /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ status = nfs4_preprocess_pnfs_ds_stateid(current_fh, ++ stateid); ++#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ goto out; ++ } ++#endif /* CONFIG_PNFSD */ ++ + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + +@@ -2924,13 +3044,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_co + *stpp = NULL; + *sopp = NULL; + +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { +- dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); +- return nfserr_bad_stateid; +- } +- +- if (STALE_STATEID(stateid)) +- return nfserr_stale_stateid; ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ return status; + + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; +@@ -3205,11 +3321,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + nfs4_lock_state(); +- status = nfserr_bad_stateid; +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) +- goto out; +- status = nfserr_stale_stateid; +- if (STALE_STATEID(stateid)) ++ status = nfs4_check_stateid(stateid); ++ if (status) + goto out; + status = nfserr_bad_stateid; + if (!is_delegation_stateid(stateid)) +@@ -3238,26 +3351,6 @@ out: + #define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) + #define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) + +-static inline u64 +-end_offset(u64 start, u64 len) +-{ +- u64 end; +- +- end = start + len; +- return end >= start ? end: NFS4_MAX_UINT64; +-} +- +-/* last octet in a range */ +-static inline u64 +-last_byte_offset(u64 start, u64 len) +-{ +- u64 end; +- +- BUG_ON(!len); +- end = start + len; +- return end > start ? end - 1: NFS4_MAX_UINT64; +-} +- + #define lockownerid_hashval(id) \ + ((id) & LOCK_HASH_MASK) + +@@ -3274,7 +3367,7 @@ static struct list_head lock_ownerid_has + static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; + static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; + +-static struct nfs4_stateid * ++struct nfs4_stateid * + find_stateid(stateid_t *stid, int flags) + { + struct nfs4_stateid *local; +@@ -3303,7 +3396,7 @@ find_stateid(stateid_t *stid, int flags) + return NULL; + } + +-static struct nfs4_delegation * ++struct nfs4_delegation * + find_delegation_stateid(struct inode *ino, stateid_t *stid) + { + struct nfs4_file *fp; +@@ -3436,6 +3529,9 @@ alloc_init_lock_stateid(struct nfs4_stat + INIT_LIST_HEAD(&stp->st_perfile); + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]); + list_add(&stp->st_perfile, &fp->fi_stateids); + list_add(&stp->st_perstateowner, &sop->so_stateids); +@@ -3998,6 +4094,9 @@ nfs4_state_init(void) + INIT_LIST_HEAD(&client_lru); + INIT_LIST_HEAD(&del_recall_lru); + reclaim_str_hashtbl_size = 0; ++#if defined(CONFIG_PNFSD) ++ nfs4_pnfs_state_init(); ++#endif /* CONFIG_PNFSD */ + return 0; + } + +@@ -4110,6 +4209,7 @@ __nfs4_state_shutdown(void) + } + + nfsd4_shutdown_recdir(); ++ nfs4_pnfs_state_shutdown(); + nfs4_init = 0; + } + +diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-09-30 10:15:18.353734000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-09-30 10:17:08.894999000 -0400 +@@ -47,9 +47,14 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_XDR + +@@ -1234,6 +1239,138 @@ nfsd4_decode_sequence(struct nfsd4_compo + DECODE_TAIL; + } + ++#if defined(CONFIG_PNFSD) ++static __be32 ++nfsd4_decode_getdevlist(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16 + sizeof(nfs4_verifier)); ++ READ32(gdevl->gd_layout_type); ++ READ32(gdevl->gd_maxdevices); ++ READ64(gdevl->gd_cookie); ++ COPYMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_getdevinfo(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ u32 num; ++ DECODE_HEAD; ++ ++ READ_BUF(12 + sizeof(struct nfsd4_pnfs_deviceid)); ++ READ64(gdev->gd_devid.sbid); ++ READ64(gdev->gd_devid.devid); ++ READ32(gdev->gd_layout_type); ++ READ32(gdev->gd_maxcount); ++ READ32(num); ++ if (num) { ++ READ_BUF(4); ++ READ32(gdev->gd_notify_types); ++ } else { ++ gdev->gd_notify_types = 0; ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(36); ++ READ32(lgp->lg_signal); ++ READ32(lgp->lg_seg.layout_type); ++ READ32(lgp->lg_seg.iomode); ++ READ64(lgp->lg_seg.offset); ++ READ64(lgp->lg_seg.length); ++ READ64(lgp->lg_minlength); ++ nfsd4_decode_stateid(argp, &lgp->lg_sid); ++ READ_BUF(4); ++ READ32(lgp->lg_maxcount); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ DECODE_HEAD; ++ u32 timechange; ++ ++ READ_BUF(20); ++ READ64(lcp->args.lc_seg.offset); ++ READ64(lcp->args.lc_seg.length); ++ READ32(lcp->args.lc_reclaim); ++ nfsd4_decode_stateid(argp, &lcp->lc_sid); ++ READ_BUF(4); ++ READ32(lcp->args.lc_newoffset); ++ if (lcp->args.lc_newoffset) { ++ READ_BUF(8); ++ READ64(lcp->args.lc_last_wr); ++ } else ++ lcp->args.lc_last_wr = 0; ++ READ_BUF(4); ++ READ32(timechange); ++ if (timechange) { ++ READ_BUF(12); ++ READ64(lcp->args.lc_mtime.seconds); ++ READ32(lcp->args.lc_mtime.nseconds); ++ } else { ++ lcp->args.lc_mtime.seconds = 0; ++ lcp->args.lc_mtime.nseconds = 0; ++ } ++ READ_BUF(8); ++ READ32(lcp->args.lc_seg.layout_type); ++ /* XXX: saving XDR'ed layout update. Since we don't have the ++ * current_fh yet, and therefore no export_ops, we can't call ++ * the layout specific decode routines. File and pVFS2 ++ * do not use the layout update.... ++ */ ++ READ32(lcp->args.lc_up_len); ++ if (lcp->args.lc_up_len > 0) { ++ READ_BUF(lcp->args.lc_up_len); ++ READMEM(lcp->args.lc_up_layout, lcp->args.lc_up_len); ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16); ++ READ32(lrp->args.lr_reclaim); ++ READ32(lrp->args.lr_seg.layout_type); ++ READ32(lrp->args.lr_seg.iomode); ++ READ32(lrp->args.lr_return_type); ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ READ_BUF(16); ++ READ64(lrp->args.lr_seg.offset); ++ READ64(lrp->args.lr_seg.length); ++ nfsd4_decode_stateid(argp, &lrp->lr_sid); ++ READ_BUF(4); ++ READ32(lrp->args.lrf_body_len); ++ if (lrp->args.lrf_body_len > 0) { ++ READ_BUF(lrp->args.lrf_body_len); ++ READMEM(lrp->args.lrf_body, lrp->args.lrf_body_len); ++ } ++ } ++ ++ DECODE_TAIL; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) + { +@@ -1335,11 +1472,19 @@ static nfsd4_dec nfsd41_dec_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, + [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, +@@ -2136,6 +2281,36 @@ out_acl: + } + WRITE64(stat.ino); + } ++#if defined(CONFIG_PNFSD) ++ if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { ++ struct super_block *sb = dentry->d_inode->i_sb; ++ int type = 0; ++ ++ /* Query the filesystem for supported pNFS layout types. ++ * Currently, we only support one layout type per file system. ++ * The export_ops->layout_type() returns the pnfs_layouttype4. ++ */ ++ buflen -= 4; ++ if (buflen < 0) /* length */ ++ goto out_resource; ++ ++ if (sb && sb->s_pnfs_op && sb->s_pnfs_op->layout_type) ++ type = sb->s_pnfs_op->layout_type(sb); ++ if (type) { ++ if ((buflen -= 4) < 0) /* type */ ++ goto out_resource; ++ WRITE32(1); /* length */ ++ WRITE32(type); /* type */ ++ } else ++ WRITE32(0); /* length */ ++ } ++ ++ if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ if ((buflen -= 4) < 0) ++ goto out_resource; ++ WRITE32(stat.blksize); ++ } ++#endif /* CONFIG_PNFSD */ + if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + WRITE32(3); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); +@@ -2366,6 +2541,10 @@ nfsd4_encode_commit(struct nfsd4_compoun + if (!nfserr) { + RESERVE_SPACE(8); + WRITEMEM(commit->co_verf.data, 8); ++ dprintk("NFSD: nfsd4_encode_commit: verifier %x:%x\n", ++ ((u32 *)(&commit->co_verf.data))[0], ++ ((u32 *)(&commit->co_verf.data))[1]); ++ + ADJUST_ARGS(); + } + return nfserr; +@@ -2620,9 +2799,20 @@ nfsd4_encode_read(struct nfsd4_compoundr + } + read->rd_vlen = v; + ++#if defined(CONFIG_SPNFS) ++ if (spnfs_enabled()) ++ nfserr = spnfs_read(read->rd_fhp->fh_dentry->d_inode, ++ read->rd_offset, &maxcount, read->rd_vlen, ++ resp->rqstp); ++ else /* we're not an MDS */ ++ nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, ++ read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, ++ &maxcount); ++#else + nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, + read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, + &maxcount); ++#endif /* CONFIG_SPNFS */ + + if (nfserr == nfserr_symlink) + nfserr = nfserr_inval; +@@ -2926,6 +3116,9 @@ nfsd4_encode_write(struct nfsd4_compound + WRITE32(write->wr_bytes_written); + WRITE32(write->wr_how_written); + WRITEMEM(write->wr_verifier.data, 8); ++ dprintk("NFSD: nfsd4_encode_write: verifier %x:%x\n", ++ ((u32 *)(&write->wr_verifier.data))[0], ++ ((u32 *)(&write->wr_verifier.data))[1]); + ADJUST_ARGS(); + } + return nfserr; +@@ -3069,6 +3262,343 @@ nfsd4_encode_sequence(struct nfsd4_compo + return 0; + } + ++#if defined(CONFIG_PNFSD) ++ ++/* Uses the export interface to iterate through the available devices ++ * and encodes them on the response stream. ++ */ ++static __be32 ++nfsd4_encode_devlist_iterator(struct nfsd4_compoundres *resp, ++ struct nfsd4_pnfs_getdevlist *gdevl, ++ unsigned int *dev_count) ++{ ++ struct super_block *sb = gdevl->gd_fhp->fh_dentry->d_inode->i_sb; ++ __be32 nfserr; ++ int status; ++ __be32 *p; ++ struct nfsd4_pnfs_dev_iter_res res = { ++ .gd_cookie = gdevl->gd_cookie, ++ .gd_verf = gdevl->gd_verf, ++ .gd_eof = 0 ++ }; ++ u64 sbid; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ sbid = find_create_sbid(sb); ++ *dev_count = 0; ++ do { ++ status = sb->s_pnfs_op->get_device_iter(sb, ++ gdevl->gd_layout_type, ++ &res); ++ if (status) { ++ if (status == -ENOENT) { ++ res.gd_eof = 1; ++ /* return success */ ++ break; ++ } ++ nfserr = nfserrno(status); ++ goto out_err; ++ } ++ ++ /* Encode device id and layout type */ ++ RESERVE_SPACE(sizeof(struct nfsd4_pnfs_deviceid)); ++ WRITE64((__be64)sbid); ++ WRITE64(res.gd_devid); /* devid minor */ ++ ADJUST_ARGS(); ++ (*dev_count)++; ++ } while (*dev_count < gdevl->gd_maxdevices && !res.gd_eof); ++ gdevl->gd_cookie = res.gd_cookie; ++ gdevl->gd_verf = res.gd_verf; ++ gdevl->gd_eof = res.gd_eof; ++ nfserr = nfs_ok; ++out_err: ++ dprintk("%s: Encoded %u devices\n", __func__, *dev_count); ++ return nfserr; ++} ++ ++/* Encodes the response of get device list. ++*/ ++static __be32 ++nfsd4_encode_getdevlist(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ unsigned int dev_count = 0, lead_count; ++ u32 *p_in = resp->p; ++ __be32 *p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ /* Ensure we have room for cookie, verifier, and devlist len, ++ * which we will backfill in after we encode as many devices as possible ++ */ ++ lead_count = 8 + sizeof(nfs4_verifier) + 4; ++ RESERVE_SPACE(lead_count); ++ /* skip past these values */ ++ p += XDR_QUADLEN(lead_count); ++ ADJUST_ARGS(); ++ ++ /* Iterate over as many device ids as possible on the xdr stream */ ++ nfserr = nfsd4_encode_devlist_iterator(resp, gdevl, &dev_count); ++ if (nfserr) ++ goto out_err; ++ ++ /* Backfill in cookie, verf and number of devices encoded */ ++ p = p_in; ++ WRITE64(gdevl->gd_cookie); ++ WRITEMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ WRITE32(dev_count); ++ ++ /* Skip over devices */ ++ p += XDR_QUADLEN(dev_count * sizeof(struct nfsd4_pnfs_deviceid)); ++ ADJUST_ARGS(); ++ ++ /* are we at the end of devices? */ ++ RESERVE_SPACE(4); ++ WRITE32(gdevl->gd_eof); ++ ADJUST_ARGS(); ++ ++ dprintk("%s: done.\n", __func__); ++ ++ nfserr = nfs_ok; ++out: ++ return nfserr; ++out_err: ++ p = p_in; ++ ADJUST_ARGS(); ++ goto out; ++} ++ ++/* For a given device id, have the file system retrieve and encode the ++ * associated device. For file layout, the encoding function is ++ * passed down to the file system. The file system then has the option ++ * of using this encoding function or one of its own. ++ * ++ * Note: the file system must return the XDR size of struct device_addr4 ++ * da_addr_body in pnfs_xdr_info.bytes_written on NFS4ERR_TOOSMALL for the ++ * gdir_mincount calculation. ++ */ ++static __be32 ++nfsd4_encode_getdevinfo(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ struct super_block *sb; ++ int maxcount = 0, type_notify_len = 12; ++ __be32 *p, *p_save = NULL, *p_in = resp->p; ++ struct exp_xdr_stream xdr; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = gdev->gd_sb; ++ ++ if (gdev->gd_maxcount != 0) { ++ /* FIXME: this will be bound by the session max response */ ++ maxcount = svc_max_payload(resp->rqstp); ++ if (maxcount > gdev->gd_maxcount) ++ maxcount = gdev->gd_maxcount; ++ ++ /* Ensure have room for type and notify field */ ++ maxcount -= type_notify_len; ++ if (maxcount < 0) { ++ nfserr = -ETOOSMALL; ++ goto toosmall; ++ } ++ } ++ ++ RESERVE_SPACE(4); ++ WRITE32(gdev->gd_layout_type); ++ ADJUST_ARGS(); ++ ++ /* If maxcount is 0 then just update notifications */ ++ if (gdev->gd_maxcount == 0) ++ goto handle_notifications; ++ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ nfserr = sb->s_pnfs_op->get_device_info(sb, &xdr, gdev->gd_layout_type, ++ &gdev->gd_devid); ++ if (nfserr) ++ goto err; ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ BUG_ON(xdr.p > xdr.end); ++ ++ /* Update the xdr stream with the number of bytes encoded ++ * by the file system. ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++handle_notifications: ++ /* Encode supported device notifications */ ++ RESERVE_SPACE(4); ++ if (sb->s_pnfs_op->set_device_notify) { ++ struct pnfs_devnotify_arg dn_args; ++ ++ dn_args.dn_layout_type = gdev->gd_layout_type; ++ dn_args.dn_devid = gdev->gd_devid; ++ dn_args.dn_notify_types = gdev->gd_notify_types; ++ nfserr = sb->s_pnfs_op->set_device_notify(sb, &dn_args); ++ if (nfserr) ++ goto err; ++ WRITE32(dn_args.dn_notify_types); ++ } else { ++ WRITE32(0); ++ } ++ ADJUST_ARGS(); ++ ++out: ++ return nfserrno(nfserr); ++toosmall: ++ dprintk("%s: maxcount too small\n", __func__); ++ RESERVE_SPACE(4); ++ WRITE32((p_save ? (xdr.p - p_save) * 4 : 0) + type_notify_len); ++ ADJUST_ARGS(); ++ goto out; ++err: ++ /* Rewind to the beginning */ ++ p = p_in; ++ ADJUST_ARGS(); ++ if (nfserr == -ETOOSMALL) ++ goto toosmall; ++ printk(KERN_ERR "%s: export ERROR %d\n", __func__, nfserr); ++ goto out; ++} ++ ++static __be32 ++nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, ++ __be32 nfserr, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int maxcount, leadcount; ++ struct super_block *sb; ++ struct exp_xdr_stream xdr; ++ __be32 *p, *p_save, *p_start = resp->p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = lgp->lg_fhp->fh_dentry->d_inode->i_sb; ++ maxcount = PAGE_SIZE; ++ if (maxcount > lgp->lg_maxcount) ++ maxcount = lgp->lg_maxcount; ++ ++ /* Check for space on xdr stream */ ++ leadcount = 36 + sizeof(stateid_opaque_t); ++ RESERVE_SPACE(leadcount); ++ /* encode layout metadata after file system encodes layout */ ++ p += XDR_QUADLEN(leadcount); ++ ADJUST_ARGS(); ++ ++ /* Ensure have room for ret_on_close, off, len, iomode, type */ ++ maxcount -= leadcount; ++ if (maxcount < 0) { ++ printk(KERN_ERR "%s: buffer too small\n", __func__); ++ nfserr = nfserr_toosmall; ++ goto err; ++ } ++ ++ /* Set xdr info so file system can encode layout */ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ /* Retrieve, encode, and merge layout; process stateid */ ++ nfserr = nfs4_pnfs_get_layout(lgp, &xdr); ++ if (nfserr) ++ goto err; ++ ++ /* Ensure file system returned enough bytes for the client ++ * to access. ++ */ ++ if (lgp->lg_seg.length < lgp->lg_minlength) { ++ nfserr = nfserr_badlayout; ++ goto err; ++ } ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ ++ /* Rewind to beginning and encode attrs */ ++ resp->p = p_start; ++ RESERVE_SPACE(4); ++ WRITE32(lgp->lg_roc); /* return on close */ ++ ADJUST_ARGS(); ++ nfsd4_encode_stateid(resp, &lgp->lg_sid); ++ RESERVE_SPACE(28); ++ /* Note: response logr_layout array count, always one for now */ ++ WRITE32(1); ++ WRITE64(lgp->lg_seg.offset); ++ WRITE64(lgp->lg_seg.length); ++ WRITE32(lgp->lg_seg.iomode); ++ WRITE32(lgp->lg_seg.layout_type); ++ ++ /* Update the xdr stream with the number of bytes written ++ * by the file system ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++ return nfs_ok; ++err: ++ resp->p = p_start; ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lcp->res.lc_size_chg); ++ ADJUST_ARGS(); ++ if (lcp->res.lc_size_chg) { ++ RESERVE_SPACE(8); ++ WRITE64(lcp->res.lc_newsize); ++ ADJUST_ARGS(); ++ } ++out: ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lrp->lrs_present != 0); /* got stateid? */ ++ ADJUST_ARGS(); ++ if (lrp->lrs_present) ++ nfsd4_encode_stateid(resp, &lrp->lr_sid); ++out: ++ return nfserr; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) + { +@@ -3129,11 +3659,19 @@ static nfsd4_enc nfsd4_enc_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, + [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, +diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c +--- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-09-30 10:15:18.364728000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-09-30 10:17:08.900002000 -0400 +@@ -13,10 +13,15 @@ + #include + #include + #include ++#include + + #include "nfsd.h" + #include "cache.h" + ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++#include ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + /* + * We have a single directory with 9 nodes in it. + */ +@@ -49,6 +54,9 @@ enum { + NFSD_Gracetime, + NFSD_RecoveryDir, + #endif ++#ifdef CONFIG_PNFSD ++ NFSD_pnfs_dlm_device, ++#endif + }; + + /* +@@ -74,6 +82,9 @@ static ssize_t write_leasetime(struct fi + static ssize_t write_gracetime(struct file *file, char *buf, size_t size); + static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); + #endif ++#ifdef CONFIG_PNFSD ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size); ++#endif + + static ssize_t (*write_op[])(struct file *, char *, size_t) = { + [NFSD_Svc] = write_svc, +@@ -96,6 +107,9 @@ static ssize_t (*write_op[])(struct file + [NFSD_Gracetime] = write_gracetime, + [NFSD_RecoveryDir] = write_recoverydir, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = write_pnfs_dlm_device, ++#endif + }; + + static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) +@@ -1349,6 +1363,68 @@ static ssize_t write_recoverydir(struct + + #endif + ++#ifdef CONFIG_PNFSD ++ ++static ssize_t __write_pnfs_dlm_device(struct file *file, char *buf, ++ size_t size) ++{ ++ char *mesg = buf; ++ char *pnfs_dlm_device; ++ int max_size = NFSD_PNFS_DLM_DEVICE_MAX; ++ int len, ret = 0; ++ ++ if (size > 0) { ++ ret = -EINVAL; ++ if (size > max_size || buf[size-1] != '\n') ++ return ret; ++ buf[size-1] = 0; ++ ++ pnfs_dlm_device = mesg; ++ len = qword_get(&mesg, pnfs_dlm_device, size); ++ if (len <= 0) ++ return ret; ++ ++ ret = nfsd4_set_pnfs_dlm_device(pnfs_dlm_device, len); ++ } else ++ return nfsd4_get_pnfs_dlm_device_list(buf, SIMPLE_TRANSACTION_LIMIT); ++ ++ return ret <= 0 ? ret : strlen(buf); ++} ++ ++/** ++ * write_pnfs_dlm_device - Set or report the current pNFS data server list ++ * ++ * Input: ++ * buf: ignored ++ * size: zero ++ * ++ * OR ++ * ++ * Input: ++ * buf: C string containing a block device name, ++ * a colon, and then a comma separated ++ * list of pNFS data server IPv4 addresses ++ * size: non-zero length of C string in @buf ++ * Output: ++ * On success: passed-in buffer filled with '\n'-terminated C ++ * string containing a block device name, a colon, and ++ * then a comma separated list of pNFS ++ * data server IPv4 addresses. ++ * return code is the size in bytes of the string ++ * On error: return code is a negative errno value ++ */ ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size) ++{ ++ ssize_t rv; ++ ++ mutex_lock(&nfsd_mutex); ++ rv = __write_pnfs_dlm_device(file, buf, size); ++ mutex_unlock(&nfsd_mutex); ++ return rv; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ + /*----------------------------------------------------------------------------*/ + /* + * populating the filesystem. +@@ -1383,6 +1459,10 @@ static int nfsd_fill_super(struct super_ + [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = {"pnfs_dlm_device", &transaction_ops, ++ S_IWUSR|S_IRUSR}, ++#endif + /* last one */ {""} + }; + return simple_fill_super(sb, 0x6e667364, nfsd_files); +@@ -1421,6 +1501,9 @@ static int create_proc_exports_entry(voi + } + #endif + ++#if defined(CONFIG_SPNFS_BLOCK) ++int nfsd_bl_init(void); ++#endif + static int __init init_nfsd(void) + { + int retval; +@@ -1443,6 +1526,15 @@ static int __init init_nfsd(void) + retval = create_proc_exports_entry(); + if (retval) + goto out_free_idmap; ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ retval = spnfs_init_proc(); ++ if (retval != 0) ++ goto out_free_idmap; ++#if defined(CONFIG_SPNFS_BLOCK) ++ nfsd_bl_init(); ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + retval = register_filesystem(&nfsd_fs_type); + if (retval) + goto out_free_all; +@@ -1465,7 +1557,22 @@ out_free_stat: + + static void __exit exit_nfsd(void) + { ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ remove_proc_entry("fs/nfs/spnfs/recall", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/getfh", NULL); ++ remove_proc_entry("fs/nfs/spnfs/config", NULL); ++ remove_proc_entry("fs/nfs/spnfs/ctl", NULL); ++ remove_proc_entry("fs/nfs/spnfs", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutsegsize", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ + nfsd_export_shutdown(); ++ nfsd4_pnfs_dlm_shutdown(); + nfsd_reply_cache_shutdown(); + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); +diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h +--- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-09-30 10:15:18.370728000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-09-30 10:17:08.906000000 -0400 +@@ -285,11 +285,17 @@ extern time_t nfsd4_grace; + #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ + NFSD4_SUPPORTED_ATTRS_WORD0 + ++#if defined(CONFIG_PNFSD) ++#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ ++ (NFSD4_SUPPORTED_ATTRS_WORD1 | FATTR4_WORD1_FS_LAYOUT_TYPES) ++#else /* CONFIG_PNFSD */ + #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ + NFSD4_SUPPORTED_ATTRS_WORD1 ++#endif /* CONFIG_PNFSD */ + + #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ +- (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) ++ (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT | \ ++ FATTR4_WORD2_LAYOUT_BLKSIZE) + + static inline u32 nfsd_suppattrs0(u32 minorversion) + { +diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.c +--- linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-09-30 10:17:08.911003000 -0400 +@@ -10,6 +10,7 @@ + #include + + #include ++#include + #include "nfsd.h" + #include "vfs.h" + #include "auth.h" +@@ -139,6 +140,7 @@ static inline __be32 check_pseudo_root(s + static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) + { + struct knfsd_fh *fh = &fhp->fh_handle; ++ int fsid_type; + struct fid *fid = NULL, sfid; + struct svc_export *exp; + struct dentry *dentry; +@@ -159,7 +161,8 @@ static __be32 nfsd_set_fh_dentry(struct + return error; + if (fh->fh_auth_type != 0) + return error; +- len = key_len(fh->fh_fsid_type) / 4; ++ fsid_type = pnfs_fh_fsid_type(fh); ++ len = key_len(fsid_type) / 4; + if (len == 0) + return error; + if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { +@@ -172,7 +175,7 @@ static __be32 nfsd_set_fh_dentry(struct + data_left -= len; + if (data_left < 0) + return error; +- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth); ++ exp = rqst_exp_find(rqstp, fsid_type, fh->fh_auth); + fid = (struct fid *)(fh->fh_auth + len); + } else { + __u32 tfh[2]; +diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.h +--- linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-09-30 10:17:08.917000000 -0400 +@@ -14,6 +14,7 @@ enum nfsd_fsid { + FSID_UUID8, + FSID_UUID16, + FSID_UUID16_INUM, ++ FSID_MAX + }; + + enum fsid_source { +@@ -205,4 +206,42 @@ fh_unlock(struct svc_fh *fhp) + } + } + ++#if defined(CONFIG_PNFSD) ++ ++/* ++ * fh_fsid_type is overloaded to indicate whether a filehandle was one supplied ++ * to a DS by LAYOUTGET. nfs4_preprocess_stateid_op() uses this to decide how ++ * to handle a given stateid. ++ */ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return fh->fh_fsid_type >= FSID_MAX; ++} ++ ++static inline void pnfs_fh_mark_ds(struct knfsd_fh *fh) ++{ ++ BUG_ON(fh->fh_version != 1); ++ BUG_ON(pnfs_fh_is_ds(fh)); ++ fh->fh_fsid_type += FSID_MAX; ++} ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ ++/* allows fh_verify() to check the real fsid_type (i.e., not overloaded). */ ++static inline int pnfs_fh_fsid_type(struct knfsd_fh *fh) ++{ ++ int fsid_type = fh->fh_fsid_type; ++ ++ if (pnfs_fh_is_ds(fh)) ++ return fsid_type - FSID_MAX; ++ return fsid_type; ++} ++ + #endif /* _LINUX_NFSD_FH_INT_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/nfssvc.c +--- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-09-30 10:15:05.063337000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-09-30 10:17:08.922000000 -0400 +@@ -115,7 +115,7 @@ struct svc_program nfsd_program = { + + }; + +-u32 nfsd_supported_minorversion; ++u32 nfsd_supported_minorversion = NFSD_SUPPORTED_MINOR_VERSION; + + int nfsd_vers(int vers, enum vers_op change) + { +diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pnfsd.h +--- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-09-30 10:17:08.924003000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-09-30 10:17:08.926004000 -0400 +@@ -0,0 +1,143 @@ ++/* ++ * Copyright (c) 2005 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef LINUX_NFSD_PNFSD_H ++#define LINUX_NFSD_PNFSD_H ++ ++#include ++#include ++ ++#include "state.h" ++#include "xdr4.h" ++ ++/* outstanding layout stateid */ ++struct nfs4_layout_state { ++ struct list_head ls_perfile; ++ struct list_head ls_layouts; /* list of nfs4_layouts */ ++ struct kref ls_ref; ++ struct nfs4_client *ls_client; ++ struct nfs4_file *ls_file; ++ stateid_t ls_stateid; ++}; ++ ++/* outstanding layout */ ++struct nfs4_layout { ++ struct list_head lo_perfile; /* hash by f_id */ ++ struct list_head lo_perclnt; /* hash by clientid */ ++ struct list_head lo_perstate; ++ struct nfs4_file *lo_file; /* backpointer */ ++ struct nfs4_client *lo_client; ++ struct nfs4_layout_state *lo_state; ++ struct nfsd4_layout_seg lo_seg; ++}; ++ ++struct pnfs_inval_state { ++ struct knfsd_fh mdsfh; /* needed only by invalidate all */ ++ stateid_t stid; ++ clientid_t clid; ++ u32 status; ++}; ++ ++/* pNFS Data Server state */ ++#define DS_STATEID_VALID 0 ++#define DS_STATEID_ERROR 1 ++#define DS_STATEID_NEW 2 ++ ++struct pnfs_ds_stateid { ++ struct list_head ds_hash; /* ds_stateid hash entry */ ++ struct list_head ds_perclid; /* per client hash entry */ ++ stateid_t ds_stid; ++ struct knfsd_fh ds_fh; ++ unsigned long ds_access; ++ u32 ds_status; /* from MDS */ ++ u32 ds_verifier[2]; /* from MDS */ ++ wait_queue_head_t ds_waitq; ++ unsigned long ds_flags; ++ struct kref ds_ref; ++ clientid_t ds_mdsclid; ++}; ++ ++struct pnfs_ds_clientid { ++ struct list_head dc_hash; /* mds_clid_hashtbl entry */ ++ struct list_head dc_stateid; /* ds_stateid head */ ++ struct list_head dc_permdsid; /* per mdsid hash entry */ ++ clientid_t dc_mdsclid; ++ struct kref dc_ref; ++ uint32_t dc_mdsid; ++}; ++ ++struct pnfs_mds_id { ++ struct list_head di_hash; /* mds_nodeid list entry */ ++ struct list_head di_mdsclid; /* mds_clientid head */ ++ uint32_t di_mdsid; ++ time_t di_mdsboot; /* mds boot time */ ++ struct kref di_ref; ++}; ++ ++/* notify device request (from exported filesystem) */ ++struct nfs4_notify_device { ++ struct nfsd4_pnfs_cb_dev_list *nd_list; ++ struct nfs4_client *nd_client; ++ struct list_head nd_perclnt; ++ ++ void *nd_args; /* nfsd internal */ ++}; ++ ++u64 find_create_sbid(struct super_block *); ++struct super_block *find_sbid_id(u64); ++__be32 nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *, struct exp_xdr_stream *); ++int nfs4_pnfs_return_layout(struct super_block *, struct svc_fh *, ++ struct nfsd4_pnfs_layoutreturn *); ++int nfs4_pnfs_cb_get_state(struct super_block *, struct pnfs_get_state *); ++int nfs4_pnfs_cb_change_state(struct pnfs_get_state *); ++void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++int put_layoutrecall(struct nfs4_layoutrecall *); ++void nomatching_layout(struct nfs4_layoutrecall *); ++void *layoutrecall_done(struct nfs4_layoutrecall *); ++int nfsd4_cb_layout(struct nfs4_layoutrecall *); ++int nfsd_layout_recall_cb(struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++int nfsd_device_notify_cb(struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++int nfsd4_cb_notify_device(struct nfs4_notify_device *); ++void pnfs_set_device_notify(clientid_t *, unsigned int types); ++void pnfs_clear_device_notify(struct nfs4_client *); ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++extern struct sockaddr pnfsd_lexp_addr; ++extern size_t pnfs_lexp_addr_len; ++ ++extern void pnfsd_lexp_init(struct inode *); ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#endif /* LINUX_NFSD_PNFSD_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c +--- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-09-30 10:17:08.928999000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-09-30 10:17:08.930006000 -0400 +@@ -0,0 +1,225 @@ ++/* ++ * linux/fs/nfsd/pnfs_lexp.c ++ * ++ * pNFS export of local filesystems. ++ * ++ * Export local file systems over the files layout type. ++ * The MDS (metadata server) functions also as a single DS (data server). ++ * This is mostly useful for development and debugging purposes. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * Copyright (C) 2008 Benny Halevy, ++ * ++ * Initial implementation was based on the pnfs-gfs2 patches done ++ * by David M. Richter ++ */ ++ ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++struct sockaddr pnfsd_lexp_addr; ++size_t pnfs_lexp_addr_len; ++ ++static int ++pnfsd_lexp_layout_type(struct super_block *sb) ++{ ++ int ret = LAYOUT_NFSV4_1_FILES; ++ dprintk("<-- %s: return %d\n", __func__, ret); ++ return ret; ++} ++ ++static int ++pnfsd_lexp_get_device_iter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ ++ dprintk("<-- %s: return 0\n", __func__); ++ return 0; ++} ++ ++static int ++pnfsd_lexp_get_device_info(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_multipath fl_devices[1]; ++ u32 fl_stripe_indices[1] = { 0 }; ++ struct pnfs_filelayout_devaddr daddr; ++ /* %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x.%03u.%03u */ ++ char daddr_buf[8*4 + 2*3 + 10]; ++ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ ++ if (devid->devid != 1) { ++ printk(KERN_ERR "%s: WARNING: didn't receive a deviceid of 1 " ++ "(got: 0x%llx)\n", __func__, devid->devid); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ /* count the number of comma-delimited DS IPs */ ++ fdev.fl_device_length = 1; ++ fdev.fl_device_list = fl_devices; ++ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = fl_stripe_indices; ++ ++ daddr.r_addr.data = daddr_buf; ++ daddr.r_addr.len = sizeof(daddr_buf); ++ err = __svc_print_netaddr(&pnfsd_lexp_addr, &daddr.r_addr); ++ if (err < 0) ++ goto out; ++ daddr.r_addr.len = err; ++ switch (pnfsd_lexp_addr.sa_family) { ++ case AF_INET: ++ daddr.r_netid.data = "tcp"; ++ daddr.r_netid.len = 3; ++ break; ++ case AF_INET6: ++ daddr.r_netid.data = "tcp6"; ++ daddr.r_netid.len = 4; ++ break; ++ default: ++ BUG(); ++ } ++ fdev.fl_device_list[0].fl_multipath_length = 1; ++ fdev.fl_device_list[0].fl_multipath_list = &daddr; ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ dprintk("<-- %s: return %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize < NFSSVC_MAXBLKSIZE) ++ blocksize = NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++ dprintk("%s: return %d\n", __func__, blocksize); ++ return blocksize; ++} ++ ++static enum nfsstat4 ++pnfsd_lexp_layout_get(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ enum nfsstat4 rc = NFS4_OK; ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ ++ dprintk("--> %s: inode=%p\n", __func__, inode); ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = true; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = arg->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = 0; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ memcpy(fhp, arg->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ dprintk("<-- %s: return %d\n", __func__, rc); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++pnfsd_lexp_layout_commit(struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ dprintk("%s: (unimplemented)\n", __func__); ++ ++ return 0; ++} ++ ++static int ++pnfsd_lexp_layout_return(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ dprintk("%s: (unimplemented)\n", __func__); ++ ++ return 0; ++} ++ ++static int pnfsd_lexp_get_state(struct inode *inode, struct knfsd_fh *fh, ++ struct pnfs_get_state *p) ++{ ++ return 0; /* just use the current stateid */ ++} ++ ++static struct pnfs_export_operations pnfsd_lexp_ops = { ++ .layout_type = pnfsd_lexp_layout_type, ++ .get_device_info = pnfsd_lexp_get_device_info, ++ .get_device_iter = pnfsd_lexp_get_device_iter, ++ .layout_get = pnfsd_lexp_layout_get, ++ .layout_commit = pnfsd_lexp_layout_commit, ++ .layout_return = pnfsd_lexp_layout_return, ++ .get_state = pnfsd_lexp_get_state, ++}; ++ ++void ++pnfsd_lexp_init(struct inode *inode) ++{ ++ dprintk("%s: &pnfsd_lexp_ops=%p\n", __func__, &pnfsd_lexp_ops); ++ inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_com.c +--- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-09-30 10:17:08.933003000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-09-30 10:17:08.935000000 -0400 +@@ -0,0 +1,535 @@ ++/* ++ * fs/nfsd/spnfs_com.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * Based heavily on idmap.c ++ * ++ */ ++ ++/* ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Marius Aamodt Eriksen ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++static ssize_t spnfs_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t spnfs_pipe_downcall(struct file *, const char __user *, ++ size_t); ++static void spnfs_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops spnfs_upcall_ops = { ++ .upcall = spnfs_pipe_upcall, ++ .downcall = spnfs_pipe_downcall, ++ .destroy_msg = spnfs_pipe_destroy_msg, ++}; ++ ++/* evil global variable */ ++struct spnfs *global_spnfs; ++struct spnfs_config *spnfs_config; ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++int spnfs_use_layoutsegments; ++uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++/* ++ * Used by spnfs_enabled() ++ * Tracks if the subsystem has been initialized at some point. It doesn't ++ * matter if it's not currently initialized. ++ */ ++static int spnfs_enabled_at_some_point; ++ ++/* call this to start the ball rolling */ ++/* code it like we're going to avoid the global variable in the future */ ++int ++nfsd_spnfs_new(void) ++{ ++ struct spnfs *spnfs = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ if (global_spnfs != NULL) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ spnfs = kzalloc(sizeof(*spnfs), GFP_KERNEL); ++ if (spnfs == NULL){ ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ spnfs->spnfs_dentry = rpc_mkpipe(nd.path.dentry, "spnfs", spnfs, ++ &spnfs_upcall_ops, 0); ++ if (IS_ERR(spnfs->spnfs_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ ++ mutex_init(&spnfs->spnfs_lock); ++ mutex_init(&spnfs->spnfs_plock); ++ init_waitqueue_head(&spnfs->spnfs_wq); ++ ++ global_spnfs = spnfs; ++ spnfs_enabled_at_some_point = 1; ++ ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(spnfs); ++ return rc; ++} ++ ++/* again, code it like we're going to remove the global variable */ ++void ++nfsd_spnfs_delete(void) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ ++ if (!spnfs) ++ return; ++ rpc_unlink(spnfs->spnfs_dentry); ++ rpc_put_mount(); ++ global_spnfs = NULL; ++ kfree(spnfs); ++} ++ ++/* RPC pipefs upcall/downcall routines */ ++/* looks like this code is invoked by the rpc_pipe code */ ++/* to handle upcalls on things we've queued elsewhere */ ++/* See nfs_idmap_id for an exmaple of enqueueing */ ++static ssize_t ++spnfs_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, ++ char __user *dst, size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied; ++ ssize_t left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ return mlen; ++} ++ ++static ssize_t ++spnfs_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ struct spnfs *spnfs = (struct spnfs *)rpci->private; ++ struct spnfs_msg *im_in = NULL, *im = &spnfs->spnfs_im; ++ int ret; ++ ++ if (mlen != sizeof(struct spnfs_msg)) ++ return -ENOSPC; ++ ++ im_in = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im_in == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(im_in, src, mlen) != 0) ++ return -EFAULT; ++ ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ ret = mlen; ++ im->im_status = im_in->im_status; ++ /* If we got an error, terminate now, and wake up pending upcalls */ ++ if (!(im_in->im_status & SPNFS_STATUS_SUCCESS)) { ++ wake_up(&spnfs->spnfs_wq); ++ goto out; ++ } ++ ++ ret = -EINVAL; ++ /* Did we match the current upcall? */ ++ /* DMXXX: do not understand the comment above, from original code */ ++ /* DMXXX: when do we _not_ match the current upcall? */ ++ /* DMXXX: anyway, let's to a simplistic check */ ++ if (im_in->im_type == im->im_type) { ++ /* copy the response into the spnfs struct */ ++ memcpy(&im->im_res, &im_in->im_res, sizeof(im->im_res)); ++ ret = mlen; ++ } else ++ dprintk("spnfs: downcall type != upcall type\n"); ++ ++ ++ wake_up(&spnfs->spnfs_wq); ++/* DMXXX handle rval processing */ ++out: ++ mutex_unlock(&spnfs->spnfs_plock); ++ kfree(im_in); ++ return ret; ++} ++ ++static void ++spnfs_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ struct spnfs_msg *im = msg->data; ++ struct spnfs *spnfs = container_of(im, struct spnfs, spnfs_im); ++ ++ if (msg->errno >= 0) ++ return; ++ mutex_lock(&spnfs->spnfs_plock); ++ im->im_status = SPNFS_STATUS_FAIL; /* DMXXX */ ++ wake_up(&spnfs->spnfs_wq); ++ mutex_unlock(&spnfs->spnfs_plock); ++} ++ ++/* generic upcall. called by functions in spnfs_ops.c */ ++int ++spnfs_upcall(struct spnfs *spnfs, struct spnfs_msg *upmsg, ++ union spnfs_msg_res *res) ++{ ++ struct rpc_pipe_msg msg; ++ struct spnfs_msg *im; ++ DECLARE_WAITQUEUE(wq, current); ++ int ret = -EIO; ++ int rval; ++ ++ im = &spnfs->spnfs_im; ++ ++ mutex_lock(&spnfs->spnfs_lock); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ memset(im, 0, sizeof(*im)); ++ memcpy(im, upmsg, sizeof(*upmsg)); ++ ++ memset(&msg, 0, sizeof(msg)); ++ msg.data = im; ++ msg.len = sizeof(*im); ++ ++ add_wait_queue(&spnfs->spnfs_wq, &wq); ++ rval = rpc_queue_upcall(spnfs->spnfs_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&spnfs->spnfs_plock); ++ schedule(); ++ current->state = TASK_RUNNING; ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ if (im->im_status & SPNFS_STATUS_SUCCESS) { ++ /* copy our result from the upcall */ ++ memcpy(res, &im->im_res, sizeof(*res)); ++ ret = 0; ++ } ++ ++out: ++ memset(im, 0, sizeof(*im)); ++ mutex_unlock(&spnfs->spnfs_plock); ++ mutex_unlock(&spnfs->spnfs_lock); ++ return(ret); ++} ++ ++/* ++ * This is used to determine if the spnfsd daemon has been started at ++ * least once since the system came up. This is used to by the export ++ * mechanism to decide if spnfs is in use. ++ * ++ * Returns non-zero if the spnfsd has initialized the communication pipe ++ * at least once. ++ */ ++int spnfs_enabled(void) ++{ ++ return spnfs_enabled_at_some_point; ++} ++ ++#ifdef CONFIG_PROC_FS ++ ++/* ++ * procfs virtual files for user/kernel space communication: ++ * ++ * ctl - currently just an on/off switch...can be expanded ++ * getfh - fd to fh conversion ++ * recall - recall a layout from the command line, for example: ++ * echo > /proc/fs/spnfs/recall ++ * config - configuration info, e.g., stripe size, num ds, etc. ++ */ ++ ++/*-------------- start ctl -------------------------*/ ++static ssize_t ctl_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int cmd, rc; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (cmd) { ++ rc = nfsd_spnfs_new(); ++ if (rc != 0) ++ return rc; ++ } else ++ nfsd_spnfs_delete(); ++ ++ return count; ++} ++ ++static const struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++/*-------------- end ctl ---------------------------*/ ++ ++/*-------------- start config -------------------------*/ ++static ssize_t config_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ static struct spnfs_config cfg; ++ ++ if (copy_from_user(&cfg, buf, count)) ++ return -EFAULT; ++ ++ spnfs_config = &cfg; ++ return 0; ++} ++ ++static const struct file_operations config_ops = { ++ .write = config_write, ++}; ++/*-------------- end config ---------------------------*/ ++ ++/*-------------- start getfh -----------------------*/ ++static int getfh_open(struct inode *inode, struct file *file) ++{ ++ file->private_data = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); ++ if (file->private_data == NULL) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static ssize_t getfh_read(struct file *file, char __user *buf, size_t count, ++ loff_t *offset) ++{ ++ if (copy_to_user(buf, file->private_data, sizeof(struct nfs_fh))) ++ return -EFAULT; ++ ++ return count; ++} ++ ++static ssize_t getfh_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int fd; ++ ++ if (copy_from_user((int *)&fd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (spnfs_getfh(fd, file->private_data) != 0) ++ return -EIO; ++ ++ return count; ++} ++ ++static int getfh_release(struct inode *inode, struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static const struct file_operations getfh_ops = { ++ .open = getfh_open, ++ .read = getfh_read, ++ .write = getfh_write, ++ .release = getfh_release, ++}; ++/*-------------- end getfh ------------------------*/ ++ ++ ++/*-------------- start recall layout --------------*/ ++static ssize_t recall_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char input[128]; ++ char *path, *str, *p; ++ int rc; ++ u64 off = 0, len = 0; ++ ++ if (count > 128) ++ return -EINVAL; ++ ++ if (copy_from_user(input, buf, count)) ++ return -EFAULT; ++ ++ /* assumes newline-terminated path */ ++ p = memchr(input, '\n', count); ++ if (p == NULL) ++ return -EINVAL; ++ *p = '\0'; ++ ++ /* ++ * Scan for path and, optionally, an offset and length ++ * of a layout segment to be recalled; if there are two ++ * fields, they're assumed to be path and offset. ++ */ ++ p = input; ++ path = strsep(&p, " "); ++ if (path == NULL) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &off); ++ if (rc != 0) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &len); ++ if (rc != 0) ++ return -EINVAL; ++ } ++ } ++ ++ rc = spnfs_test_layoutrecall(path, off, len); ++ if (rc != 0) ++ return rc; ++ ++ return count; ++} ++ ++static const struct file_operations recall_ops = { ++ .write = recall_write, ++}; ++/*-------------- end recall layout --------------*/ ++ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++/*-------------- start layoutseg -------------------------*/ ++static ssize_t layoutseg_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[3]; ++ ++ if (copy_from_user(cmd, buf, 1)) ++ return -EFAULT; ++ if (cmd[0] == '0') ++ spnfs_use_layoutsegments = 0; ++ else ++ spnfs_use_layoutsegments = 1; ++ ++ return count; ++} ++ ++static const struct file_operations layoutseg_ops = { ++ .write = layoutseg_write, ++}; ++/*-------------- end layoutseg ---------------------------*/ ++ ++/*-------------- start layoutsegsize -------------------------*/ ++static ssize_t layoutsegsize_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[50]; ++ ++ if (copy_from_user(cmd, buf, 49)) ++ return -EFAULT; ++ layoutsegment_size = simple_strtoull(cmd, NULL, 10); ++ ++ return count; ++} ++ ++static const struct file_operations layoutsegsize_ops = { ++ .write = layoutsegsize_write, ++}; ++/*-------------- end layoutsegsize ---------------------------*/ ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++int ++spnfs_init_proc(void) ++{ ++ struct proc_dir_entry *entry; ++ ++ entry = proc_mkdir("fs/spnfs", NULL); ++ if (!entry) ++ return -ENOMEM; ++ ++ entry = create_proc_entry("fs/spnfs/ctl", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &ctl_ops; ++ ++ entry = create_proc_entry("fs/spnfs/config", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &config_ops; ++ ++ entry = create_proc_entry("fs/spnfs/getfh", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &getfh_ops; ++ ++ entry = create_proc_entry("fs/spnfs/recall", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &recall_ops; ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++ entry = create_proc_entry("fs/spnfs/layoutseg", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutseg_ops; ++ ++ entry = create_proc_entry("fs/spnfs/layoutsegsize", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutsegsize_ops; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ return 0; ++} ++#endif /* CONFIG_PROC_FS */ +diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c +--- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-09-30 10:17:08.938003000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-09-30 10:17:08.940000000 -0400 +@@ -0,0 +1,878 @@ ++/* ++ * fs/nfsd/spnfs_ops.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * ++ */ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++/* comment out CONFIG_SPNFS_TEST for non-test behaviour */ ++/* #define CONFIG_SPNFS_TEST 1 */ ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++/* ++ * The functions that are called from elsewhere in the kernel ++ * to perform tasks in userspace ++ * ++ */ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++extern int spnfs_use_layoutsegments; ++extern uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++extern struct spnfs *global_spnfs; ++ ++int ++spnfs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++enum nfsstat4 ++spnfs_layoutget(struct inode *inode, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *lg_arg, ++ struct nfsd4_pnfs_layoutget_res *lg_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct pnfs_filelayout_layout *flp = NULL; ++ int status, i; ++ enum nfsstat4 nfserr; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ im->im_type = SPNFS_TYPE_LAYOUTGET; ++ im->im_args.layoutget_args.inode = inode->i_ino; ++ im->im_args.layoutget_args.generation = inode->i_generation; ++ ++ /* call function to queue the msg for upcall */ ++ if (spnfs_upcall(spnfs, im, res) != 0) { ++ dprintk("failed spnfs upcall: layoutget\n"); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto layoutget_cleanup; ++ } ++ status = res->layoutget_res.status; ++ if (status != 0) { ++ /* FIXME? until user mode is fixed, translate system error */ ++ switch (status) { ++ case -E2BIG: ++ case -ETOOSMALL: ++ nfserr = NFS4ERR_TOOSMALL; ++ break; ++ case -ENOMEM: ++ case -EAGAIN: ++ case -EINTR: ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ break; ++ case -ENOENT: ++ nfserr = NFS4ERR_BADLAYOUT; ++ break; ++ default: ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ } ++ dprintk("spnfs layout_get upcall: status=%d nfserr=%u\n", ++ status, nfserr); ++ goto layoutget_cleanup; ++ } ++ ++ lg_res->lg_return_on_close = 0; ++#if defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ /* if spnfs_use_layoutsegments & layoutsegment_size == 0, use */ ++ /* the amount requested by the client. */ ++ if (spnfs_use_layoutsegments) { ++ if (layoutsegment_size != 0) ++ lg_res->lg_seg.length = layoutsegment_size; ++ } else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ flp = kmalloc(sizeof(struct pnfs_filelayout_layout), GFP_KERNEL); ++ if (flp == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ flp->device_id.sbid = lg_arg->lg_sbid; ++ flp->device_id.devid = res->layoutget_res.devid; ++ flp->lg_layout_type = 1; /* XXX */ ++ flp->lg_stripe_type = res->layoutget_res.stripe_type; ++ flp->lg_commit_through_mds = 0; ++ flp->lg_stripe_unit = res->layoutget_res.stripe_size; ++ flp->lg_first_stripe_index = 0; ++ flp->lg_pattern_offset = 0; ++ flp->lg_fh_length = res->layoutget_res.stripe_count; ++ ++ flp->lg_fh_list = kmalloc(flp->lg_fh_length * sizeof(struct knfsd_fh), ++ GFP_KERNEL); ++ if (flp->lg_fh_list == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ /* ++ * FIX: Doing an extra copy here. Should group res.flist's fh_len ++ * and fh_val into a knfsd_fh structure. ++ */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ flp->lg_fh_list[i].fh_size = res->layoutget_res.flist[i].fh_len; ++ memcpy(&flp->lg_fh_list[i].fh_base, ++ res->layoutget_res.flist[i].fh_val, ++ res->layoutget_res.flist[i].fh_len); ++ } ++ ++ /* encode the layoutget body */ ++ nfserr = filelayout_encode_layout(xdr, flp); ++ ++layoutget_cleanup: ++ if (flp) { ++ if (flp->lg_fh_list) ++ kfree(flp->lg_fh_list); ++ kfree(flp); ++ } ++ kfree(im); ++ kfree(res); ++ ++ return nfserr; ++} ++ ++int ++spnfs_layoutcommit(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutreturn(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for ino = %lu\n", ++ __func__, inode->i_ino); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* XXX figure out how to get a sb since there's no inode ptr */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = offset; ++ lr.cbl_seg.length = len; ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ lr.cbl_layoutchanged = 0; ++ ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ ++ return 0; ++} ++ ++ ++int ++spnfs_test_layoutrecall(char *path, u64 offset, u64 len) ++{ ++ struct nameidata nd; ++ struct inode *inode; ++ int type, rc; ++ ++ dprintk("%s: path=%s, offset=%llu, len=%llu\n", ++ __func__, path, offset, len); ++ ++ if (strcmp(path, "all") == 0) { ++ inode = NULL; ++ type = RETURN_ALL; ++ } else { ++ rc = path_lookup(path, 0, &nd); ++ if (rc != 0) ++ return -ENOENT; ++ ++ /* ++ * XXX todo: add a RETURN_FSID scenario here...maybe if ++ * inode is a dir... ++ */ ++ ++ inode = nd.path.dentry->d_inode; ++ type = RETURN_FILE; ++ } ++ ++ if (len == 0) ++ len = NFS4_MAX_UINT64; ++ ++ rc = spnfs_layoutrecall(inode, type, offset, len); ++ ++ if (type != RETURN_ALL) ++ path_put(&nd.path); ++ return rc; ++} ++ ++int ++spnfs_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *gd_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* XXX keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEITER; ++ im->im_args.getdeviceiter_args.cookie = gd_res->gd_cookie; ++ im->im_args.getdeviceiter_args.verf = gd_res->gd_verf; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceiter_out; ++ } ++ status = res->getdeviceiter_res.status; ++ ++ if (res->getdeviceiter_res.eof) ++ gd_res->gd_eof = 1; ++ else { ++ gd_res->gd_devid = res->getdeviceiter_res.devid; ++ gd_res->gd_cookie = res->getdeviceiter_res.cookie; ++ gd_res->gd_verf = res->getdeviceiter_res.verf; ++ gd_res->gd_eof = 0; ++ } ++ ++getdeviceiter_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++#ifdef CONFIG_SPNFS_TEST ++/* ++ * Setup the rq_res xdr_buf. The svc_rqst rq_respages[1] page contains the ++ * 1024 encoded stripe indices. ++ * ++ * Skip the devaddr4 length and encode the indicies count (1024) in the ++ * rq_res.head and set the rq_res.head length. ++ * ++ * Set the rq_res page_len to 4096 (for the 1024 stripe indices). ++ * Set the rq_res xdr_buf tail base to rq_respages[0] just after the ++ * rq_res head to hold the rest of the getdeviceinfo return. ++ * ++ * So rq_respages[rq_resused - 1] contains the rq_res.head and rq_res.tail and ++ * rq_respages[rq_resused] contains the rq_res.pages. ++ */ ++static int spnfs_test_indices_xdr(struct pnfs_xdr_info *info, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ struct nfsd4_compoundres *resp = info->resp; ++ struct svc_rqst *rqstp = resp->rqstp; ++ struct xdr_buf *xb = &resp->rqstp->rq_res; ++ __be32 *p; ++ ++ p = nfsd4_xdr_reserve_space(resp, 8); ++ p++; /* Fill in length later */ ++ *p++ = cpu_to_be32(fdev->fl_stripeindices_length); /* 1024 */ ++ resp->p = p; ++ ++ xb->head[0].iov_len = (char *)resp->p - (char *)xb->head[0].iov_base; ++ xb->pages = &rqstp->rq_respages[rqstp->rq_resused]; ++ xb->page_base = 0; ++ xb->page_len = PAGE_SIZE; /* page of 1024 encoded indices */ ++ xb->tail[0].iov_base = resp->p; ++ resp->end = xb->head[0].iov_base + PAGE_SIZE; ++ xb->tail[0].iov_len = (char *)resp->end - (char *)resp->p; ++ return 0; ++} ++/* ++ * Return a stripeindices of length 1024 to test ++ * the pNFS client multipage getdeviceinfo implementation. ++ * ++ * Encode a page of stripe indices. ++ */ ++static void spnfs_set_test_indices(struct pnfs_filelayout_device *fldev, ++ struct spnfs_device *dev, ++ struct pnfs_devinfo_arg *info) ++{ ++ struct svc_rqst *rqstp = info->xdr.resp->rqstp; ++ __be32 *p; ++ int i, j = 0; ++ ++ p = (__be32 *)page_address(rqstp->rq_respages[rqstp->rq_resused]); ++ fldev->fl_stripeindices_length = 1024; ++ /* round-robin the data servers device index into the stripe indicie */ ++ for (i = 0; i < 1024; i++) { ++ *p++ = cpu_to_be32(j); ++ if (j < dev->dscount - 1) ++ j++; ++ else ++ j = 0; ++ } ++ fldev->fl_stripeindices_list = NULL; ++} ++#endif /* CONFIG_SPNFS_TEST */ ++ ++int ++spnfs_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct spnfs_device *dev; ++ struct pnfs_filelayout_device *fldev = NULL; ++ struct pnfs_filelayout_multipath *mp = NULL; ++ struct pnfs_filelayout_devaddr *fldap = NULL; ++ int status = 0, i, len; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEINFO; ++ /* XXX FIX: figure out what to do about fsid */ ++ im->im_args.getdeviceinfo_args.devid = devid->devid; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceinfo_out; ++ } ++ status = res->getdeviceinfo_res.status; ++ if (status != 0) ++ goto getdeviceinfo_out; ++ ++ dev = &res->getdeviceinfo_res.devinfo; ++ ++ /* Fill in the device data, i.e., nfs4_1_file_layout_ds_addr4 */ ++ fldev = kzalloc(sizeof(struct pnfs_filelayout_device), GFP_KERNEL); ++ if (fldev == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ /* ++ * Stripe count is the same as data server count for our purposes ++ */ ++ fldev->fl_stripeindices_length = dev->dscount; ++ fldev->fl_device_length = dev->dscount; ++ ++ /* Set stripe indices */ ++#ifdef CONFIG_SPNFS_TEST ++ spnfs_set_test_indices(fldev, dev, info); ++ fldev->fl_enc_stripe_indices = spnfs_test_indices_xdr; ++#else /* CONFIG_SPNFS_TEST */ ++ fldev->fl_stripeindices_list = ++ kmalloc(fldev->fl_stripeindices_length * sizeof(u32), ++ GFP_KERNEL); ++ if (fldev->fl_stripeindices_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_stripeindices_length; i++) ++ fldev->fl_stripeindices_list[i] = i; ++#endif /* CONFIG_SPNFS_TEST */ ++ ++ /* ++ * Set the device's data server addresses No multipath for spnfs, ++ * so mp length is always 1. ++ * ++ */ ++ fldev->fl_device_list = ++ kmalloc(fldev->fl_device_length * ++ sizeof(struct pnfs_filelayout_multipath), ++ GFP_KERNEL); ++ if (fldev->fl_device_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ mp = &fldev->fl_device_list[i]; ++ mp->fl_multipath_length = 1; ++ mp->fl_multipath_list = ++ kmalloc(sizeof(struct pnfs_filelayout_devaddr), ++ GFP_KERNEL); ++ if (mp->fl_multipath_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ fldap = mp->fl_multipath_list; ++ ++ /* ++ * Copy the netid into the device address, for example: "tcp" ++ */ ++ len = strlen(dev->dslist[i].netid); ++ fldap->r_netid.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_netid.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_netid.data, dev->dslist[i].netid, len); ++ fldap->r_netid.len = len; ++ ++ /* ++ * Copy the network address into the device address, ++ * for example: "10.35.9.16.08.01" ++ */ ++ len = strlen(dev->dslist[i].addr); ++ fldap->r_addr.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_addr.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_addr.data, dev->dslist[i].addr, len); ++ fldap->r_addr.len = len; ++ } ++ ++ /* encode the device data */ ++ status = filelayout_encode_devinfo(xdr, fldev); ++ ++getdeviceinfo_out: ++ if (fldev) { ++ kfree(fldev->fl_stripeindices_list); ++ if (fldev->fl_device_list) { ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ fldap = ++ fldev->fl_device_list[i].fl_multipath_list; ++ kfree(fldap->r_netid.data); ++ kfree(fldap->r_addr.data); ++ kfree(fldap); ++ } ++ kfree(fldev->fl_device_list); ++ } ++ kfree(fldev); ++ } ++ ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_setattr(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_open(struct inode *inode, struct nfsd4_open *open) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_OPEN; ++ im->im_args.open_args.inode = inode->i_ino; ++ im->im_args.open_args.generation = inode->i_generation; ++ im->im_args.open_args.create = open->op_create; ++ im->im_args.open_args.createmode = open->op_createmode; ++ im->im_args.open_args.truncate = open->op_truncate; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto open_out; ++ } ++ status = res->open_res.status; ++ ++open_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_create(void) ++{ ++ return 0; ++} ++ ++/* ++ * Invokes the spnfsd with the inode number of the object to remove. ++ * The file has already been removed on the MDS, so all the spnsfd ++ * daemon does is remove the stripes. ++ * Returns 0 on success otherwise error code ++ */ ++int ++spnfs_remove(unsigned long ino, unsigned long generation) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_REMOVE; ++ im->im_args.remove_args.inode = ino; ++ im->im_args.remove_args.generation = generation; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto remove_out; ++ } ++ status = res->remove_res.status; ++ ++remove_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++static int ++read_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_read(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ if (err == 0) ++ break; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDONLY | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0 ; vnum < vlen ; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = read_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err < 0) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ if (err < iolen) { ++ bytecount += err; ++ goto read_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++read_out: ++ *lenp = bytecount; ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ return status; ++} ++ ++__be32 ++spnfs_read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return read(inode, offset, lenp, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++static int ++write_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_write(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDWR | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto write_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0; vnum < vlen; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = write_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err != iolen) { ++ dprintk("spnfs_write: err=%d expected %Zd\n", err, len); ++ status = nfserr_io; ++ goto write_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++write_out: ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ ++ return status; ++} ++ ++__be32 ++spnfs_write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return write(inode, offset, len, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++int ++spnfs_commit(void) ++{ ++ return 0; ++} ++ ++/* ++ * Return the state for this object. ++ * At this time simply return 0 to indicate success and use the existing state ++ */ ++int ++spnfs_get_state(struct inode *inode, struct knfsd_fh *fh, struct pnfs_get_state *arg) ++{ ++ return 0; ++} ++ ++/* ++ * Return the filehandle for the specified file descriptor ++ */ ++int ++spnfs_getfh(int fd, struct nfs_fh *fh) ++{ ++ struct file *file; ++ ++ file = fget(fd); ++ if (file == NULL) ++ return -EIO; ++ ++ memcpy(fh, NFS_FH(file->f_dentry->d_inode), sizeof(struct nfs_fh)); ++ fput(file); ++ return 0; ++} +diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h +--- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-09-30 10:15:18.375737000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-09-30 10:17:08.964002000 -0400 +@@ -242,6 +242,12 @@ struct nfs4_client { + u32 cl_cb_seq_nr; + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ ++#if defined(CONFIG_PNFSD) ++ struct list_head cl_layouts; /* outstanding layouts */ ++ struct list_head cl_layoutrecalls; /* outstanding layoutrecall ++ callbacks */ ++ atomic_t cl_deviceref; /* Num outstanding devs */ ++#endif /* CONFIG_PNFSD */ + }; + + static inline void +@@ -342,12 +348,31 @@ struct nfs4_file { + struct list_head fi_hash; /* hash by "struct inode *" */ + struct list_head fi_stateids; + struct list_head fi_delegations; ++#if defined(CONFIG_PNFSD) ++ struct list_head fi_layouts; ++ struct list_head fi_layout_states; ++#endif /* CONFIG_PNFSD */ + struct inode *fi_inode; + u32 fi_id; /* used with stateowner->so_id + * for stateid_hashtbl hash */ + bool fi_had_conflict; ++#if defined(CONFIG_PNFSD) ++ /* used by layoutget / layoutrecall */ ++ struct nfs4_fsid fi_fsid; ++ u32 fi_fhlen; ++ u8 fi_fhval[NFS4_FHSIZE]; ++#endif /* CONFIG_PNFSD */ + }; + ++#if defined(CONFIG_PNFSD) ++/* pNFS Metadata server state */ ++ ++struct pnfs_ds_dev_entry { ++ struct list_head dd_dev_entry; /* st_pnfs_ds_id entry */ ++ u32 dd_dsid; ++}; ++#endif /* CONFIG_PNFSD */ ++ + /* + * nfs4_stateid can either be an open stateid or (eventually) a lock stateid + * +@@ -370,6 +395,9 @@ struct nfs4_stateid { + struct list_head st_perfile; + struct list_head st_perstateowner; + struct list_head st_lockowners; ++#if defined(CONFIG_PNFSD) ++ struct list_head st_pnfs_ds_id; ++#endif /* CONFIG_PNFSD */ + struct nfs4_stateowner * st_stateowner; + struct nfs4_file * st_file; + stateid_t st_stateid; +@@ -421,6 +449,34 @@ extern void nfsd4_recdir_purge_old(void) + extern int nfsd4_create_clid_dir(struct nfs4_client *clp); + extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); + extern void release_session_client(struct nfsd4_session *); ++extern void nfsd4_free_slab(struct kmem_cache **); ++extern struct nfs4_file *find_file(struct inode *); ++extern struct nfs4_file *find_alloc_file(struct inode *, struct svc_fh *); ++extern void put_nfs4_file(struct nfs4_file *); ++extern void get_nfs4_file(struct nfs4_file *); ++extern struct nfs4_client *find_confirmed_client(clientid_t *); ++extern struct nfs4_stateid *find_stateid(stateid_t *, int flags); ++extern struct nfs4_delegation *find_delegation_stateid(struct inode *, stateid_t *); ++extern __be32 nfs4_check_stateid(stateid_t *); ++extern void expire_client_lock(struct nfs4_client *); ++extern int filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), void *); ++ ++#if defined(CONFIG_PNFSD) ++extern int nfsd4_init_pnfs_slabs(void); ++extern void nfsd4_free_pnfs_slabs(void); ++extern void pnfs_expire_client(struct nfs4_client *); ++extern void release_pnfs_ds_dev_list(struct nfs4_stateid *); ++extern void nfs4_pnfs_state_init(void); ++extern void nfs4_pnfs_state_shutdown(void); ++extern void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++extern int nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *, stateid_t *); ++#else /* CONFIG_PNFSD */ ++static inline void nfsd4_free_pnfs_slabs(void) {} ++static inline int nfsd4_init_pnfs_slabs(void) { return 0; } ++static inline void pnfs_expire_client(struct nfs4_client *clp) {} ++static inline void release_pnfs_ds_dev_list(struct nfs4_stateid *stp) {} ++static inline void nfs4_pnfs_state_shutdown(void) {} ++#endif /* CONFIG_PNFSD */ + + static inline void + nfs4_put_stateowner(struct nfs4_stateowner *so) +@@ -434,4 +490,24 @@ nfs4_get_stateowner(struct nfs4_stateown + kref_get(&so->so_ref); + } + ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ + #endif /* NFSD4_STATE_H */ +diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs.c +--- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-09-30 10:15:05.090335000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-09-30 10:17:08.970001000 -0400 +@@ -37,7 +37,12 @@ + #ifdef CONFIG_NFSD_V4 + #include + #include ++#include ++#include + #endif /* CONFIG_NFSD_V4 */ ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif + + #include "nfsd.h" + #include "vfs.h" +@@ -383,6 +388,12 @@ nfsd_setattr(struct svc_rqst *rqstp, str + NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); + if (err) + goto out; ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, 0)) { ++ err = bl_layoutrecall(inode, RETURN_FILE, ++ iap->ia_size, inode->i_size - iap->ia_size); ++ } ++#endif /* CONFIG_SPNFS_BLOCK */ + } + + /* +@@ -1703,6 +1714,11 @@ nfsd_rename(struct svc_rqst *rqstp, stru + struct inode *fdir, *tdir; + __be32 err; + int host_err; ++#ifdef CONFIG_SPNFS ++ unsigned long ino = 0; ++ unsigned long generation = 0; ++ unsigned int nlink = 0; ++#endif /* CONFIG_SPNFS */ + + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); + if (err) +@@ -1766,7 +1782,26 @@ nfsd_rename(struct svc_rqst *rqstp, stru + if (host_err) + goto out_dput_new; + ++#ifdef CONFIG_SPNFS ++ /* ++ * if the target is a preexisting regular file, remember the ++ * inode number and generation so we can delete the stripes; ++ * save the link count as well so that the stripes only get ++ * get deleted when the last link is deleted ++ */ ++ if (ndentry && ndentry->d_inode && S_ISREG(ndentry->d_inode->i_mode)) { ++ ino = ndentry->d_inode->i_ino; ++ generation = ndentry->d_inode->i_generation; ++ nlink = ndentry->d_inode->i_nlink; ++ } ++#endif /* CONFIG_SPNFS */ ++ + host_err = vfs_rename(fdir, odentry, tdir, ndentry); ++#ifdef CONFIG_SPNFS ++ if (spnfs_enabled() && (!host_err && ino && nlink == 1)) ++ spnfs_remove(ino, generation); ++#endif /* CONFIG_SPNFS */ ++ + if (!host_err) { + host_err = commit_metadata(tfhp); + if (!host_err) +@@ -1807,6 +1842,11 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + struct inode *dirp; + __be32 err; + int host_err; ++#if defined(CONFIG_SPNFS) ++ unsigned long ino; ++ unsigned long generation; ++ unsigned int nlink; ++#endif /* defined(CONFIG_SPNFS) */ + + err = nfserr_acces; + if (!flen || isdotent(fname, flen)) +@@ -1830,6 +1870,17 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + goto out; + } + ++#if defined(CONFIG_SPNFS) ++ /* ++ * Remember the inode number to communicate to the spnfsd ++ * for removal of stripes; save the link count as well so that ++ * the stripes only get get deleted when the last link is deleted ++ */ ++ ino = rdentry->d_inode->i_ino; ++ generation = rdentry->d_inode->i_generation; ++ nlink = rdentry->d_inode->i_nlink; ++#endif /* defined(CONFIG_SPNFS) */ ++ + if (!type) + type = rdentry->d_inode->i_mode & S_IFMT; + +@@ -1854,6 +1905,29 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + if (!host_err) + host_err = commit_metadata(fhp); + ++#if defined(CONFIG_SPNFS) ++ /* ++ * spnfs: notify spnfsd of removal to destroy stripes ++ */ ++/* ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (sb->s_export_op->spnfs_remove) { ++*/ ++ dprintk("%s check if spnfs_enabled\n", __FUNCTION__); ++ if (spnfs_enabled() && nlink == 1) { ++ BUG_ON(ino == 0); ++ dprintk("%s calling spnfs_remove inumber=%ld\n", ++ __FUNCTION__, ino); ++ if (spnfs_remove(ino, generation) == 0) { ++ dprintk("%s spnfs_remove success\n", __FUNCTION__); ++ } else { ++ /* XXX How do we make this atomic? */ ++ printk(KERN_WARNING "nfsd: pNFS could not " ++ "remove stripes for inode: %ld\n", ino); ++ } ++ } ++#endif /* defined(CONFIG_SPNFS) */ ++ + mnt_drop_write(fhp->fh_export->ex_path.mnt); + out_nfserr: + err = nfserrno(host_err); +diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h +--- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-09-30 10:15:18.395731000 -0400 ++++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-09-30 10:17:08.978004000 -0400 +@@ -37,6 +37,8 @@ + #ifndef _LINUX_NFSD_XDR4_H + #define _LINUX_NFSD_XDR4_H + ++#include ++ + #include "state.h" + #include "nfsd.h" + +@@ -385,6 +387,51 @@ struct nfsd4_reclaim_complete { + u32 rca_one_fs; + }; + ++struct nfsd4_pnfs_getdevinfo { ++ struct nfsd4_pnfs_deviceid gd_devid; /* request */ ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxcount; /* request */ ++ u32 gd_notify_types;/* request */ ++ struct super_block *gd_sb; ++}; ++ ++struct nfsd4_pnfs_getdevlist { ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxdevices; /* request */ ++ u64 gd_cookie; /* request - response */ ++ u64 gd_verf; /* request - response */ ++ struct svc_fh *gd_fhp; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutget { ++ u64 lg_minlength; /* request */ ++ u32 lg_signal; /* request */ ++ u32 lg_maxcount; /* request */ ++ struct svc_fh *lg_fhp; /* request */ ++ stateid_t lg_sid; /* request/response */ ++ struct nfsd4_layout_seg lg_seg; /* request/response */ ++ u32 lg_roc; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit { ++ struct nfsd4_pnfs_layoutcommit_arg args; ++ stateid_t lc_sid; /* request */ ++ struct nfsd4_pnfs_layoutcommit_res res; ++}; ++ ++enum layoutreturn_flags { ++ LR_FLAG_INTERN = 1 << 0, /* internal return */ ++ LR_FLAG_EXPIRE = 1 << 1, /* return on client expiration */ ++}; ++ ++struct nfsd4_pnfs_layoutreturn { ++ struct nfsd4_pnfs_layoutreturn_arg args; ++ u32 lr_flags; ++ stateid_t lr_sid; /* request/resopnse */ ++ u32 lrs_present; /* response */ ++}; ++ + struct nfsd4_op { + int opnum; + __be32 status; +@@ -426,6 +473,13 @@ struct nfsd4_op { + struct nfsd4_destroy_session destroy_session; + struct nfsd4_sequence sequence; + struct nfsd4_reclaim_complete reclaim_complete; ++#if defined(CONFIG_PNFSD) ++ struct nfsd4_pnfs_getdevlist pnfs_getdevlist; ++ struct nfsd4_pnfs_getdevinfo pnfs_getdevinfo; ++ struct nfsd4_pnfs_layoutget pnfs_layoutget; ++ struct nfsd4_pnfs_layoutcommit pnfs_layoutcommit; ++ struct nfsd4_pnfs_layoutreturn pnfs_layoutreturn; ++#endif /* CONFIG_PNFSD */ + } u; + struct nfs4_replay * replay; + }; +diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file.c +--- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-09-30 10:15:17.741713000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/file.c 2010-09-30 10:17:08.626991000 -0400 +@@ -36,6 +36,7 @@ + #include "internal.h" + #include "iostat.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_FILE + +@@ -388,12 +389,17 @@ static int nfs_write_begin(struct file * + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int once_thru = 0; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + mapping->host->i_ino, len, (long long) pos); + ++ pnfs_update_layout(mapping->host, ++ nfs_file_open_context(file), ++ 0, NFS4_MAX_UINT64, IOMODE_RW, ++ &lseg); + start: + /* + * Prevent starvation issues if someone is doing a consistency +@@ -402,17 +408,22 @@ start: + ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) +- return ret; ++ goto out; + + page = grab_cache_page_write_begin(mapping, index, flags); +- if (!page) +- return -ENOMEM; ++ if (!page) { ++ ret = -ENOMEM; ++ goto out; ++ } + *pagep = page; + +- ret = nfs_flush_incompatible(file, page); ++ ret = nfs_flush_incompatible(file, page, lseg); + if (ret) { + unlock_page(page); + page_cache_release(page); ++ *pagep = NULL; ++ *fsdata = NULL; ++ goto out; + } else if (!once_thru && + nfs_want_read_modify_write(file, page, pos, len)) { + once_thru = 1; +@@ -421,6 +432,12 @@ start: + if (!ret) + goto start; + } ++ ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata); ++ out: ++ if (ret) { ++ put_lseg(lseg); ++ *fsdata = NULL; ++ } + return ret; + } + +@@ -430,6 +447,7 @@ static int nfs_write_end(struct file *fi + { + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + int status; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, +@@ -456,10 +474,17 @@ static int nfs_write_end(struct file *fi + zero_user_segment(page, pglen, PAGE_CACHE_SIZE); + } + +- status = nfs_updatepage(file, page, offset, copied); ++ lseg = nfs4_pull_lseg_from_fsdata(file, fsdata); ++ status = pnfs_write_end(file, page, pos, len, copied, lseg); ++ if (status) ++ goto out; ++ status = nfs_updatepage(file, page, offset, copied, lseg, fsdata); + ++ out: + unlock_page(page); + page_cache_release(page); ++ pnfs_write_end_cleanup(file, fsdata); ++ put_lseg(lseg); + + if (status < 0) + return status; +@@ -570,6 +595,8 @@ static int nfs_vm_page_mkwrite(struct vm + /* make sure the cache has finished storing the page */ + nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); + ++ /* XXX Do we want to call pnfs_update_layout here? */ ++ + lock_page(page); + mapping = page->mapping; + if (mapping != dentry->d_inode->i_mapping) +@@ -580,11 +607,11 @@ static int nfs_vm_page_mkwrite(struct vm + if (pagelen == 0) + goto out_unlock; + +- ret = nfs_flush_incompatible(filp, page); ++ ret = nfs_flush_incompatible(filp, page, NULL); + if (ret != 0) + goto out_unlock; + +- ret = nfs_updatepage(filp, page, 0, pagelen); ++ ret = nfs_updatepage(filp, page, 0, pagelen, NULL, NULL); + out_unlock: + if (!ret) + return VM_FAULT_LOCKED; +diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inode.c +--- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-09-30 10:15:17.769716000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-09-30 10:17:08.632991000 -0400 +@@ -48,6 +48,7 @@ + #include "internal.h" + #include "fscache.h" + #include "dns_resolve.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_VFS + +@@ -278,7 +279,7 @@ nfs_fhget(struct super_block *sb, struct + */ + inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; + if (S_ISREG(inode->i_mode)) { +- inode->i_fop = &nfs_file_operations; ++ inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; + inode->i_data.a_ops = &nfs_file_aops; + inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; + } else if (S_ISDIR(inode->i_mode)) { +@@ -530,6 +531,68 @@ out: + return err; + } + ++static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) ++{ ++ atomic_set(&l_ctx->count, 1); ++ l_ctx->lockowner = current->files; ++ l_ctx->pid = current->tgid; ++ INIT_LIST_HEAD(&l_ctx->list); ++} ++ ++static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) ++{ ++ struct nfs_lock_context *pos; ++ ++ list_for_each_entry(pos, &ctx->lock_context.list, list) { ++ if (pos->lockowner != current->files) ++ continue; ++ if (pos->pid != current->tgid) ++ continue; ++ atomic_inc(&pos->count); ++ return pos; ++ } ++ return NULL; ++} ++ ++struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) ++{ ++ struct nfs_lock_context *res, *new = NULL; ++ struct inode *inode = ctx->path.dentry->d_inode; ++ ++ spin_lock(&inode->i_lock); ++ res = __nfs_find_lock_context(ctx); ++ if (res == NULL) { ++ spin_unlock(&inode->i_lock); ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (new == NULL) ++ return NULL; ++ nfs_init_lock_context(new); ++ spin_lock(&inode->i_lock); ++ res = __nfs_find_lock_context(ctx); ++ if (res == NULL) { ++ list_add_tail(&new->list, &ctx->lock_context.list); ++ new->open_context = ctx; ++ res = new; ++ new = NULL; ++ } ++ } ++ spin_unlock(&inode->i_lock); ++ kfree(new); ++ return res; ++} ++ ++void nfs_put_lock_context(struct nfs_lock_context *l_ctx) ++{ ++ struct nfs_open_context *ctx = l_ctx->open_context; ++ struct inode *inode = ctx->path.dentry->d_inode; ++ ++ if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) ++ return; ++ list_del(&l_ctx->list); ++ spin_unlock(&inode->i_lock); ++ kfree(l_ctx); ++} ++ + /** + * nfs_close_context - Common close_context() routine NFSv2/v3 + * @ctx: pointer to context +@@ -566,11 +629,11 @@ static struct nfs_open_context *alloc_nf + path_get(&ctx->path); + ctx->cred = get_rpccred(cred); + ctx->state = NULL; +- ctx->lockowner = current->files; + ctx->flags = 0; + ctx->error = 0; + ctx->dir_cookie = 0; +- atomic_set(&ctx->count, 1); ++ nfs_init_lock_context(&ctx->lock_context); ++ ctx->lock_context.open_context = ctx; + } + return ctx; + } +@@ -578,15 +641,16 @@ static struct nfs_open_context *alloc_nf + struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) + { + if (ctx != NULL) +- atomic_inc(&ctx->count); ++ atomic_inc(&ctx->lock_context.count); + return ctx; + } ++EXPORT_SYMBOL(get_nfs_open_context); + + static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) + { + struct inode *inode = ctx->path.dentry->d_inode; + +- if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) ++ if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) + return; + list_del(&ctx->list); + spin_unlock(&inode->i_lock); +@@ -933,6 +997,7 @@ void nfs_fattr_init(struct nfs_fattr *fa + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); + } ++EXPORT_SYMBOL(nfs_fattr_init); + + struct nfs_fattr *nfs_alloc_fattr(void) + { +@@ -1142,6 +1207,14 @@ static int nfs_update_inode(struct inode + server->fsid = fattr->fsid; + + /* ++ * file needs layout commit, server attributes may be stale ++ */ ++ if (layoutcommit_needed(nfsi) && nfsi->change_attr >= fattr->change_attr) { ++ dprintk("NFS: %s: layoutcommit is needed for file %s/%ld\n", ++ __func__, inode->i_sb->s_id, inode->i_ino); ++ return 0; ++ } ++ /* + * Update the read time so we don't revalidate too often. + */ + nfsi->read_cache_jiffies = fattr->time_start; +@@ -1340,9 +1413,10 @@ static int nfs_update_inode(struct inode + */ + void nfs4_clear_inode(struct inode *inode) + { ++ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); ++ + /* If we are holding a delegation, return it! */ + nfs_inode_return_delegation_noreclaim(inode); +- /* First call standard NFS clear_inode() code */ + nfs_clear_inode(inode); + } + #endif +@@ -1367,7 +1441,10 @@ struct inode *nfs_alloc_inode(struct sup + + void nfs_destroy_inode(struct inode *inode) + { +- kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); ++ struct nfs_inode *nfsi = NFS_I(inode); ++ ++ pnfs_destroy_layout(nfsi); ++ kmem_cache_free(nfs_inode_cachep, nfsi); + } + + static inline void nfs4_init_once(struct nfs_inode *nfsi) +@@ -1377,6 +1454,11 @@ static inline void nfs4_init_once(struct + nfsi->delegation = NULL; + nfsi->delegation_state = 0; + init_rwsem(&nfsi->rwsem); ++#ifdef CONFIG_NFS_V4_1 ++ init_waitqueue_head(&nfsi->lo_waitq); ++ nfsi->pnfs_layout_suspend = 0; ++ nfsi->layout = NULL; ++#endif /* CONFIG_NFS_V4_1 */ + #endif + } + +@@ -1488,6 +1570,12 @@ static int __init init_nfs_fs(void) + if (err) + goto out0; + ++#ifdef CONFIG_NFS_V4_1 ++ err = pnfs_initialize(); ++ if (err) ++ goto out00; ++#endif /* CONFIG_NFS_V4_1 */ ++ + #ifdef CONFIG_PROC_FS + rpc_proc_register(&nfs_rpcstat); + #endif +@@ -1498,6 +1586,10 @@ out: + #ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); + #endif ++#ifdef CONFIG_NFS_V4_1 ++out00: ++ pnfs_uninitialize(); ++#endif /* CONFIG_NFS_V4_1 */ + nfs_destroy_directcache(); + out0: + nfs_destroy_writepagecache(); +@@ -1531,6 +1623,9 @@ static void __exit exit_nfs_fs(void) + #ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); + #endif ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_uninitialize(); ++#endif + unregister_nfs_fs(); + nfs_fs_proc_exit(); + nfsiod_stop(); +diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/internal.h +--- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-09-30 10:15:17.775713000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-09-30 10:17:08.637996000 -0400 +@@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_serv + struct nfs_fattr *); + extern void nfs_mark_client_ready(struct nfs_client *clp, int state); + extern int nfs4_check_client_ready(struct nfs_client *clp); ++extern int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++ const struct sockaddr *sa2); ++extern int nfs4_set_client(struct nfs_server *server, ++ const char *hostname, ++ const struct sockaddr *addr, ++ const size_t addrlen, ++ const char *ip_addr, ++ rpc_authflavor_t authflavour, ++ int proto, const struct rpc_timeout *timeparms, ++ u32 minorversion); + #ifdef CONFIG_PROC_FS + extern int __init nfs_fs_proc_init(void); + extern void nfs_fs_proc_exit(void); +@@ -201,6 +211,8 @@ extern const u32 nfs41_maxwrite_overhead + extern struct rpc_procinfo nfs4_procedures[]; + #endif + ++extern int nfs4_recover_expired_lease(struct nfs_client *clp); ++ + /* proc.c */ + void nfs_close_context(struct nfs_open_context *ctx, int is_sync); + +@@ -248,10 +260,31 @@ extern int nfs4_get_rootfh(struct nfs_se + #endif + + /* read.c */ ++extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); ++extern int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); + extern void nfs_read_prepare(struct rpc_task *task, void *calldata); + + /* write.c */ ++extern int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs); + extern void nfs_write_prepare(struct rpc_task *task, void *calldata); ++extern void nfs_mark_list_commit(struct list_head *head); + #ifdef CONFIG_MIGRATION + extern int nfs_migrate_page(struct address_space *, + struct page *, struct page *); +diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kconfig +--- linux-2.6.34.noarch/fs/nfs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-09-30 10:17:08.515988000 -0400 +@@ -79,10 +79,48 @@ config NFS_V4_1 + depends on NFS_V4 && EXPERIMENTAL + help + This option enables support for minor version 1 of the NFSv4 protocol +- (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. ++ (RFC5661) including support for the parallel NFS (pNFS) features ++ in the kernel's NFS client. + + Unless you're an NFS developer, say N. + ++config PNFS_FILE_LAYOUT ++ tristate "NFS client support for the pNFS nfs-files layout (DEVELOPER ONLY)" ++ depends on NFS_FS && NFS_V4_1 ++ default y ++ help ++ This option enables support for the pNFS nfs-files layout. ++ ++ Unless you're an NFS developer, say N. ++ ++config PNFS_OBJLAYOUT ++ tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD ++ help ++ Say M here if you want your pNFS client to support the Objects Layout Driver. ++ Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and ++ upper level driver (SCSI_OSD_ULD). ++ ++ If unsure, say N. ++ ++config PNFS_PANLAYOUT ++ tristate "Provide support for the Panasas OSD Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on PNFS_OBJLAYOUT ++ help ++ Say M or y here if you want your pNFS client to support the Panasas OSD Layout Driver. ++ ++ If unsure, say N. ++ ++config PNFS_BLOCK ++ tristate "Provide a pNFS block client (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 ++ select MD ++ select BLK_DEV_DM ++ help ++ Say M or y here if you want your pNfs client to support the block protocol ++ ++ If unsure, say N. ++ + config ROOT_NFS + bool "Root file system on NFS" + depends on NFS_FS=y && IP_PNP +diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Makefile +--- linux-2.6.34.noarch/fs/nfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-09-30 10:17:08.520988000 -0400 +@@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4x + delegation.o idmap.o \ + callback.o callback_xdr.o callback_proc.o \ + nfs4namespace.o ++nfs-$(CONFIG_NFS_V4_1) += pnfs.o + nfs-$(CONFIG_SYSCTL) += sysctl.o + nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o ++ ++obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o ++nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o ++ ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ ++obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ +diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs3proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-09-30 10:15:17.806716000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-09-30 10:17:08.643994000 -0400 +@@ -833,6 +833,7 @@ const struct nfs_rpc_ops nfs_v3_clientop + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs3_dir_inode_operations, + .file_inode_ops = &nfs3_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs3_proc_get_root, + .getattr = nfs3_proc_getattr, + .setattr = nfs3_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-09-30 10:17:08.652995000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-09-30 10:17:08.654992000 -0400 +@@ -0,0 +1,768 @@ ++/* ++ * linux/fs/nfs/nfs4filelayout.c ++ * ++ * Module for the pnfs nfs4 file layout driver. ++ * Defines all I/O and Policy interface operations, plus code ++ * to register itself with the pNFS client. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfs4filelayout.h" ++#include "nfs4_fs.h" ++#include "internal.h" ++#include "pnfs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Dean Hildebrand "); ++MODULE_DESCRIPTION("The NFSv4 file layout driver"); ++ ++/* Callback operations to the pNFS client */ ++struct pnfs_client_operations *pnfs_callback_ops; ++ ++/* Forward declaration */ ++struct layoutdriver_io_operations filelayout_io_operations; ++ ++int ++filelayout_initialize_mountpoint(struct nfs_server *nfss, ++ const struct nfs_fh *mntfh) ++{ ++ int status = nfs4_alloc_init_deviceid_cache(nfss->nfs_client, ++ nfs4_fl_free_deviceid_callback); ++ if (status) { ++ printk(KERN_WARNING "%s: deviceid cache could not be " ++ "initialized\n", __func__); ++ return status; ++ } ++ dprintk("%s: deviceid cache has been initialized successfully\n", ++ __func__); ++ return 0; ++} ++ ++/* Uninitialize a mountpoint by destroying its device list */ ++int ++filelayout_uninitialize_mountpoint(struct nfs_server *nfss) ++{ ++ dprintk("--> %s\n", __func__); ++ ++ if (nfss->pnfs_curr_ld && nfss->nfs_client->cl_devid_cache) ++ nfs4_put_deviceid_cache(nfss->nfs_client); ++ return 0; ++} ++ ++/* This function is used by the layout driver to calculate the ++ * offset of the file on the dserver based on whether the ++ * layout type is STRIPE_DENSE or STRIPE_SPARSE ++ */ ++static loff_t ++filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ ++ switch (flseg->stripe_type) { ++ case STRIPE_SPARSE: ++ return offset; ++ ++ case STRIPE_DENSE: ++ { ++ u32 stripe_width; ++ u64 tmp, off; ++ u32 unit = flseg->stripe_unit; ++ ++ stripe_width = unit * FILE_DSADDR(lseg)->stripe_count; ++ tmp = off = offset - flseg->pattern_offset; ++ do_div(tmp, stripe_width); ++ return tmp * unit + do_div(off, unit); ++ } ++ default: ++ BUG(); ++ } ++ ++ /* We should never get here... just to stop the gcc warning */ ++ return 0; ++} ++ ++/* ++ * Call ops for the async read/write cases ++ * In the case of dense layouts, the offset needs to be reset to its ++ * original value. ++ */ ++static void filelayout_read_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ if (rdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ rdata->args.offset, rdata->fldata.orig_offset); ++ rdata->args.offset = rdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ rdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_read_release(void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ put_lseg(rdata->pdata.lseg); ++ rdata->pdata.lseg = NULL; ++ rdata->pdata.call_ops->rpc_release(data); ++} ++ ++static void filelayout_write_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ if (wdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ wdata->args.offset, wdata->fldata.orig_offset); ++ wdata->args.offset = wdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_write_release(void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ put_lseg(wdata->pdata.lseg); ++ wdata->pdata.lseg = NULL; ++ wdata->pdata.call_ops->rpc_release(data); ++} ++ ++struct rpc_call_ops filelayout_read_call_ops = { ++ .rpc_call_prepare = nfs_read_prepare, ++ .rpc_call_done = filelayout_read_call_done, ++ .rpc_release = filelayout_read_release, ++}; ++ ++struct rpc_call_ops filelayout_write_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_write_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* Perform sync or async reads. ++ * ++ * An optimization for the NFS file layout driver ++ * allows the original read/write data structs to be passed in the ++ * last argument. ++ * ++ * TODO: join with write_pagelist? ++ */ ++static enum pnfs_try_status ++filelayout_read_pagelist(struct nfs_read_data *data, unsigned nr_pages) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ dprintk("--> %s ino %lu nr_pages %d pgbase %u req %Zu@%llu\n", ++ __func__, data->inode->i_ino, nr_pages, ++ data->args.pgbase, (size_t)data->args.count, offset); ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s USE DS:ip %x %s\n", __func__, ++ htonl(ds->ds_ip_addr), ds->r_addr); ++ ++ /* just try the first data server for the index..*/ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ ++ /* ++ * Now get the file offset on the dserver ++ * Set the read offset to this offset, and ++ * save the original offset in orig_offset ++ * In the case of aync reads, the offset will be reset in the ++ * call_ops->rpc_call_done() routine. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* Perform an asynchronous read */ ++ nfs_initiate_read(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_read_call_ops); ++ ++ data->pdata.pnfs_error = 0; ++ ++ return PNFS_ATTEMPTED; ++} ++ ++/* Perform async writes. */ ++static enum pnfs_try_status ++filelayout_write_pagelist(struct nfs_write_data *data, unsigned nr_pages, int sync) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu %s\n", __func__, ++ data->inode->i_ino, sync, (size_t) data->args.count, offset, ++ htonl(ds->ds_ip_addr), ntohs(ds->ds_port), ds->r_addr); ++ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ /* ++ * Get the file offset on the dserver. Set the write offset to ++ * this offset and save the original offset. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* ++ * Perform an asynchronous write The offset will be reset in the ++ * call_ops->rpc_call_done() routine ++ */ ++ nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_write_call_ops, sync); ++ ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++} ++ ++/* ++ * Create a filelayout layout structure and return it. The pNFS client ++ * will use the pnfs_layout_hdr type to refer to the layout for this ++ * inode from now on. ++ */ ++static struct pnfs_layout_hdr * ++filelayout_alloc_layout(struct inode *inode) ++{ ++ struct nfs4_filelayout *flp; ++ ++ dprintk("NFS_FILELAYOUT: allocating layout\n"); ++ flp = kzalloc(sizeof(struct nfs4_filelayout), GFP_KERNEL); ++ return flp ? &flp->fl_layout : NULL; ++} ++ ++/* Free a filelayout layout structure */ ++static void ++filelayout_free_layout(struct pnfs_layout_hdr *lo) ++{ ++ dprintk("NFS_FILELAYOUT: freeing layout\n"); ++ kfree(FILE_LO(lo)); ++} ++ ++/* ++ * filelayout_check_layout() ++ * ++ * Make sure layout segment parameters are sane WRT the device. ++ * ++ * Notes: ++ * 1) current code insists that # stripe index = # data servers in ds_list ++ * which is wrong. ++ * 2) pattern_offset is ignored and must == 0 which is wrong; ++ * 3) the pattern_offset needs to be a mutliple of the stripe unit. ++ * 4) stripe unit is multiple of page size ++ */ ++ ++static int ++filelayout_check_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct nfs4_filelayout_segment *fl = LSEG_LD_DATA(lseg); ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ int status = -EINVAL; ++ struct nfs_server *nfss = NFS_SERVER(PNFS_INODE(lo)); ++ ++ dprintk("--> %s\n", __func__); ++ /* find in list or get from server and reference the deviceid */ ++ dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, &fl->dev_id); ++ if (dsaddr == NULL) { ++ dsaddr = get_device_info(PNFS_INODE(lo), &fl->dev_id); ++ if (dsaddr == NULL) { ++ dprintk("%s NO device for dev_id %s\n", ++ __func__, deviceid_fmt(&fl->dev_id)); ++ goto out; ++ } ++ } ++ if (fl->first_stripe_index < 0 || ++ fl->first_stripe_index > dsaddr->stripe_count) { ++ dprintk("%s Bad first_stripe_index %d\n", ++ __func__, fl->first_stripe_index); ++ goto out_put; ++ } ++ ++ if (fl->pattern_offset != 0) { ++ dprintk("%s Unsupported no-zero pattern_offset %Ld\n", ++ __func__, fl->pattern_offset); ++ goto out_put; ++ } ++ ++ if (fl->stripe_unit % PAGE_SIZE) { ++ dprintk("%s Stripe unit (%u) not page aligned\n", ++ __func__, fl->stripe_unit); ++ goto out_put; ++ } ++ ++ /* XXX only support SPARSE packing. Don't support use MDS open fh */ ++ if (!(fl->num_fh == 1 || fl->num_fh == dsaddr->ds_num)) { ++ dprintk("%s num_fh %u not equal to 1 or ds_num %u\n", ++ __func__, fl->num_fh, dsaddr->ds_num); ++ goto out_put; ++ } ++ ++ if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { ++ dprintk("%s Stripe unit (%u) not aligned with rsize %u " ++ "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, ++ nfss->wsize); ++ } ++ ++ nfs4_set_layout_deviceid(lseg, &dsaddr->deviceid); ++ ++ status = 0; ++out: ++ dprintk("--> %s returns %d\n", __func__, status); ++ return status; ++out_put: ++ nfs4_put_unset_layout_deviceid(lseg, &dsaddr->deviceid, ++ nfs4_fl_free_deviceid_callback); ++ goto out; ++} ++ ++static void _filelayout_free_lseg(struct pnfs_layout_segment *lseg); ++static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl); ++ ++/* Decode layout and store in layoutid. Overwrite any existing layout ++ * information for this file. ++ */ ++static int ++filelayout_set_layout(struct nfs4_filelayout *flo, ++ struct nfs4_filelayout_segment *fl, ++ struct nfs4_layoutget_res *lgr) ++{ ++ uint32_t *p = (uint32_t *)lgr->layout.buf; ++ uint32_t nfl_util; ++ int i; ++ ++ dprintk("%s: set_layout_map Begin\n", __func__); ++ ++ memcpy(&fl->dev_id, p, NFS4_PNFS_DEVICEID4_SIZE); ++ p += XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE); ++ nfl_util = be32_to_cpup(p++); ++ if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) ++ fl->commit_through_mds = 1; ++ if (nfl_util & NFL4_UFLG_DENSE) ++ fl->stripe_type = STRIPE_DENSE; ++ else ++ fl->stripe_type = STRIPE_SPARSE; ++ fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; ++ ++ if (!flo->stripe_unit) ++ flo->stripe_unit = fl->stripe_unit; ++ else if (flo->stripe_unit != fl->stripe_unit) { ++ printk(KERN_NOTICE "%s: updating strip_unit from %u to %u\n", ++ __func__, flo->stripe_unit, fl->stripe_unit); ++ flo->stripe_unit = fl->stripe_unit; ++ } ++ ++ fl->first_stripe_index = be32_to_cpup(p++); ++ p = xdr_decode_hyper(p, &fl->pattern_offset); ++ fl->num_fh = be32_to_cpup(p++); ++ ++ dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu dev_id %s\n", ++ __func__, nfl_util, fl->num_fh, fl->first_stripe_index, ++ fl->pattern_offset, deviceid_fmt(&fl->dev_id)); ++ ++ if (fl->num_fh * sizeof(struct nfs_fh) > 2*PAGE_SIZE) { ++ fl->fh_array = vmalloc(fl->num_fh * sizeof(struct nfs_fh)); ++ if (fl->fh_array) ++ memset(fl->fh_array, 0, ++ fl->num_fh * sizeof(struct nfs_fh)); ++ } else { ++ fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh), ++ GFP_KERNEL); ++ } ++ if (!fl->fh_array) ++ return -ENOMEM; ++ ++ for (i = 0; i < fl->num_fh; i++) { ++ /* fh */ ++ fl->fh_array[i].size = be32_to_cpup(p++); ++ if (sizeof(struct nfs_fh) < fl->fh_array[i].size) { ++ printk(KERN_ERR "Too big fh %d received %d\n", ++ i, fl->fh_array[i].size); ++ /* Layout is now invalid, pretend it doesn't exist */ ++ filelayout_free_fh_array(fl); ++ fl->num_fh = 0; ++ break; ++ } ++ memcpy(fl->fh_array[i].data, p, fl->fh_array[i].size); ++ p += XDR_QUADLEN(fl->fh_array[i].size); ++ dprintk("DEBUG: %s: fh len %d\n", __func__, ++ fl->fh_array[i].size); ++ } ++ ++ return 0; ++} ++ ++static struct pnfs_layout_segment * ++filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, ++ struct nfs4_layoutget_res *lgr) ++{ ++ struct nfs4_filelayout *flo = FILE_LO(layoutid); ++ struct pnfs_layout_segment *lseg; ++ int rc; ++ ++ dprintk("--> %s\n", __func__); ++ lseg = kzalloc(sizeof(struct pnfs_layout_segment) + ++ sizeof(struct nfs4_filelayout_segment), GFP_KERNEL); ++ if (!lseg) ++ return NULL; ++ ++ rc = filelayout_set_layout(flo, LSEG_LD_DATA(lseg), lgr); ++ ++ if (rc != 0 || filelayout_check_layout(layoutid, lseg)) { ++ _filelayout_free_lseg(lseg); ++ lseg = NULL; ++ } ++ return lseg; ++} ++ ++static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) ++{ ++ if (fl->num_fh * sizeof(struct nfs_fh) > 2*PAGE_SIZE) ++ vfree(fl->fh_array); ++ else ++ kfree(fl->fh_array); ++ ++ fl->fh_array = NULL; ++} ++ ++static void ++_filelayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ filelayout_free_fh_array(LSEG_LD_DATA(lseg)); ++ kfree(lseg); ++} ++ ++static void ++filelayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ dprintk("--> %s\n", __func__); ++ nfs4_put_unset_layout_deviceid(lseg, lseg->deviceid, ++ nfs4_fl_free_deviceid_callback); ++ _filelayout_free_lseg(lseg); ++} ++ ++/* Allocate a new nfs_write_data struct and initialize */ ++static struct nfs_write_data * ++filelayout_clone_write_data(struct nfs_write_data *old) ++{ ++ static struct nfs_write_data *new; ++ ++ new = nfs_commitdata_alloc(); ++ if (!new) ++ goto out; ++ kref_init(&new->refcount); ++ new->parent = old; ++ kref_get(&old->refcount); ++ new->inode = old->inode; ++ new->cred = old->cred; ++ new->args.offset = 0; ++ new->args.count = 0; ++ new->res.count = 0; ++ new->res.fattr = &new->fattr; ++ nfs_fattr_init(&new->fattr); ++ new->res.verf = &new->verf; ++ new->args.context = get_nfs_open_context(old->args.context); ++ new->pdata.lseg = NULL; ++ new->pdata.call_ops = old->pdata.call_ops; ++ new->pdata.how = old->pdata.how; ++out: ++ return new; ++} ++ ++static void filelayout_commit_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static struct rpc_call_ops filelayout_commit_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_commit_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* ++ * Execute a COMMIT op to the MDS or to each data server on which a page ++ * in 'pages' exists. ++ * Invoke the pnfs_commit_complete callback. ++ */ ++enum pnfs_try_status ++filelayout_commit(struct nfs_write_data *data, int sync) ++{ ++ LIST_HEAD(head); ++ struct nfs_page *req; ++ loff_t file_offset = 0; ++ u16 idx, i; ++ struct list_head **ds_page_list = NULL; ++ u16 *indices_used; ++ int num_indices_seen = 0; ++ const struct rpc_call_ops *call_ops; ++ struct rpc_clnt *clnt; ++ struct nfs_write_data **clone_list = NULL; ++ struct nfs_write_data *dsdata; ++ struct nfs4_pnfs_ds *ds; ++ ++ dprintk("%s data %p sync %d\n", __func__, data, sync); ++ ++ /* Alloc room for both in one go */ ++ ds_page_list = kzalloc((NFS4_PNFS_MAX_MULTI_CNT + 1) * ++ (sizeof(u16) + sizeof(struct list_head *)), ++ GFP_KERNEL); ++ if (!ds_page_list) ++ goto mem_error; ++ indices_used = (u16 *) (ds_page_list + NFS4_PNFS_MAX_MULTI_CNT + 1); ++ /* ++ * Sort pages based on which ds to send to. ++ * MDS is given index equal to NFS4_PNFS_MAX_MULTI_CNT. ++ * Note we are assuming there is only a single lseg in play. ++ * When that is not true, we could first sort on lseg, then ++ * sort within each as we do here. ++ */ ++ while (!list_empty(&data->pages)) { ++ req = nfs_list_entry(data->pages.next); ++ nfs_list_remove_request(req); ++ if (!req->wb_lseg || ++ ((struct nfs4_filelayout_segment *) ++ LSEG_LD_DATA(req->wb_lseg))->commit_through_mds) ++ idx = NFS4_PNFS_MAX_MULTI_CNT; ++ else { ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ idx = nfs4_fl_calc_ds_index(req->wb_lseg, file_offset); ++ } ++ if (ds_page_list[idx]) { ++ /* Already seen this idx */ ++ list_add(&req->wb_list, ds_page_list[idx]); ++ } else { ++ /* New idx not seen so far */ ++ list_add_tail(&req->wb_list, &head); ++ indices_used[num_indices_seen++] = idx; ++ } ++ ds_page_list[idx] = &req->wb_list; ++ } ++ /* Once created, clone must be released via call_op */ ++ clone_list = kzalloc(num_indices_seen * ++ sizeof(struct nfs_write_data *), GFP_KERNEL); ++ if (!clone_list) ++ goto mem_error; ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ clone_list[i] = filelayout_clone_write_data(data); ++ if (!clone_list[i]) ++ goto mem_error; ++ } ++ clone_list[i] = data; ++ /* ++ * Now send off the RPCs to each ds. Note that it is important ++ * that any RPC to the MDS be sent last (or at least after all ++ * clones have been made.) ++ */ ++ for (i = 0; i < num_indices_seen; i++) { ++ dsdata = clone_list[i]; ++ idx = indices_used[i]; ++ list_cut_position(&dsdata->pages, &head, ds_page_list[idx]); ++ if (idx == NFS4_PNFS_MAX_MULTI_CNT) { ++ call_ops = data->pdata.call_ops;; ++ clnt = NFS_CLIENT(dsdata->inode); ++ ds = NULL; ++ } else { ++ struct nfs_fh *fh; ++ ++ call_ops = &filelayout_commit_call_ops; ++ req = nfs_list_entry(dsdata->pages.next); ++ ds = nfs4_fl_prepare_ds(req->wb_lseg, idx); ++ if (!ds) { ++ /* Trigger retry of this chunk through MDS */ ++ dsdata->task.tk_status = -EIO; ++ data->pdata.call_ops->rpc_release(dsdata); ++ continue; ++ } ++ clnt = ds->ds_clp->cl_rpcclient; ++ dsdata->fldata.ds_nfs_client = ds->ds_clp; ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ fh = nfs4_fl_select_ds_fh(req->wb_lseg, file_offset); ++ if (fh) ++ dsdata->args.fh = fh; ++ } ++ dprintk("%s: Initiating commit: %llu USE DS:\n", ++ __func__, file_offset); ++ print_ds(ds); ++ ++ /* Send COMMIT to data server */ ++ nfs_initiate_commit(dsdata, clnt, call_ops, sync); ++ } ++ kfree(clone_list); ++ kfree(ds_page_list); ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++ ++ mem_error: ++ if (clone_list) { ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ if (!clone_list[i]) ++ break; ++ data->pdata.call_ops->rpc_release(clone_list[i]); ++ } ++ kfree(clone_list); ++ } ++ kfree(ds_page_list); ++ /* One of these will be empty, but doesn't hurt to do both */ ++ nfs_mark_list_commit(&head); ++ nfs_mark_list_commit(&data->pages); ++ data->pdata.call_ops->rpc_release(data); ++ return PNFS_ATTEMPTED; ++} ++ ++/* Return the stripesize for the specified file */ ++ssize_t ++filelayout_get_stripesize(struct pnfs_layout_hdr *lo) ++{ ++ struct nfs4_filelayout *flo = FILE_LO(lo); ++ ++ return flo->stripe_unit; ++} ++ ++/* ++ * filelayout_pg_test(). Called by nfs_can_coalesce_requests() ++ * ++ * return 1 : coalesce page ++ * return 0 : don't coalesce page ++ */ ++int ++filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ u64 p_stripe, r_stripe; ++ ++ if (pgio->pg_boundary == 0) ++ return 1; ++ p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; ++ r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; ++ ++ do_div(p_stripe, pgio->pg_boundary); ++ do_div(r_stripe, pgio->pg_boundary); ++ ++ return (p_stripe == r_stripe); ++} ++ ++struct layoutdriver_io_operations filelayout_io_operations = { ++ .commit = filelayout_commit, ++ .read_pagelist = filelayout_read_pagelist, ++ .write_pagelist = filelayout_write_pagelist, ++ .alloc_layout = filelayout_alloc_layout, ++ .free_layout = filelayout_free_layout, ++ .alloc_lseg = filelayout_alloc_lseg, ++ .free_lseg = filelayout_free_lseg, ++ .initialize_mountpoint = filelayout_initialize_mountpoint, ++ .uninitialize_mountpoint = filelayout_uninitialize_mountpoint, ++}; ++ ++struct layoutdriver_policy_operations filelayout_policy_operations = { ++ .flags = PNFS_USE_RPC_CODE, ++ .get_stripesize = filelayout_get_stripesize, ++ .pg_test = filelayout_pg_test, ++}; ++ ++struct pnfs_layoutdriver_type filelayout_type = { ++ .id = LAYOUT_NFSV4_1_FILES, ++ .name = "LAYOUT_NFSV4_1_FILES", ++ .ld_io_ops = &filelayout_io_operations, ++ .ld_policy_ops = &filelayout_policy_operations, ++}; ++ ++static int __init nfs4filelayout_init(void) ++{ ++ printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", ++ __func__); ++ ++ /* ++ * Need to register file_operations struct with global list to indicate ++ * that NFS4 file layout is a possible pNFS I/O module ++ */ ++ pnfs_callback_ops = pnfs_register_layoutdriver(&filelayout_type); ++ ++ return 0; ++} ++ ++static void __exit nfs4filelayout_exit(void) ++{ ++ printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", ++ __func__); ++ ++ /* Unregister NFS4 file layout driver with pNFS client*/ ++ pnfs_unregister_layoutdriver(&filelayout_type); ++} ++ ++module_init(nfs4filelayout_init); ++module_exit(nfs4filelayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-09-30 10:17:08.661995000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-09-30 10:17:08.663993000 -0400 +@@ -0,0 +1,635 @@ ++/* ++ * linux/fs/nfs/nfs4filelayoutdev.c ++ * ++ * Device operations for the pnfs nfs4 file layout driver. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * Garth Goodson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include "nfs4filelayout.h" ++#include "internal.h" ++#include "nfs4_fs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++DEFINE_SPINLOCK(nfs4_ds_cache_lock); ++static LIST_HEAD(nfs4_data_server_cache); ++ ++void ++print_ds(struct nfs4_pnfs_ds *ds) ++{ ++ if (ds == NULL) { ++ dprintk("%s NULL device \n", __func__); ++ return; ++ } ++ dprintk(" ip_addr %x\n", ntohl(ds->ds_ip_addr)); ++ dprintk(" port %hu\n", ntohs(ds->ds_port)); ++ dprintk(" client %p\n", ds->ds_clp); ++ dprintk(" ref count %d\n", atomic_read(&ds->ds_count)); ++ if (ds->ds_clp) ++ dprintk(" cl_exchange_flags %x\n", ++ ds->ds_clp->cl_exchange_flags); ++ dprintk(" ip:port %s\n", ds->r_addr); ++} ++ ++void ++print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr) ++{ ++ int i; ++ ++ dprintk("%s dsaddr->ds_num %d\n", __func__, ++ dsaddr->ds_num); ++ for (i = 0; i < dsaddr->ds_num; i++) ++ print_ds(dsaddr->ds_list[i]); ++} ++ ++/* Debugging function assuming a 64bit major/minor split of the deviceid */ ++char * ++deviceid_fmt(const struct pnfs_deviceid *dev_id) ++{ ++ static char buf[17]; ++ uint32_t *p = (uint32_t *)dev_id->data; ++ uint64_t major, minor; ++ ++ p = xdr_decode_hyper(p, &major); ++ p = xdr_decode_hyper(p, &minor); ++ ++ sprintf(buf, "%08llu %08llu", major, minor); ++ return buf; ++} ++ ++/* nfs4_ds_cache_lock is held */ ++static inline struct nfs4_pnfs_ds * ++_data_server_lookup(u32 ip_addr, u32 port) ++{ ++ struct nfs4_pnfs_ds *ds; ++ ++ dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", ++ ntohl(ip_addr), ntohs(port)); ++ ++ list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { ++ if (ds->ds_ip_addr == ip_addr && ++ ds->ds_port == port) { ++ return ds; ++ } ++ } ++ return NULL; ++} ++ ++/* Create an rpc to the data server defined in 'dev_list' */ ++static int ++nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) ++{ ++ struct nfs_server *tmp; ++ struct sockaddr_in sin; ++ struct rpc_clnt *mds_clnt = mds_srv->client; ++ struct nfs_client *clp = mds_srv->nfs_client; ++ struct sockaddr *mds_addr; ++ int err = 0; ++ ++ dprintk("--> %s ip:port %s au_flavor %d\n", __func__, ++ ds->r_addr, mds_clnt->cl_auth->au_flavor); ++ ++ sin.sin_family = AF_INET; ++ sin.sin_addr.s_addr = ds->ds_ip_addr; ++ sin.sin_port = ds->ds_port; ++ ++ /* ++ * If this DS is also the MDS, use the MDS session only if the ++ * MDS exchangeid flags show the EXCHGID4_FLAG_USE_PNFS_DS pNFS role. ++ */ ++ mds_addr = (struct sockaddr *)&clp->cl_addr; ++ if (nfs_sockaddr_cmp((struct sockaddr *)&sin, mds_addr)) { ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO "ip:port %s is not a pNFS Data " ++ "Server\n", ds->r_addr); ++ err = -ENODEV; ++ } else { ++ atomic_inc(&clp->cl_count); ++ ds->ds_clp = clp; ++ dprintk("%s Using MDS Session for DS\n", __func__); ++ } ++ goto out; ++ } ++ ++ /* Temporay server for nfs4_set_client */ ++ tmp = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); ++ if (!tmp) ++ goto out; ++ ++ /* ++ * Set a retrans, timeout interval, and authflavor equual to the MDS ++ * values. Use the MDS nfs_client cl_ipaddr field so as to use the ++ * same co_ownerid as the MDS. ++ */ ++ err = nfs4_set_client(tmp, ++ mds_srv->nfs_client->cl_hostname, ++ (struct sockaddr *)&sin, ++ sizeof(struct sockaddr), ++ mds_srv->nfs_client->cl_ipaddr, ++ mds_clnt->cl_auth->au_flavor, ++ IPPROTO_TCP, ++ mds_clnt->cl_xprt->timeout, ++ 1 /* minorversion */); ++ if (err < 0) ++ goto out_free; ++ ++ clp = tmp->nfs_client; ++ ++ /* Ask for only the EXCHGID4_FLAG_USE_PNFS_DS pNFS role */ ++ dprintk("%s EXCHANGE_ID for clp %p\n", __func__, clp); ++ clp->cl_exchange_flags = EXCHGID4_FLAG_USE_PNFS_DS; ++ ++ err = nfs4_recover_expired_lease(clp); ++ if (!err) ++ err = nfs4_check_client_ready(clp); ++ if (err) ++ goto out_put; ++ ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO "ip:port %s is not a pNFS Data Server\n", ++ ds->r_addr); ++ err = -ENODEV; ++ goto out_put; ++ } ++ /* ++ * Mask the (possibly) returned EXCHGID4_FLAG_USE_PNFS_MDS pNFS role ++ * The is_ds_only_session depends on this. ++ */ ++ clp->cl_exchange_flags &= ~EXCHGID4_FLAG_USE_PNFS_MDS; ++ /* ++ * Set DS lease equal to the MDS lease, renewal is scheduled in ++ * create_session ++ */ ++ spin_lock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time; ++ spin_unlock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_last_renewal = jiffies; ++ ++ clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); ++ ds->ds_clp = clp; ++ ++ dprintk("%s: ip=%x, port=%hu, rpcclient %p\n", __func__, ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), ++ clp->cl_rpcclient); ++out_free: ++ kfree(tmp); ++out: ++ dprintk("%s Returns %d\n", __func__, err); ++ return err; ++out_put: ++ nfs_put_client(clp); ++ goto out_free; ++} ++ ++static void ++destroy_ds(struct nfs4_pnfs_ds *ds) ++{ ++ dprintk("--> %s\n", __func__); ++ print_ds(ds); ++ ++ if (ds->ds_clp) ++ nfs_put_client(ds->ds_clp); ++ kfree(ds); ++} ++ ++static void ++nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) ++{ ++ struct nfs4_pnfs_ds *ds; ++ int i; ++ ++ dprintk("%s: device id=%s\n", __func__, ++ deviceid_fmt(&dsaddr->deviceid.de_id)); ++ ++ for (i = 0; i < dsaddr->ds_num; i++) { ++ ds = dsaddr->ds_list[i]; ++ if (ds != NULL) { ++ if (atomic_dec_and_lock(&ds->ds_count, ++ &nfs4_ds_cache_lock)) { ++ list_del_init(&ds->ds_node); ++ spin_unlock(&nfs4_ds_cache_lock); ++ destroy_ds(ds); ++ } ++ } ++ } ++ kfree(dsaddr->stripe_indices); ++ kfree(dsaddr); ++} ++ ++void ++nfs4_fl_free_deviceid_callback(struct kref *kref) ++{ ++ struct nfs4_deviceid *device = ++ container_of(kref, struct nfs4_deviceid, de_kref); ++ struct nfs4_file_layout_dsaddr *dsaddr = ++ container_of(device, struct nfs4_file_layout_dsaddr, deviceid); ++ ++ nfs4_fl_free_deviceid(dsaddr); ++} ++ ++static void ++nfs4_pnfs_ds_add(struct inode *inode, struct nfs4_pnfs_ds **dsp, ++ u32 ip_addr, u32 port, char *r_addr, int len) ++{ ++ struct nfs4_pnfs_ds *tmp_ds, *ds; ++ ++ *dsp = NULL; ++ ++ ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL); ++ if (!ds) ++ return; ++ ++ spin_lock(&nfs4_ds_cache_lock); ++ tmp_ds = _data_server_lookup(ip_addr, port); ++ if (tmp_ds == NULL) { ++ ds->ds_ip_addr = ip_addr; ++ ds->ds_port = port; ++ strncpy(ds->r_addr, r_addr, len); ++ atomic_set(&ds->ds_count, 1); ++ INIT_LIST_HEAD(&ds->ds_node); ++ ds->ds_clp = NULL; ++ list_add(&ds->ds_node, &nfs4_data_server_cache); ++ *dsp = ds; ++ dprintk("%s add new data server ip 0x%x\n", __func__, ++ ds->ds_ip_addr); ++ spin_unlock(&nfs4_ds_cache_lock); ++ } else { ++ atomic_inc(&tmp_ds->ds_count); ++ *dsp = tmp_ds; ++ dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", ++ __func__, tmp_ds->ds_ip_addr, ++ atomic_read(&tmp_ds->ds_count)); ++ spin_unlock(&nfs4_ds_cache_lock); ++ kfree(ds); ++ } ++} ++ ++static struct nfs4_pnfs_ds * ++decode_and_add_ds(uint32_t **pp, struct inode *inode) ++{ ++ struct nfs4_pnfs_ds *ds = NULL; ++ char r_addr[29]; /* max size of ip/port string */ ++ int len; ++ u32 ip_addr, port; ++ int tmp[6]; ++ uint32_t *p = *pp; ++ ++ dprintk("%s enter\n", __func__); ++ /* check and skip r_netid */ ++ len = be32_to_cpup(p++); ++ /* "tcp" */ ++ if (len != 3) { ++ printk("%s: ERROR: non TCP r_netid len %d\n", ++ __func__, len); ++ goto out_err; ++ } ++ /* ++ * Read the bytes into a temporary buffer ++ * XXX: should probably sanity check them ++ */ ++ tmp[0] = be32_to_cpup(p++); ++ ++ len = be32_to_cpup(p++); ++ if (len >= sizeof(r_addr)) { ++ printk("%s: ERROR: Device ip/port too long (%d)\n", ++ __func__, len); ++ goto out_err; ++ } ++ memcpy(r_addr, p, len); ++ p += XDR_QUADLEN(len); ++ *pp = p; ++ r_addr[len] = '\0'; ++ sscanf(r_addr, "%d.%d.%d.%d.%d.%d", &tmp[0], &tmp[1], ++ &tmp[2], &tmp[3], &tmp[4], &tmp[5]); ++ ip_addr = htonl((tmp[0]<<24) | (tmp[1]<<16) | (tmp[2]<<8) | (tmp[3])); ++ port = htons((tmp[4] << 8) | (tmp[5])); ++ ++ nfs4_pnfs_ds_add(inode, &ds, ip_addr, port, r_addr, len); ++ ++ dprintk("%s: addr:port string = %s\n", __func__, r_addr); ++ return ds; ++out_err: ++ dprintk("%s returned NULL\n", __func__); ++ return NULL; ++} ++ ++/* Decode opaque device data and return the result */ ++static struct nfs4_file_layout_dsaddr* ++decode_device(struct inode *ino, struct pnfs_device *pdev) ++{ ++ int i, dummy; ++ u32 cnt, num; ++ u8 *indexp; ++ uint32_t *p = (u32 *)pdev->area, *indicesp; ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ ++ /* Get the stripe count (number of stripe index) */ ++ cnt = be32_to_cpup(p++); ++ dprintk("%s stripe count %d\n", __func__, cnt); ++ if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { ++ printk(KERN_WARNING "%s: stripe count %d greater than " ++ "supported maximum %d\n", __func__, ++ cnt, NFS4_PNFS_MAX_STRIPE_CNT); ++ goto out_err; ++ } ++ ++ /* Check the multipath list count */ ++ indicesp = p; ++ p += XDR_QUADLEN(cnt << 2); ++ num = be32_to_cpup(p++); ++ dprintk("%s ds_num %u\n", __func__, num); ++ if (num > NFS4_PNFS_MAX_MULTI_CNT) { ++ printk(KERN_WARNING "%s: multipath count %d greater than " ++ "supported maximum %d\n", __func__, ++ num, NFS4_PNFS_MAX_MULTI_CNT); ++ goto out_err; ++ } ++ dsaddr = kzalloc(sizeof(*dsaddr) + ++ (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), ++ GFP_KERNEL); ++ if (!dsaddr) ++ goto out_err; ++ ++ dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL); ++ if (!dsaddr->stripe_indices) ++ goto out_err_free; ++ ++ dsaddr->stripe_count = cnt; ++ dsaddr->ds_num = num; ++ ++ memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ ++ /* Go back an read stripe indices */ ++ p = indicesp; ++ indexp = &dsaddr->stripe_indices[0]; ++ for (i = 0; i < dsaddr->stripe_count; i++) { ++ dummy = be32_to_cpup(p++); ++ *indexp = dummy; /* bound by NFS4_PNFS_MAX_MULTI_CNT */ ++ indexp++; ++ } ++ /* Skip already read multipath list count */ ++ p++; ++ ++ for (i = 0; i < dsaddr->ds_num; i++) { ++ int j; ++ ++ dummy = be32_to_cpup(p++); /* multipath count */ ++ if (dummy > 1) { ++ printk(KERN_WARNING ++ "%s: Multipath count %d not supported, " ++ "skipping all greater than 1\n", __func__, ++ dummy); ++ } ++ for (j = 0; j < dummy; j++) { ++ if (j == 0) { ++ dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); ++ if (dsaddr->ds_list[i] == NULL) ++ goto out_err_free; ++ } else { ++ u32 len; ++ /* skip extra multipath */ ++ len = be32_to_cpup(p++); ++ p += XDR_QUADLEN(len); ++ len = be32_to_cpup(p++); ++ p += XDR_QUADLEN(len); ++ continue; ++ } ++ } ++ } ++ nfs4_init_deviceid_node(&dsaddr->deviceid); ++ ++ return dsaddr; ++ ++out_err_free: ++ nfs4_fl_free_deviceid(dsaddr); ++out_err: ++ dprintk("%s ERROR: returning NULL\n", __func__); ++ return NULL; ++} ++ ++/* ++ * Decode the opaque device specified in 'dev' ++ * and add it to the list of available devices. ++ * If the deviceid is already cached, nfs4_add_deviceid will return ++ * a pointer to the cached struct and throw away the new. ++ */ ++static struct nfs4_file_layout_dsaddr* ++decode_and_add_device(struct inode *inode, struct pnfs_device *dev) ++{ ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ struct nfs4_deviceid *d; ++ ++ dsaddr = decode_device(inode, dev); ++ if (!dsaddr) { ++ printk(KERN_WARNING "%s: Could not decode or add device\n", ++ __func__); ++ return NULL; ++ } ++ ++ d = nfs4_add_get_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, ++ &dsaddr->deviceid); ++ ++ return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); ++} ++ ++/* ++ * Retrieve the information for dev_id, add it to the list ++ * of available devices, and return it. ++ */ ++struct nfs4_file_layout_dsaddr * ++get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id) ++{ ++ struct pnfs_device *pdev = NULL; ++ u32 max_resp_sz; ++ int max_pages; ++ struct page **pages = NULL; ++ struct nfs4_file_layout_dsaddr *dsaddr = NULL; ++ int rc, i; ++ struct nfs_server *server = NFS_SERVER(inode); ++ ++ /* ++ * Use the session max response size as the basis for setting ++ * GETDEVICEINFO's maxcount ++ */ ++ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; ++ max_pages = max_resp_sz >> PAGE_SHIFT; ++ dprintk("%s inode %p max_resp_sz %u max_pages %d\n", ++ __func__, inode, max_resp_sz, max_pages); ++ ++ pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL); ++ if (pdev == NULL) ++ return NULL; ++ ++ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); ++ if (pages == NULL) { ++ kfree(pdev); ++ return NULL; ++ } ++ for (i = 0; i < max_pages; i++) { ++ pages[i] = alloc_page(GFP_KERNEL); ++ if (!pages[i]) ++ goto out_free; ++ } ++ ++ /* set pdev->area */ ++ pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); ++ if (!pdev->area) ++ goto out_free; ++ ++ memcpy(&pdev->dev_id, dev_id, NFS4_PNFS_DEVICEID4_SIZE); ++ pdev->layout_type = LAYOUT_NFSV4_1_FILES; ++ pdev->pages = pages; ++ pdev->pgbase = 0; ++ pdev->pglen = PAGE_SIZE * max_pages; ++ pdev->mincount = 0; ++ /* TODO: Update types when CB_NOTIFY_DEVICEID is available */ ++ pdev->dev_notify_types = 0; ++ ++ rc = pnfs_callback_ops->nfs_getdeviceinfo(server, pdev); ++ dprintk("%s getdevice info returns %d\n", __func__, rc); ++ if (rc) ++ goto out_free; ++ ++ /* ++ * Found new device, need to decode it and then add it to the ++ * list of known devices for this mountpoint. ++ */ ++ dsaddr = decode_and_add_device(inode, pdev); ++out_free: ++ if (pdev->area != NULL) ++ vunmap(pdev->area); ++ for (i = 0; i < max_pages; i++) ++ __free_page(pages[i]); ++ kfree(pages); ++ kfree(pdev); ++ dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); ++ return dsaddr; ++} ++ ++struct nfs4_file_layout_dsaddr * ++nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ ++ d = nfs4_find_get_deviceid(clp->cl_devid_cache, id); ++ dprintk("%s device id (%s) nfs4_deviceid %p\n", __func__, ++ deviceid_fmt(id), d); ++ return (d == NULL) ? NULL : ++ container_of(d, struct nfs4_file_layout_dsaddr, deviceid); ++} ++ ++/* ++ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit ++ * Then: ((res + fsi) % dsaddr->stripe_count) ++ */ ++static inline u32 ++_nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ u64 tmp; ++ ++ tmp = offset - flseg->pattern_offset; ++ do_div(tmp, flseg->stripe_unit); ++ tmp += flseg->first_stripe_index; ++ return do_div(tmp, FILE_DSADDR(lseg)->stripe_count); ++} ++ ++u32 ++nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ u32 j; ++ ++ j = _nfs4_fl_calc_j_index(lseg, offset); ++ return FILE_DSADDR(lseg)->stripe_indices[j]; ++} ++ ++struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ u32 i; ++ ++ if (flseg->stripe_type == STRIPE_SPARSE) { ++ if (flseg->num_fh == 1) ++ i = 0; ++ else if (flseg->num_fh == 0) ++ return NULL; ++ else ++ i = nfs4_fl_calc_ds_index(lseg, offset); ++ } else ++ i = _nfs4_fl_calc_j_index(lseg, offset); ++ return &flseg->fh_array[i]; ++} ++ ++struct nfs4_pnfs_ds * ++nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ ++ dsaddr = FILE_DSADDR(lseg); ++ if (dsaddr->ds_list[ds_idx] == NULL) { ++ printk(KERN_ERR "%s: No data server for device id (%s)!!\n", ++ __func__, deviceid_fmt(&flseg->dev_id)); ++ return NULL; ++ } ++ ++ if (!dsaddr->ds_list[ds_idx]->ds_clp) { ++ int err; ++ ++ err = nfs4_pnfs_ds_create(PNFS_NFS_SERVER(lseg->layout), ++ dsaddr->ds_list[ds_idx]); ++ if (err) { ++ printk(KERN_ERR "%s nfs4_pnfs_ds_create error %d\n", ++ __func__, err); ++ return NULL; ++ } ++ } ++ dprintk("%s: dev_id=%s, ds_idx=%u\n", ++ __func__, deviceid_fmt(&flseg->dev_id), ds_idx); ++ ++ return dsaddr->ds_list[ds_idx]; ++} ++ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h +--- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-09-30 10:17:08.657991000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-09-30 10:17:08.658997000 -0400 +@@ -0,0 +1,96 @@ ++/* ++ * pnfs_nfs4filelayout.h ++ * ++ * NFSv4 file layout driver data structures. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef FS_NFS_NFS4FILELAYOUT_H ++#define FS_NFS_NFS4FILELAYOUT_H ++ ++#include ++#include ++ ++#define NFS4_PNFS_DEV_HASH_BITS 5 ++#define NFS4_PNFS_DEV_HASH_SIZE (1 << NFS4_PNFS_DEV_HASH_BITS) ++#define NFS4_PNFS_DEV_HASH_MASK (NFS4_PNFS_DEV_HASH_SIZE - 1) ++ ++#define NFS4_PNFS_MAX_STRIPE_CNT 4096 ++#define NFS4_PNFS_MAX_MULTI_CNT 64 /* 256 fit into a u8 stripe_index */ ++#define NFS4_PNFS_MAX_MULTI_DS 2 ++ ++#define FILE_DSADDR(lseg) (container_of(lseg->deviceid, \ ++ struct nfs4_file_layout_dsaddr, \ ++ deviceid)) ++ ++enum stripetype4 { ++ STRIPE_SPARSE = 1, ++ STRIPE_DENSE = 2 ++}; ++ ++/* Individual ip address */ ++struct nfs4_pnfs_ds { ++ struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ ++ u32 ds_ip_addr; ++ u32 ds_port; ++ struct nfs_client *ds_clp; ++ atomic_t ds_count; ++ char r_addr[29]; ++}; ++ ++struct nfs4_file_layout_dsaddr { ++ struct nfs4_deviceid deviceid; ++ u32 stripe_count; ++ u8 *stripe_indices; ++ u32 ds_num; ++ struct nfs4_pnfs_ds *ds_list[1]; ++}; ++ ++struct nfs4_pnfs_dev_hlist { ++ rwlock_t dev_lock; ++ struct hlist_head dev_list[NFS4_PNFS_DEV_HASH_SIZE]; ++}; ++ ++struct nfs4_filelayout_segment { ++ u32 stripe_type; ++ u32 commit_through_mds; ++ u32 stripe_unit; ++ u32 first_stripe_index; ++ u64 pattern_offset; ++ struct pnfs_deviceid dev_id; ++ unsigned int num_fh; ++ struct nfs_fh *fh_array; ++}; ++ ++struct nfs4_filelayout { ++ struct pnfs_layout_hdr fl_layout; ++ u32 stripe_unit; ++}; ++ ++extern struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset); ++ ++static inline struct nfs4_filelayout * ++FILE_LO(struct pnfs_layout_hdr *lo) ++{ ++ return container_of(lo, struct nfs4_filelayout, fl_layout); ++} ++ ++extern struct pnfs_client_operations *pnfs_callback_ops; ++ ++extern void nfs4_fl_free_deviceid_callback(struct kref *); ++extern void print_ds(struct nfs4_pnfs_ds *ds); ++char *deviceid_fmt(const struct pnfs_deviceid *dev_id); ++u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset); ++struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, ++ u32 ds_idx); ++extern struct nfs4_file_layout_dsaddr * ++nfs4_fl_find_get_deviceid(struct nfs_client *, struct pnfs_deviceid *dev_id); ++struct nfs4_file_layout_dsaddr * ++get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id); ++ ++#endif /* FS_NFS_NFS4FILELAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nfs4_fs.h +--- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-09-30 10:15:17.839715000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-09-30 10:17:08.649992000 -0400 +@@ -45,8 +45,28 @@ enum nfs4_client_state { + NFS4CLNT_RECLAIM_NOGRACE, + NFS4CLNT_DELEGRETURN, + NFS4CLNT_SESSION_RESET, +- NFS4CLNT_SESSION_DRAINING, + NFS4CLNT_RECALL_SLOT, ++ NFS4CLNT_LAYOUT_RECALL, ++}; ++ ++enum nfs4_session_state { ++ NFS4_SESSION_INITING, ++ NFS4_SESSION_DRAINING, ++}; ++ ++struct nfs4_minor_version_ops { ++ u32 minor_version; ++ ++ int (*call_sync)(struct nfs_server *server, ++ struct rpc_message *msg, ++ struct nfs4_sequence_args *args, ++ struct nfs4_sequence_res *res, ++ int cache_reply); ++ int (*validate_stateid)(struct nfs_delegation *, ++ const nfs4_stateid *); ++ const struct nfs4_state_recovery_ops *reboot_recovery_ops; ++ const struct nfs4_state_recovery_ops *nograce_recovery_ops; ++ const struct nfs4_state_maintenance_ops *state_renewal_ops; + }; + + /* +@@ -89,7 +109,6 @@ struct nfs_unique_id { + */ + struct nfs4_state_owner { + struct nfs_unique_id so_owner_id; +- struct nfs_client *so_client; + struct nfs_server *so_server; + struct rb_node so_client_node; + +@@ -99,7 +118,6 @@ struct nfs4_state_owner { + atomic_t so_count; + unsigned long so_flags; + struct list_head so_states; +- struct list_head so_delegations; + struct nfs_seqid_counter so_seqid; + struct rpc_sequence so_sequence; + }; +@@ -125,10 +143,20 @@ enum { + * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) + */ + ++struct nfs4_lock_owner { ++ unsigned int lo_type; ++#define NFS4_ANY_LOCK_TYPE (0U) ++#define NFS4_FLOCK_LOCK_TYPE (1U << 0) ++#define NFS4_POSIX_LOCK_TYPE (1U << 1) ++ union { ++ fl_owner_t posix_owner; ++ pid_t flock_owner; ++ } lo_u; ++}; ++ + struct nfs4_lock_state { + struct list_head ls_locks; /* Other lock stateids */ + struct nfs4_state * ls_state; /* Pointer to open state */ +- fl_owner_t ls_owner; /* POSIX lock owner */ + #define NFS_LOCK_INITIALIZED 1 + int ls_flags; + struct nfs_seqid_counter ls_seqid; +@@ -136,6 +164,7 @@ struct nfs4_lock_state { + struct nfs_unique_id ls_id; + nfs4_stateid ls_stateid; + atomic_t ls_count; ++ struct nfs4_lock_owner ls_owner; + }; + + /* bits for nfs4_state->flags */ +@@ -219,22 +248,34 @@ extern int nfs4_open_revalidate(struct i + extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); + extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, + struct nfs4_fs_locations *fs_locations, struct page *page); ++extern void nfs4_release_lockowner(const struct nfs4_lock_state *); + +-extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[]; +-extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[]; + #if defined(CONFIG_NFS_V4_1) +-extern int nfs4_setup_sequence(struct nfs_client *clp, ++static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) ++{ ++ return server->nfs_client->cl_session; ++} ++ ++extern int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task); + extern void nfs4_destroy_session(struct nfs4_session *session); + extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); ++extern int nfs4_proc_exchange_id(struct nfs_client *, struct rpc_cred *); + extern int nfs4_proc_create_session(struct nfs_client *); + extern int nfs4_proc_destroy_session(struct nfs4_session *); + extern int nfs4_init_session(struct nfs_server *server); + extern int nfs4_proc_get_lease_time(struct nfs_client *clp, + struct nfs_fsinfo *fsinfo); + #else /* CONFIG_NFS_v4_1 */ +-static inline int nfs4_setup_sequence(struct nfs_client *clp, ++static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) ++{ ++ return NULL; ++} ++ ++static inline int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task) + { +@@ -247,12 +288,12 @@ static inline int nfs4_init_session(stru + } + #endif /* CONFIG_NFS_V4_1 */ + +-extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[]; ++extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; + + extern const u32 nfs4_fattr_bitmap[2]; + extern const u32 nfs4_statfs_bitmap[2]; + extern const u32 nfs4_pathconf_bitmap[2]; +-extern const u32 nfs4_fsinfo_bitmap[2]; ++extern const u32 nfs4_fsinfo_bitmap[3]; + extern const u32 nfs4_fs_locations_bitmap[2]; + + /* nfs4renewd.c */ +@@ -284,7 +325,7 @@ extern void nfs41_handle_sequence_flag_e + extern void nfs41_handle_recall_slot(struct nfs_client *clp); + extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); + extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); +-extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); ++extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); + + extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); + extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); +@@ -293,6 +334,7 @@ extern void nfs_increment_lock_seqid(int + extern void nfs_release_seqid(struct nfs_seqid *seqid); + extern void nfs_free_seqid(struct nfs_seqid *seqid); + ++/* write.c */ + extern const nfs4_stateid zero_stateid; + + /* nfs4xdr.c */ +diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs4proc.c +--- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-09-30 10:15:17.855715000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-09-30 10:17:08.673994000 -0400 +@@ -49,12 +49,14 @@ + #include + #include + #include ++#include + + #include "nfs4_fs.h" + #include "delegation.h" + #include "internal.h" + #include "iostat.h" + #include "callback.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PROC + +@@ -67,7 +69,7 @@ struct nfs4_opendata; + static int _nfs4_proc_open(struct nfs4_opendata *data); + static int _nfs4_recover_proc_open(struct nfs4_opendata *data); + static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); +-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); ++static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, struct nfs_client *); + static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, +@@ -125,11 +127,16 @@ const u32 nfs4_pathconf_bitmap[2] = { + 0 + }; + +-const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE ++const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE + | FATTR4_WORD0_MAXREAD + | FATTR4_WORD0_MAXWRITE + | FATTR4_WORD0_LEASE_TIME, ++#ifdef CONFIG_NFS_V4_1 ++ FATTR4_WORD1_FS_LAYOUT_TYPES, ++ FATTR4_WORD2_LAYOUT_BLKSIZE ++#else /* CONFIG_NFS_V4_1 */ + 0 ++#endif /* CONFIG_NFS_V4_1 */ + }; + + const u32 nfs4_fs_locations_bitmap[2] = { +@@ -356,7 +363,7 @@ static void nfs41_check_drain_session_co + { + struct rpc_task *task; + +- if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) { ++ if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); + if (task) + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); +@@ -370,12 +377,11 @@ static void nfs41_check_drain_session_co + complete(&ses->complete); + } + +-static void nfs41_sequence_free_slot(const struct nfs_client *clp, +- struct nfs4_sequence_res *res) ++static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) + { + struct nfs4_slot_table *tbl; + +- tbl = &clp->cl_session->fc_slot_table; ++ tbl = &res->sr_session->fc_slot_table; + if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { + /* just wake up the next guy waiting since + * we may have not consumed a slot after all */ +@@ -385,18 +391,17 @@ static void nfs41_sequence_free_slot(con + + spin_lock(&tbl->slot_tbl_lock); + nfs4_free_slot(tbl, res->sr_slotid); +- nfs41_check_drain_session_complete(clp->cl_session); ++ nfs41_check_drain_session_complete(res->sr_session); + spin_unlock(&tbl->slot_tbl_lock); + res->sr_slotid = NFS4_MAX_SLOT_TABLE; + } + +-static void nfs41_sequence_done(struct nfs_client *clp, +- struct nfs4_sequence_res *res, +- int rpc_status) ++static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) + { + unsigned long timestamp; + struct nfs4_slot_table *tbl; + struct nfs4_slot *slot; ++ struct nfs_client *clp; + + /* + * sr_status remains 1 if an RPC level error occurred. The server +@@ -411,13 +416,16 @@ static void nfs41_sequence_done(struct n + if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) + goto out; + ++ tbl = &res->sr_session->fc_slot_table; ++ slot = tbl->slots + res->sr_slotid; ++ + /* Check the SEQUENCE operation status */ +- if (res->sr_status == 0) { +- tbl = &clp->cl_session->fc_slot_table; +- slot = tbl->slots + res->sr_slotid; ++ switch (res->sr_status) { ++ case 0: + /* Update the slot's sequence and clientid lease timer */ + ++slot->seq_nr; + timestamp = res->sr_renewal_time; ++ clp = res->sr_session->clp; + spin_lock(&clp->cl_lock); + if (time_before(clp->cl_last_renewal, timestamp)) + clp->cl_last_renewal = timestamp; +@@ -425,11 +433,39 @@ static void nfs41_sequence_done(struct n + /* Check sequence flags */ + if (atomic_read(&clp->cl_count) > 1) + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); ++ break; ++ case -NFS4ERR_DELAY: ++ /* The server detected a resend of the RPC call and ++ * returned NFS4ERR_DELAY as per Section 2.10.6.2 ++ * of RFC5661. ++ */ ++ dprintk("%s: slot=%d seq=%d: Operation in progress\n", ++ __func__, res->sr_slotid, slot->seq_nr); ++ goto out_retry; ++ default: ++ /* Just update the slot sequence no. */ ++ ++slot->seq_nr; + } + out: + /* The session may be reset by one of the error handlers. */ + dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); +- nfs41_sequence_free_slot(clp, res); ++ nfs41_sequence_free_slot(res); ++ return 1; ++out_retry: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ rpc_restart_call(task); ++ /* FIXME: rpc_restart_call() should be made to return success/fail */ ++ if (RPC_ASSASSINATED(task)) ++ goto out; ++ return 0; ++} ++ ++static int nfs4_sequence_done(struct rpc_task *task, ++ struct nfs4_sequence_res *res) ++{ ++ if (res->sr_session == NULL) ++ return 1; ++ return nfs41_sequence_done(task, res); + } + + /* +@@ -480,12 +516,11 @@ static int nfs41_setup_sequence(struct n + if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) + return 0; + +- memset(res, 0, sizeof(*res)); + res->sr_slotid = NFS4_MAX_SLOT_TABLE; + tbl = &session->fc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); +- if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) && ++ if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && + !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { + /* + * The state manager will wait until the slot table is empty. +@@ -525,6 +560,7 @@ static int nfs41_setup_sequence(struct n + res->sr_session = session; + res->sr_slotid = slotid; + res->sr_renewal_time = jiffies; ++ res->sr_status_flags = 0; + /* + * sr_status is only set in decode_sequence, and so will remain + * set to 1 if an rpc level failure occurs. +@@ -533,33 +569,36 @@ static int nfs41_setup_sequence(struct n + return 0; + } + +-int nfs4_setup_sequence(struct nfs_client *clp, ++int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply, + struct rpc_task *task) + { ++ struct nfs4_session *session = nfs4_get_session(server); + int ret = 0; + ++ if (ds_session) ++ session = ds_session; ++ if (session == NULL) { ++ args->sa_session = NULL; ++ res->sr_session = NULL; ++ goto out; ++ } ++ + dprintk("--> %s clp %p session %p sr_slotid %d\n", +- __func__, clp, clp->cl_session, res->sr_slotid); ++ __func__, session->clp, session, res->sr_slotid); + +- if (!nfs4_has_session(clp)) +- goto out; +- ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply, ++ ret = nfs41_setup_sequence(session, args, res, cache_reply, + task); +- if (ret && ret != -EAGAIN) { +- /* terminate rpc task */ +- task->tk_status = ret; +- task->tk_action = NULL; +- } + out: + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; + } + + struct nfs41_call_sync_data { +- struct nfs_client *clp; ++ const struct nfs_server *seq_server; + struct nfs4_sequence_args *seq_args; + struct nfs4_sequence_res *seq_res; + int cache_reply; +@@ -569,9 +608,9 @@ static void nfs41_call_sync_prepare(stru + { + struct nfs41_call_sync_data *data = calldata; + +- dprintk("--> %s data->clp->cl_session %p\n", __func__, +- data->clp->cl_session); +- if (nfs4_setup_sequence(data->clp, data->seq_args, ++ dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); ++ ++ if (nfs4_setup_sequence(data->seq_server, NULL, data->seq_args, + data->seq_res, data->cache_reply, task)) + return; + rpc_call_start(task); +@@ -587,7 +626,7 @@ static void nfs41_call_sync_done(struct + { + struct nfs41_call_sync_data *data = calldata; + +- nfs41_sequence_done(data->clp, data->seq_res, task->tk_status); ++ nfs41_sequence_done(task, data->seq_res); + } + + struct rpc_call_ops nfs41_call_sync_ops = { +@@ -600,8 +639,7 @@ struct rpc_call_ops nfs41_call_priv_sync + .rpc_call_done = nfs41_call_sync_done, + }; + +-static int nfs4_call_sync_sequence(struct nfs_client *clp, +- struct rpc_clnt *clnt, ++static int nfs4_call_sync_sequence(struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, +@@ -611,13 +649,13 @@ static int nfs4_call_sync_sequence(struc + int ret; + struct rpc_task *task; + struct nfs41_call_sync_data data = { +- .clp = clp, ++ .seq_server = server, + .seq_args = args, + .seq_res = res, + .cache_reply = cache_reply, + }; + struct rpc_task_setup task_setup = { +- .rpc_client = clnt, ++ .rpc_client = server->client, + .rpc_message = msg, + .callback_ops = &nfs41_call_sync_ops, + .callback_data = &data +@@ -642,10 +680,15 @@ int _nfs4_call_sync_session(struct nfs_s + struct nfs4_sequence_res *res, + int cache_reply) + { +- return nfs4_call_sync_sequence(server->nfs_client, server->client, +- msg, args, res, cache_reply, 0); ++ return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0); + } + ++#else ++static int nfs4_sequence_done(struct rpc_task *task, ++ struct nfs4_sequence_res *res) ++{ ++ return 1; ++} + #endif /* CONFIG_NFS_V4_1 */ + + int _nfs4_call_sync(struct nfs_server *server, +@@ -659,18 +702,9 @@ int _nfs4_call_sync(struct nfs_server *s + } + + #define nfs4_call_sync(server, msg, args, res, cache_reply) \ +- (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \ ++ (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \ + &(res)->seq_res, (cache_reply)) + +-static void nfs4_sequence_done(const struct nfs_server *server, +- struct nfs4_sequence_res *res, int rpc_status) +-{ +-#ifdef CONFIG_NFS_V4_1 +- if (nfs4_has_session(server->nfs_client)) +- nfs41_sequence_done(server->nfs_client, res, rpc_status); +-#endif /* CONFIG_NFS_V4_1 */ +-} +- + static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) + { + struct nfs_inode *nfsi = NFS_I(dir); +@@ -745,19 +779,14 @@ static struct nfs4_opendata *nfs4_openda + p->o_arg.server = server; + p->o_arg.bitmask = server->attr_bitmask; + p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; +- if (flags & O_EXCL) { +- if (nfs4_has_persistent_session(server->nfs_client)) { +- /* GUARDED */ +- p->o_arg.u.attrs = &p->attrs; +- memcpy(&p->attrs, attrs, sizeof(p->attrs)); +- } else { /* EXCLUSIVE4_1 */ +- u32 *s = (u32 *) p->o_arg.u.verifier.data; +- s[0] = jiffies; +- s[1] = current->pid; +- } +- } else if (flags & O_CREAT) { ++ if (flags & O_CREAT) { ++ u32 *s; ++ + p->o_arg.u.attrs = &p->attrs; + memcpy(&p->attrs, attrs, sizeof(p->attrs)); ++ s = (u32 *) p->o_arg.u.verifier.data; ++ s[0] = jiffies; ++ s[1] = current->pid; + } + p->c_arg.fh = &p->o_res.fh; + p->c_arg.stateid = &p->o_res.stateid; +@@ -851,8 +880,10 @@ static void update_open_stateflags(struc + static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) + { + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) +- memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data)); +- memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data)); ++ memcpy(state->stateid.u.data, stateid->u.data, ++ sizeof(state->stateid.u.data)); ++ memcpy(state->open_stateid.u.data, stateid->u.data, ++ sizeof(state->open_stateid.u.data)); + switch (fmode) { + case FMODE_READ: + set_bit(NFS_O_RDONLY_STATE, &state->flags); +@@ -880,7 +911,8 @@ static void __update_open_stateid(struct + */ + write_seqlock(&state->seqlock); + if (deleg_stateid != NULL) { +- memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data)); ++ memcpy(state->stateid.u.data, deleg_stateid->u.data, ++ sizeof(state->stateid.u.data)); + set_bit(NFS_DELEGATED_STATE, &state->flags); + } + if (open_stateid != NULL) +@@ -911,7 +943,8 @@ static int update_open_stateid(struct nf + + if (delegation == NULL) + delegation = &deleg_cur->stateid; +- else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0) ++ else if (memcmp(deleg_cur->stateid.u.data, delegation->u.data, ++ NFS4_STATEID_SIZE) != 0) + goto no_delegation_unlock; + + nfs_mark_delegation_referenced(deleg_cur); +@@ -973,7 +1006,8 @@ static struct nfs4_state *nfs4_try_open_ + break; + } + /* Save the delegation */ +- memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); ++ memcpy(stateid.u.data, delegation->stateid.u.data, ++ sizeof(stateid.u.data)); + rcu_read_unlock(); + ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); + if (ret != 0) +@@ -1127,10 +1161,13 @@ static int nfs4_open_recover(struct nfs4 + * Check if we need to update the current stateid. + */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 && +- memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) { ++ memcmp(state->stateid.u.data, state->open_stateid.u.data, ++ sizeof(state->stateid.u.data)) != 0) { + write_seqlock(&state->seqlock); + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) +- memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)); ++ memcpy(state->stateid.u.data, ++ state->open_stateid.u.data, ++ sizeof(state->stateid.u.data)); + write_sequnlock(&state->seqlock); + } + return 0; +@@ -1199,8 +1236,8 @@ static int _nfs4_open_delegation_recall( + if (IS_ERR(opendata)) + return PTR_ERR(opendata); + opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; +- memcpy(opendata->o_arg.u.delegation.data, stateid->data, +- sizeof(opendata->o_arg.u.delegation.data)); ++ memcpy(opendata->o_arg.u.delegation.u.data, stateid->u.data, ++ sizeof(opendata->o_arg.u.delegation.u.data)); + ret = nfs4_open_recover(opendata, state); + nfs4_opendata_put(opendata); + return ret; +@@ -1258,8 +1295,8 @@ static void nfs4_open_confirm_done(struc + if (RPC_ASSASSINATED(task)) + return; + if (data->rpc_status == 0) { +- memcpy(data->o_res.stateid.data, data->c_res.stateid.data, +- sizeof(data->o_res.stateid.data)); ++ memcpy(data->o_res.stateid.u.data, data->c_res.stateid.u.data, ++ sizeof(data->o_res.stateid.u.data)); + nfs_confirm_seqid(&data->owner->so_seqid, 0); + renew_lease(data->o_res.server, data->timestamp); + data->rpc_done = 1; +@@ -1356,13 +1393,13 @@ static void nfs4_open_prepare(struct rpc + } + /* Update sequence id. */ + data->o_arg.id = sp->so_owner_id.id; +- data->o_arg.clientid = sp->so_client->cl_clientid; ++ data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; + if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; + nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); + } + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->o_arg.server->nfs_client, ++ if (nfs4_setup_sequence(data->o_arg.server, NULL, + &data->o_arg.seq_args, + &data->o_res.seq_res, 1, task)) + return; +@@ -1385,8 +1422,8 @@ static void nfs4_open_done(struct rpc_ta + + data->rpc_status = task->tk_status; + +- nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->o_res.seq_res)) ++ return; + + if (RPC_ASSASSINATED(task)) + return; +@@ -1539,9 +1576,8 @@ static int _nfs4_proc_open(struct nfs4_o + return 0; + } + +-static int nfs4_recover_expired_lease(struct nfs_server *server) ++int nfs4_recover_expired_lease(struct nfs_client *clp) + { +- struct nfs_client *clp = server->nfs_client; + unsigned int loop; + int ret; + +@@ -1557,6 +1593,7 @@ static int nfs4_recover_expired_lease(st + } + return ret; + } ++EXPORT_SYMBOL(nfs4_recover_expired_lease); + + /* + * OPEN_EXPIRED: +@@ -1646,7 +1683,7 @@ static int _nfs4_do_open(struct inode *d + dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); + goto out_err; + } +- status = nfs4_recover_expired_lease(server); ++ status = nfs4_recover_expired_lease(server->nfs_client); + if (status != 0) + goto err_put_state_owner; + if (path->dentry->d_inode != NULL) +@@ -1773,7 +1810,7 @@ static int _nfs4_do_setattr(struct inode + if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { + /* Use that stateid */ + } else if (state != NULL) { +- nfs4_copy_stateid(&arg.stateid, state, current->files); ++ nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid); + } else + memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); + +@@ -1838,7 +1875,8 @@ static void nfs4_close_done(struct rpc_t + struct nfs4_state *state = calldata->state; + struct nfs_server *server = NFS_SERVER(calldata->inode); + +- nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status); ++ if (!nfs4_sequence_done(task, &calldata->res.seq_res)) ++ return; + if (RPC_ASSASSINATED(task)) + return; + /* hmm. we are done with the inode, and in the process of freeing +@@ -1858,7 +1896,7 @@ static void nfs4_close_done(struct rpc_t + if (calldata->arg.fmode == 0) + break; + default: +- if (nfs4_async_handle_error(task, server, state) == -EAGAIN) ++ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } + nfs_release_seqid(calldata->arg.seqid); +@@ -1903,7 +1941,7 @@ static void nfs4_close_prepare(struct rp + + nfs_fattr_init(calldata->res.fattr); + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client, ++ if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), NULL, + &calldata->arg.seq_args, &calldata->res.seq_res, + 1, task)) + return; +@@ -2325,6 +2363,9 @@ nfs4_proc_setattr(struct dentry *dentry, + struct nfs4_state *state = NULL; + int status; + ++ if (pnfs_ld_layoutret_on_setattr(inode)) ++ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); ++ + nfs_fattr_init(fattr); + + /* Search for an existing open(O_WRITE) file */ +@@ -2650,8 +2691,9 @@ static int nfs4_proc_unlink_done(struct + { + struct nfs_removeres *res = task->tk_msg.rpc_resp; + +- nfs4_sequence_done(res->server, &res->seq_res, task->tk_status); +- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) ++ if (!nfs4_sequence_done(task, &res->seq_res)) ++ return 0; ++ if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN) + return 0; + update_changeattr(dir, &res->cinfo); + nfs_post_op_update_inode(dir, res->dir_attr); +@@ -3092,18 +3134,31 @@ static int nfs4_proc_pathconf(struct nfs + static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) + { + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; + + dprintk("--> %s\n", __func__); + +- nfs4_sequence_done(server, &data->res.seq_res, task->tk_status); ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; + +- if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, server->nfs_client); ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } + + nfs_invalidate_atime(data->inode); +- if (task->tk_status > 0) ++ if (task->tk_status > 0 && client == server->nfs_client) + renew_lease(server, data->timestamp); + return 0; + } +@@ -3114,20 +3169,56 @@ static void nfs4_proc_read_setup(struct + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + } + ++static void pnfs4_update_write_done(struct nfs_inode *nfsi, struct nfs_write_data *data) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ + static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- +- nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, +- task->tk_status); ++ struct nfs_server *server = NFS_SERVER(inode); ++ struct nfs_client *client = server->nfs_client; + +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; ++ ++#ifdef CONFIG_NFS_V4_1 ++ /* restore original count after retry? */ ++ if (data->pdata.orig_count) { ++ dprintk("%s: restoring original count %u\n", __func__, ++ data->pdata.orig_count); ++ data->args.count = data->pdata.orig_count; ++ } ++ ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } ++ ++ /* ++ * MDS write: renew lease ++ * DS write: update lastbyte written, mark for layout commit ++ */ + if (task->tk_status >= 0) { +- renew_lease(NFS_SERVER(inode), data->timestamp); +- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ if (client == server->nfs_client) { ++ renew_lease(server, data->timestamp); ++ nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ } else ++ pnfs4_update_write_done(NFS_I(inode), data); + } + return 0; + } +@@ -3140,20 +3231,42 @@ static void nfs4_proc_write_setup(struct + data->res.server = server; + data->timestamp = jiffies; + ++#ifdef CONFIG_NFS_V4_1 ++ /* writes to DS use pnfs vector */ ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_WRITE]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; + } + + static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- +- nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, +- task->tk_status); +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { ++ struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; ++ ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS commit\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; ++ ++ if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL, NULL) == -EAGAIN) { + nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + return -EAGAIN; + } +- nfs_refresh_inode(inode, data->res.fattr); ++ if (client == server->nfs_client) ++ nfs_refresh_inode(inode, data->res.fattr); + return 0; + } + +@@ -3163,6 +3276,12 @@ static void nfs4_proc_commit_setup(struc + + data->args.bitmask = server->cache_consistency_bitmask; + data->res.server = server; ++#if defined(CONFIG_NFS_V4_1) ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_COMMIT]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; + } + +@@ -3466,9 +3585,12 @@ static int nfs4_proc_set_acl(struct inod + } + + static int +-_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state) ++nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state, struct nfs_client *clp) + { +- if (!clp || task->tk_status >= 0) ++ if (!clp) ++ clp = server->nfs_client; ++ ++ if (task->tk_status >= 0) + return 0; + switch(task->tk_status) { + case -NFS4ERR_ADMIN_REVOKED: +@@ -3493,8 +3615,9 @@ _nfs4_async_handle_error(struct rpc_task + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: +- dprintk("%s ERROR %d, Reset session\n", __func__, +- task->tk_status); ++ dprintk("%s ERROR %d, Reset session. Exchangeid " ++ "flags 0x%x\n", __func__, task->tk_status, ++ clp->cl_exchange_flags); + nfs4_schedule_state_recovery(clp); + task->tk_status = 0; + return -EAGAIN; +@@ -3514,6 +3637,8 @@ _nfs4_async_handle_error(struct rpc_task + task->tk_status = nfs4_map_errors(task->tk_status); + return 0; + do_state_recovery: ++ if (is_ds_only_client(clp)) ++ return 0; + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); + nfs4_schedule_state_recovery(clp); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) +@@ -3522,12 +3647,6 @@ do_state_recovery: + return -EAGAIN; + } + +-static int +-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) +-{ +- return _nfs4_async_handle_error(task, server, server->nfs_client, state); +-} +- + int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, + unsigned short port, struct rpc_cred *cred, + struct nfs4_setclientid_res *res) +@@ -3643,8 +3762,8 @@ static void nfs4_delegreturn_done(struct + { + struct nfs4_delegreturndata *data = calldata; + +- nfs4_sequence_done(data->res.server, &data->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; + + switch (task->tk_status) { + case -NFS4ERR_STALE_STATEID: +@@ -3653,8 +3772,8 @@ static void nfs4_delegreturn_done(struct + renew_lease(data->res.server, data->timestamp); + break; + default: +- if (nfs4_async_handle_error(task, data->res.server, NULL) == +- -EAGAIN) { ++ if (nfs4_async_handle_error(task, data->res.server, NULL, NULL) ++ == -EAGAIN) { + nfs_restart_rpc(task, data->res.server->nfs_client); + return; + } +@@ -3674,7 +3793,7 @@ static void nfs4_delegreturn_prepare(str + + d_data = (struct nfs4_delegreturndata *)data; + +- if (nfs4_setup_sequence(d_data->res.server->nfs_client, ++ if (nfs4_setup_sequence(d_data->res.server, NULL, + &d_data->args.seq_args, + &d_data->res.seq_res, 1, task)) + return; +@@ -3894,15 +4013,16 @@ static void nfs4_locku_done(struct rpc_t + { + struct nfs4_unlockdata *calldata = data; + +- nfs4_sequence_done(calldata->server, &calldata->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &calldata->res.seq_res)) ++ return; + if (RPC_ASSASSINATED(task)) + return; + switch (task->tk_status) { + case 0: +- memcpy(calldata->lsp->ls_stateid.data, +- calldata->res.stateid.data, +- sizeof(calldata->lsp->ls_stateid.data)); ++ memcpy(calldata->lsp->ls_stateid.u.data, ++ calldata->res.stateid.u.data, ++ sizeof(calldata->lsp->ls_stateid.u. ++ data)); + renew_lease(calldata->server, calldata->timestamp); + break; + case -NFS4ERR_BAD_STATEID: +@@ -3911,7 +4031,7 @@ static void nfs4_locku_done(struct rpc_t + case -NFS4ERR_EXPIRED: + break; + default: +- if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) ++ if (nfs4_async_handle_error(task, calldata->server, NULL, NULL) == -EAGAIN) + nfs_restart_rpc(task, + calldata->server->nfs_client); + } +@@ -3929,7 +4049,7 @@ static void nfs4_locku_prepare(struct rp + return; + } + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence(calldata->server->nfs_client, ++ if (nfs4_setup_sequence(calldata->server, NULL, + &calldata->arg.seq_args, + &calldata->res.seq_res, 1, task)) + return; +@@ -4084,7 +4204,8 @@ static void nfs4_lock_prepare(struct rpc + } else + data->arg.new_lock_owner = 0; + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args, ++ if (nfs4_setup_sequence(data->server, NULL, ++ &data->arg.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +@@ -4103,8 +4224,8 @@ static void nfs4_lock_done(struct rpc_ta + + dprintk("%s: begin!\n", __func__); + +- nfs4_sequence_done(data->server, &data->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; + + data->rpc_status = task->tk_status; + if (RPC_ASSASSINATED(task)) +@@ -4116,8 +4237,8 @@ static void nfs4_lock_done(struct rpc_ta + goto out; + } + if (data->rpc_status == 0) { +- memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, +- sizeof(data->lsp->ls_stateid.data)); ++ memcpy(data->lsp->ls_stateid.u.data, data->res.stateid.u.data, ++ sizeof(data->lsp->ls_stateid.u.data)); + data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; + renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp); + } +@@ -4426,6 +4547,34 @@ out: + return err; + } + ++static void nfs4_release_lockowner_release(void *calldata) ++{ ++ kfree(calldata); ++} ++ ++const struct rpc_call_ops nfs4_release_lockowner_ops = { ++ .rpc_release = nfs4_release_lockowner_release, ++}; ++ ++void nfs4_release_lockowner(const struct nfs4_lock_state *lsp) ++{ ++ struct nfs_server *server = lsp->ls_state->owner->so_server; ++ struct nfs_release_lockowner_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], ++ }; ++ ++ if (server->nfs_client->cl_mvops->minor_version != 0) ++ return; ++ args = kmalloc(sizeof(*args), GFP_NOFS); ++ if (!args) ++ return; ++ args->lock_owner.clientid = server->nfs_client->cl_clientid; ++ args->lock_owner.id = lsp->ls_id.id; ++ msg.rpc_argp = args; ++ rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); ++} ++ + #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" + + int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, +@@ -4528,7 +4677,7 @@ int nfs4_proc_exchange_id(struct nfs_cli + nfs4_verifier verifier; + struct nfs41_exchange_id_args args = { + .client = clp, +- .flags = clp->cl_exchange_flags, ++ .flags = clp->cl_exchange_flags & ~EXCHGID4_FLAG_CONFIRMED_R, + }; + struct nfs41_exchange_id_res res = { + .client = clp, +@@ -4576,6 +4725,7 @@ int nfs4_proc_exchange_id(struct nfs_cli + dprintk("<-- %s status= %d\n", __func__, status); + return status; + } ++EXPORT_SYMBOL(nfs4_proc_exchange_id); + + struct nfs4_get_lease_time_data { + struct nfs4_get_lease_time_args *args; +@@ -4613,7 +4763,8 @@ static void nfs4_get_lease_time_done(str + (struct nfs4_get_lease_time_data *)calldata; + + dprintk("--> %s\n", __func__); +- nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status); ++ if (!nfs41_sequence_done(task, &data->res->lr_seq_res)) ++ return; + switch (task->tk_status) { + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: +@@ -4807,13 +4958,6 @@ struct nfs4_session *nfs4_alloc_session( + if (!session) + return NULL; + +- /* +- * The create session reply races with the server back +- * channel probe. Mark the client NFS_CS_SESSION_INITING +- * so that the client back channel can find the +- * nfs_client struct +- */ +- clp->cl_cons_state = NFS_CS_SESSION_INITING; + init_completion(&session->complete); + + tbl = &session->fc_slot_table; +@@ -4826,6 +4970,8 @@ struct nfs4_session *nfs4_alloc_session( + spin_lock_init(&tbl->slot_tbl_lock); + rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); + ++ session->session_state = 1<clp = clp; + return session; + } +@@ -5042,6 +5188,10 @@ int nfs4_init_session(struct nfs_server + if (!nfs4_has_session(clp)) + return 0; + ++ session = clp->cl_session; ++ if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) ++ return 0; ++ + rsize = server->rsize; + if (rsize == 0) + rsize = NFS_MAX_FILE_IO_SIZE; +@@ -5049,11 +5199,10 @@ int nfs4_init_session(struct nfs_server + if (wsize == 0) + wsize = NFS_MAX_FILE_IO_SIZE; + +- session = clp->cl_session; + session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; + session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; + +- ret = nfs4_recover_expired_lease(server); ++ ret = nfs4_recover_expired_lease(server->nfs_client); + if (!ret) + ret = nfs4_check_client_ready(clp); + return ret; +@@ -5062,69 +5211,70 @@ int nfs4_init_session(struct nfs_server + /* + * Renew the cl_session lease. + */ +-static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +-{ ++struct nfs4_sequence_data { ++ struct nfs_client *clp; + struct nfs4_sequence_args args; + struct nfs4_sequence_res res; +- +- struct rpc_message msg = { +- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], +- .rpc_argp = &args, +- .rpc_resp = &res, +- .rpc_cred = cred, +- }; +- +- args.sa_cache_this = 0; +- +- return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args, +- &res, args.sa_cache_this, 1); +-} ++}; + + static void nfs41_sequence_release(void *data) + { +- struct nfs_client *clp = (struct nfs_client *)data; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); ++ kfree(calldata); ++} ++ ++static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp) ++{ ++ switch(task->tk_status) { ++ case -NFS4ERR_DELAY: ++ case -EKEYEXPIRED: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ return -EAGAIN; ++ default: ++ nfs4_schedule_state_recovery(clp); ++ } ++ return 0; + } + + static void nfs41_sequence_call_done(struct rpc_task *task, void *data) + { +- struct nfs_client *clp = (struct nfs_client *)data; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + +- nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status); ++ if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp)) ++ return; + + if (task->tk_status < 0) { + dprintk("%s ERROR %d\n", __func__, task->tk_status); + if (atomic_read(&clp->cl_count) == 1) + goto out; + +- if (_nfs4_async_handle_error(task, NULL, clp, NULL) +- == -EAGAIN) { +- nfs_restart_rpc(task, clp); ++ if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) { ++ rpc_restart_call_prepare(task); + return; + } + } + dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); + out: +- kfree(task->tk_msg.rpc_argp); +- kfree(task->tk_msg.rpc_resp); +- + dprintk("<-- %s\n", __func__); + } + + static void nfs41_sequence_prepare(struct rpc_task *task, void *data) + { +- struct nfs_client *clp; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + struct nfs4_sequence_args *args; + struct nfs4_sequence_res *res; + +- clp = (struct nfs_client *)data; + args = task->tk_msg.rpc_argp; + res = task->tk_msg.rpc_resp; + +- if (nfs4_setup_sequence(clp, args, res, 0, task)) ++ if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task)) + return; + rpc_call_start(task); + } +@@ -5135,32 +5285,67 @@ static const struct rpc_call_ops nfs41_s + .rpc_release = nfs41_sequence_release, + }; + +-static int nfs41_proc_async_sequence(struct nfs_client *clp, +- struct rpc_cred *cred) ++static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) + { +- struct nfs4_sequence_args *args; +- struct nfs4_sequence_res *res; ++ struct nfs4_sequence_data *calldata; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], + .rpc_cred = cred, + }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = clp->cl_rpcclient, ++ .rpc_message = &msg, ++ .callback_ops = &nfs41_sequence_ops, ++ .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT, ++ }; + + if (!atomic_inc_not_zero(&clp->cl_count)) +- return -EIO; +- args = kzalloc(sizeof(*args), GFP_NOFS); +- res = kzalloc(sizeof(*res), GFP_NOFS); +- if (!args || !res) { +- kfree(args); +- kfree(res); ++ return ERR_PTR(-EIO); ++ calldata = kmalloc(sizeof(*calldata), GFP_NOFS); ++ if (calldata == NULL) { + nfs_put_client(clp); +- return -ENOMEM; ++ return ERR_PTR(-ENOMEM); + } +- res->sr_slotid = NFS4_MAX_SLOT_TABLE; +- msg.rpc_argp = args; +- msg.rpc_resp = res; ++ calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ msg.rpc_argp = &calldata->args; ++ msg.rpc_resp = &calldata->res; ++ calldata->clp = clp; ++ task_setup_data.callback_data = calldata; + +- return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, +- &nfs41_sequence_ops, (void *)clp); ++ return rpc_run_task(&task_setup_data); ++} ++ ++static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred) ++{ ++ struct rpc_task *task; ++ int ret = 0; ++ ++ task = _nfs41_proc_sequence(clp, cred); ++ if (IS_ERR(task)) ++ ret = PTR_ERR(task); ++ else ++ rpc_put_task(task); ++ dprintk("<-- %s status=%d\n", __func__, ret); ++ return ret; ++} ++ ++static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) ++{ ++ struct rpc_task *task; ++ int ret; ++ ++ task = _nfs41_proc_sequence(clp, cred); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ ret = rpc_wait_for_completion_task(task); ++ if (!ret) ++ ret = task->tk_status; ++ rpc_put_task(task); ++out: ++ dprintk("<-- %s status=%d\n", __func__, ret); ++ return ret; + } + + struct nfs4_reclaim_complete_data { +@@ -5174,13 +5359,31 @@ static void nfs4_reclaim_complete_prepar + struct nfs4_reclaim_complete_data *calldata = data; + + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); +- if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, ++ if (nfs41_setup_sequence(calldata->clp->cl_session, ++ &calldata->arg.seq_args, + &calldata->res.seq_res, 0, task)) + return; + + rpc_call_start(task); + } + ++static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) ++{ ++ switch(task->tk_status) { ++ case 0: ++ case -NFS4ERR_COMPLETE_ALREADY: ++ case -NFS4ERR_WRONG_CRED: /* What to do here? */ ++ break; ++ case -NFS4ERR_DELAY: ++ case -EKEYEXPIRED: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ return -EAGAIN; ++ default: ++ nfs4_schedule_state_recovery(clp); ++ } ++ return 0; ++} ++ + static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) + { + struct nfs4_reclaim_complete_data *calldata = data; +@@ -5188,32 +5391,13 @@ static void nfs4_reclaim_complete_done(s + struct nfs4_sequence_res *res = &calldata->res.seq_res; + + dprintk("--> %s\n", __func__); +- nfs41_sequence_done(clp, res, task->tk_status); +- switch (task->tk_status) { +- case 0: +- case -NFS4ERR_COMPLETE_ALREADY: +- break; +- case -NFS4ERR_BADSESSION: +- case -NFS4ERR_DEADSESSION: +- /* +- * Handle the session error, but do not retry the operation, as +- * we have no way of telling whether the clientid had to be +- * reset before we got our reply. If reset, a new wave of +- * reclaim operations will follow, containing their own reclaim +- * complete. We don't want our retry to get on the way of +- * recovery by incorrectly indicating to the server that we're +- * done reclaiming state since the process had to be restarted. +- */ +- _nfs4_async_handle_error(task, NULL, clp, NULL); +- break; +- default: +- if (_nfs4_async_handle_error( +- task, NULL, clp, NULL) == -EAGAIN) { +- rpc_restart_call_prepare(task); +- return; +- } +- } ++ if (!nfs41_sequence_done(task, res)) ++ return; + ++ if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) { ++ rpc_restart_call_prepare(task); ++ return; ++ } + dprintk("<-- %s\n", __func__); + } + +@@ -5270,6 +5454,404 @@ out: + dprintk("<-- %s status=%d\n", __func__, status); + return status; + } ++ ++static void ++nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutget *lgp = calldata; ++ struct inode *ino = lgp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args, ++ &lgp->res.seq_res, 0, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutget *lgp = calldata; ++ struct inode *ino = lgp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!nfs4_sequence_done(task, &lgp->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ pnfs_get_layout_done(lgp, task->tk_status); ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ lgp->status = task->tk_status; ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void nfs4_layoutget_release(void *calldata) ++{ ++ struct nfs4_layoutget *lgp = calldata; ++ ++ dprintk("--> %s\n", __func__); ++ pnfs_layout_release(NFS_I(lgp->args.inode)->layout, NULL); ++ if (lgp->res.layout.buf != NULL) ++ free_page((unsigned long) lgp->res.layout.buf); ++ kfree(calldata); ++ dprintk("<-- %s\n", __func__); ++} ++ ++static const struct rpc_call_ops nfs4_layoutget_call_ops = { ++ .rpc_call_prepare = nfs4_layoutget_prepare, ++ .rpc_call_done = nfs4_layoutget_done, ++ .rpc_release = nfs4_layoutget_release, ++}; ++ ++/* FIXME: We need to call nfs4_handle_exception ++ * and deal with retries. ++ * Currently we can't since we release lgp and its contents. ++ */ ++static int _nfs4_proc_layoutget(struct nfs4_layoutget *lgp) ++{ ++ struct nfs_server *server = NFS_SERVER(lgp->args.inode); ++ struct rpc_task *task; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], ++ .rpc_argp = &lgp->args, ++ .rpc_resp = &lgp->res, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = server->client, ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_layoutget_call_ops, ++ .callback_data = lgp, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ int status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ ++ lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); ++ if (lgp->res.layout.buf == NULL) { ++ nfs4_layoutget_release(lgp); ++ return -ENOMEM; ++ } ++ ++ lgp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = lgp->status; ++ if (status != 0) ++ goto out; ++ status = pnfs_layout_process(lgp); ++out: ++ rpc_put_task(task); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ return status; ++} ++ ++int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) ++{ ++ struct nfs_server *server = NFS_SERVER(lgp->args.inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, _nfs4_proc_layoutget(lgp), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *data) ++{ ++ struct nfs4_layoutcommit_data *ldata = ++ (struct nfs4_layoutcommit_data *)data; ++ struct nfs_server *server = NFS_SERVER(ldata->args.inode); ++ ++ if (nfs4_setup_sequence(server, NULL, &ldata->args.seq_args, ++ &ldata->res.seq_res, 1, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void ++nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutcommit_data *data = ++ (struct nfs4_layoutcommit_data *)calldata; ++ struct nfs_server *server = NFS_SERVER(data->args.inode); ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ data->status = task->tk_status; ++} ++ ++static void nfs4_layoutcommit_release(void *lcdata) ++{ ++ struct nfs4_layoutcommit_data *data = ++ (struct nfs4_layoutcommit_data *)lcdata; ++ ++ put_rpccred(data->cred); ++ pnfs_cleanup_layoutcommit(lcdata); ++ pnfs_layoutcommit_free(lcdata); ++ /* Matched by get_layout in pnfs_layoutcommit_inode */ ++ put_layout(data->args.inode); ++} ++ ++static const struct rpc_call_ops nfs4_layoutcommit_ops = { ++ .rpc_call_prepare = nfs4_layoutcommit_prepare, ++ .rpc_call_done = nfs4_layoutcommit_done, ++ .rpc_release = nfs4_layoutcommit_release, ++}; ++ ++/* Execute a layoutcommit to the server */ ++static int ++_nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync) ++{ ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT], ++ .rpc_argp = &data->args, ++ .rpc_resp = &data->res, ++ .rpc_cred = data->cred, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .task = &data->task, ++ .rpc_client = NFS_CLIENT(data->args.inode), ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_layoutcommit_ops, ++ .callback_data = data, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ struct rpc_task *task; ++ int status = 0; ++ ++ dprintk("NFS: %4d initiating layoutcommit call. %llu@%llu lbw: %llu " ++ "type: %d issync %d\n", ++ data->task.tk_pid, ++ data->args.range.length, ++ data->args.range.offset, ++ data->args.lastbytewritten, ++ data->args.layout_type, issync); ++ ++ data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = data->status; ++out: ++ dprintk("%s: status %d\n", __func__, status); ++ rpc_put_task(task); ++ return 0; ++} ++ ++int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync) ++{ ++ struct nfs4_exception exception = { }; ++ struct nfs_server *server = NFS_SERVER(data->args.inode); ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_proc_layoutcommit(data, issync), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static void ++nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ if (nfs4_setup_sequence(server, NULL, &lrp->args.seq_args, ++ &lrp->res.seq_res, 0, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!nfs4_sequence_done(task, &lrp->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void nfs4_layoutreturn_release(void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; ++ ++ dprintk("--> %s return_type %d lo %p\n", __func__, ++ lrp->args.return_type, lo); ++ ++ if (lrp->args.return_type == RETURN_FILE) { ++ if (!lrp->res.lrs_present) ++ pnfs_set_layout_stateid(lo, &zero_stateid); ++ pnfs_layout_release(lo, &lrp->args.range); ++ } ++ kfree(calldata); ++ dprintk("<-- %s\n", __func__); ++} ++ ++static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { ++ .rpc_call_prepare = nfs4_layoutreturn_prepare, ++ .rpc_call_done = nfs4_layoutreturn_done, ++ .rpc_release = nfs4_layoutreturn_release, ++}; ++ ++int _nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync) ++{ ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ struct rpc_task *task; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], ++ .rpc_argp = &lrp->args, ++ .rpc_resp = &lrp->res, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = server->client, ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_layoutreturn_call_ops, ++ .callback_data = lrp, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ int status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ lrp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = task->tk_status; ++out: ++ dprintk("<-- %s\n", __func__); ++ rpc_put_task(task); ++ return status; ++} ++ ++int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync) ++{ ++ struct nfs_server *server = NFS_SERVER(lrp->args.inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_proc_layoutreturn(lrp, issync), ++ &exception); ++ } while (exception.retry); ++ ++ return err; ++} ++ ++/* ++ * Retrieve the list of Data Server devices from the MDS. ++ */ ++static int _nfs4_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_getdevicelist_args args = { ++ .fh = fh, ++ .layoutclass = server->pnfs_curr_ld->id, ++ }; ++ struct nfs4_getdevicelist_res res = { ++ .devlist = devlist, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], ++ .rpc_argp = &args, ++ .rpc_resp = &res, ++ }; ++ int status; ++ ++ dprintk("--> %s\n", __func__); ++ status = nfs4_call_sync(server, &msg, &args, &res, 0); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ return status; ++} ++ ++int nfs4_proc_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_getdevicelist(server, fh, devlist), ++ &exception); ++ } while (exception.retry); ++ ++ dprintk("nfs4_pnfs_getdevlist: err=%d, num_devs=%u\n", ++ err, devlist->num_devs); ++ ++ return err; ++} ++ ++int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) ++{ ++ struct nfs4_getdeviceinfo_args args = { ++ .pdev = pdev, ++ }; ++ struct nfs4_getdeviceinfo_res res = { ++ .pdev = pdev, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], ++ .rpc_argp = &args, ++ .rpc_resp = &res, ++ }; ++ int status; ++ ++ dprintk("--> %s\n", __func__); ++ status = nfs4_call_sync(server, &msg, &args, &res, 0); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ ++ return status; ++} ++ + #endif /* CONFIG_NFS_V4_1 */ + + struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { +@@ -5327,28 +5909,30 @@ struct nfs4_state_maintenance_ops nfs41_ + }; + #endif + +-/* +- * Per minor version reboot and network partition recovery ops +- */ +- +-struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = { +- &nfs40_reboot_recovery_ops, +-#if defined(CONFIG_NFS_V4_1) +- &nfs41_reboot_recovery_ops, +-#endif ++static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { ++ .minor_version = 0, ++ .call_sync = _nfs4_call_sync, ++ .validate_stateid = nfs4_validate_delegation_stateid, ++ .reboot_recovery_ops = &nfs40_reboot_recovery_ops, ++ .nograce_recovery_ops = &nfs40_nograce_recovery_ops, ++ .state_renewal_ops = &nfs40_state_renewal_ops, + }; + +-struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = { +- &nfs40_nograce_recovery_ops, + #if defined(CONFIG_NFS_V4_1) +- &nfs41_nograce_recovery_ops, +-#endif ++static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { ++ .minor_version = 1, ++ .call_sync = _nfs4_call_sync_session, ++ .validate_stateid = nfs41_validate_delegation_stateid, ++ .reboot_recovery_ops = &nfs41_reboot_recovery_ops, ++ .nograce_recovery_ops = &nfs41_nograce_recovery_ops, ++ .state_renewal_ops = &nfs41_state_renewal_ops, + }; ++#endif + +-struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = { +- &nfs40_state_renewal_ops, ++const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { ++ [0] = &nfs_v4_0_minor_ops, + #if defined(CONFIG_NFS_V4_1) +- &nfs41_state_renewal_ops, ++ [1] = &nfs_v4_1_minor_ops, + #endif + }; + +@@ -5366,6 +5950,7 @@ const struct nfs_rpc_ops nfs_v4_clientop + .dentry_ops = &nfs4_dentry_operations, + .dir_inode_ops = &nfs4_dir_inode_operations, + .file_inode_ops = &nfs4_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs4_proc_get_root, + .getattr = nfs4_proc_getattr, + .setattr = nfs4_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs/nfs4renewd.c +--- linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-09-30 10:17:08.679993000 -0400 +@@ -54,17 +54,17 @@ + void + nfs4_renew_state(struct work_struct *work) + { +- struct nfs4_state_maintenance_ops *ops; ++ const struct nfs4_state_maintenance_ops *ops; + struct nfs_client *clp = + container_of(work, struct nfs_client, cl_renewd.work); + struct rpc_cred *cred; + long lease; + unsigned long last, now; + +- ops = nfs4_state_renewal_ops[clp->cl_minorversion]; ++ ops = clp->cl_mvops->state_renewal_ops; + dprintk("%s: start\n", __func__); + /* Are there any active superblocks? */ +- if (list_empty(&clp->cl_superblocks)) ++ if (list_empty(&clp->cl_superblocks) && !is_ds_only_client(clp)) + goto out; + spin_lock(&clp->cl_lock); + lease = clp->cl_lease_time; +diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/nfs4state.c +--- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-09-30 10:15:17.863715000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-09-30 10:17:08.685993000 -0400 +@@ -48,11 +48,13 @@ + #include + #include + #include ++#include + + #include "nfs4_fs.h" + #include "callback.h" + #include "delegation.h" + #include "internal.h" ++#include "pnfs.h" + + #define OPENOWNER_POOL_SIZE 8 + +@@ -126,6 +128,11 @@ static int nfs41_setup_state_renewal(str + int status; + struct nfs_fsinfo fsinfo; + ++ if (is_ds_only_client(clp)) { ++ nfs4_schedule_state_renewal(clp); ++ return 0; ++ } ++ + status = nfs4_proc_get_lease_time(clp, &fsinfo); + if (status == 0) { + /* Update lease time and schedule renewal */ +@@ -145,7 +152,9 @@ static void nfs4_end_drain_session(struc + struct nfs4_session *ses = clp->cl_session; + int max_slots; + +- if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) { ++ if (ses == NULL) ++ return; ++ if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + spin_lock(&ses->fc_slot_table.slot_tbl_lock); + max_slots = ses->fc_slot_table.max_slots; + while (max_slots--) { +@@ -167,7 +176,7 @@ static int nfs4_begin_drain_session(stru + struct nfs4_slot_table *tbl = &ses->fc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); +- set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state); ++ set_bit(NFS4_SESSION_DRAINING, &ses->session_state); + if (tbl->highest_used_slotid != -1) { + INIT_COMPLETION(ses->complete); + spin_unlock(&tbl->slot_tbl_lock); +@@ -371,7 +380,6 @@ nfs4_alloc_state_owner(void) + return NULL; + spin_lock_init(&sp->so_lock); + INIT_LIST_HEAD(&sp->so_states); +- INIT_LIST_HEAD(&sp->so_delegations); + rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); + sp->so_seqid.sequence = &sp->so_sequence; + spin_lock_init(&sp->so_sequence.lock); +@@ -384,7 +392,7 @@ static void + nfs4_drop_state_owner(struct nfs4_state_owner *sp) + { + if (!RB_EMPTY_NODE(&sp->so_client_node)) { +- struct nfs_client *clp = sp->so_client; ++ struct nfs_client *clp = sp->so_server->nfs_client; + + spin_lock(&clp->cl_lock); + rb_erase(&sp->so_client_node, &clp->cl_state_owners); +@@ -406,7 +414,6 @@ struct nfs4_state_owner *nfs4_get_state_ + new = nfs4_alloc_state_owner(); + if (new == NULL) + return NULL; +- new->so_client = clp; + new->so_server = server; + new->so_cred = cred; + spin_lock(&clp->cl_lock); +@@ -423,7 +430,7 @@ struct nfs4_state_owner *nfs4_get_state_ + + void nfs4_put_state_owner(struct nfs4_state_owner *sp) + { +- struct nfs_client *clp = sp->so_client; ++ struct nfs_client *clp = sp->so_server->nfs_client; + struct rpc_cred *cred = sp->so_cred; + + if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) +@@ -583,8 +590,24 @@ static void __nfs4_close(struct path *pa + if (!call_close) { + nfs4_put_open_state(state); + nfs4_put_state_owner(owner); +- } else ++ } else { ++ u32 roc_iomode; ++ struct nfs_inode *nfsi = NFS_I(state->inode); ++ ++ if (has_layout(nfsi) && ++ (roc_iomode = pnfs_layout_roc_iomode(nfsi)) != 0) { ++ struct pnfs_layout_range range = { ++ .iomode = roc_iomode, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ ++ pnfs_return_layout(state->inode, &range, NULL, ++ RETURN_FILE, wait); ++ } ++ + nfs4_do_close(path, state, gfp_mask, wait); ++ } + } + + void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) +@@ -602,12 +625,21 @@ void nfs4_close_sync(struct path *path, + * that is compatible with current->files + */ + static struct nfs4_lock_state * +-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) ++__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) + { + struct nfs4_lock_state *pos; + list_for_each_entry(pos, &state->lock_states, ls_locks) { +- if (pos->ls_owner != fl_owner) ++ if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type) + continue; ++ switch (pos->ls_owner.lo_type) { ++ case NFS4_POSIX_LOCK_TYPE: ++ if (pos->ls_owner.lo_u.posix_owner != fl_owner) ++ continue; ++ break; ++ case NFS4_FLOCK_LOCK_TYPE: ++ if (pos->ls_owner.lo_u.flock_owner != fl_pid) ++ continue; ++ } + atomic_inc(&pos->ls_count); + return pos; + } +@@ -619,10 +651,10 @@ __nfs4_find_lock_state(struct nfs4_state + * exists, return an uninitialized one. + * + */ +-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) ++static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) + { + struct nfs4_lock_state *lsp; +- struct nfs_client *clp = state->owner->so_client; ++ struct nfs_client *clp = state->owner->so_server->nfs_client; + + lsp = kzalloc(sizeof(*lsp), GFP_NOFS); + if (lsp == NULL) +@@ -633,7 +665,18 @@ static struct nfs4_lock_state *nfs4_allo + lsp->ls_seqid.sequence = &lsp->ls_sequence; + atomic_set(&lsp->ls_count, 1); + lsp->ls_state = state; +- lsp->ls_owner = fl_owner; ++ lsp->ls_owner.lo_type = type; ++ switch (lsp->ls_owner.lo_type) { ++ case NFS4_FLOCK_LOCK_TYPE: ++ lsp->ls_owner.lo_u.flock_owner = fl_pid; ++ break; ++ case NFS4_POSIX_LOCK_TYPE: ++ lsp->ls_owner.lo_u.posix_owner = fl_owner; ++ break; ++ default: ++ kfree(lsp); ++ return NULL; ++ } + spin_lock(&clp->cl_lock); + nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); + spin_unlock(&clp->cl_lock); +@@ -643,7 +686,7 @@ static struct nfs4_lock_state *nfs4_allo + + static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) + { +- struct nfs_client *clp = lsp->ls_state->owner->so_client; ++ struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client; + + spin_lock(&clp->cl_lock); + nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); +@@ -657,13 +700,13 @@ static void nfs4_free_lock_state(struct + * exists, return an uninitialized one. + * + */ +-static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) ++static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type) + { + struct nfs4_lock_state *lsp, *new = NULL; + + for(;;) { + spin_lock(&state->state_lock); +- lsp = __nfs4_find_lock_state(state, owner); ++ lsp = __nfs4_find_lock_state(state, owner, pid, type); + if (lsp != NULL) + break; + if (new != NULL) { +@@ -674,7 +717,7 @@ static struct nfs4_lock_state *nfs4_get_ + break; + } + spin_unlock(&state->state_lock); +- new = nfs4_alloc_lock_state(state, owner); ++ new = nfs4_alloc_lock_state(state, owner, pid, type); + if (new == NULL) + return NULL; + } +@@ -701,6 +744,8 @@ void nfs4_put_lock_state(struct nfs4_loc + if (list_empty(&state->lock_states)) + clear_bit(LK_STATE_IN_USE, &state->flags); + spin_unlock(&state->state_lock); ++ if (lsp->ls_flags & NFS_LOCK_INITIALIZED) ++ nfs4_release_lockowner(lsp); + nfs4_free_lock_state(lsp); + } + +@@ -728,7 +773,12 @@ int nfs4_set_lock_state(struct nfs4_stat + + if (fl->fl_ops != NULL) + return 0; +- lsp = nfs4_get_lock_state(state, fl->fl_owner); ++ if (fl->fl_flags & FL_POSIX) ++ lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE); ++ else if (fl->fl_flags & FL_FLOCK) ++ lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE); ++ else ++ return -EINVAL; + if (lsp == NULL) + return -ENOMEM; + fl->fl_u.nfs4_fl.owner = lsp; +@@ -740,7 +790,7 @@ int nfs4_set_lock_state(struct nfs4_stat + * Byte-range lock aware utility to initialize the stateid of read/write + * requests. + */ +-void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) ++void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid) + { + struct nfs4_lock_state *lsp; + int seq; +@@ -753,7 +803,7 @@ void nfs4_copy_stateid(nfs4_stateid *dst + return; + + spin_lock(&state->state_lock); +- lsp = __nfs4_find_lock_state(state, fl_owner); ++ lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); + if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) + memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); + spin_unlock(&state->state_lock); +@@ -1031,8 +1081,8 @@ restart: + * Open state on this file cannot be recovered + * All we can do is revert to using the zero stateid. + */ +- memset(state->stateid.data, 0, +- sizeof(state->stateid.data)); ++ memset(state->stateid.u.data, 0, ++ sizeof(state->stateid.u.data)); + /* Mark the file as being 'closed' */ + state->state = 0; + break; +@@ -1041,11 +1091,11 @@ restart: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: +- nfs4_state_mark_reclaim_nograce(sp->so_client, state); ++ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_NO_GRACE: +- nfs4_state_mark_reclaim_nograce(sp->so_client, state); ++ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: +@@ -1120,8 +1170,7 @@ static void nfs4_state_end_reclaim_reboo + if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + return; + +- nfs4_reclaim_complete(clp, +- nfs4_reboot_recovery_ops[clp->cl_minorversion]); ++ nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); + + for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); +@@ -1211,8 +1260,8 @@ restart: + static int nfs4_check_lease(struct nfs_client *clp) + { + struct rpc_cred *cred; +- struct nfs4_state_maintenance_ops *ops = +- nfs4_state_renewal_ops[clp->cl_minorversion]; ++ const struct nfs4_state_maintenance_ops *ops = ++ clp->cl_mvops->state_renewal_ops; + int status = -NFS4ERR_EXPIRED; + + /* Is the client already known to have an expired lease? */ +@@ -1235,8 +1284,8 @@ out: + static int nfs4_reclaim_lease(struct nfs_client *clp) + { + struct rpc_cred *cred; +- struct nfs4_state_recovery_ops *ops = +- nfs4_reboot_recovery_ops[clp->cl_minorversion]; ++ const struct nfs4_state_recovery_ops *ops = ++ clp->cl_mvops->reboot_recovery_ops; + int status = -ENOENT; + + cred = ops->get_clid_cred(clp); +@@ -1421,6 +1470,7 @@ static void nfs4_state_manager(struct nf + } + clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); ++ pnfs_destroy_all_layouts(clp); + } + + if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { +@@ -1444,7 +1494,7 @@ static void nfs4_state_manager(struct nf + /* First recover reboot state... */ + if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, +- nfs4_reboot_recovery_ops[clp->cl_minorversion]); ++ clp->cl_mvops->reboot_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + continue; +@@ -1458,7 +1508,7 @@ static void nfs4_state_manager(struct nf + /* Now recover expired state... */ + if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, +- nfs4_nograce_recovery_ops[clp->cl_minorversion]); ++ clp->cl_mvops->nograce_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || + test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) +diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs4xdr.c +--- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-09-30 10:15:17.872720000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-09-30 10:17:08.709998000 -0400 +@@ -50,8 +50,10 @@ + #include + #include + #include ++#include + #include "nfs4_fs.h" + #include "internal.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_XDR + +@@ -89,7 +91,7 @@ static int nfs4_stat_to_errno(int); + #define encode_getfh_maxsz (op_encode_hdr_maxsz) + #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ + ((3+NFS4_FHSIZE) >> 2)) +-#define nfs4_fattr_bitmap_maxsz 3 ++#define nfs4_fattr_bitmap_maxsz 4 + #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) + #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) + #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) +@@ -111,7 +113,11 @@ static int nfs4_stat_to_errno(int); + #define encode_restorefh_maxsz (op_encode_hdr_maxsz) + #define decode_restorefh_maxsz (op_decode_hdr_maxsz) + #define encode_fsinfo_maxsz (encode_getattr_maxsz) +-#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) ++/* The 5 accounts for the PNFS attributes, and assumes that at most three ++ * layout types will be returned. ++ */ ++#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ ++ nfs4_fattr_bitmap_maxsz + 8 + 5) + #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) + #define decode_renew_maxsz (op_decode_hdr_maxsz) + #define encode_setclientid_maxsz \ +@@ -202,14 +208,17 @@ static int nfs4_stat_to_errno(int); + #define encode_link_maxsz (op_encode_hdr_maxsz + \ + nfs4_name_maxsz) + #define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) ++#define encode_lockowner_maxsz (7) + #define encode_lock_maxsz (op_encode_hdr_maxsz + \ + 7 + \ +- 1 + encode_stateid_maxsz + 8) ++ 1 + encode_stateid_maxsz + 1 + \ ++ encode_lockowner_maxsz) + #define decode_lock_denied_maxsz \ + (8 + decode_lockowner_maxsz) + #define decode_lock_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) +-#define encode_lockt_maxsz (op_encode_hdr_maxsz + 12) ++#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \ ++ encode_lockowner_maxsz) + #define decode_lockt_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) + #define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \ +@@ -217,6 +226,11 @@ static int nfs4_stat_to_errno(int); + 4) + #define decode_locku_maxsz (op_decode_hdr_maxsz + \ + decode_stateid_maxsz) ++#define encode_release_lockowner_maxsz \ ++ (op_encode_hdr_maxsz + \ ++ encode_lockowner_maxsz) ++#define decode_release_lockowner_maxsz \ ++ (op_decode_hdr_maxsz) + #define encode_access_maxsz (op_encode_hdr_maxsz + 1) + #define decode_access_maxsz (op_decode_hdr_maxsz + 2) + #define encode_symlink_maxsz (op_encode_hdr_maxsz + \ +@@ -302,6 +316,35 @@ static int nfs4_stat_to_errno(int); + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) + #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) + #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) ++#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ ++ encode_verifier_maxsz) ++#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1 + \ ++ decode_verifier_maxsz + \ ++ XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ ++ NFS4_PNFS_DEVICEID4_SIZE)) ++#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ ++ XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE)) ++#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ ++ 4 /*layout type */ + \ ++ 4 /* opaque devaddr4 length */ +\ ++ 4 /* notification bitmap length */ + \ ++ 4 /* notification bitmap */) ++#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ ++ encode_stateid_maxsz) ++#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ ++ decode_stateid_maxsz + \ ++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) ++#define encode_layoutcommit_maxsz (18 + \ ++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \ ++ op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz) ++#define decode_layoutcommit_maxsz (3 + op_decode_hdr_maxsz) ++#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz + \ ++ 1 /* FIXME: opaque lrf_body always empty at ++ *the moment */) ++#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ ++ 1 + decode_stateid_maxsz) + #else /* CONFIG_NFS_V4_1 */ + #define encode_sequence_maxsz 0 + #define decode_sequence_maxsz 0 +@@ -471,6 +514,12 @@ static int nfs4_stat_to_errno(int); + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_locku_maxsz) ++#define NFS4_enc_release_lockowner_sz \ ++ (compound_encode_hdr_maxsz + \ ++ encode_lockowner_maxsz) ++#define NFS4_dec_release_lockowner_sz \ ++ (compound_decode_hdr_maxsz + \ ++ decode_lockowner_maxsz) + #define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ +@@ -685,6 +734,60 @@ static int nfs4_stat_to_errno(int); + #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_reclaim_complete_maxsz) ++#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_getdevicelist_maxsz) ++#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_getdevicelist_maxsz) ++#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_getdeviceinfo_maxsz) ++#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_getdeviceinfo_maxsz) ++#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_layoutget_maxsz) ++#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutget_maxsz) ++#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_layoutcommit_maxsz + \ ++ encode_getattr_maxsz) ++#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutcommit_maxsz + \ ++ decode_getattr_maxsz) ++#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_layoutreturn_maxsz) ++#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutreturn_maxsz) ++#define NFS4_enc_dswrite_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_write_maxsz) ++#define NFS4_dec_dswrite_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_write_maxsz) ++#define NFS4_enc_dscommit_sz (compound_encode_hdr_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_commit_maxsz) ++#define NFS4_dec_dscommit_sz (compound_decode_hdr_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_commit_maxsz) + + const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_encode_hdr_maxsz + +@@ -915,7 +1018,7 @@ static void encode_close(struct xdr_stre + p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_CLOSE); + *p++ = cpu_to_be32(arg->seqid->sequence->counter); +- xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_close_maxsz; + } +@@ -989,6 +1092,35 @@ static void encode_getattr_two(struct xd + hdr->replen += decode_getattr_maxsz; + } + ++static void ++encode_getattr_three(struct xdr_stream *xdr, ++ uint32_t bm0, uint32_t bm1, uint32_t bm2, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(OP_GETATTR); ++ if (bm2) { ++ p = reserve_space(xdr, 16); ++ *p++ = cpu_to_be32(3); ++ *p++ = cpu_to_be32(bm0); ++ *p++ = cpu_to_be32(bm1); ++ *p = cpu_to_be32(bm2); ++ } else if (bm1) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(2); ++ *p++ = cpu_to_be32(bm0); ++ *p = cpu_to_be32(bm1); ++ } else { ++ p = reserve_space(xdr, 8); ++ *p++ = cpu_to_be32(1); ++ *p = cpu_to_be32(bm0); ++ } ++ hdr->nops++; ++ hdr->replen += decode_getattr_maxsz; ++} ++ + static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { + encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], +@@ -997,8 +1129,11 @@ static void encode_getfattr(struct xdr_s + + static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { +- encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], +- bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); ++ encode_getattr_three(xdr, ++ bitmask[0] & nfs4_fsinfo_bitmap[0], ++ bitmask[1] & nfs4_fsinfo_bitmap[1], ++ bitmask[2] & nfs4_fsinfo_bitmap[2], ++ hdr); + } + + static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) +@@ -1042,6 +1177,17 @@ static inline uint64_t nfs4_lock_length( + return fl->fl_end - fl->fl_start + 1; + } + ++static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 28); ++ p = xdr_encode_hyper(p, lowner->clientid); ++ *p++ = cpu_to_be32(16); ++ p = xdr_encode_opaque_fixed(p, "lock id:", 8); ++ xdr_encode_hyper(p, lowner->id); ++} ++ + /* + * opcode,type,reclaim,offset,length,new_lock_owner = 32 + * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 +@@ -1058,18 +1204,16 @@ static void encode_lock(struct xdr_strea + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + *p = cpu_to_be32(args->new_lock_owner); + if (args->new_lock_owner){ +- p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); ++ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(args->open_seqid->sequence->counter); +- p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->open_stateid->u.data, ++ NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); +- p = xdr_encode_hyper(p, args->lock_owner.clientid); +- *p++ = cpu_to_be32(16); +- p = xdr_encode_opaque_fixed(p, "lock id:", 8); +- xdr_encode_hyper(p, args->lock_owner.id); ++ encode_lockowner(xdr, &args->lock_owner); + } + else { + p = reserve_space(xdr, NFS4_STATEID_SIZE+4); +- p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->lock_stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(args->lock_seqid->sequence->counter); + } + hdr->nops++; +@@ -1080,15 +1224,12 @@ static void encode_lockt(struct xdr_stre + { + __be32 *p; + +- p = reserve_space(xdr, 52); ++ p = reserve_space(xdr, 24); + *p++ = cpu_to_be32(OP_LOCKT); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); +- p = xdr_encode_hyper(p, args->lock_owner.clientid); +- *p++ = cpu_to_be32(16); +- p = xdr_encode_opaque_fixed(p, "lock id:", 8); +- xdr_encode_hyper(p, args->lock_owner.id); ++ encode_lockowner(xdr, &args->lock_owner); + hdr->nops++; + hdr->replen += decode_lockt_maxsz; + } +@@ -1101,13 +1242,25 @@ static void encode_locku(struct xdr_stre + *p++ = cpu_to_be32(OP_LOCKU); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + *p++ = cpu_to_be32(args->seqid->sequence->counter); +- p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->stateid->u.data, ++ NFS4_STATEID_SIZE); + p = xdr_encode_hyper(p, args->fl->fl_start); + xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + hdr->nops++; + hdr->replen += decode_locku_maxsz; + } + ++static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(OP_RELEASE_LOCKOWNER); ++ encode_lockowner(xdr, lowner); ++ hdr->nops++; ++ hdr->replen += decode_release_lockowner_maxsz; ++} ++ + static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) + { + int len = name->len; +@@ -1172,7 +1325,7 @@ static inline void encode_createmode(str + break; + default: + clp = arg->server->nfs_client; +- if (clp->cl_minorversion > 0) { ++ if (clp->cl_mvops->minor_version > 0) { + if (nfs4_has_persistent_session(clp)) { + *p = cpu_to_be32(NFS4_CREATE_GUARDED); + encode_attrs(xdr, arg->u.attrs, arg->server); +@@ -1251,7 +1404,7 @@ static inline void encode_claim_delegate + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); +- xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, stateid->u.data, NFS4_STATEID_SIZE); + encode_string(xdr, name->len, name->name); + } + +@@ -1282,7 +1435,7 @@ static void encode_open_confirm(struct x + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_CONFIRM); +- p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); + hdr->nops++; + hdr->replen += decode_open_confirm_maxsz; +@@ -1294,7 +1447,7 @@ static void encode_open_downgrade(struct + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); +- p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); + encode_share_access(xdr, arg->fmode); + hdr->nops++; +@@ -1324,17 +1477,17 @@ static void encode_putrootfh(struct xdr_ + hdr->replen += decode_putrootfh_maxsz; + } + +-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) ++static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx) + { + nfs4_stateid stateid; + __be32 *p; + + p = reserve_space(xdr, NFS4_STATEID_SIZE); + if (ctx->state != NULL) { +- nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); +- xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); ++ nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); ++ xdr_encode_opaque_fixed(p, stateid.u.data, NFS4_STATEID_SIZE); + } else +- xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, zero_stateid.u.data, NFS4_STATEID_SIZE); + } + + static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) +@@ -1344,7 +1497,7 @@ static void encode_read(struct xdr_strea + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_READ); + +- encode_stateid(xdr, args->context); ++ encode_stateid(xdr, args->context, args->lock_context); + + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->offset); +@@ -1448,7 +1601,7 @@ encode_setacl(struct xdr_stream *xdr, st + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); +- xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, zero_stateid.u.data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 2*4); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(FATTR4_WORD0_ACL); +@@ -1479,7 +1632,7 @@ static void encode_setattr(struct xdr_st + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); +- xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, arg->stateid.u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_setattr_maxsz; + encode_attrs(xdr, arg->iap, server); +@@ -1523,7 +1676,7 @@ static void encode_write(struct xdr_stre + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_WRITE); + +- encode_stateid(xdr, args->context); ++ encode_stateid(xdr, args->context, args->lock_context); + + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, args->offset); +@@ -1542,7 +1695,7 @@ static void encode_delegreturn(struct xd + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + + *p++ = cpu_to_be32(OP_DELEGRETURN); +- xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, stateid->u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_delegreturn_maxsz; + } +@@ -1696,6 +1849,162 @@ static void encode_sequence(struct xdr_s + #endif /* CONFIG_NFS_V4_1 */ + } + ++#ifdef CONFIG_NFS_V4_1 ++static void ++encode_getdevicelist(struct xdr_stream *xdr, ++ const struct nfs4_getdevicelist_args *args, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ nfs4_verifier dummy = { ++ .data = "dummmmmy", ++ }; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_GETDEVICELIST); ++ *p++ = cpu_to_be32(args->layoutclass); ++ *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); ++ xdr_encode_hyper(p, 0ULL); /* cookie */ ++ encode_nfs4_verifier(xdr, &dummy); ++ hdr->nops++; ++} ++ ++static void ++encode_getdeviceinfo(struct xdr_stream *xdr, ++ const struct nfs4_getdeviceinfo_args *args, ++ struct compound_hdr *hdr) ++{ ++ int has_bitmap = (args->pdev->dev_notify_types != 0); ++ int len = 16 + NFS4_PNFS_DEVICEID4_SIZE + (has_bitmap * 4); ++ __be32 *p; ++ ++ p = reserve_space(xdr, len); ++ *p++ = cpu_to_be32(OP_GETDEVICEINFO); ++ p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ *p++ = cpu_to_be32(args->pdev->layout_type); ++ *p++ = cpu_to_be32(args->pdev->pglen + len); /* gdia_maxcount */ ++ *p++ = cpu_to_be32(has_bitmap); /* bitmap length [01] */ ++ if (has_bitmap) ++ *p = cpu_to_be32(args->pdev->dev_notify_types); ++ hdr->nops++; ++} ++ ++static void ++encode_layoutget(struct xdr_stream *xdr, ++ const struct nfs4_layoutget_args *args, ++ struct compound_hdr *hdr) ++{ ++ nfs4_stateid stateid; ++ __be32 *p; ++ ++ p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(OP_LAYOUTGET); ++ *p++ = cpu_to_be32(0); /* Signal layout available */ ++ *p++ = cpu_to_be32(args->type); ++ *p++ = cpu_to_be32(args->range.iomode); ++ p = xdr_encode_hyper(p, args->range.offset); ++ p = xdr_encode_hyper(p, args->range.length); ++ p = xdr_encode_hyper(p, args->minlength); ++ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout); ++ p = xdr_encode_opaque_fixed(p, &stateid.u.data, NFS4_STATEID_SIZE); ++ *p = cpu_to_be32(args->maxcount); ++ ++ dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", ++ __func__, ++ args->type, ++ args->range.iomode, ++ (unsigned long)args->range.offset, ++ (unsigned long)args->range.length, ++ args->maxcount); ++ hdr->nops++; ++ hdr->replen += decode_layoutget_maxsz; ++} ++ ++static int ++encode_layoutcommit(struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *args, ++ struct compound_hdr *hdr) ++{ ++ struct layoutdriver_io_operations *ld_io_ops = ++ NFS_SERVER(args->inode)->pnfs_curr_ld->ld_io_ops; ++ __be32 *p; ++ ++ dprintk("%s: %llu@%llu lbw: %llu type: %d\n", __func__, ++ args->range.length, args->range.offset, args->lastbytewritten, ++ args->layout_type); ++ ++ p = reserve_space(xdr, 40 + NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); ++ p = xdr_encode_hyper(p, args->range.offset); ++ p = xdr_encode_hyper(p, args->range.length); ++ *p++ = cpu_to_be32(0); /* reclaim */ ++ p = xdr_encode_opaque_fixed(p, args->stateid.u.data, NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(1); /* newoffset = TRUE */ ++ p = xdr_encode_hyper(p, args->lastbytewritten); ++ *p = cpu_to_be32(args->time_modify_changed != 0); ++ if (args->time_modify_changed) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(0); ++ *p++ = cpu_to_be32(args->time_modify.tv_sec); ++ *p = cpu_to_be32(args->time_modify.tv_nsec); ++ } ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(args->layout_type); ++ ++ if (ld_io_ops->encode_layoutcommit) { ++ ld_io_ops->encode_layoutcommit(NFS_I(args->inode)->layout, ++ xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ xdr_encode_opaque(p, NULL, 0); ++ } ++ ++ hdr->nops++; ++ hdr->replen += decode_layoutcommit_maxsz; ++ return 0; ++} ++ ++static void ++encode_layoutreturn(struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args, ++ struct compound_hdr *hdr) ++{ ++ nfs4_stateid stateid; ++ __be32 *p; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_LAYOUTRETURN); ++ *p++ = cpu_to_be32(args->reclaim); ++ *p++ = cpu_to_be32(args->layout_type); ++ *p++ = cpu_to_be32(args->range.iomode); ++ *p = cpu_to_be32(args->return_type); ++ if (args->return_type == RETURN_FILE) { ++ struct layoutdriver_io_operations *ld_io_ops = ++ NFS_SERVER(args->inode)->pnfs_curr_ld->ld_io_ops; ++ ++ p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE); ++ p = xdr_encode_hyper(p, args->range.offset); ++ p = xdr_encode_hyper(p, args->range.length); ++ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout); ++ p = xdr_encode_opaque_fixed(p, &stateid.u.data, ++ NFS4_STATEID_SIZE); ++ dprintk("%s: call %pF\n", __func__, ++ ld_io_ops->encode_layoutreturn); ++ if (ld_io_ops->encode_layoutreturn) { ++ ld_io_ops->encode_layoutreturn( ++ NFS_I(args->inode)->layout, xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(0); ++ } ++ } ++ hdr->nops++; ++ hdr->replen += decode_layoutreturn_maxsz; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ + /* + * END OF "GENERIC" ENCODE ROUTINES. + */ +@@ -1704,7 +2013,7 @@ static u32 nfs4_xdr_minorversion(const s + { + #if defined(CONFIG_NFS_V4_1) + if (args->sa_session) +- return args->sa_session->clp->cl_minorversion; ++ return args->sa_session->clp->cl_mvops->minor_version; + #endif /* CONFIG_NFS_V4_1 */ + return 0; + } +@@ -2048,6 +2357,20 @@ static int nfs4_xdr_enc_locku(struct rpc + return 0; + } + ++static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = 0, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_release_lockowner(&xdr, &args->lock_owner, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ + /* + * Encode a READLINK request + */ +@@ -2330,7 +2653,7 @@ static int nfs4_xdr_enc_setclientid_conf + struct compound_hdr hdr = { + .nops = 0, + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); +@@ -2395,7 +2718,7 @@ static int nfs4_xdr_enc_exchange_id(stru + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = args->client->cl_minorversion, ++ .minorversion = args->client->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2413,7 +2736,7 @@ static int nfs4_xdr_enc_create_session(s + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = args->client->cl_minorversion, ++ .minorversion = args->client->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2431,7 +2754,7 @@ static int nfs4_xdr_enc_destroy_session( + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = session->clp->cl_minorversion, ++ .minorversion = session->clp->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2469,7 +2792,7 @@ static int nfs4_xdr_enc_get_lease_time(s + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); +@@ -2499,6 +2822,159 @@ static int nfs4_xdr_enc_reclaim_complete + return 0; + } + ++/* ++ * Encode GETDEVICELIST request ++ */ ++static int ++nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_getdevicelist_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_getdevicelist(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode GETDEVICEINFO request ++ */ ++static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_getdeviceinfo_args *args) ++{ ++ struct xdr_stream xdr; ++ struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ int replen; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_getdeviceinfo(&xdr, args, &hdr); ++ ++ /* set up reply kvec. Subtract notification bitmap max size (8) ++ * so that notification bitmap is put in xdr_buf tail */ ++ replen = (RPC_REPHDRSIZE + auth->au_rslack + ++ NFS4_dec_getdeviceinfo_sz - 8) << 2; ++ xdr_inline_pages(&req->rq_rcv_buf, replen, args->pdev->pages, ++ args->pdev->pgbase, args->pdev->pglen); ++ dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", ++ __func__, replen, args->pdev->pages, ++ args->pdev->pgbase, args->pdev->pglen); ++ ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTGET request ++ */ ++static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_layoutget_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, NFS_FH(args->inode), &hdr); ++ encode_layoutget(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTCOMMIT request ++ */ ++static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_layoutcommit_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_layoutcommit(&xdr, args, &hdr); ++ encode_getfattr(&xdr, args->bitmask, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTRETURN request ++ */ ++static int nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_layoutreturn_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, NFS_FH(args->inode), &hdr); ++ encode_layoutreturn(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server WRITE request ++ */ ++static int nfs4_xdr_enc_dswrite(struct rpc_rqst *req, uint32_t *p, ++ struct nfs_writeargs *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_write(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server COMMIT request ++ */ ++static int nfs4_xdr_enc_dscommit(struct rpc_rqst *req, uint32_t *p, ++ struct nfs_writeargs *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_commit(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} + #endif /* CONFIG_NFS_V4_1 */ + + static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +@@ -2599,14 +3075,17 @@ static int decode_attr_bitmap(struct xdr + goto out_overflow; + bmlen = be32_to_cpup(p); + +- bitmap[0] = bitmap[1] = 0; ++ bitmap[0] = bitmap[1] = bitmap[2] = 0; + p = xdr_inline_decode(xdr, (bmlen << 2)); + if (unlikely(!p)) + goto out_overflow; + if (bmlen > 0) { + bitmap[0] = be32_to_cpup(p++); +- if (bmlen > 1) +- bitmap[1] = be32_to_cpup(p); ++ if (bmlen > 1) { ++ bitmap[1] = be32_to_cpup(p++); ++ if (bmlen > 2) ++ bitmap[2] = be32_to_cpup(p); ++ } + } + return 0; + out_overflow: +@@ -2635,8 +3114,9 @@ static int decode_attr_supported(struct + decode_attr_bitmap(xdr, bitmask); + bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; + } else +- bitmask[0] = bitmask[1] = 0; +- dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); ++ bitmask[0] = bitmask[1] = bitmask[2] = 0; ++ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, ++ bitmask[0], bitmask[1], bitmask[2]); + return 0; + } + +@@ -3565,7 +4045,7 @@ static int decode_opaque_fixed(struct xd + + static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) + { +- return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); ++ return decode_opaque_fixed(xdr, stateid->u.data, NFS4_STATEID_SIZE); + } + + static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +@@ -3621,7 +4101,7 @@ out_overflow: + static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3647,7 +4127,7 @@ xdr_error: + static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3679,7 +4159,7 @@ xdr_error: + static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3705,7 +4185,7 @@ static int decode_getfattr(struct xdr_st + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}, ++ bitmap[3] = {0}, + type; + int status; + umode_t fmode = 0; +@@ -3824,24 +4304,101 @@ xdr_error: + return status; + } + +- +-static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) ++#if defined(CONFIG_NFS_V4_1) ++/* ++ * Decode potentially multiple layout types. Currently we only support ++ * one layout driver per file system. ++ */ ++static int decode_pnfs_list(struct xdr_stream *xdr, uint32_t *layoutclass) + { +- __be32 *savep; +- uint32_t attrlen, bitmap[2]; +- int status; ++ uint32_t *p; ++ int num; + +- if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +- goto xdr_error; +- if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) +- goto xdr_error; +- if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) +- goto xdr_error; ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ num = be32_to_cpup(p); + +- fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ ++ /* pNFS is not supported by the underlying file system */ ++ if (num == 0) { ++ *layoutclass = 0; ++ return 0; ++ } + +- if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) +- goto xdr_error; ++ /* TODO: We will eventually support multiple layout drivers ? */ ++ if (num > 1) ++ printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers " ++ "per filesystem not supported\n", __func__); ++ ++ /* Decode and set first layout type */ ++ p = xdr_inline_decode(xdr, num * 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ *layoutclass = be32_to_cpup(p); ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++/* ++ * The type of file system exported ++ */ ++static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, ++ uint32_t *layoutclass) ++{ ++ int status = 0; ++ ++ dprintk("%s: bitmap is %x\n", __func__, bitmap[1]); ++ if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) ++ return -EIO; ++ if (likely(bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES)) { ++ status = decode_pnfs_list(xdr, layoutclass); ++ bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; ++ } ++ return status; ++} ++ ++/* ++ * The prefered block size for layout directed io ++ */ ++static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, ++ uint32_t *res) ++{ ++ __be32 *p; ++ ++ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); ++ *res = 0; ++ if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) { ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++ } ++ *res = be32_to_cpup(p); ++ bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; ++ } ++ return 0; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ ++static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) ++{ ++ __be32 *savep; ++ uint32_t attrlen, bitmap[3]; ++ int status; ++ ++ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) ++ goto xdr_error; ++ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) ++ goto xdr_error; ++ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) ++ goto xdr_error; ++ ++ fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ ++ ++ if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) ++ goto xdr_error; + if ((status = decode_attr_maxfilesize(xdr, bitmap, &fsinfo->maxfilesize)) != 0) + goto xdr_error; + if ((status = decode_attr_maxread(xdr, bitmap, &fsinfo->rtmax)) != 0) +@@ -3850,6 +4407,14 @@ static int decode_fsinfo(struct xdr_stre + if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) + goto xdr_error; + fsinfo->wtpref = fsinfo->wtmax; ++#if defined(CONFIG_NFS_V4_1) ++ status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); ++ if (status) ++ goto xdr_error; ++ status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); ++ if (status) ++ goto xdr_error; ++#endif /* CONFIG_NFS_V4_1 */ + + status = verify_attr_len(xdr, savep, attrlen); + xdr_error: +@@ -3973,6 +4538,11 @@ static int decode_locku(struct xdr_strea + return status; + } + ++static int decode_release_lockowner(struct xdr_stream *xdr) ++{ ++ return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER); ++} ++ + static int decode_lookup(struct xdr_stream *xdr) + { + return decode_op_hdr(xdr, OP_LOOKUP); +@@ -4333,7 +4903,7 @@ static int decode_getacl(struct xdr_stre + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}; ++ bitmap[3] = {0}; + struct kvec *iov = req->rq_rcv_buf.head; + int status; + +@@ -4682,6 +5252,226 @@ out_overflow: + #endif /* CONFIG_NFS_V4_1 */ + } + ++#if defined(CONFIG_NFS_V4_1) ++/* ++ * TODO: Need to handle case when EOF != true; ++ */ ++static int decode_getdevicelist(struct xdr_stream *xdr, ++ struct pnfs_devicelist *res) ++{ ++ __be32 *p; ++ int status, i; ++ struct nfs_writeverf verftemp; ++ ++ status = decode_op_hdr(xdr, OP_GETDEVICELIST); ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 8 + 8 + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ ++ /* TODO: Skip cookie for now */ ++ p += 2; ++ ++ /* Read verifier */ ++ p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); ++ ++ res->num_devs = be32_to_cpup(p); ++ ++ dprintk("%s: num_dev %d\n", __func__, res->num_devs); ++ ++ if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) ++ return -NFS4ERR_REP_TOO_BIG; ++ ++ p = xdr_inline_decode(xdr, ++ res->num_devs * NFS4_PNFS_DEVICEID4_SIZE + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ for (i = 0; i < res->num_devs; i++) ++ p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ res->eof = be32_to_cpup(p); ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_getdeviceinfo(struct xdr_stream *xdr, ++ struct pnfs_device *pdev) ++{ ++ __be32 *p; ++ uint32_t len, type; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_GETDEVICEINFO); ++ if (status) { ++ if (status == -ETOOSMALL) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ pdev->mincount = be32_to_cpup(p); ++ dprintk("%s: Min count too small. mincnt = %u\n", ++ __func__, pdev->mincount); ++ } ++ return status; ++ } ++ ++ p = xdr_inline_decode(xdr, 8); ++ if (unlikely(!p)) ++ goto out_overflow; ++ type = be32_to_cpup(p++); ++ if (type != pdev->layout_type) { ++ dprintk("%s: layout mismatch req: %u pdev: %u\n", ++ __func__, pdev->layout_type, type); ++ return -EINVAL; ++ } ++ /* ++ * Get the length of the opaque device_addr4. xdr_read_pages places ++ * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages) ++ * and places the remaining xdr data in xdr_buf->tail ++ */ ++ pdev->mincount = be32_to_cpup(p); ++ xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ ++ ++ /* At most one bitmap word */ ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ len = be32_to_cpup(p); ++ if (len) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ pdev->dev_notify_types = be32_to_cpup(p); ++ } else ++ pdev->dev_notify_types = 0; ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, ++ struct nfs4_layoutget_res *res) ++{ ++ __be32 *p; ++ int status; ++ u32 layout_count, dummy; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTGET); ++ if (status) ++ return status; ++ p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->return_on_close = be32_to_cpup(p++); ++ p = xdr_decode_opaque_fixed(p, res->stateid.u.data, NFS4_STATEID_SIZE); ++ layout_count = be32_to_cpup(p); ++ if (!layout_count) { ++ dprintk("%s: server responded with empty layout array\n", ++ __func__); ++ return -EINVAL; ++ } ++ ++ p = xdr_inline_decode(xdr, 24); ++ if (unlikely(!p)) ++ goto out_overflow; ++ p = xdr_decode_hyper(p, &res->range.offset); ++ p = xdr_decode_hyper(p, &res->range.length); ++ res->range.iomode = be32_to_cpup(p++); ++ res->type = be32_to_cpup(p++); ++ ++ status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p); ++ if (unlikely(status)) ++ return status; ++ ++ dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", ++ __func__, ++ (unsigned long)res->range.offset, ++ (unsigned long)res->range.length, ++ res->range.iomode, ++ res->type, ++ res->layout.len); ++ ++ /* presuambly, nfs4_proc_layoutget allocated a single page */ ++ if (res->layout.len > PAGE_SIZE) ++ return -ENOMEM; ++ memcpy(res->layout.buf, p, res->layout.len); ++ ++ /* FIXME: the whole layout array should be passed up to the pnfs ++ * client */ ++ if (layout_count > 1) { ++ dprintk("%s: server responded with %d layouts, dropping tail\n", ++ __func__, layout_count); ++ ++ while (--layout_count) { ++ p = xdr_inline_decode(xdr, 24); ++ if (unlikely(!p)) ++ goto out_overflow; ++ status = decode_opaque_inline(xdr, &dummy, (char **)&p); ++ if (unlikely(status)) ++ return status; ++ } ++ } ++ ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutreturn(struct xdr_stream *xdr, ++ struct nfs4_layoutreturn_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTRETURN); ++ if (status) ++ return status; ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->lrs_present = be32_to_cpup(p); ++ if (res->lrs_present) ++ status = decode_stateid(xdr, &res->stateid); ++ return status; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutcommit(struct xdr_stream *xdr, ++ struct rpc_rqst *req, ++ struct nfs4_layoutcommit_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->sizechanged = be32_to_cpup(p); ++ ++ if (res->sizechanged) { ++ p = xdr_inline_decode(xdr, 8); ++ if (unlikely(!p)) ++ goto out_overflow; ++ xdr_decode_hyper(p, &res->newsize); ++ } ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ + /* + * END OF "GENERIC" DECODE ROUTINES. + */ +@@ -5259,6 +6049,19 @@ out: + return status; + } + ++static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (!status) ++ status = decode_release_lockowner(&xdr); ++ return status; ++} ++ + /* + * Decode READLINK response + */ +@@ -5696,6 +6499,186 @@ static int nfs4_xdr_dec_reclaim_complete + status = decode_reclaim_complete(&xdr, (void *)NULL); + return status; + } ++ ++/* ++ * Decode GETDEVICELIST response ++ */ ++static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_getdevicelist_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ dprintk("encoding getdevicelist!\n"); ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status != 0) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status != 0) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status != 0) ++ goto out; ++ status = decode_getdevicelist(&xdr, res->devlist); ++out: ++ return status; ++} ++ ++/* ++ * Decode GETDEVINFO response ++ */ ++static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_getdeviceinfo_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status != 0) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status != 0) ++ goto out; ++ status = decode_getdeviceinfo(&xdr, res->pdev); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTGET response ++ */ ++static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_layoutget_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutget(&xdr, rqstp, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTRETURN response ++ */ ++static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_layoutreturn_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutreturn(&xdr, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTCOMMIT response ++ */ ++static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_layoutcommit_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutcommit(&xdr, rqstp, res); ++ if (status) ++ goto out; ++ decode_getfattr(&xdr, res->fattr, res->server, ++ !RPC_IS_ASYNC(rqstp->rq_task)); ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server WRITE response ++ */ ++static int nfs4_xdr_dec_dswrite(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs_writeres *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_write(&xdr, res); ++ if (!status) ++ return res->count; ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server COMMIT response ++ */ ++static int nfs4_xdr_dec_dscommit(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs_writeres *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_commit(&xdr, res); ++out: ++ return status; ++} + #endif /* CONFIG_NFS_V4_1 */ + + __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) +@@ -5866,6 +6849,7 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(GETACL, enc_getacl, dec_getacl), + PROC(SETACL, enc_setacl, dec_setacl), + PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), ++ PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), + #if defined(CONFIG_NFS_V4_1) + PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), + PROC(CREATE_SESSION, enc_create_session, dec_create_session), +@@ -5873,6 +6857,13 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(SEQUENCE, enc_sequence, dec_sequence), + PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), ++ PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), ++ PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), ++ PROC(LAYOUTGET, enc_layoutget, dec_layoutget), ++ PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), ++ PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), ++ PROC(PNFS_WRITE, enc_dswrite, dec_dswrite), ++ PROC(PNFS_COMMIT, enc_dscommit, dec_dscommit), + #endif /* CONFIG_NFS_V4_1 */ + }; + +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild +--- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-09-30 10:17:08.713997000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-09-30 10:17:08.715994000 -0400 +@@ -0,0 +1,11 @@ ++# ++# Makefile for the pNFS Objects Layout Driver kernel module ++# ++objlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o objio_osd.o ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o ++ ++# ++# Panasas pNFS Layout Driver kernel module ++# ++panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o ++obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-09-30 10:17:08.717999000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-09-30 10:17:08.719998000 -0400 +@@ -0,0 +1,1087 @@ ++/* ++ * objio_osd.c ++ * ++ * pNFS Objects layout implementation over open-osd initiator library ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++#define _LLU(x) ((unsigned long long)x) ++ ++enum { BIO_MAX_PAGES_KMALLOC = ++ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), ++}; ++ ++/* A per mountpoint struct currently for device cache */ ++struct objio_mount_type { ++ struct list_head dev_list; ++ spinlock_t dev_list_lock; ++}; ++ ++struct _dev_ent { ++ struct list_head list; ++ struct pnfs_deviceid d_id; ++ struct osd_dev *od; ++}; ++ ++static void _dev_list_remove_all(struct objio_mount_type *omt) ++{ ++ spin_lock(&omt->dev_list_lock); ++ ++ while (!list_empty(&omt->dev_list)) { ++ struct _dev_ent *de = list_entry(omt->dev_list.next, ++ struct _dev_ent, list); ++ ++ list_del_init(&de->list); ++ osduld_put_device(de->od); ++ kfree(de); ++ } ++ ++ spin_unlock(&omt->dev_list_lock); ++} ++ ++static struct osd_dev *___dev_list_find(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id) ++{ ++ struct list_head *le; ++ ++ list_for_each(le, &omt->dev_list) { ++ struct _dev_ent *de = list_entry(le, struct _dev_ent, list); ++ ++ if (0 == memcmp(&de->d_id, d_id, sizeof(*d_id))) ++ return de->od; ++ } ++ ++ return NULL; ++} ++ ++static struct osd_dev *_dev_list_find(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id) ++{ ++ struct osd_dev *od; ++ ++ spin_lock(&omt->dev_list_lock); ++ od = ___dev_list_find(omt, d_id); ++ spin_unlock(&omt->dev_list_lock); ++ return od; ++} ++ ++static int _dev_list_add(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id, struct osd_dev *od) ++{ ++ struct _dev_ent *de = kzalloc(sizeof(*de), GFP_KERNEL); ++ ++ if (!de) ++ return -ENOMEM; ++ ++ spin_lock(&omt->dev_list_lock); ++ ++ if (___dev_list_find(omt, d_id)) { ++ kfree(de); ++ goto out; ++ } ++ ++ de->d_id = *d_id; ++ de->od = od; ++ list_add(&de->list, &omt->dev_list); ++ ++out: ++ spin_unlock(&omt->dev_list_lock); ++ return 0; ++} ++ ++struct objio_segment { ++ struct pnfs_osd_layout *layout; ++ ++ unsigned mirrors_p1; ++ unsigned stripe_unit; ++ unsigned group_width; /* Data stripe_units without integrity comps */ ++ u64 group_depth; ++ unsigned group_count; ++ ++ unsigned num_comps; ++ /* variable length */ ++ struct osd_dev *ods[1]; ++}; ++ ++struct objio_state; ++typedef ssize_t (*objio_done_fn)(struct objio_state *ios); ++ ++struct objio_state { ++ /* Generic layer */ ++ struct objlayout_io_state ol_state; ++ ++ struct objio_segment *objio_seg; ++ ++ struct kref kref; ++ objio_done_fn done; ++ void *private; ++ ++ unsigned long length; ++ unsigned numdevs; /* Actually used devs in this IO */ ++ /* A per-device variable array of size numdevs */ ++ struct _objio_per_comp { ++ struct bio *bio; ++ struct osd_request *or; ++ unsigned long length; ++ u64 offset; ++ unsigned dev; ++ } per_dev[]; ++}; ++ ++/* Send and wait for a get_device_info of devices in the layout, ++ then look them up with the osd_initiator library */ ++static struct osd_dev *_device_lookup(struct pnfs_layout_hdr *pnfslay, ++ struct objio_segment *objio_seg, unsigned comp) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ struct pnfs_osd_deviceaddr *deviceaddr; ++ struct pnfs_deviceid *d_id; ++ struct osd_dev *od; ++ struct osd_dev_info odi; ++ struct objio_mount_type *omt = PNFS_NFS_SERVER(pnfslay)->pnfs_ld_data; ++ int err; ++ ++ d_id = &layout->olo_comps[comp].oc_object_id.oid_device_id; ++ ++ od = _dev_list_find(omt, d_id); ++ if (od) ++ return od; ++ ++ err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr); ++ if (unlikely(err)) { ++ dprintk("%s: objlayout_get_deviceinfo=>%d\n", __func__, err); ++ return ERR_PTR(err); ++ } ++ ++ odi.systemid_len = deviceaddr->oda_systemid.len; ++ if (odi.systemid_len > sizeof(odi.systemid)) { ++ err = -EINVAL; ++ goto out; ++ } else if (odi.systemid_len) ++ memcpy(odi.systemid, deviceaddr->oda_systemid.data, ++ odi.systemid_len); ++ odi.osdname_len = deviceaddr->oda_osdname.len; ++ odi.osdname = (u8 *)deviceaddr->oda_osdname.data; ++ ++ if (!odi.osdname_len && !odi.systemid_len) { ++ dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", ++ __func__); ++ err = -ENODEV; ++ goto out; ++ } ++ ++ od = osduld_info_lookup(&odi); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ dprintk("%s: osduld_info_lookup => %d\n", __func__, err); ++ goto out; ++ } ++ ++ _dev_list_add(omt, d_id, od); ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ objlayout_put_deviceinfo(deviceaddr); ++ return err ? ERR_PTR(err) : od; ++} ++ ++static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, ++ struct objio_segment *objio_seg) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ unsigned i, num_comps = layout->olo_num_comps; ++ int err; ++ ++ /* lookup all devices */ ++ for (i = 0; i < num_comps; i++) { ++ struct osd_dev *od; ++ ++ od = _device_lookup(pnfslay, objio_seg, i); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ goto out; ++ } ++ objio_seg->ods[i] = od; ++ } ++ objio_seg->num_comps = num_comps; ++ err = 0; ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ return err; ++} ++ ++static int _verify_data_map(struct pnfs_osd_layout *layout) ++{ ++ struct pnfs_osd_data_map *data_map = &layout->olo_map; ++ u64 stripe_length; ++ u32 group_width; ++ ++/* FIXME: Only raid0 for now. if not go through MDS */ ++ if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { ++ printk(KERN_ERR "Only RAID_0 for now\n"); ++ return -ENOTSUPP; ++ } ++ if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { ++ printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", ++ data_map->odm_num_comps, data_map->odm_mirror_cnt); ++ return -EINVAL; ++ } ++ ++ if (data_map->odm_group_width) ++ group_width = data_map->odm_group_width; ++ else ++ group_width = data_map->odm_num_comps / ++ (data_map->odm_mirror_cnt + 1); ++ ++ stripe_length = (u64)data_map->odm_stripe_unit * group_width; ++ if (stripe_length >= (1ULL << 32)) { ++ printk(KERN_ERR "Total Stripe length(0x%llx)" ++ " >= 32bit is not supported\n", _LLU(stripe_length)); ++ return -ENOTSUPP; ++ } ++ ++ if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { ++ printk(KERN_ERR "Stripe Unit(0x%llx)" ++ " must be Multples of PAGE_SIZE(0x%lx)\n", ++ _LLU(data_map->odm_stripe_unit), PAGE_SIZE); ++ return -ENOTSUPP; ++ } ++ ++ return 0; ++} ++ ++int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ struct objio_segment *objio_seg; ++ int err; ++ ++ err = _verify_data_map(layout); ++ if (unlikely(err)) ++ return err; ++ ++ objio_seg = kzalloc(sizeof(*objio_seg) + ++ (layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]), ++ GFP_KERNEL); ++ if (!objio_seg) ++ return -ENOMEM; ++ ++ objio_seg->layout = layout; ++ err = objio_devices_lookup(pnfslay, objio_seg); ++ if (err) ++ goto free_seg; ++ ++ objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1; ++ objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit; ++ if (layout->olo_map.odm_group_width) { ++ objio_seg->group_width = layout->olo_map.odm_group_width; ++ objio_seg->group_depth = layout->olo_map.odm_group_depth; ++ objio_seg->group_count = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1 / ++ objio_seg->group_width; ++ } else { ++ objio_seg->group_width = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1; ++ objio_seg->group_depth = -1; ++ objio_seg->group_count = 1; ++ } ++ ++ *outp = objio_seg; ++ return 0; ++ ++free_seg: ++ dprintk("%s: Error: return %d\n", __func__, err); ++ kfree(objio_seg); ++ *outp = NULL; ++ return err; ++} ++ ++void objio_free_lseg(void *p) ++{ ++ struct objio_segment *objio_seg = p; ++ ++ kfree(objio_seg); ++} ++ ++int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct objio_segment *objio_seg = seg; ++ struct objio_state *ios; ++ const unsigned first_size = sizeof(*ios) + ++ objio_seg->num_comps * sizeof(ios->per_dev[0]); ++ const unsigned sec_size = objio_seg->num_comps * ++ sizeof(ios->ol_state.ioerrs[0]); ++ ++ dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps); ++ ios = kzalloc(first_size + sec_size, GFP_KERNEL); ++ if (unlikely(!ios)) ++ return -ENOMEM; ++ ++ ios->objio_seg = objio_seg; ++ ios->ol_state.ioerrs = ((void *)ios) + first_size; ++ ios->ol_state.num_comps = objio_seg->num_comps; ++ ++ *outp = &ios->ol_state; ++ return 0; ++} ++ ++void objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ ++ kfree(ios); ++} ++ ++enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) ++{ ++ switch (oep) { ++ case OSD_ERR_PRI_NO_ERROR: ++ return (enum pnfs_osd_errno)0; ++ ++ case OSD_ERR_PRI_CLEAR_PAGES: ++ BUG_ON(1); ++ return 0; ++ ++ case OSD_ERR_PRI_RESOURCE: ++ return PNFS_OSD_ERR_RESOURCE; ++ case OSD_ERR_PRI_BAD_CRED: ++ return PNFS_OSD_ERR_BAD_CRED; ++ case OSD_ERR_PRI_NO_ACCESS: ++ return PNFS_OSD_ERR_NO_ACCESS; ++ case OSD_ERR_PRI_UNREACHABLE: ++ return PNFS_OSD_ERR_UNREACHABLE; ++ case OSD_ERR_PRI_NOT_FOUND: ++ return PNFS_OSD_ERR_NOT_FOUND; ++ case OSD_ERR_PRI_NO_SPACE: ++ return PNFS_OSD_ERR_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case OSD_ERR_PRI_EIO: ++ return PNFS_OSD_ERR_EIO; ++ } ++} ++ ++static void _clear_bio(struct bio *bio) ++{ ++ struct bio_vec *bv; ++ unsigned i; ++ ++ __bio_for_each_segment(bv, bio, i, 0) { ++ unsigned this_count = bv->bv_len; ++ ++ if (likely(PAGE_SIZE == this_count)) ++ clear_highpage(bv->bv_page); ++ else ++ zero_user(bv->bv_page, bv->bv_offset, this_count); ++ } ++} ++ ++static int _io_check(struct objio_state *ios, bool is_write) ++{ ++ enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; ++ int lin_ret = 0; ++ int i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_sense_info osi; ++ struct osd_request *or = ios->per_dev[i].or; ++ int ret; ++ ++ if (!or) ++ continue; ++ ++ ret = osd_req_decode_sense(or, &osi); ++ if (likely(!ret)) ++ continue; ++ ++ if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { ++ /* start read offset passed endof file */ ++ BUG_ON(is_write); ++ _clear_bio(ios->per_dev[i].bio); ++ dprintk("%s: start read offset passed end of file " ++ "offset=0x%llx, length=0x%lx\n", __func__, ++ _LLU(ios->per_dev[i].offset), ++ ios->per_dev[i].length); ++ ++ continue; /* we recovered */ ++ } ++ objlayout_io_set_result(&ios->ol_state, ios->per_dev[i].dev, ++ osd_pri_2_pnfs_err(osi.osd_err_pri), ++ ios->per_dev[i].offset, ++ ios->per_dev[i].length, ++ is_write); ++ ++ if (osi.osd_err_pri >= oep) { ++ oep = osi.osd_err_pri; ++ lin_ret = ret; ++ } ++ } ++ ++ return lin_ret; ++} ++ ++/* ++ * Common IO state helpers. ++ */ ++static void _io_free(struct objio_state *ios) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[i]; ++ ++ if (per_dev->or) { ++ osd_end_request(per_dev->or); ++ per_dev->or = NULL; ++ } ++ ++ if (per_dev->bio) { ++ bio_put(per_dev->bio); ++ per_dev->bio = NULL; ++ } ++ } ++} ++ ++struct osd_dev * _io_od(struct objio_state *ios, unsigned dev) ++{ ++ unsigned min_dev = ios->objio_seg->layout->olo_comps_index; ++ unsigned max_dev = min_dev + ios->ol_state.num_comps; ++ ++ BUG_ON(dev < min_dev || max_dev <= dev); ++ return ios->objio_seg->ods[dev - min_dev]; ++} ++ ++struct _striping_info { ++ u64 obj_offset; ++ u64 group_length; ++ u64 total_group_length; ++ u64 Major; ++ unsigned dev; ++ unsigned unit_off; ++}; ++ ++static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, ++ struct _striping_info *si) ++{ ++ u32 stripe_unit = ios->objio_seg->stripe_unit; ++ u32 group_width = ios->objio_seg->group_width; ++ u64 group_depth = ios->objio_seg->group_depth; ++ u32 U = stripe_unit * group_width; ++ ++ u64 T = U * group_depth; ++ u64 S = T * ios->objio_seg->group_count; ++ u64 M = div64_u64(file_offset, S); ++ ++ /* ++ G = (L - (M * S)) / T ++ H = (L - (M * S)) % T ++ */ ++ u64 LmodU = file_offset - M * S; ++ u32 G = div64_u64(LmodU, T); ++ u64 H = LmodU - G * T; ++ ++ u32 N = div_u64(H, U); ++ ++ div_u64_rem(file_offset, stripe_unit, &si->unit_off); ++ si->obj_offset = si->unit_off + (N * stripe_unit) + ++ (M * group_depth * stripe_unit); ++ ++ /* "H - (N * U)" is just "H % U" so it's bound to u32 */ ++ si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; ++ si->dev *= ios->objio_seg->mirrors_p1; ++ ++ si->group_length = T - H; ++ si->total_group_length = T; ++ si->Major = M; ++} ++ ++static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, ++ unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len) ++{ ++ unsigned pg = *cur_pg; ++ struct request_queue *q = ++ osd_request_queue(_io_od(ios, per_dev->dev)); ++ ++ per_dev->length += cur_len; ++ ++ if (per_dev->bio == NULL) { ++ unsigned stripes = ios->ol_state.num_comps / ++ ios->objio_seg->mirrors_p1; ++ unsigned pages_in_stripe = stripes * ++ (ios->objio_seg->stripe_unit / PAGE_SIZE); ++ unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / ++ stripes; ++ ++ per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); ++ if (unlikely(!per_dev->bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", bio_size); ++ return -ENOMEM; ++ } ++ } ++ ++ while (cur_len > 0) { ++ unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); ++ unsigned added_len; ++ ++ BUG_ON(ios->ol_state.nr_pages <= pg); ++ cur_len -= pglen; ++ ++ added_len = bio_add_pc_page(q, per_dev->bio, ++ ios->ol_state.pages[pg], pglen, pgbase); ++ if (unlikely(pglen != added_len)) ++ return -ENOMEM; ++ pgbase = 0; ++ ++pg; ++ } ++ BUG_ON(cur_len); ++ ++ *cur_pg = pg; ++ return 0; ++} ++ ++static int _prepare_one_group(struct objio_state *ios, u64 length, ++ struct _striping_info *si, unsigned first_comp, ++ unsigned *last_pg) ++{ ++ unsigned stripe_unit = ios->objio_seg->stripe_unit; ++ unsigned mirrors_p1 = ios->objio_seg->mirrors_p1; ++ unsigned devs_in_group = ios->objio_seg->group_width * mirrors_p1; ++ unsigned dev = si->dev; ++ unsigned first_dev = dev - (dev % devs_in_group); ++ unsigned comp = first_comp + (dev - first_dev); ++ unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; ++ unsigned cur_pg = *last_pg; ++ int ret = 0; ++ ++ while (length) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[comp]; ++ unsigned cur_len, page_off = 0; ++ ++ if (!per_dev->length) { ++ per_dev->dev = dev; ++ if (dev < si->dev) { ++ per_dev->offset = si->obj_offset + stripe_unit - ++ si->unit_off; ++ cur_len = stripe_unit; ++ } else if (dev == si->dev) { ++ per_dev->offset = si->obj_offset; ++ cur_len = stripe_unit - si->unit_off; ++ page_off = si->unit_off & ~PAGE_MASK; ++ BUG_ON(page_off && ++ (page_off != ios->ol_state.pgbase)); ++ } else { /* dev > si->dev */ ++ per_dev->offset = si->obj_offset - si->unit_off; ++ cur_len = stripe_unit; ++ } ++ ++ if (max_comp < comp) ++ max_comp = comp; ++ ++ dev += mirrors_p1; ++ dev = (dev % devs_in_group) + first_dev; ++ } else { ++ cur_len = stripe_unit; ++ } ++ if (cur_len >= length) ++ cur_len = length; ++ ++ ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, ++ cur_len); ++ if (unlikely(ret)) ++ goto out; ++ ++ comp += mirrors_p1; ++ comp = (comp % devs_in_group) + first_comp; ++ ++ length -= cur_len; ++ ios->length += cur_len; ++ } ++out: ++ ios->numdevs = max_comp + mirrors_p1; ++ *last_pg = cur_pg; ++ return ret; ++} ++ ++static int _io_rw_pagelist(struct objio_state *ios) ++{ ++ u64 length = ios->ol_state.count; ++ struct _striping_info si; ++ unsigned devs_in_group = ios->objio_seg->group_width * ++ ios->objio_seg->mirrors_p1; ++ unsigned first_comp = 0; ++ unsigned num_comps = ios->objio_seg->layout->olo_map.odm_num_comps; ++ unsigned last_pg = 0; ++ int ret = 0; ++ ++ _calc_stripe_info(ios, ios->ol_state.offset, &si); ++ while (length) { ++ if (length < si.group_length) ++ si.group_length = length; ++ ++ ret = _prepare_one_group(ios, si.group_length, &si, first_comp, ++ &last_pg); ++ if (unlikely(ret)) ++ goto out; ++ ++ length -= si.group_length; ++ ++ si.group_length = si.total_group_length; ++ si.unit_off = 0; ++ ++si.Major; ++ si.obj_offset = si.Major * ios->objio_seg->stripe_unit * ++ ios->objio_seg->group_depth; ++ ++ si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group; ++ si.dev %= num_comps; ++ ++ first_comp += devs_in_group; ++ first_comp %= num_comps; ++ } ++ ++out: ++ if (!ios->length) ++ return ret; ++ ++ return 0; ++} ++ ++static ssize_t _sync_done(struct objio_state *ios) ++{ ++ struct completion *waiting = ios->private; ++ ++ complete(waiting); ++ return 0; ++} ++ ++static void _last_io(struct kref *kref) ++{ ++ struct objio_state *ios = container_of(kref, struct objio_state, kref); ++ ++ ios->done(ios); ++} ++ ++static void _done_io(struct osd_request *or, void *p) ++{ ++ struct objio_state *ios = p; ++ ++ kref_put(&ios->kref, _last_io); ++} ++ ++static ssize_t _io_exec(struct objio_state *ios) ++{ ++ DECLARE_COMPLETION_ONSTACK(wait); ++ ssize_t status = 0; /* sync status */ ++ unsigned i; ++ objio_done_fn saved_done_fn = ios->done; ++ bool sync = ios->ol_state.sync; ++ ++ if (sync) { ++ ios->done = _sync_done; ++ ios->private = &wait; ++ } ++ ++ kref_init(&ios->kref); ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_request *or = ios->per_dev[i].or; ++ ++ if (!or) ++ continue; ++ ++ kref_get(&ios->kref); ++ osd_execute_request_async(or, _done_io, ios); ++ } ++ ++ kref_put(&ios->kref, _last_io); ++ ++ if (sync) { ++ wait_for_completion(&wait); ++ status = saved_done_fn(ios); ++ } ++ ++ return status; ++} ++ ++/* ++ * read ++ */ ++static ssize_t _read_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, false); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) ++ status = ios->length; ++ else ++ status = ret; ++ ++ objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct osd_request *or = NULL; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = per_dev->dev; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ int ret; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ ++err: ++ return ret; ++} ++ ++static ssize_t _read_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _read_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _read_done; ++ return _io_exec(ios); /* In sync mode exec returns the io status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _read_exec(ios); ++} ++ ++/* ++ * write ++ */ ++static ssize_t _write_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, true); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) { ++ /* FIXME: should be based on the OSD's persistence model ++ * See OSD2r05 Section 4.13 Data persistence model */ ++ ios->ol_state.committed = NFS_UNSTABLE; //NFS_FILE_SYNC; ++ status = ios->length; ++ } else { ++ status = ret; ++ } ++ ++ objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = ios->per_dev[cur_comp].dev; ++ unsigned last_comp = cur_comp + ios->objio_seg->mirrors_p1; ++ int ret; ++ ++ for (; cur_comp < last_comp; ++cur_comp, ++dev) { ++ struct osd_request *or = NULL; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ struct bio *bio; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ if (per_dev != master_dev) { ++ bio = bio_kmalloc(GFP_KERNEL, ++ master_dev->bio->bi_max_vecs); ++ if (unlikely(!bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", ++ master_dev->bio->bi_max_vecs); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ __bio_clone(bio, master_dev->bio); ++ bio->bi_bdev = NULL; ++ bio->bi_next = NULL; ++ per_dev->bio = bio; ++ per_dev->dev = dev; ++ per_dev->length = master_dev->length; ++ per_dev->offset = master_dev->offset; ++ } else { ++ bio = master_dev->bio; ++ /* FIXME: bio_set_dir() */ ++ bio->bi_rw |= (1 << BIO_RW); ++ } ++ ++ osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ } ++ ++err: ++ return ret; ++} ++ ++static ssize_t _write_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _write_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _write_done; ++ return _io_exec(ios); /* In sync mode exec returns the io->status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ /* TODO: ios->stable = stable; */ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _write_exec(ios); ++} ++ ++/* ++ * Policy Operations ++ */ ++ ++/* ++ * Return the stripe size for the specified file ++ */ ++ssize_t ++objlayout_get_stripesize(struct pnfs_layout_hdr *pnfslay) ++{ ++ ssize_t sz, maxsz = -1; ++ struct pnfs_layout_segment *lseg; ++ ++ list_for_each_entry(lseg, &pnfslay->segs, fi_list) { ++ int n; ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(lseg); ++ struct pnfs_osd_layout *lo = ++ (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; ++ struct pnfs_osd_data_map *map = &lo->olo_map; ++ ++ n = map->odm_group_width; ++ if (n == 0) ++ n = map->odm_num_comps / (map->odm_mirror_cnt + 1); ++ ++ switch (map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_5: ++ n -= 1; ++ break; ++ ++ case PNFS_OSD_RAID_PQ: ++ n -= 2; ++ break; ++ ++ default: ++ BUG_ON(1); ++ } ++ sz = map->odm_stripe_unit * n; ++ if (sz > maxsz) ++ maxsz = sz; ++ } ++ dprintk("%s: Return %Zx\n", __func__, maxsz); ++ return maxsz; ++} ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++objlayout_get_blocksize(void) ++{ ++ ssize_t sz = BIO_MAX_PAGES_KMALLOC * PAGE_SIZE; ++ ++ return sz; ++} ++ ++static struct layoutdriver_policy_operations objlayout_policy_operations = { ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ .get_stripesize = objlayout_get_stripesize, ++ .get_blocksize = objlayout_get_blocksize, ++}; ++ ++static struct pnfs_layoutdriver_type objlayout_type = { ++ .id = LAYOUT_OSD2_OBJECTS, ++ .name = "LAYOUT_OSD2_OBJECTS", ++ .ld_io_ops = &objlayout_io_operations, ++ .ld_policy_ops = &objlayout_policy_operations, ++}; ++ ++void *objio_init_mt(void) ++{ ++ struct objio_mount_type *omt = kzalloc(sizeof(*omt), GFP_KERNEL); ++ ++ if (!omt) ++ return ERR_PTR(-ENOMEM); ++ ++ INIT_LIST_HEAD(&omt->dev_list); ++ spin_lock_init(&omt->dev_list_lock); ++ return omt; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++ _dev_list_remove_all(mountid); ++ kfree(mountid); ++} ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++objlayout_init(void) ++{ ++ pnfs_client_ops = pnfs_register_layoutdriver(&objlayout_type); ++ printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", ++ __func__); ++ return 0; ++} ++ ++static void __exit ++objlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&objlayout_type); ++ printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(objlayout_init); ++module_exit(objlayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-09-30 10:17:08.722997000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-09-30 10:17:08.724995000 -0400 +@@ -0,0 +1,790 @@ ++/* ++ * objlayout.c ++ * ++ * pNFS layout driver for Panasas OSDs ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pnfs_client_operations *pnfs_client_ops; ++ ++/* ++ * Create a objlayout layout structure for the given inode and return it. ++ */ ++static struct pnfs_layout_hdr * ++objlayout_alloc_layout(struct inode *inode) ++{ ++ struct objlayout *objlay; ++ ++ objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL); ++ if (objlay) { ++ spin_lock_init(&objlay->lock); ++ INIT_LIST_HEAD(&objlay->err_list); ++ } ++ dprintk("%s: Return %p\n", __func__, objlay); ++ return &objlay->pnfs_layout; ++} ++ ++/* ++ * Free an objlayout layout structure ++ */ ++static void ++objlayout_free_layout(struct pnfs_layout_hdr *lo) ++{ ++ struct objlayout *objlay = OBJLAYOUT(lo); ++ ++ dprintk("%s: objlay %p\n", __func__, objlay); ++ ++ WARN_ON(!list_empty(&objlay->err_list)); ++ kfree(objlay); ++} ++ ++/* ++ * Unmarshall layout and store it in pnfslay. ++ */ ++static struct pnfs_layout_segment * ++objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, ++ struct nfs4_layoutget_res *lgr) ++{ ++ int status; ++ void *layout = lgr->layout.buf; ++ struct pnfs_layout_segment *lseg; ++ struct objlayout_segment *objlseg; ++ struct pnfs_osd_layout *pnfs_osd_layout; ++ ++ dprintk("%s: Begin pnfslay %p layout %p\n", __func__, pnfslay, layout); ++ ++ BUG_ON(!layout); ++ ++ status = -ENOMEM; ++ lseg = kzalloc(sizeof(*lseg) + sizeof(*objlseg) + ++ pnfs_osd_layout_incore_sz(layout), GFP_KERNEL); ++ if (!lseg) ++ goto err; ++ ++ objlseg = LSEG_LD_DATA(lseg); ++ pnfs_osd_layout = (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; ++ pnfs_osd_xdr_decode_layout(pnfs_osd_layout, layout); ++ ++ status = objio_alloc_lseg(&objlseg->internal, pnfslay, lseg, ++ pnfs_osd_layout); ++ if (status) ++ goto err; ++ ++ dprintk("%s: Return %p\n", __func__, lseg); ++ return lseg; ++ ++ err: ++ kfree(lseg); ++ return ERR_PTR(status); ++} ++ ++/* ++ * Free a layout segement ++ */ ++static void ++objlayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ struct objlayout_segment *objlseg; ++ ++ dprintk("%s: freeing layout segment %p\n", __func__, lseg); ++ ++ if (unlikely(!lseg)) ++ return; ++ ++ objlseg = LSEG_LD_DATA(lseg); ++ objio_free_lseg(objlseg->internal); ++ kfree(lseg); ++} ++ ++/* ++ * I/O Operations ++ */ ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ ++static struct objlayout_io_state * ++objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, ++ struct page **pages, ++ unsigned pgbase, ++ unsigned nr_pages, ++ loff_t offset, ++ size_t count, ++ struct pnfs_layout_segment *lseg, ++ void *rpcdata) ++{ ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(lseg); ++ struct objlayout_io_state *state; ++ u64 lseg_end_offset; ++ size_t size_nr_pages; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ if (objio_alloc_io_state(objlseg->internal, &state)) ++ return NULL; ++ ++ BUG_ON(offset < lseg->range.offset); ++ lseg_end_offset = end_offset(lseg->range.offset, lseg->range.length); ++ BUG_ON(offset >= lseg_end_offset); ++ if (offset + count > lseg_end_offset) { ++ count = lseg->range.length - (offset - lseg->range.offset); ++ dprintk("%s: truncated count %Zd\n", __func__, count); ++ } ++ ++ if (pgbase > PAGE_SIZE) { ++ unsigned n = pgbase >> PAGE_SHIFT; ++ ++ pgbase &= ~PAGE_MASK; ++ pages += n; ++ nr_pages -= n; ++ } ++ ++ size_nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ BUG_ON(nr_pages < size_nr_pages); ++ if (nr_pages > size_nr_pages) ++ nr_pages = size_nr_pages; ++ ++ INIT_LIST_HEAD(&state->err_list); ++ state->lseg = lseg; ++ state->rpcdata = rpcdata; ++ state->pages = pages; ++ state->pgbase = pgbase; ++ state->nr_pages = nr_pages; ++ state->offset = offset; ++ state->count = count; ++ state->sync = 0; ++ ++ return state; ++} ++ ++static void ++objlayout_free_io_state(struct objlayout_io_state *state) ++{ ++ dprintk("%s: freeing io_state\n", __func__); ++ if (unlikely(!state)) ++ return; ++ ++ objio_free_io_state(state); ++} ++ ++/* ++ * I/O done common code ++ */ ++static void ++objlayout_iodone(struct objlayout_io_state *state) ++{ ++ dprintk("%s: state %p status\n", __func__, state); ++ ++ if (likely(state->status >= 0)) { ++ objlayout_free_io_state(state); ++ } else { ++ struct objlayout *objlay = OBJLAYOUT(state->lseg->layout); ++ ++ spin_lock(&objlay->lock); ++ objlay->delta_space_valid = OBJ_DSU_INVALID; ++ list_add(&objlay->err_list, &state->err_list); ++ spin_unlock(&objlay->lock); ++ } ++} ++ ++/* ++ * objlayout_io_set_result - Set an osd_error code on a specific osd comp. ++ * ++ * The @index component IO failed (error returned from target). Register ++ * the error for later reporting at layout-return. ++ */ ++void ++objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, ++ int osd_error, u64 offset, u64 length, bool is_write) ++{ ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; ++ ++ BUG_ON(index >= state->num_comps); ++ if (osd_error) { ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(state->lseg); ++ struct pnfs_osd_layout *layout = ++ (typeof(layout))objlseg->pnfs_osd_layout; ++ ++ ioerr->oer_component = layout->olo_comps[index].oc_object_id; ++ ioerr->oer_comp_offset = offset; ++ ioerr->oer_comp_length = length; ++ ioerr->oer_iswrite = is_write; ++ ioerr->oer_errno = osd_error; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " ++ "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", ++ __func__, index, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ } else { ++ /* User need not call if no error is reported */ ++ ioerr->oer_errno = 0; ++ } ++} ++ ++static void _rpc_commit_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_client_ops->nfs_commit_complete(wdata); ++} ++ ++/* ++ * Commit data remotely on OSDs ++ */ ++enum pnfs_try_status ++objlayout_commit(struct nfs_write_data *wdata, int how) ++{ ++ int status = PNFS_ATTEMPTED; ++ ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_commit_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ dprintk("%s: Return %d\n", __func__, status); ++ return status; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_read_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ ++ pnfs_client_ops->nfs_readlist_complete(rdata); ++} ++ ++void ++objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) ++{ ++ int eof = state->eof; ++ struct nfs_read_data *rdata; ++ ++ state->status = status; ++ dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof); ++ rdata = state->rpcdata; ++ rdata->task.tk_status = status; ++ if (status >= 0) { ++ rdata->res.count = status; ++ rdata->res.eof = eof; ++ } ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_client_ops->nfs_readlist_complete(rdata); ++ else { ++ INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); ++ schedule_work(&rdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async reads. ++ */ ++enum pnfs_try_status ++objlayout_read_pagelist(struct nfs_read_data *rdata, unsigned nr_pages) ++{ ++ loff_t offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct objlayout_io_state *state; ++ ssize_t status = 0; ++ loff_t eof; ++ ++ dprintk("%s: Begin inode %p offset %llu count %d\n", ++ __func__, rdata->inode, offset, (int)count); ++ ++ eof = i_size_read(rdata->inode); ++ if (unlikely(offset + count > eof)) { ++ if (offset >= eof) { ++ status = 0; ++ rdata->res.count = 0; ++ rdata->res.eof = 1; ++ goto out; ++ } ++ count = eof - offset; ++ } ++ ++ state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, ++ rdata->args.pages, rdata->args.pgbase, ++ nr_pages, offset, count, ++ rdata->pdata.lseg, rdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->eof = state->offset + state->count >= eof; ++ ++ status = objio_read_pagelist(state); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ rdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_write_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_client_ops->nfs_writelist_complete(wdata); ++} ++ ++void ++objlayout_write_done(struct objlayout_io_state *state, ssize_t status, ++ bool sync) ++{ ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s: Begin\n", __func__); ++ wdata = state->rpcdata; ++ state->status = status; ++ wdata->task.tk_status = status; ++ if (status >= 0) { ++ wdata->res.count = status; ++ wdata->verf.committed = state->committed; ++ dprintk("%s: Return status %d committed %d\n", ++ __func__, wdata->task.tk_status, ++ wdata->verf.committed); ++ } else ++ dprintk("%s: Return status %d\n", ++ __func__, wdata->task.tk_status); ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_client_ops->nfs_writelist_complete(wdata); ++ else { ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async writes. ++ */ ++enum pnfs_try_status ++objlayout_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int how) ++{ ++ struct objlayout_io_state *state; ++ ssize_t status; ++ ++ dprintk("%s: Begin inode %p offset %llu count %u\n", ++ __func__, wdata->inode, wdata->args.offset, wdata->args.count); ++ ++ state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, ++ wdata->args.pages, ++ wdata->args.pgbase, ++ nr_pages, ++ wdata->args.offset, ++ wdata->args.count, ++ wdata->pdata.lseg, wdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->sync = how & FLUSH_SYNC; ++ ++ status = objio_write_pagelist(state, how & FLUSH_STABLE); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ wdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++void ++objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct pnfs_osd_layoutupdate lou; ++ __be32 *start; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ spin_lock(&objlay->lock); ++ lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); ++ lou.dsu_delta = objlay->delta_space_used; ++ objlay->delta_space_used = 0; ++ objlay->delta_space_valid = OBJ_DSU_INIT; ++ lou.olu_ioerr_flag = !list_empty(&objlay->err_list); ++ spin_unlock(&objlay->lock); ++ ++ start = xdr_reserve_space(xdr, 4); ++ ++ BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ ++ dprintk("%s: Return delta_space_used %lld err %d\n", __func__, ++ lou.dsu_delta, lou.olu_ioerr_flag); ++} ++ ++static int ++err_prio(u32 oer_errno) ++{ ++ switch (oer_errno) { ++ case 0: ++ return 0; ++ ++ case PNFS_OSD_ERR_RESOURCE: ++ return OSD_ERR_PRI_RESOURCE; ++ case PNFS_OSD_ERR_BAD_CRED: ++ return OSD_ERR_PRI_BAD_CRED; ++ case PNFS_OSD_ERR_NO_ACCESS: ++ return OSD_ERR_PRI_NO_ACCESS; ++ case PNFS_OSD_ERR_UNREACHABLE: ++ return OSD_ERR_PRI_UNREACHABLE; ++ case PNFS_OSD_ERR_NOT_FOUND: ++ return OSD_ERR_PRI_NOT_FOUND; ++ case PNFS_OSD_ERR_NO_SPACE: ++ return OSD_ERR_PRI_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case PNFS_OSD_ERR_EIO: ++ return OSD_ERR_PRI_EIO; ++ } ++} ++ ++static void ++merge_ioerr(struct pnfs_osd_ioerr *dest_err, ++ const struct pnfs_osd_ioerr *src_err) ++{ ++ u64 dest_end, src_end; ++ ++ if (!dest_err->oer_errno) { ++ *dest_err = *src_err; ++ /* accumulated device must be blank */ ++ memset(&dest_err->oer_component.oid_device_id, 0, ++ sizeof(dest_err->oer_component.oid_device_id)); ++ ++ return; ++ } ++ ++ if (dest_err->oer_component.oid_partition_id != ++ src_err->oer_component.oid_partition_id) ++ dest_err->oer_component.oid_partition_id = 0; ++ ++ if (dest_err->oer_component.oid_object_id != ++ src_err->oer_component.oid_object_id) ++ dest_err->oer_component.oid_object_id = 0; ++ ++ if (dest_err->oer_comp_offset > src_err->oer_comp_offset) ++ dest_err->oer_comp_offset = src_err->oer_comp_offset; ++ ++ dest_end = end_offset(dest_err->oer_comp_offset, ++ dest_err->oer_comp_length); ++ src_end = end_offset(src_err->oer_comp_offset, ++ src_err->oer_comp_length); ++ if (dest_end < src_end) ++ dest_end = src_end; ++ ++ dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; ++ ++ if ((src_err->oer_iswrite == dest_err->oer_iswrite) && ++ (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { ++ dest_err->oer_errno = src_err->oer_errno; ++ } else if (src_err->oer_iswrite) { ++ dest_err->oer_iswrite = true; ++ dest_err->oer_errno = src_err->oer_errno; ++ } ++} ++ ++static void ++encode_accumulated_error(struct objlayout *objlay, struct xdr_stream *xdr) ++{ ++ struct objlayout_io_state *state, *tmp; ++ struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ ++ for (i = 0; i < state->num_comps; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ merge_ioerr(&accumulated_err, ioerr); ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++ ++ BUG_ON(pnfs_osd_xdr_encode_ioerr(xdr, &accumulated_err)); ++} ++ ++void ++objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct objlayout_io_state *state, *tmp; ++ __be32 *start, *uninitialized_var(last_xdr); ++ ++ dprintk("%s: Begin\n", __func__); ++ start = xdr_reserve_space(xdr, 4); ++ BUG_ON(!start); ++ ++ spin_lock(&objlay->lock); ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ int res = 0; ++ ++ for (i = 0; i < state->num_comps && !res; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ last_xdr = xdr->p; ++ res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]); ++ } ++ if (unlikely(res)) { ++ /* no space for even one error descriptor */ ++ BUG_ON(last_xdr == start + 1); ++ ++ /* we've encountered a situation with lots and lots of ++ * errors and no space to encode them all. Use the last ++ * available slot to report the union of all the ++ * remaining errors. ++ */ ++ xdr_rewind_stream(xdr, last_xdr - ++ pnfs_osd_ioerr_xdr_sz() / 4); ++ encode_accumulated_error(objlay, xdr); ++ goto loop_done; ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++loop_done: ++ spin_unlock(&objlay->lock); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ dprintk("%s: Return\n", __func__); ++} ++ ++struct objlayout_deviceinfo { ++ struct page *page; ++ struct pnfs_osd_deviceaddr da; /* This must be last */ ++}; ++ ++/* Initialize and call nfs_getdeviceinfo, then decode and return a ++ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() ++ * should be called. ++ */ ++int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi; ++ struct pnfs_device pd; ++ struct super_block *sb; ++ struct page *page; ++ size_t sz; ++ u32 *p; ++ int err; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ pd.area = page_address(page); ++ ++ memcpy(&pd.dev_id, d_id, sizeof(*d_id)); ++ pd.layout_type = LAYOUT_OSD2_OBJECTS; ++ pd.dev_notify_types = 0; ++ pd.pages = &page; ++ pd.pgbase = 0; ++ pd.pglen = PAGE_SIZE; ++ pd.mincount = 0; ++ ++ sb = PNFS_INODE(pnfslay)->i_sb; ++ err = pnfs_client_ops->nfs_getdeviceinfo(PNFS_NFS_SERVER(pnfslay), &pd); ++ dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); ++ if (err) ++ goto err_out; ++ ++ p = pd.area; ++ sz = pnfs_osd_xdr_deviceaddr_incore_sz(p); ++ odi = kzalloc(sz + (sizeof(*odi) - sizeof(odi->da)), GFP_KERNEL); ++ if (!odi) { ++ err = -ENOMEM; ++ goto err_out; ++ } ++ pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); ++ odi->page = page; ++ *deviceaddr = &odi->da; ++ return 0; ++ ++err_out: ++ __free_page(page); ++ return err; ++} ++ ++void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi = container_of(deviceaddr, ++ struct objlayout_deviceinfo, ++ da); ++ ++ __free_page(odi->page); ++ kfree(odi); ++} ++ ++/* ++ * Initialize a mountpoint by retrieving the list of ++ * available devices for it. ++ * Return the pnfs_mount_type structure so the ++ * pNFS_client can refer to the mount point later on. ++ */ ++static int ++objlayout_initialize_mountpoint(struct nfs_server *server, ++ const struct nfs_fh *mntfh) ++{ ++ void *data; ++ ++ data = objio_init_mt(); ++ if (IS_ERR(data)) { ++ printk(KERN_INFO "%s: objlayout lib not ready err=%ld\n", ++ __func__, PTR_ERR(data)); ++ return PTR_ERR(data); ++ } ++ server->pnfs_ld_data = data; ++ ++ dprintk("%s: Return data=%p\n", __func__, data); ++ return 0; ++} ++ ++/* ++ * Uninitialize a mountpoint ++ */ ++static int ++objlayout_uninitialize_mountpoint(struct nfs_server *server) ++{ ++ dprintk("%s: Begin %p\n", __func__, server->pnfs_ld_data); ++ objio_fini_mt(server->pnfs_ld_data); ++ return 0; ++} ++ ++struct layoutdriver_io_operations objlayout_io_operations = { ++ .commit = objlayout_commit, ++ .read_pagelist = objlayout_read_pagelist, ++ .write_pagelist = objlayout_write_pagelist, ++ .alloc_layout = objlayout_alloc_layout, ++ .free_layout = objlayout_free_layout, ++ .alloc_lseg = objlayout_alloc_lseg, ++ .free_lseg = objlayout_free_lseg, ++ .encode_layoutcommit = objlayout_encode_layoutcommit, ++ .encode_layoutreturn = objlayout_encode_layoutreturn, ++ .initialize_mountpoint = objlayout_initialize_mountpoint, ++ .uninitialize_mountpoint = objlayout_uninitialize_mountpoint, ++}; +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h +--- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-09-30 10:17:08.727996000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-09-30 10:17:08.729004000 -0400 +@@ -0,0 +1,171 @@ ++/* ++ * objlayout.h ++ * ++ * Data types and function declerations for interfacing with the ++ * pNFS standard object layout driver. ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#ifndef _OBJLAYOUT_H ++#define _OBJLAYOUT_H ++ ++#include ++#include ++#include ++ ++/* ++ * in-core layout segment ++ */ ++struct objlayout_segment { ++ void *internal; /* for provider internal use */ ++ u8 pnfs_osd_layout[]; ++}; ++ ++/* ++ * per-inode layout ++ */ ++struct objlayout { ++ struct pnfs_layout_hdr pnfs_layout; ++ ++ /* for layout_commit */ ++ enum osd_delta_space_valid_enum { ++ OBJ_DSU_INIT = 0, ++ OBJ_DSU_VALID, ++ OBJ_DSU_INVALID, ++ } delta_space_valid; ++ s64 delta_space_used; /* consumed by write ops */ ++ ++ /* for layout_return */ ++ spinlock_t lock; ++ struct list_head err_list; ++}; ++ ++static inline struct objlayout * ++OBJLAYOUT(struct pnfs_layout_hdr *lo) ++{ ++ return container_of(lo, struct objlayout, pnfs_layout); ++} ++ ++/* ++ * per-I/O operation state ++ * embedded in objects provider io_state data structure ++ */ ++struct objlayout_io_state { ++ struct pnfs_layout_segment *lseg; ++ ++ struct page **pages; ++ unsigned pgbase; ++ unsigned nr_pages; ++ unsigned long count; ++ loff_t offset; ++ bool sync; ++ ++ void *rpcdata; ++ int status; /* res */ ++ int eof; /* res */ ++ int committed; /* res */ ++ ++ /* Error reporting (layout_return) */ ++ struct list_head err_list; ++ unsigned num_comps; ++ /* Pointer to array of error descriptors of size num_comps. ++ * It should contain as many entries as devices in the osd_layout ++ * that participate in the I/O. It is up to the io_engine to allocate ++ * needed space and set num_comps. ++ */ ++ struct pnfs_osd_ioerr *ioerrs; ++}; ++ ++/* ++ * Raid engine I/O API ++ */ ++extern void *objio_init_mt(void); ++extern void objio_fini_mt(void *mt); ++ ++extern int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout); ++extern void objio_free_lseg(void *p); ++ ++extern int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp); ++extern void objio_free_io_state(struct objlayout_io_state *state); ++ ++extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); ++extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable); ++ ++/* ++ * callback API ++ */ ++extern void objlayout_io_set_result(struct objlayout_io_state *state, ++ unsigned index, int osd_error, ++ u64 offset, u64 length, bool is_write); ++ ++static inline void ++objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) ++{ ++ struct objlayout *objlay = OBJLAYOUT(state->lseg->layout); ++ ++ /* If one of the I/Os errored out and the delta_space_used was ++ * invalid we render the complete report as invalid. Protocol mandate ++ * the DSU be accurate or not reported. ++ */ ++ spin_lock(&objlay->lock); ++ if (objlay->delta_space_valid != OBJ_DSU_INVALID) { ++ objlay->delta_space_valid = OBJ_DSU_VALID; ++ objlay->delta_space_used += space_used; ++ } ++ spin_unlock(&objlay->lock); ++} ++ ++extern void objlayout_read_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++extern void objlayout_write_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++ ++extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr); ++extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); ++ ++/* ++ * exported generic objects function vectors ++ */ ++extern struct layoutdriver_io_operations objlayout_io_operations; ++extern struct pnfs_client_operations *pnfs_client_ops; ++ ++#endif /* _OBJLAYOUT_H */ +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-09-30 10:17:08.731997000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-09-30 10:17:08.733995000 -0400 +@@ -0,0 +1,734 @@ ++/* ++ * panfs_shim.c ++ * ++ * Shim layer for interfacing with the Panasas DirectFlow module I/O stack ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include "objlayout.h" ++#include "panfs_shim.h" ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct panfs_export_operations *panfs_export_ops; ++ ++void * ++objio_init_mt(void) ++{ ++ return panfs_export_ops == NULL ? ERR_PTR(-EAGAIN) : NULL; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++} ++ ++static int ++panfs_shim_conv_raid01(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) { ++ hdr->type = PAN_AGG_RAID1; ++ hdr->hdr.raid1.num_comps = lo_map->odm_mirror_cnt + 1; ++ } else if (layout->olo_num_comps > 1) { ++ hdr->type = PAN_AGG_RAID0; ++ hdr->hdr.raid0.num_comps = layout->olo_num_comps; ++ hdr->hdr.raid0.stripe_unit = lo_map->odm_stripe_unit; ++ } else ++ hdr->type = PAN_AGG_SIMPLE; ++ return 0; ++} ++ ++static int ++panfs_shim_conv_raid5(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) ++ goto err; ++ ++ if (lo_map->odm_group_width || lo_map->odm_group_depth) { ++ if (!lo_map->odm_group_width || !lo_map->odm_group_depth) ++ goto err; ++ ++ hdr->type = PAN_AGG_GRP_RAID5_LEFT; ++ hdr->hdr.grp_raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.grp_raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.grp_raid5_left.stripe_unit = lo_map->odm_stripe_unit; ++ hdr->hdr.grp_raid5_left.rg_width = lo_map->odm_group_width; ++ hdr->hdr.grp_raid5_left.rg_depth = lo_map->odm_group_depth; ++ /* this is a guess, panasas server is not supposed to ++ hand out layotu otherwise */ ++ hdr->hdr.grp_raid5_left.group_layout_policy = ++ PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN; ++ } else { ++ hdr->type = PAN_AGG_RAID5_LEFT; ++ hdr->hdr.raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.raid5_left.stripe_unit2 = ++ hdr->hdr.raid5_left.stripe_unit1 = ++ hdr->hdr.raid5_left.stripe_unit0 = lo_map->odm_stripe_unit; ++ } ++ ++ return 0; ++err: ++ return -EINVAL; ++} ++ ++/* ++ * Convert a pnfs_osd data map into Panasas aggregation layout header ++ */ ++static int ++panfs_shim_conv_pnfs_osd_data_map( ++ struct pnfs_osd_layout *layout, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ int status = -EINVAL; ++ struct pnfs_osd_data_map *lo_map = &layout->olo_map; ++ ++ if (!layout->olo_num_comps) { ++ dprintk("%s: !!layout.n_comps(%u)\n", __func__, ++ layout->olo_num_comps); ++ goto err; ++ } ++ ++ switch (lo_map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_0 " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid01(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_5: ++ if (!lo_map->odm_group_width) { ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_5 !group_width " ++ "layout.n_comps(%u)!=map.n_comps(%u) " ++ "|| comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ } else if ((layout->olo_num_comps != lo_map->odm_num_comps && ++ layout->olo_num_comps > lo_map->odm_group_width) || ++ (layout->olo_comps_index % lo_map->odm_group_width)){ ++ dprintk("%s: !!PNFS_OSD_RAID_5 group_width(%u) " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ lo_map->odm_group_width, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid5(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_PQ: ++ default: ++ dprintk("%s: !!PNFS_OSD_RAID_(%d)\n", __func__, ++ lo_map->odm_raid_algorithm); ++ goto err; ++ } ++ ++ return 0; ++ ++err: ++ return status; ++} ++ ++/* ++ * Convert pnfs_osd layout into Panasas map and caps type ++ */ ++int ++objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ int i, total_comps; ++ int status; ++ struct pnfs_osd_object_cred *lo_comp; ++ pan_size_t alloc_sz, local_sz; ++ pan_sm_map_cap_t *mcs = NULL; ++ u8 *buf; ++ pan_agg_comp_obj_t *pan_comp; ++ pan_sm_sec_t *pan_sec; ++ ++ status = -EINVAL; ++ if (layout->olo_num_comps < layout->olo_map.odm_group_width) { ++ total_comps = layout->olo_comps_index + layout->olo_num_comps; ++ } else { ++ /* allocate full map, otherwise SAM gets confused */ ++ total_comps = layout->olo_map.odm_num_comps; ++ } ++ alloc_sz = total_comps * ++ (sizeof(pan_agg_comp_obj_t) + sizeof(pan_sm_sec_t)); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p = layout->olo_comps[i].oc_cap.cred; ++ if (panfs_export_ops->sm_sec_t_get_size_otw( ++ (pan_sm_sec_otw_t *)&p, &local_sz, NULL, NULL)) ++ goto err; ++ alloc_sz += local_sz; ++ } ++ ++ status = -ENOMEM; ++ mcs = kzalloc(sizeof(*mcs) + alloc_sz, GFP_KERNEL); ++ if (!mcs) ++ goto err; ++ buf = (u8 *)&mcs[1]; ++ ++ mcs->offset = lseg->range.offset; ++ mcs->length = lseg->range.length; ++#if 0 ++ /* FIXME: for now */ ++ mcs->expiration_time.ts_sec = 0; ++ mcs->expiration_time.ts_nsec = 0; ++#endif ++ mcs->full_map.map_hdr.avail_state = PAN_AGG_OBJ_STATE_NORMAL; ++ status = panfs_shim_conv_pnfs_osd_data_map(layout, ++ &mcs->full_map.layout_hdr); ++ if (status) ++ goto err; ++ ++ mcs->full_map.components.size = total_comps; ++ mcs->full_map.components.data = (pan_agg_comp_obj_t *)buf; ++ buf += total_comps * sizeof(pan_agg_comp_obj_t); ++ ++ mcs->secs.size = total_comps; ++ mcs->secs.data = (pan_sm_sec_t *)buf; ++ buf += total_comps * sizeof(pan_sm_sec_t); ++ ++ lo_comp = layout->olo_comps; ++ pan_comp = mcs->full_map.components.data + layout->olo_comps_index; ++ pan_sec = mcs->secs.data + layout->olo_comps_index; ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p; ++ pan_stor_obj_id_t *obj_id = &mcs->full_map.map_hdr.obj_id; ++ struct pnfs_osd_objid *oc_obj_id = &lo_comp->oc_object_id; ++ u64 dev_id = __be64_to_cpup( ++ (__be64 *)oc_obj_id->oid_device_id.data + 1); ++ ++ dprintk("%s: i=%d deviceid=%Lx:%Lx partition=%Lx object=%Lx\n", ++ __func__, i, ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data), ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data + 1), ++ oc_obj_id->oid_partition_id, oc_obj_id->oid_object_id); ++ ++ if (i == 0) { ++ /* make up mgr_id to calm sam down */ ++ pan_mgr_id_construct_artificial(PAN_MGR_SM, 0, ++ &obj_id->dev_id); ++ obj_id->grp_id = oc_obj_id->oid_partition_id; ++ obj_id->obj_id = oc_obj_id->oid_object_id; ++ } ++ ++ if (obj_id->grp_id != lo_comp->oc_object_id.oid_partition_id) { ++ dprintk("%s: i=%d grp_id=0x%Lx oid_partition_id=0x%Lx\n", ++ __func__, i, (u64)obj_id->grp_id, ++ lo_comp->oc_object_id.oid_partition_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ if (obj_id->obj_id != lo_comp->oc_object_id.oid_object_id) { ++ dprintk("%s: i=%d obj_id=0x%Lx oid_object_id=0x%Lx\n", ++ __func__, i, obj_id->obj_id, ++ lo_comp->oc_object_id.oid_object_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ pan_comp->dev_id = dev_id; ++ if (!pan_stor_is_device_id_an_obsd_id(pan_comp->dev_id)) { ++ dprintk("%s: i=%d dev_id=0x%Lx not an obsd_id\n", ++ __func__, i, obj_id->dev_id); ++ status = -EINVAL; ++ goto err; ++ } ++ if (lo_comp->oc_osd_version == PNFS_OSD_MISSING) { ++ dprintk("%s: degraded maps not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ pan_comp->avail_state = PAN_AGG_COMP_STATE_NORMAL; ++ if (lo_comp->oc_cap_key_sec != PNFS_OSD_CAP_KEY_SEC_NONE) { ++ dprintk("%s: cap key security not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ ++ p = lo_comp->oc_cap.cred; ++ panfs_export_ops->sm_sec_t_unmarshall( ++ (pan_sm_sec_otw_t *)&p, ++ pan_sec, ++ buf, ++ alloc_sz, ++ NULL, ++ &local_sz); ++ buf += local_sz; ++ alloc_sz -= local_sz; ++ ++ lo_comp++; ++ pan_comp++; ++ pan_sec++; ++ } ++ ++ *outp = mcs; ++ dprintk("%s:Return mcs=%p\n", __func__, mcs); ++ return 0; ++ ++err: ++ objio_free_lseg(mcs); ++ dprintk("%s:Error %d\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Free a Panasas map and caps type ++ */ ++void ++objio_free_lseg(void *p) ++{ ++ kfree(p); ++} ++ ++/* ++ * I/O routines ++ */ ++int ++objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct panfs_shim_io_state *p; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ p = kzalloc(sizeof(*p), GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ *outp = &p->ol_state; ++ return 0; ++} ++ ++/* ++ * Free an I/O state ++ */ ++void ++objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ int i; ++ ++ dprintk("%s: freeing io_state\n", __func__); ++ for (i = 0; i < state->ol_state.nr_pages; i++) ++ kunmap(state->ol_state.pages[i]); ++ ++ if (state->ucreds) ++ panfs_export_ops->ucreds_put(state->ucreds); ++ kfree(state->sg_list); ++ kfree(state); ++} ++ ++static int ++panfs_shim_pages_to_sg( ++ struct panfs_shim_io_state *state, ++ struct page **pages, ++ unsigned int pgbase, ++ unsigned nr_pages, ++ size_t count) ++{ ++ unsigned i, n; ++ pan_sg_entry_t *sg; ++ ++ dprintk("%s pgbase %u nr_pages %u count %d " ++ "pg0 %p flags 0x%x index %llu\n", ++ __func__, pgbase, nr_pages, (int)count, pages[0], ++ (unsigned)pages[0]->flags, (unsigned long long)pages[0]->index); ++ ++ sg = kmalloc(nr_pages * sizeof(*sg), GFP_KERNEL); ++ if (sg == NULL) ++ return -ENOMEM; ++ ++ dprintk("%s sg_list %p pages %p pgbase %u nr_pages %u\n", ++ __func__, sg, pages, pgbase, nr_pages); ++ ++ for (i = 0; i < nr_pages; i++) { ++ sg[i].buffer = (char *)kmap(pages[i]) + pgbase; ++ n = PAGE_SIZE - pgbase; ++ pgbase = 0; ++ if (n > count) ++ n = count; ++ sg[i].chunk_size = n; ++ count -= n; ++ if (likely(count)) { ++ sg[i].next = &sg[i+1]; ++ } else { ++ /* we're done */ ++ sg[i].next = NULL; ++ break; ++ } ++ } ++ BUG_ON(count); ++ ++ state->sg_list = sg; ++ return 0; ++} ++ ++/* ++ * Callback function for async reads ++ */ ++static void ++panfs_shim_read_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.read.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++ status = res_p->length; ++ WARN_ON(status < 0); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_read rc %d: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_read_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ struct objlayout_segment *lseg = LSEG_LD_DATA(ol_state->lseg); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)lseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.read.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.read.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_read(PAN_SAM_ACCESS_BYPASS_TIMESTAMP, ++ &state->u.read.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_read_done, ++ state, NULL, ++ &state->u.read.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_read_done(state, NULL, &state->u.read.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Callback function for async writes ++ */ ++static void ++panfs_shim_write_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.write.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++/* state->ol_state.committed = NFS_FILE_SYNC;*/ ++ state->ol_state.committed = NFS_UNSTABLE; ++ status = res_p->length; ++ WARN_ON(status < 0); ++ ++ objlayout_add_delta_space_used(&state->ol_state, ++ res_p->delta_capacity_used); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_write rc %u: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_write_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable /* unused, PanOSD writes are stable */) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ struct objlayout_segment *lseg = LSEG_LD_DATA(ol_state->lseg); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)lseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.write.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.write.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_write(PAN_SAM_ACCESS_NONE, ++ &state->u.write.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_write_done, ++ state, ++ NULL, ++ &state->u.write.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_write_done(state, NULL, &state->u.write.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++int ++panfs_shim_register(struct panfs_export_operations *ops) ++{ ++ if (panfs_export_ops) { ++ printk(KERN_INFO ++ "%s: panfs already registered (panfs ops %p)\n", ++ __func__, panfs_export_ops); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: registering panfs ops %p\n", ++ __func__, ops); ++ ++ panfs_export_ops = ops; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_register); ++ ++int ++panfs_shim_unregister(void) ++{ ++ if (!panfs_export_ops) { ++ printk(KERN_INFO "%s: panfs is not registered\n", __func__); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: unregistering panfs ops %p\n", ++ __func__, panfs_export_ops); ++ ++ panfs_export_ops = NULL; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_unregister); ++ ++/* ++ * Policy Operations ++ */ ++ ++/* ++ * Return the stripe size for the specified file ++ */ ++ssize_t ++panlayout_get_stripesize(struct pnfs_layout_hdr *pnfslay) ++{ ++ ssize_t sz, maxsz = -1; ++ struct pnfs_layout_segment *lseg; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ list_for_each_entry(lseg, &pnfslay->segs, fi_list) { ++ int n; ++ struct objlayout_segment *panlseg = LSEG_LD_DATA(lseg); ++ struct pnfs_osd_layout *lo = ++ (struct pnfs_osd_layout *)panlseg->pnfs_osd_layout; ++ struct pnfs_osd_data_map *map = &lo->olo_map; ++ ++ n = map->odm_group_width; ++ if (n == 0) ++ n = map->odm_num_comps / (map->odm_mirror_cnt + 1); ++ ++ switch (map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_5: ++ n -= 1; ++ n *= 8; /* FIXME: until we have 2-D coalescing */ ++ break; ++ ++ case PNFS_OSD_RAID_PQ: ++ n -= 2; ++ break; ++ ++ default: ++ BUG_ON(1); ++ } ++ sz = map->odm_stripe_unit * n; ++ if (sz > maxsz) ++ maxsz = sz; ++ } ++ dprintk("%s: Return %Zd\n", __func__, maxsz); ++ return maxsz; ++} ++ ++#define PANLAYOUT_DEF_STRIPE_UNIT (64*1024) ++#define PANLAYOUT_DEF_STRIPE_WIDTH 9 ++#define PANLAYOUT_MAX_STRIPE_WIDTH 11 ++#define PANLAYOUT_MAX_GATHER_STRIPES 8 ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++panlayout_get_blocksize(void) ++{ ++ ssize_t sz = (PANLAYOUT_MAX_STRIPE_WIDTH-1) * ++ PANLAYOUT_DEF_STRIPE_UNIT * ++ PANLAYOUT_MAX_GATHER_STRIPES; ++ dprintk("%s: Return %Zd\n", __func__, sz); ++ return sz; ++} ++ ++static struct layoutdriver_policy_operations panlayout_policy_operations = { ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ .get_stripesize = panlayout_get_stripesize, ++ .get_blocksize = panlayout_get_blocksize, ++}; ++ ++#define PNFS_LAYOUT_PANOSD (NFS4_PNFS_PRIVATE_LAYOUT | LAYOUT_OSD2_OBJECTS) ++ ++static struct pnfs_layoutdriver_type panlayout_type = { ++ .id = PNFS_LAYOUT_PANOSD, ++ .name = "PNFS_LAYOUT_PANOSD", ++ .ld_io_ops = &objlayout_io_operations, ++ .ld_policy_ops = &panlayout_policy_operations, ++}; ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for Panasas OSDs"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++panlayout_init(void) ++{ ++ pnfs_client_ops = pnfs_register_layoutdriver(&panlayout_type); ++ printk(KERN_INFO "%s: Registered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++ return 0; ++} ++ ++static void __exit ++panlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&panlayout_type); ++ printk(KERN_INFO "%s: Unregistered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(panlayout_init); ++module_exit(panlayout_exit); +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h +--- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-09-30 10:17:08.736995000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-09-30 10:17:08.738995000 -0400 +@@ -0,0 +1,482 @@ ++/* ++ * panfs_shim.h ++ * ++ * Data types and external function declerations for interfacing with ++ * panfs (Panasas DirectFlow) I/O stack ++ * ++ * Copyright (C) 2007 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#ifndef _PANLAYOUT_PANFS_SHIM_H ++#define _PANLAYOUT_PANFS_SHIM_H ++ ++typedef s8 pan_int8_t; ++typedef u8 pan_uint8_t; ++typedef s16 pan_int16_t; ++typedef u16 pan_uint16_t; ++typedef s32 pan_int32_t; ++typedef u32 pan_uint32_t; ++typedef s64 pan_int64_t; ++typedef u64 pan_uint64_t; ++ ++/* ++ * from pan_base_types.h ++ */ ++typedef pan_uint64_t pan_rpc_none_t; ++typedef pan_uint32_t pan_rpc_arrdim_t; ++typedef pan_uint32_t pan_status_t; ++typedef pan_uint8_t pan_otw_t; ++typedef pan_uint8_t pan_pad_t; ++ ++typedef pan_uint32_t pan_timespec_sec_t; ++typedef pan_uint32_t pan_timespec_nsec_t; ++ ++typedef struct pan_timespec_s pan_timespec_t; ++struct pan_timespec_s { ++ pan_timespec_sec_t ts_sec; ++ pan_timespec_nsec_t ts_nsec; ++}; ++ ++/* ++ * from pan_std_types.h ++ */ ++typedef pan_uint32_t pan_size_t; ++typedef int pan_bool_t; ++ ++/* ++ * from pan_common_error.h ++ */ ++#define PAN_SUCCESS ((pan_status_t)0) ++#define PAN_ERR_IN_PROGRESS ((pan_status_t)55) ++ ++/* ++ * from pan_sg.h ++ */ ++typedef struct pan_sg_entry_s pan_sg_entry_t; ++struct pan_sg_entry_s { ++ void *buffer; /* pointer to memory */ ++ pan_uint32_t chunk_size; /* size of each chunk (bytes) */ ++ pan_sg_entry_t *next; ++}; ++ ++/* ++ * from pan_storage.h ++ */ ++typedef pan_uint64_t pan_stor_dev_id_t; ++typedef pan_uint32_t pan_stor_obj_grp_id_t; ++typedef pan_uint64_t pan_stor_obj_uniq_t; ++typedef pan_uint32_t pan_stor_action_t; ++typedef pan_uint8_t pan_stor_cap_key_t[20]; ++ ++typedef pan_uint8_t pan_stor_key_type_t; ++typedef pan_uint64_t pan_stor_len_t; ++typedef pan_int64_t pan_stor_delta_len_t; ++typedef pan_uint64_t pan_stor_offset_t; ++typedef pan_uint16_t pan_stor_op_t; ++ ++typedef pan_uint16_t pan_stor_sec_level_t; ++ ++struct pan_stor_obj_id_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_obj_uniq_t obj_id; ++ pan_stor_obj_grp_id_t grp_id; ++}; ++ ++typedef struct pan_stor_obj_id_s pan_stor_obj_id_t; ++ ++#define PAN_STOR_OP_NONE ((pan_stor_op_t) 0U) ++#define PAN_STOR_OP_READ ((pan_stor_op_t) 8U) ++#define PAN_STOR_OP_WRITE ((pan_stor_op_t) 9U) ++#define PAN_STOR_OP_APPEND ((pan_stor_op_t) 10U) ++#define PAN_STOR_OP_GETATTR ((pan_stor_op_t) 11U) ++#define PAN_STOR_OP_SETATTR ((pan_stor_op_t) 12U) ++#define PAN_STOR_OP_FLUSH ((pan_stor_op_t) 13U) ++#define PAN_STOR_OP_CLEAR ((pan_stor_op_t) 14U) ++ ++/* ++ * from pan_aggregation_map.h ++ */ ++typedef pan_uint8_t pan_agg_type_t; ++typedef pan_uint64_t pan_agg_map_version_t; ++typedef pan_uint8_t pan_agg_obj_state_t; ++typedef pan_uint8_t pan_agg_comp_state_t; ++typedef pan_uint8_t pan_agg_comp_flag_t; ++ ++#define PAN_AGG_OBJ_STATE_INVALID ((pan_agg_obj_state_t) 0x00) ++#define PAN_AGG_OBJ_STATE_NORMAL ((pan_agg_obj_state_t) 0x01) ++#define PAN_AGG_OBJ_STATE_DEGRADED ((pan_agg_obj_state_t) 0x02) ++#define PAN_AGG_OBJ_STATE_RECONSTRUCT ((pan_agg_obj_state_t) 0x03) ++#define PAN_AGG_OBJ_STATE_COPYBACK ((pan_agg_obj_state_t) 0x04) ++#define PAN_AGG_OBJ_STATE_UNAVAILABLE ((pan_agg_obj_state_t) 0x05) ++#define PAN_AGG_OBJ_STATE_CREATING ((pan_agg_obj_state_t) 0x06) ++#define PAN_AGG_OBJ_STATE_DELETED ((pan_agg_obj_state_t) 0x07) ++#define PAN_AGG_COMP_STATE_INVALID ((pan_agg_comp_state_t) 0x00) ++#define PAN_AGG_COMP_STATE_NORMAL ((pan_agg_comp_state_t) 0x01) ++#define PAN_AGG_COMP_STATE_UNAVAILABLE ((pan_agg_comp_state_t) 0x02) ++#define PAN_AGG_COMP_STATE_COPYBACK ((pan_agg_comp_state_t) 0x03) ++#define PAN_AGG_COMP_F_NONE ((pan_agg_comp_flag_t) 0x00) ++#define PAN_AGG_COMP_F_ATTR_STORING ((pan_agg_comp_flag_t) 0x01) ++#define PAN_AGG_COMP_F_OBJ_CORRUPT_OBS ((pan_agg_comp_flag_t) 0x02) ++#define PAN_AGG_COMP_F_TEMP ((pan_agg_comp_flag_t) 0x04) ++ ++struct pan_aggregation_map_s { ++ pan_agg_map_version_t version; ++ pan_agg_obj_state_t avail_state; ++ pan_stor_obj_id_t obj_id; ++}; ++ ++typedef struct pan_aggregation_map_s pan_aggregation_map_t; ++ ++struct pan_agg_comp_obj_s { ++ pan_stor_dev_id_t dev_id; ++ pan_agg_comp_state_t avail_state; ++ pan_agg_comp_flag_t comp_flags; ++}; ++ ++typedef struct pan_agg_comp_obj_s pan_agg_comp_obj_t; ++ ++struct pan_agg_simple_header_s { ++ pan_uint8_t unused; ++}; ++ ++typedef struct pan_agg_simple_header_s pan_agg_simple_header_t; ++ ++struct pan_agg_raid1_header_s { ++ pan_uint16_t num_comps; ++}; ++ ++typedef struct pan_agg_raid1_header_s pan_agg_raid1_header_t; ++ ++struct pan_agg_raid0_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++}; ++ ++typedef struct pan_agg_raid0_header_s pan_agg_raid0_header_t; ++ ++struct pan_agg_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit0; ++ pan_uint32_t stripe_unit1; ++ pan_uint32_t stripe_unit2; ++}; ++ ++typedef struct pan_agg_raid5_left_header_s pan_agg_raid5_left_header_t; ++ ++typedef struct pan_agg_grp_raid5_left_header_s pan_agg_grp_raid5_left_header_t; ++ ++struct pan_agg_grp_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++ pan_uint16_t rg_width; ++ pan_uint16_t rg_depth; ++ pan_uint8_t group_layout_policy; ++}; ++ ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_INVALID ((pan_uint8_t) 0x00) ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN ((pan_uint8_t) 0x01) ++ ++#define PAN_AGG_NULL_MAP ((pan_agg_type_t) 0x00) ++#define PAN_AGG_SIMPLE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_RAID1 ((pan_agg_type_t) 0x02) ++#define PAN_AGG_RAID0 ((pan_agg_type_t) 0x03) ++#define PAN_AGG_RAID5_LEFT ((pan_agg_type_t) 0x04) ++#define PAN_AGG_GRP_RAID5_LEFT ((pan_agg_type_t) 0x06) ++#define PAN_AGG_MINTYPE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_MAXTYPE ((pan_agg_type_t) 0x06) ++ ++struct pan_agg_layout_hdr_s { ++ pan_agg_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_uint64_t null; ++ pan_agg_simple_header_t simple; ++ pan_agg_raid1_header_t raid1; ++ pan_agg_raid0_header_t raid0; ++ pan_agg_raid5_left_header_t raid5_left; ++ pan_agg_grp_raid5_left_header_t grp_raid5_left; ++ } hdr; ++}; ++ ++typedef struct pan_agg_layout_hdr_s pan_agg_layout_hdr_t; ++ ++struct pan_agg_comp_obj_a_s { ++ pan_rpc_arrdim_t size; ++ pan_agg_comp_obj_t *data; ++}; ++typedef struct pan_agg_comp_obj_a_s pan_agg_comp_obj_a; ++ ++struct pan_agg_full_map_s { ++ pan_aggregation_map_t map_hdr; ++ pan_agg_layout_hdr_t layout_hdr; ++ pan_agg_comp_obj_a components; ++}; ++ ++typedef struct pan_agg_full_map_s pan_agg_full_map_t; ++ ++/* ++ * from pan_obsd_rpc_types.h ++ */ ++typedef pan_uint8_t pan_obsd_security_key_a[16]; ++ ++typedef pan_uint8_t pan_obsd_capability_key_a[20]; ++ ++typedef pan_uint8_t pan_obsd_key_holder_id_t; ++ ++#define PAN_OBSD_KEY_HOLDER_BASIS_KEY ((pan_obsd_key_holder_id_t) 0x01) ++#define PAN_OBSD_KEY_HOLDER_CAP_KEY ((pan_obsd_key_holder_id_t) 0x02) ++ ++struct pan_obsd_key_holder_s { ++ pan_obsd_key_holder_id_t select; ++ pan_pad_t pad[3]; ++ union { ++ pan_obsd_security_key_a basis_key; ++ pan_obsd_capability_key_a cap_key; ++ } key; ++}; ++ ++typedef struct pan_obsd_key_holder_s pan_obsd_key_holder_t; ++ ++/* ++ * from pan_sm_sec.h ++ */ ++typedef pan_uint8_t pan_sm_sec_type_t; ++typedef pan_uint8_t pan_sm_sec_otw_allo_mode_t; ++ ++struct pan_obsd_capability_generic_otw_t_s { ++ pan_rpc_arrdim_t size; ++ pan_uint8_t *data; ++}; ++typedef struct pan_obsd_capability_generic_otw_t_s ++ pan_obsd_capability_generic_otw_t; ++ ++struct pan_sm_sec_obsd_s { ++ pan_obsd_key_holder_t key; ++ pan_obsd_capability_generic_otw_t cap_otw; ++ pan_sm_sec_otw_allo_mode_t allo_mode; ++}; ++ ++typedef struct pan_sm_sec_obsd_s pan_sm_sec_obsd_t; ++ ++struct pan_sm_sec_s { ++ pan_sm_sec_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_rpc_none_t none; ++ pan_sm_sec_obsd_t obsd; ++ } variant; ++}; ++ ++typedef struct pan_sm_sec_s pan_sm_sec_t; ++ ++struct pan_sm_sec_a_s { ++ pan_rpc_arrdim_t size; ++ pan_sm_sec_t *data; ++}; ++typedef struct pan_sm_sec_a_s pan_sm_sec_a; ++typedef pan_otw_t *pan_sm_sec_otw_t; ++ ++/* ++ * from pan_sm_types.h ++ */ ++typedef pan_uint64_t pan_sm_cap_handle_t; ++ ++struct pan_sm_map_cap_s { ++ pan_agg_full_map_t full_map; ++ pan_stor_offset_t offset; ++ pan_stor_len_t length; ++ pan_sm_sec_a secs; ++ pan_sm_cap_handle_t handle; ++ pan_timespec_t expiration_time; ++ pan_stor_action_t action_mask; ++ pan_uint32_t flags; ++}; ++ ++typedef struct pan_sm_map_cap_s pan_sm_map_cap_t; ++ ++/* ++ * from pan_sm_ops.h ++ */ ++typedef pan_rpc_none_t pan_sm_cache_ptr_t; ++ ++/* ++ * from pan_sam_api.h ++ */ ++typedef pan_uint32_t pan_sam_access_flags_t; ++ ++typedef struct pan_sam_dev_error_s pan_sam_dev_error_t; ++struct pan_sam_dev_error_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_op_t stor_op; ++ pan_status_t error; ++}; ++ ++typedef struct pan_sam_ext_status_s pan_sam_ext_status_t; ++struct pan_sam_ext_status_s { ++ pan_uint32_t available; ++ pan_uint32_t size; ++ pan_sam_dev_error_t *errors; ++}; ++ ++enum pan_sam_rpc_sec_sel_e { ++ PAN_SAM_RPC_SEC_DEFAULT, ++ PAN_SAM_RPC_SEC_ATLEAST, ++ PAN_SAM_RPC_SEC_EXACTLY ++}; ++typedef enum pan_sam_rpc_sec_sel_e pan_sam_rpc_sec_sel_t; ++ ++typedef struct pan_sam_obj_sec_s pan_sam_obj_sec_t; ++struct pan_sam_obj_sec_s { ++ pan_stor_sec_level_t min_security; ++ pan_sm_map_cap_t *map_ccaps; ++}; ++ ++typedef struct pan_sam_rpc_sec_s pan_sam_rpc_sec_t; ++struct pan_sam_rpc_sec_s { ++ pan_sam_rpc_sec_sel_t selector; ++}; ++ ++typedef struct pan_sam_read_args_s pan_sam_read_args_t; ++struct pan_sam_read_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ void *return_attr; ++ void *checksum; ++ pan_stor_offset_t offset; ++ pan_uint16_t sm_options; ++ void *callout; ++ void *callout_arg; ++}; ++ ++typedef struct pan_sam_read_res_s pan_sam_read_res_t; ++struct pan_sam_read_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ void *attr; ++ void *checksum; ++}; ++ ++typedef void (*pan_sam_read_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t status); ++ ++#define PAN_SAM_ACCESS_NONE 0x0000 ++#define PAN_SAM_ACCESS_BYPASS_TIMESTAMP 0x0020 ++ ++typedef struct pan_sam_write_args_s pan_sam_write_args_t; ++struct pan_sam_write_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ pan_stor_offset_t offset; ++ void *attr; ++ void *return_attr; ++}; ++ ++typedef struct pan_sam_write_res_s pan_sam_write_res_t; ++struct pan_sam_write_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ pan_stor_delta_len_t delta_capacity_used; ++ pan_bool_t parity_dirty; ++ void *attr; ++}; ++ ++typedef void (*pan_sam_write_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t status); ++ ++/* ++ * from pan_mgr_types.h ++ */ ++#define PAN_MGR_ID_TYPE_SHIFT 56 ++#define PAN_MGR_ID_TYPE_MASK ((pan_mgr_id_t)18374686479671623680ULL) ++#define PAN_MGR_ID_UNIQ_MASK ((pan_mgr_id_t)72057594037927935ULL) ++ ++typedef pan_uint16_t pan_mgr_type_t; ++typedef pan_uint64_t pan_mgr_id_t; ++ ++#define PAN_MGR_SM ((pan_mgr_type_t) 2U) ++#define PAN_MGR_OBSD ((pan_mgr_type_t) 6U) ++ ++/* ++ * from pan_mgr_types_c.h ++ */ ++#define pan_mgr_id_construct_artificial(_mgr_type_, _mgr_uniq_, _mgr_id_p_) { \ ++ pan_mgr_id_t _id1, _id2; \ ++\ ++ _id1 = (_mgr_type_); \ ++ _id1 <<= PAN_MGR_ID_TYPE_SHIFT; \ ++ _id1 &= PAN_MGR_ID_TYPE_MASK; \ ++ _id2 = (_mgr_uniq_); \ ++ _id2 &= PAN_MGR_ID_UNIQ_MASK; \ ++ _id1 |= _id2; \ ++ *(_mgr_id_p_) = _id1; \ ++} ++ ++/* ++ * from pan_storage_c.h ++ */ ++#define pan_stor_is_device_id_an_obsd_id(_device_id_) \ ++ ((((_device_id_) & PAN_MGR_ID_TYPE_MASK) >> PAN_MGR_ID_TYPE_SHIFT) \ ++ == PAN_MGR_OBSD) ++ ++/* ++ * pnfs_shim internal definitions ++ */ ++ ++struct panfs_shim_io_state { ++ struct objlayout_io_state ol_state; ++ ++ pan_sg_entry_t *sg_list; ++ pan_sam_obj_sec_t obj_sec; ++ void *ucreds; ++ union { ++ struct { ++ pan_sam_read_args_t args; ++ pan_sam_read_res_t res; ++ } read; ++ struct { ++ pan_sam_write_args_t args; ++ pan_sam_write_res_t res; ++ } write; ++ } u; ++}; ++ ++#endif /* _PANLAYOUT_PANFS_SHIM_H */ +diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +--- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-09-30 10:17:08.741996000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-09-30 10:17:08.743002000 -0400 +@@ -0,0 +1,435 @@ ++/* ++ * pnfs_osd_xdr.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* ++ * The following implementation is based on these Internet Drafts: ++ * ++ * draft-ietf-nfsv4-minorversion-21 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_objid(u32 *p, struct pnfs_osd_objid *objid) ++{ ++ COPYMEM(objid->oid_device_id.data, sizeof(objid->oid_device_id.data)); ++ READ64(objid->oid_partition_id); ++ READ64(objid->oid_object_id); ++ return p; ++} ++ ++static inline u32 * ++pnfs_osd_xdr_decode_opaque_cred(u32 *p, ++ struct pnfs_osd_opaque_cred *opaque_cred) ++{ ++ READ32(opaque_cred->cred_len); ++ COPYMEM(opaque_cred->cred, opaque_cred->cred_len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_object_cred(u32 *p, struct pnfs_osd_object_cred *comp, ++ u8 **credp) ++{ ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_objid(p, &comp->oc_object_id); ++ READ32(comp->oc_osd_version); ++ READ32(comp->oc_cap_key_sec); ++ ++ cred = *credp; ++ comp->oc_cap_key.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap_key); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap_key.cred_len)); ++ comp->oc_cap.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap.cred_len)); ++ *credp = cred; ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_data_map(u32 *p, struct pnfs_osd_data_map *data_map) ++{ ++ READ32(data_map->odm_num_comps); ++ READ64(data_map->odm_stripe_unit); ++ READ32(data_map->odm_group_width); ++ READ32(data_map->odm_group_depth); ++ READ32(data_map->odm_mirror_cnt); ++ READ32(data_map->odm_raid_algorithm); ++ dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " ++ "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", ++ __func__, ++ data_map->odm_num_comps, ++ (unsigned long long)data_map->odm_stripe_unit, ++ data_map->odm_group_width, ++ data_map->odm_group_depth, ++ data_map->odm_mirror_cnt, ++ data_map->odm_raid_algorithm); ++ return p; ++} ++ ++struct pnfs_osd_layout * ++pnfs_osd_xdr_decode_layout(struct pnfs_osd_layout *layout, u32 *p) ++{ ++ int i; ++ u32 *start = p; ++ struct pnfs_osd_object_cred *comp; ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_data_map(p, &layout->olo_map); ++ READ32(layout->olo_comps_index); ++ READ32(layout->olo_num_comps); ++ layout->olo_comps = (struct pnfs_osd_object_cred *)(layout + 1); ++ comp = layout->olo_comps; ++ cred = (u8 *)(comp + layout->olo_num_comps); ++ dprintk("%s: comps_index=%u num_comps=%u\n", ++ __func__, layout->olo_comps_index, layout->olo_num_comps); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ p = pnfs_osd_xdr_decode_object_cred(p, comp, &cred); ++ dprintk("%s: comp[%d]=dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "key_len=%u cap_len=%u\n", ++ __func__, i, ++ _DEVID_LO(&comp->oc_object_id.oid_device_id), ++ _DEVID_HI(&comp->oc_object_id.oid_device_id), ++ comp->oc_object_id.oid_partition_id, ++ comp->oc_object_id.oid_object_id, ++ comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); ++ comp++; ++ } ++ dprintk("%s: xdr_size=%Zd end=%p in_core_size=%Zd\n", __func__, ++ (char *)p - (char *)start, cred, (char *)cred - (char *)layout); ++ return layout; ++} ++ ++/* ++ * Get Device Information Decoding ++ * ++ * Note: since Device Information is currently done synchronously, most ++ * of the actual fields are left inside the rpc buffer and are only ++ * pointed to by the pnfs_osd_deviceaddr members. So the read buffer ++ * should not be freed while the returned information is in use. ++ */ ++ ++u32 *__xdr_read_calc_nfs4_string( ++ u32 *p, struct nfs4_string *str, u8 **freespace) ++{ ++ u32 len; ++ char *data; ++ bool need_copy; ++ ++ READ32(len); ++ data = (char *)p; ++ ++ if (data[len]) { /* Not null terminated we'll need extra space */ ++ data = *freespace; ++ *freespace += len + 1; ++ need_copy = true; ++ } else { ++ need_copy = false; ++ } ++ ++ if (str) { ++ str->len = len; ++ str->data = data; ++ if (need_copy) { ++ memcpy(data, p, len); ++ data[len] = 0; ++ } ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++u32 *__xdr_read_calc_u8_opaque( ++ u32 *p, struct nfs4_string *str) ++{ ++ u32 len; ++ ++ READ32(len); ++ ++ if (str) { ++ str->len = len; ++ str->data = (char *)p; ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetid { ++ * u32 oti_type; ++ * struct nfs4_string oti_scsi_device_id; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetid( ++ u32 *p, struct pnfs_osd_targetid* targetid, u8 **freespace) ++{ ++ u32 oti_type; ++ ++ READ32(oti_type); ++ if (targetid) ++ targetid->oti_type = oti_type; ++ ++ switch (oti_type) { ++ case OBJ_TARGET_SCSI_NAME: ++ case OBJ_TARGET_SCSI_DEVICE_ID: ++ p = __xdr_read_calc_u8_opaque(p, ++ targetid ? &targetid->oti_scsi_device_id : NULL); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_net_addr { ++ * struct nfs4_string r_netid; ++ * struct nfs4_string r_addr; ++ * }; ++ */ ++u32 *__xdr_read_calc_net_addr( ++ u32 *p, struct pnfs_osd_net_addr* netaddr, u8 **freespace) ++{ ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_netid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_addr : NULL, ++ freespace); ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetaddr { ++ * u32 ota_available; ++ * struct pnfs_osd_net_addr ota_netaddr; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetaddr( ++ u32 *p, struct pnfs_osd_targetaddr *targetaddr, u8 **freespace) ++{ ++ u32 ota_available; ++ ++ READ32(ota_available); ++ if (targetaddr) ++ targetaddr->ota_available = ota_available; ++ ++ if (ota_available) { ++ p = __xdr_read_calc_net_addr(p, ++ targetaddr ? &targetaddr->ota_netaddr : NULL, ++ freespace); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++u32 *__xdr_read_calc_deviceaddr( ++ u32 *p, struct pnfs_osd_deviceaddr *deviceaddr, u8 **freespace) ++{ ++ p = __xdr_read_calc_targetid(p, ++ deviceaddr ? &deviceaddr->oda_targetid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_targetaddr(p, ++ deviceaddr ? &deviceaddr->oda_targetaddr : NULL, ++ freespace); ++ ++ if (deviceaddr) ++ COPYMEM(deviceaddr->oda_lun, sizeof(deviceaddr->oda_lun)); ++ else ++ p += XDR_QUADLEN(sizeof(deviceaddr->oda_lun)); ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_systemid : NULL); ++ ++ if (deviceaddr) { ++ p = pnfs_osd_xdr_decode_object_cred(p, ++ &deviceaddr->oda_root_obj_cred, freespace); ++ } else { ++ *freespace += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_osdname : NULL); ++ ++ return p; ++} ++ ++size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p) ++{ ++ u8 *null_freespace = NULL; ++ size_t sz; ++ ++ __xdr_read_calc_deviceaddr(p, NULL, &null_freespace); ++ sz = sizeof(struct pnfs_osd_deviceaddr) + (size_t)null_freespace; ++ ++ return sz; ++} ++ ++void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p) ++{ ++ u8 *freespace = (u8 *)(deviceaddr + 1); ++ ++ __xdr_read_calc_deviceaddr(p, deviceaddr, &freespace); ++} ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou) ++{ ++ __be32 *p = xdr_reserve_space(xdr, 16); ++ ++ if (!p) ++ return -E2BIG; ++ ++ *p++ = cpu_to_be32(lou->dsu_valid); ++ if (lou->dsu_valid) ++ p = xdr_encode_hyper(p, lou->dsu_delta); ++ *p++ = cpu_to_be32(lou->olu_ioerr_flag); ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ */ ++static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p; ++ ++ p = xdr_reserve_space(xdr, 32); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, ++ sizeof(object_id->oid_device_id.data)); ++ p = xdr_encode_hyper(p, object_id->oid_partition_id); ++ p = xdr_encode_hyper(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, ++ struct pnfs_osd_ioerr *ioerr) ++{ ++ __be32 *p; ++ int ret; ++ ++ ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component); ++ if (ret) ++ return ret; ++ ++ p = xdr_reserve_space(xdr, 24); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_hyper(p, ioerr->oer_comp_offset); ++ p = xdr_encode_hyper(p, ioerr->oer_comp_length); ++ *p++ = cpu_to_be32(ioerr->oer_iswrite); ++ *p = cpu_to_be32(ioerr->oer_errno); ++ ++ return 0; ++} +diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/pagelist.c +--- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-09-30 10:15:17.899715000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-09-30 10:17:08.748995000 -0400 +@@ -20,6 +20,7 @@ + #include + + #include "internal.h" ++#include "pnfs.h" + + static struct kmem_cache *nfs_page_cachep; + +@@ -56,7 +57,8 @@ nfs_page_free(struct nfs_page *p) + struct nfs_page * + nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, + struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + +@@ -79,7 +81,11 @@ nfs_create_request(struct nfs_open_conte + req->wb_pgbase = offset; + req->wb_bytes = count; + req->wb_context = get_nfs_open_context(ctx); ++ req->wb_lock_context = nfs_get_lock_context(ctx); + kref_init(&req->wb_kref); ++ req->wb_lseg = lseg; ++ if (lseg) ++ get_lseg(lseg); + return req; + } + +@@ -141,18 +147,26 @@ void nfs_clear_request(struct nfs_page * + { + struct page *page = req->wb_page; + struct nfs_open_context *ctx = req->wb_context; ++ struct nfs_lock_context *l_ctx = req->wb_lock_context; + + if (page != NULL) { + page_cache_release(page); + req->wb_page = NULL; + } ++ if (l_ctx != NULL) { ++ nfs_put_lock_context(l_ctx); ++ req->wb_lock_context = NULL; ++ } + if (ctx != NULL) { + put_nfs_open_context(ctx); + req->wb_context = NULL; + } ++ if (req->wb_lseg != NULL) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } + } + +- + /** + * nfs_release_request - Release the count on an NFS read/write request + * @req: request to release +@@ -231,11 +245,12 @@ void nfs_pageio_init(struct nfs_pageio_d + * Return 'true' if this is the case, else return 'false'. + */ + static int nfs_can_coalesce_requests(struct nfs_page *prev, +- struct nfs_page *req) ++ struct nfs_page *req, ++ struct nfs_pageio_descriptor *pgio) + { + if (req->wb_context->cred != prev->wb_context->cred) + return 0; +- if (req->wb_context->lockowner != prev->wb_context->lockowner) ++ if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) + return 0; + if (req->wb_context->state != prev->wb_context->state) + return 0; +@@ -245,6 +260,12 @@ static int nfs_can_coalesce_requests(str + return 0; + if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) + return 0; ++ if (req->wb_lseg != prev->wb_lseg) ++ return 0; ++#ifdef CONFIG_NFS_V4_1 ++ if (pgio->pg_test && !pgio->pg_test(pgio, prev, req)) ++ return 0; ++#endif /* CONFIG_NFS_V4_1 */ + return 1; + } + +@@ -277,7 +298,7 @@ static int nfs_pageio_do_add_request(str + if (newlen > desc->pg_bsize) + return 0; + prev = nfs_list_entry(desc->pg_list.prev); +- if (!nfs_can_coalesce_requests(prev, req)) ++ if (!nfs_can_coalesce_requests(prev, req, desc)) + return 0; + } else + desc->pg_base = req->wb_pgbase; +@@ -366,6 +387,7 @@ void nfs_pageio_cond_complete(struct nfs + * @idx_start: lower bound of page->index to scan + * @npages: idx_start + npages sets the upper bound to scan. + * @tag: tag to scan for ++ * @use_pnfs: will be set TRUE if commit needs to be handled by layout driver + * + * Moves elements from one of the inode request lists. + * If the number of requests is set to 0, the entire address_space +@@ -375,7 +397,7 @@ void nfs_pageio_cond_complete(struct nfs + */ + int nfs_scan_list(struct nfs_inode *nfsi, + struct list_head *dst, pgoff_t idx_start, +- unsigned int npages, int tag) ++ unsigned int npages, int tag, int *use_pnfs) + { + struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; + struct nfs_page *req; +@@ -406,6 +428,8 @@ int nfs_scan_list(struct nfs_inode *nfsi + radix_tree_tag_clear(&nfsi->nfs_page_tree, + req->wb_index, tag); + nfs_list_add_request(req, dst); ++ if (req->wb_lseg) ++ *use_pnfs = 1; + res++; + if (res == INT_MAX) + goto out; +diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs.c +--- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-09-30 10:17:08.752997000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-09-30 10:17:08.754995000 -0400 +@@ -0,0 +1,2039 @@ ++/* ++ * linux/fs/nfs/pnfs.c ++ * ++ * pNFS functions to call and manage layout drivers. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "internal.h" ++#include "nfs4_fs.h" ++#include "pnfs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS ++ ++#define MIN_POOL_LC (4) ++ ++static int pnfs_initialized; ++ ++static void pnfs_free_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range); ++static inline void get_layout(struct pnfs_layout_hdr *lo); ++ ++/* Locking: ++ * ++ * pnfs_spinlock: ++ * protects pnfs_modules_tbl. ++ */ ++static spinlock_t pnfs_spinlock = __SPIN_LOCK_UNLOCKED(pnfs_spinlock); ++ ++/* ++ * pnfs_modules_tbl holds all pnfs modules ++ */ ++static struct list_head pnfs_modules_tbl; ++static struct kmem_cache *pnfs_cachep; ++static mempool_t *pnfs_layoutcommit_mempool; ++ ++static inline struct nfs4_layoutcommit_data *pnfs_layoutcommit_alloc(void) ++{ ++ struct nfs4_layoutcommit_data *p = ++ mempool_alloc(pnfs_layoutcommit_mempool, GFP_NOFS); ++ if (p) ++ memset(p, 0, sizeof(*p)); ++ ++ return p; ++} ++ ++void pnfs_layoutcommit_free(struct nfs4_layoutcommit_data *p) ++{ ++ mempool_free(p, pnfs_layoutcommit_mempool); ++} ++ ++/* ++ * struct pnfs_module - One per pNFS device module. ++ */ ++struct pnfs_module { ++ struct pnfs_layoutdriver_type *pnfs_ld_type; ++ struct list_head pnfs_tblid; ++}; ++ ++int ++pnfs_initialize(void) ++{ ++ INIT_LIST_HEAD(&pnfs_modules_tbl); ++ ++ pnfs_cachep = kmem_cache_create("nfs4_layoutcommit_data", ++ sizeof(struct nfs4_layoutcommit_data), ++ 0, SLAB_HWCACHE_ALIGN, NULL); ++ if (pnfs_cachep == NULL) ++ return -ENOMEM; ++ ++ pnfs_layoutcommit_mempool = mempool_create(MIN_POOL_LC, ++ mempool_alloc_slab, ++ mempool_free_slab, ++ pnfs_cachep); ++ if (pnfs_layoutcommit_mempool == NULL) { ++ kmem_cache_destroy(pnfs_cachep); ++ return -ENOMEM; ++ } ++ ++ pnfs_initialized = 1; ++ return 0; ++} ++ ++void pnfs_uninitialize(void) ++{ ++ mempool_destroy(pnfs_layoutcommit_mempool); ++ kmem_cache_destroy(pnfs_cachep); ++} ++ ++/* search pnfs_modules_tbl for right pnfs module */ ++static int ++find_pnfs(u32 id, struct pnfs_module **module) { ++ struct pnfs_module *local = NULL; ++ ++ dprintk("PNFS: %s: Searching for %u\n", __func__, id); ++ list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) { ++ if (local->pnfs_ld_type->id == id) { ++ *module = local; ++ return(1); ++ } ++ } ++ return 0; ++} ++ ++/* Set cred to indicate we require a layoutcommit ++ * If we don't even have a layout, we don't need to commit it. ++ */ ++void ++pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx) ++{ ++ dprintk("%s: has_layout=%d ctx=%p\n", __func__, has_layout(nfsi), ctx); ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (has_layout(nfsi) && ++ !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->state)) { ++ nfsi->layout->cred = get_rpccred(ctx->state->owner->so_cred); ++ __set_bit(NFS_INO_LAYOUTCOMMIT, ++ &nfsi->layout->state); ++ nfsi->change_attr++; ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ dprintk("%s: Set layoutcommit\n", __func__); ++ return; ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* Update last_write_offset for layoutcommit. ++ * TODO: We should only use commited extents, but the current nfs ++ * implementation does not calculate the written range in nfs_commit_done. ++ * We therefore update this field in writeback_done. ++ */ ++void ++pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent) ++{ ++ loff_t end_pos; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (offset < nfsi->layout->write_begin_pos) ++ nfsi->layout->write_begin_pos = offset; ++ end_pos = offset + extent - 1; /* I'm being inclusive */ ++ if (end_pos > nfsi->layout->write_end_pos) ++ nfsi->layout->write_end_pos = end_pos; ++ dprintk("%s: Wrote %lu@%lu bpos %lu, epos: %lu\n", ++ __func__, ++ (unsigned long) extent, ++ (unsigned long) offset , ++ (unsigned long) nfsi->layout->write_begin_pos, ++ (unsigned long) nfsi->layout->write_end_pos); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* Unitialize a mountpoint in a layout driver */ ++void ++unmount_pnfs_layoutdriver(struct nfs_server *nfss) ++{ ++ if (PNFS_EXISTS_LDIO_OP(nfss, uninitialize_mountpoint)) ++ nfss->pnfs_curr_ld->ld_io_ops->uninitialize_mountpoint(nfss); ++} ++ ++/* ++ * Set the server pnfs module to the first registered pnfs_type. ++ * Only one pNFS layout driver is supported. ++ */ ++void ++set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, ++ u32 id) ++{ ++ struct pnfs_module *mod = NULL; ++ ++ if (server->pnfs_curr_ld) ++ return; ++ ++ if (!find_pnfs(id, &mod)) { ++ request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); ++ find_pnfs(id, &mod); ++ } ++ ++ if (!mod) { ++ dprintk("%s: No pNFS module found for %u. ", __func__, id); ++ goto out_err; ++ } ++ ++ server->pnfs_curr_ld = mod->pnfs_ld_type; ++ if (mod->pnfs_ld_type->ld_io_ops->initialize_mountpoint( ++ server, mntfh)) { ++ printk(KERN_ERR "%s: Error initializing mount point " ++ "for layout driver %u. ", __func__, id); ++ goto out_err; ++ } ++ ++ dprintk("%s: pNFS module for %u set\n", __func__, id); ++ return; ++ ++out_err: ++ dprintk("Using NFSv4 I/O\n"); ++ server->pnfs_curr_ld = NULL; ++} ++ ++/* Allow I/O module to set its functions structure */ ++struct pnfs_client_operations* ++pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) ++{ ++ struct pnfs_module *pnfs_mod; ++ struct layoutdriver_io_operations *io_ops = ld_type->ld_io_ops; ++ ++ if (!pnfs_initialized) { ++ printk(KERN_ERR "%s Registration failure. " ++ "pNFS not initialized.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops || !io_ops->alloc_layout || !io_ops->free_layout) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "alloc_layout and free_layout.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops->alloc_lseg || !io_ops->free_lseg) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "alloc_lseg and free_lseg.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops->read_pagelist || !io_ops->write_pagelist || ++ !io_ops->commit) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "read_pagelist, write_pagelist, and commit.\n", ++ __func__); ++ return NULL; ++ } ++ ++ pnfs_mod = kmalloc(sizeof(struct pnfs_module), GFP_KERNEL); ++ if (pnfs_mod != NULL) { ++ dprintk("%s Registering id:%u name:%s\n", ++ __func__, ++ ld_type->id, ++ ld_type->name); ++ pnfs_mod->pnfs_ld_type = ld_type; ++ INIT_LIST_HEAD(&pnfs_mod->pnfs_tblid); ++ ++ spin_lock(&pnfs_spinlock); ++ list_add(&pnfs_mod->pnfs_tblid, &pnfs_modules_tbl); ++ spin_unlock(&pnfs_spinlock); ++ } ++ ++ return &pnfs_ops; ++} ++ ++/* Allow I/O module to set its functions structure */ ++void ++pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) ++{ ++ struct pnfs_module *pnfs_mod; ++ ++ if (find_pnfs(ld_type->id, &pnfs_mod)) { ++ dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); ++ spin_lock(&pnfs_spinlock); ++ list_del(&pnfs_mod->pnfs_tblid); ++ spin_unlock(&pnfs_spinlock); ++ kfree(pnfs_mod); ++ } ++} ++ ++/* ++ * pNFS client layout cache ++ */ ++#if defined(CONFIG_SMP) ++#define BUG_ON_UNLOCKED_INO(ino) \ ++ BUG_ON(!spin_is_locked(&ino->i_lock)) ++#define BUG_ON_UNLOCKED_LO(lo) \ ++ BUG_ON_UNLOCKED_INO(PNFS_INODE(lo)) ++#else /* CONFIG_SMP */ ++#define BUG_ON_UNLOCKED_INO(lo) do {} while (0) ++#define BUG_ON_UNLOCKED_LO(lo) do {} while (0) ++#endif /* CONFIG_SMP */ ++ ++static inline void ++get_layout(struct pnfs_layout_hdr *lo) ++{ ++ BUG_ON_UNLOCKED_LO(lo); ++ lo->refcount++; ++} ++ ++static inline void ++put_layout_locked(struct pnfs_layout_hdr *lo) ++{ ++ BUG_ON_UNLOCKED_LO(lo); ++ BUG_ON(lo->refcount <= 0); ++ ++ lo->refcount--; ++ if (!lo->refcount) { ++ struct layoutdriver_io_operations *io_ops = PNFS_LD_IO_OPS(lo); ++ struct nfs_inode *nfsi = PNFS_NFS_INODE(lo); ++ ++ dprintk("%s: freeing layout cache %p\n", __func__, lo); ++ WARN_ON(!list_empty(&lo->layouts)); ++ io_ops->free_layout(lo); ++ nfsi->layout = NULL; ++ } ++} ++ ++void ++put_layout(struct inode *inode) ++{ ++ spin_lock(&inode->i_lock); ++ put_layout_locked(NFS_I(inode)->layout); ++ spin_unlock(&inode->i_lock); ++ ++} ++ ++void ++pnfs_layout_release(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range) ++{ ++ struct nfs_inode *nfsi = PNFS_NFS_INODE(lo); ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (range) ++ pnfs_free_layout(lo, range); ++ /* ++ * Matched in _pnfs_update_layout for layoutget ++ * and by get_layout in _pnfs_return_layout for layoutreturn ++ */ ++ put_layout_locked(lo); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ wake_up_all(&nfsi->lo_waitq); ++} ++ ++void ++pnfs_destroy_layout(struct nfs_inode *nfsi) ++{ ++ struct pnfs_layout_hdr *lo; ++ struct pnfs_layout_range range = { ++ .iomode = IOMODE_ANY, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ lo = nfsi->layout; ++ if (lo) { ++ pnfs_free_layout(lo, &range); ++ WARN_ON(!list_empty(&nfsi->layout->segs)); ++ WARN_ON(!list_empty(&nfsi->layout->layouts)); ++ ++ if (nfsi->layout->refcount != 1) ++ printk(KERN_WARNING "%s: layout refcount not=1 %d\n", ++ __func__, nfsi->layout->refcount); ++ WARN_ON(nfsi->layout->refcount != 1); ++ ++ /* Matched by refcount set to 1 in alloc_init_layout */ ++ put_layout_locked(lo); ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* ++ * Called by the state manger to remove all layouts established under an ++ * expired lease. ++ */ ++void ++pnfs_destroy_all_layouts(struct nfs_client *clp) ++{ ++ struct pnfs_layout_hdr *lo; ++ ++ while (!list_empty(&clp->cl_layouts)) { ++ lo = list_entry(clp->cl_layouts.next, struct pnfs_layout_hdr, ++ layouts); ++ dprintk("%s freeing layout for inode %lu\n", __func__, ++ lo->inode->i_ino); ++ pnfs_destroy_layout(NFS_I(lo->inode)); ++ } ++} ++ ++static inline void ++init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) ++{ ++ INIT_LIST_HEAD(&lseg->fi_list); ++ kref_init(&lseg->kref); ++ lseg->valid = true; ++ lseg->layout = lo; ++} ++ ++static void ++destroy_lseg(struct kref *kref) ++{ ++ struct pnfs_layout_segment *lseg = ++ container_of(kref, struct pnfs_layout_segment, kref); ++ ++ dprintk("--> %s\n", __func__); ++ /* Matched by get_layout in pnfs_insert_layout */ ++ put_layout_locked(lseg->layout); ++ PNFS_LD_IO_OPS(lseg->layout)->free_lseg(lseg); ++} ++ ++static void ++put_lseg_locked(struct pnfs_layout_segment *lseg) ++{ ++ bool do_wake_up; ++ struct nfs_inode *nfsi; ++ ++ if (!lseg) ++ return; ++ ++ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, ++ atomic_read(&lseg->kref.refcount), lseg->valid); ++ do_wake_up = !lseg->valid; ++ nfsi = PNFS_NFS_INODE(lseg->layout); ++ kref_put(&lseg->kref, destroy_lseg); ++ if (do_wake_up) ++ wake_up(&nfsi->lo_waitq); ++} ++ ++void ++put_lseg(struct pnfs_layout_segment *lseg) ++{ ++ bool do_wake_up; ++ struct nfs_inode *nfsi; ++ ++ if (!lseg) ++ return; ++ ++ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, ++ atomic_read(&lseg->kref.refcount), lseg->valid); ++ do_wake_up = !lseg->valid; ++ nfsi = PNFS_NFS_INODE(lseg->layout); ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ kref_put(&lseg->kref, destroy_lseg); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ if (do_wake_up) ++ wake_up(&nfsi->lo_waitq); ++} ++EXPORT_SYMBOL(put_lseg); ++ ++void get_lseg(struct pnfs_layout_segment *lseg) ++{ ++ kref_get(&lseg->kref); ++} ++EXPORT_SYMBOL(get_lseg); ++ ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end: NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1: NFS4_MAX_UINT64; ++} ++ ++/* ++ * is l2 fully contained in l1? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_contained(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (start1 <= start2) && (end1 >= end2); ++} ++ ++/* ++ * is l1 and l2 intersecting? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_intersecting(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (end1 == NFS4_MAX_UINT64 || end1 > start2) && ++ (end2 == NFS4_MAX_UINT64 || end2 > start1); ++} ++ ++void ++pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, ++ const nfs4_stateid *stateid) ++{ ++ write_seqlock(&lo->seqlock); ++ memcpy(lo->stateid.u.data, stateid->u.data, sizeof(lo->stateid.u.data)); ++ write_sequnlock(&lo->seqlock); ++} ++ ++void ++pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo) ++{ ++ int seq; ++ ++ dprintk("--> %s\n", __func__); ++ ++ do { ++ seq = read_seqbegin(&lo->seqlock); ++ memcpy(dst->u.data, lo->stateid.u.data, ++ sizeof(lo->stateid.u.data)); ++ } while (read_seqretry(&lo->seqlock, seq)); ++ ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void ++pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo, ++ struct nfs4_state *state) ++{ ++ int seq; ++ ++ dprintk("--> %s\n", __func__); ++ ++ write_seqlock(&lo->seqlock); ++ if (!memcmp(lo->stateid.u.data, &zero_stateid, NFS4_STATEID_SIZE)) ++ do { ++ seq = read_seqbegin(&state->seqlock); ++ memcpy(lo->stateid.u.data, state->stateid.u.data, ++ sizeof(state->stateid.u.data)); ++ } while (read_seqretry(&state->seqlock, seq)); ++ write_sequnlock(&lo->seqlock); ++ dprintk("<-- %s\n", __func__); ++} ++ ++/* ++* Get layout from server. ++* for now, assume that whole file layouts are requested. ++* arg->offset: 0 ++* arg->length: all ones ++*/ ++static int ++send_layoutget(struct inode *ino, ++ struct nfs_open_context *ctx, ++ struct pnfs_layout_range *range, ++ struct pnfs_layout_segment **lsegpp, ++ struct pnfs_layout_hdr *lo) ++{ ++ int status; ++ struct nfs_server *server = NFS_SERVER(ino); ++ struct nfs4_layoutget *lgp; ++ ++ dprintk("--> %s\n", __func__); ++ ++ lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); ++ if (lgp == NULL) { ++ pnfs_layout_release(lo, NULL); ++ return -ENOMEM; ++ } ++ lgp->args.minlength = NFS4_MAX_UINT64; ++ lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; ++ lgp->args.range.iomode = range->iomode; ++ lgp->args.range.offset = 0; ++ lgp->args.range.length = NFS4_MAX_UINT64; ++ lgp->args.type = server->pnfs_curr_ld->id; ++ lgp->args.inode = ino; ++ lgp->lsegpp = lsegpp; ++ ++ if (!memcmp(lo->stateid.u.data, &zero_stateid, NFS4_STATEID_SIZE)) { ++ struct nfs_open_context *oldctx = ctx; ++ ++ if (!oldctx) { ++ ctx = nfs_find_open_context(ino, NULL, ++ (range->iomode == IOMODE_READ) ? ++ FMODE_READ: FMODE_WRITE); ++ BUG_ON(!ctx); ++ } ++ /* Set the layout stateid from the open stateid */ ++ pnfs_layout_from_open_stateid(NFS_I(ino)->layout, ctx->state); ++ if (!oldctx) ++ put_nfs_open_context(ctx); ++ } ++ ++ /* Retrieve layout information from server */ ++ status = nfs4_proc_layoutget(lgp); ++ ++ dprintk("<-- %s status %d\n", __func__, status); ++ return status; ++} ++ ++/* ++ * iomode matching rules: ++ * range lseg match ++ * ----- ----- ----- ++ * ANY READ true ++ * ANY RW true ++ * RW READ false ++ * RW RW true ++ * READ READ true ++ * READ RW false ++ */ ++static inline int ++should_free_lseg(struct pnfs_layout_segment *lseg, ++ struct pnfs_layout_range *range) ++{ ++ return (range->iomode == IOMODE_ANY || ++ lseg->range.iomode == range->iomode) && ++ lo_seg_intersecting(&lseg->range, range); ++} ++ ++static struct pnfs_layout_segment * ++has_layout_to_return(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range) ++{ ++ struct pnfs_layout_segment *out = NULL, *lseg; ++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", ++ __func__, lo, range->offset, range->length, range->iomode); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry (lseg, &lo->segs, fi_list) ++ if (should_free_lseg(lseg, range)) { ++ out = lseg; ++ break; ++ } ++ ++ dprintk("%s:Return lseg=%p\n", __func__, out); ++ return out; ++} ++ ++static inline bool ++_pnfs_can_return_lseg(struct pnfs_layout_segment *lseg) ++{ ++ return atomic_read(&lseg->kref.refcount) == 1; ++} ++ ++ ++static void ++pnfs_free_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range) ++{ ++ struct pnfs_layout_segment *lseg, *next; ++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", ++ __func__, lo, range->offset, range->length, range->iomode); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry_safe (lseg, next, &lo->segs, fi_list) { ++ if (!should_free_lseg(lseg, range) || ++ !_pnfs_can_return_lseg(lseg)) ++ continue; ++ dprintk("%s: freeing lseg %p iomode %d " ++ "offset %llu length %llu\n", __func__, ++ lseg, lseg->range.iomode, lseg->range.offset, ++ lseg->range.length); ++ list_del(&lseg->fi_list); ++ put_lseg_locked(lseg); ++ } ++ if (list_empty(&lo->segs)) { ++ struct nfs_client *clp; ++ ++ clp = PNFS_NFS_SERVER(lo)->nfs_client; ++ spin_lock(&clp->cl_lock); ++ list_del_init(&lo->layouts); ++ spin_unlock(&clp->cl_lock); ++ pnfs_set_layout_stateid(lo, &zero_stateid); ++ } ++ ++ dprintk("%s:Return\n", __func__); ++} ++ ++static bool ++pnfs_return_layout_barrier(struct nfs_inode *nfsi, ++ struct pnfs_layout_range *range) ++{ ++ struct pnfs_layout_segment *lseg; ++ bool ret = false; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ list_for_each_entry(lseg, &nfsi->layout->segs, fi_list) { ++ if (!should_free_lseg(lseg, range)) ++ continue; ++ lseg->valid = false; ++ if (!_pnfs_can_return_lseg(lseg)) { ++ dprintk("%s: wait on lseg %p refcount %d\n", ++ __func__, lseg, ++ atomic_read(&lseg->kref.refcount)); ++ ret = true; ++ } ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ dprintk("%s:Return %d\n", __func__, ret); ++ return ret; ++} ++ ++static int ++return_layout(struct inode *ino, struct pnfs_layout_range *range, ++ enum pnfs_layoutreturn_type type, struct pnfs_layout_hdr *lo, ++ bool wait) ++{ ++ struct nfs4_layoutreturn *lrp; ++ struct nfs_server *server = NFS_SERVER(ino); ++ int status = -ENOMEM; ++ ++ dprintk("--> %s\n", __func__); ++ ++ BUG_ON(type != RETURN_FILE); ++ ++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); ++ if (lrp == NULL) { ++ if (lo && (type == RETURN_FILE)) ++ pnfs_layout_release(lo, NULL); ++ goto out; ++ } ++ lrp->args.reclaim = 0; ++ lrp->args.layout_type = server->pnfs_curr_ld->id; ++ lrp->args.return_type = type; ++ lrp->args.range = *range; ++ lrp->args.inode = ino; ++ ++ status = nfs4_proc_layoutreturn(lrp, wait); ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; ++} ++ ++int ++_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ struct pnfs_layout_hdr *lo = NULL; ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_range arg; ++ int status = 0; ++ ++ dprintk("--> %s type %d\n", __func__, type); ++ ++ ++ arg.iomode = range ? range->iomode : IOMODE_ANY; ++ arg.offset = 0; ++ arg.length = NFS4_MAX_UINT64; ++ ++ if (type == RETURN_FILE) { ++ spin_lock(&ino->i_lock); ++ lo = nfsi->layout; ++ if (lo && !has_layout_to_return(lo, &arg)) { ++ lo = NULL; ++ } ++ if (!lo) { ++ spin_unlock(&ino->i_lock); ++ dprintk("%s: no layout segments to return\n", __func__); ++ goto out; ++ } ++ ++ /* Reference for layoutreturn matched in pnfs_layout_release */ ++ get_layout(lo); ++ ++ spin_unlock(&ino->i_lock); ++ ++ if (pnfs_return_layout_barrier(nfsi, &arg)) { ++ if (stateid) { /* callback */ ++ status = -EAGAIN; ++ goto out_put; ++ } ++ dprintk("%s: waiting\n", __func__); ++ wait_event(nfsi->lo_waitq, ++ !pnfs_return_layout_barrier(nfsi, &arg)); ++ } ++ ++ if (layoutcommit_needed(nfsi)) { ++ if (stateid && !wait) { /* callback */ ++ dprintk("%s: layoutcommit pending\n", __func__); ++ status = -EAGAIN; ++ goto out_put; ++ } ++ status = pnfs_layoutcommit_inode(ino, wait); ++ if (status) { ++ /* Return layout even if layoutcommit fails */ ++ dprintk("%s: layoutcommit failed, status=%d. " ++ "Returning layout anyway\n", ++ __func__, status); ++ } ++ } ++ ++ if (!stateid) ++ status = return_layout(ino, &arg, type, lo, wait); ++ else ++ pnfs_layout_release(lo, &arg); ++ } ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; ++out_put: ++ put_layout(ino); ++ goto out; ++} ++ ++/* ++ * cmp two layout segments for sorting into layout cache ++ */ ++static inline s64 ++cmp_layout(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) ++{ ++ s64 d; ++ ++ /* higher offset > lower offset */ ++ d = l1->offset - l2->offset; ++ if (d) ++ return d; ++ ++ /* longer length > shorter length */ ++ d = l1->length - l2->length; ++ if (d) ++ return d; ++ ++ /* read > read/write */ ++ return (int)(l1->iomode == IOMODE_READ) - ++ (int)(l2->iomode == IOMODE_READ); ++} ++ ++static void ++pnfs_insert_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct pnfs_layout_segment *lp; ++ int found = 0; ++ ++ dprintk("%s:Begin\n", __func__); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ if (list_empty(&lo->segs)) { ++ struct nfs_client *clp = PNFS_NFS_SERVER(lo)->nfs_client; ++ ++ spin_lock(&clp->cl_lock); ++ BUG_ON(!list_empty(&lo->layouts)); ++ list_add_tail(&lo->layouts, &clp->cl_layouts); ++ spin_unlock(&clp->cl_lock); ++ } ++ list_for_each_entry (lp, &lo->segs, fi_list) { ++ if (cmp_layout(&lp->range, &lseg->range) > 0) ++ continue; ++ list_add_tail(&lseg->fi_list, &lp->fi_list); ++ dprintk("%s: inserted lseg %p " ++ "iomode %d offset %llu length %llu before " ++ "lp %p iomode %d offset %llu length %llu\n", ++ __func__, lseg, lseg->range.iomode, ++ lseg->range.offset, lseg->range.length, ++ lp, lp->range.iomode, lp->range.offset, ++ lp->range.length); ++ found = 1; ++ break; ++ } ++ if (!found) { ++ list_add_tail(&lseg->fi_list, &lo->segs); ++ dprintk("%s: inserted lseg %p " ++ "iomode %d offset %llu length %llu at tail\n", ++ __func__, lseg, lseg->range.iomode, ++ lseg->range.offset, lseg->range.length); ++ } ++ get_layout(lo); ++ ++ dprintk("%s:Return\n", __func__); ++} ++ ++/* ++ * Each layoutdriver embeds pnfs_layout_hdr as the first field in it's ++ * per-layout type layout cache structure and returns it ZEROed ++ * from layoutdriver_io_ops->alloc_layout ++ */ ++static struct pnfs_layout_hdr * ++alloc_init_layout(struct inode *ino) ++{ ++ struct pnfs_layout_hdr *lo; ++ struct layoutdriver_io_operations *io_ops; ++ ++ io_ops = NFS_SERVER(ino)->pnfs_curr_ld->ld_io_ops; ++ lo = io_ops->alloc_layout(ino); ++ if (!lo) { ++ printk(KERN_ERR ++ "%s: out of memory: io_ops->alloc_layout failed\n", ++ __func__); ++ return NULL; ++ } ++ lo->refcount = 1; ++ INIT_LIST_HEAD(&lo->layouts); ++ INIT_LIST_HEAD(&lo->segs); ++ seqlock_init(&lo->seqlock); ++ lo->inode = ino; ++ return lo; ++} ++ ++/* ++ * Retrieve and possibly allocate the inode layout ++ * ++ * ino->i_lock must be taken by the caller. ++ */ ++static struct pnfs_layout_hdr * ++pnfs_alloc_layout(struct inode *ino) ++{ ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_hdr *new = NULL; ++ ++ dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); ++ ++ BUG_ON_UNLOCKED_INO(ino); ++ if (likely(nfsi->layout)) ++ return nfsi->layout; ++ ++ spin_unlock(&ino->i_lock); ++ new = alloc_init_layout(ino); ++ spin_lock(&ino->i_lock); ++ ++ if (likely(nfsi->layout == NULL)) { /* Won the race? */ ++ nfsi->layout = new; ++ } else if (new) { ++ /* Reference the layout accross i_lock release and grab */ ++ get_layout(nfsi->layout); ++ spin_unlock(&ino->i_lock); ++ NFS_SERVER(ino)->pnfs_curr_ld->ld_io_ops->free_layout(new); ++ spin_lock(&ino->i_lock); ++ put_layout_locked(nfsi->layout); ++ } ++ return nfsi->layout; ++} ++ ++/* ++ * iomode matching rules: ++ * range lseg match ++ * ----- ----- ----- ++ * ANY READ true ++ * ANY RW true ++ * RW READ false ++ * RW RW true ++ * READ READ true ++ * READ RW true ++ */ ++static inline int ++has_matching_lseg(struct pnfs_layout_segment *lseg, ++ struct pnfs_layout_range *range) ++{ ++ struct pnfs_layout_range range1; ++ ++ if ((range->iomode == IOMODE_RW && lseg->range.iomode != IOMODE_RW) || ++ !lo_seg_intersecting(&lseg->range, range)) ++ return 0; ++ ++ /* range1 covers only the first byte in the range */ ++ range1 = *range; ++ range1.length = 1; ++ return lo_seg_contained(&lseg->range, &range1); ++} ++ ++/* ++ * lookup range in layout ++ */ ++static struct pnfs_layout_segment * ++pnfs_has_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range, ++ bool take_ref, ++ bool only_valid) ++{ ++ struct pnfs_layout_segment *lseg, *ret = NULL; ++ ++ dprintk("%s:Begin\n", __func__); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry (lseg, &lo->segs, fi_list) { ++ if (has_matching_lseg(lseg, range) && ++ (lseg->valid || !only_valid)) { ++ ret = lseg; ++ if (take_ref) ++ get_lseg(ret); ++ break; ++ } ++ if (cmp_layout(range, &lseg->range) > 0) ++ break; ++ } ++ ++ dprintk("%s:Return lseg %p take_ref %d ref %d valid %d\n", ++ __func__, ret, take_ref, ++ ret ? atomic_read(&ret->kref.refcount) : 0, ++ ret ? ret->valid : 0); ++ return ret; ++} ++ ++/* Update the file's layout for the given range and iomode. ++ * Layout is retreived from the server if needed. ++ * If lsegpp is given, the appropriate layout segment is referenced and ++ * returned to the caller. ++ */ ++void ++_pnfs_update_layout(struct inode *ino, ++ struct nfs_open_context *ctx, ++ loff_t pos, ++ u64 count, ++ enum pnfs_iomode iomode, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ struct pnfs_layout_range arg = { ++ .iomode = iomode, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_hdr *lo; ++ struct pnfs_layout_segment *lseg = NULL; ++ bool take_ref = (lsegpp != NULL); ++ ++ if (take_ref) ++ *lsegpp = NULL; ++ spin_lock(&ino->i_lock); ++ lo = pnfs_alloc_layout(ino); ++ if (lo == NULL) { ++ dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); ++ goto out_unlock; ++ } ++ ++ /* Check to see if the layout for the given range already exists */ ++ lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref); ++ if (lseg && !lseg->valid) { ++ if (take_ref) ++ put_lseg_locked(lseg); ++ /* someone is cleaning the layout */ ++ lseg = NULL; ++ goto out_unlock; ++ } ++ ++ if (lseg) { ++ dprintk("%s: Using cached lseg %p for %llu@%llu iomode %d)\n", ++ __func__, ++ lseg, ++ arg.length, ++ arg.offset, ++ arg.iomode); ++ ++ goto out_unlock; ++ } ++ ++ /* if get layout already failed once goto out */ ++ if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state)) { ++ if (unlikely(nfsi->pnfs_layout_suspend && ++ get_seconds() >= nfsi->pnfs_layout_suspend)) { ++ dprintk("%s: layout_get resumed\n", __func__); ++ clear_bit(lo_fail_bit(iomode), ++ &nfsi->layout->state); ++ nfsi->pnfs_layout_suspend = 0; ++ } else ++ goto out_unlock; ++ } ++ ++ /* Reference the layout for layoutget matched in pnfs_layout_release */ ++ get_layout(lo); ++ spin_unlock(&ino->i_lock); ++ ++ send_layoutget(ino, ctx, &arg, lsegpp, lo); ++out: ++ dprintk("%s end, state 0x%lx lseg %p\n", __func__, ++ nfsi->layout->state, lseg); ++ return; ++out_unlock: ++ if (lsegpp) ++ *lsegpp = lseg; ++ spin_unlock(&ino->i_lock); ++ goto out; ++} ++ ++void ++pnfs_get_layout_done(struct nfs4_layoutget *lgp, int rpc_status) ++{ ++ struct pnfs_layout_segment *lseg = NULL; ++ struct nfs_inode *nfsi = NFS_I(lgp->args.inode); ++ time_t suspend = 0; ++ ++ dprintk("-->%s\n", __func__); ++ ++ lgp->status = rpc_status; ++ if (likely(!rpc_status)) { ++ if (unlikely(lgp->res.layout.len < 0)) { ++ printk(KERN_ERR ++ "%s: ERROR Returned layout size is ZERO\n", __func__); ++ lgp->status = -EIO; ++ } ++ goto out; ++ } ++ ++ dprintk("%s: ERROR retrieving layout %d\n", __func__, rpc_status); ++ switch (rpc_status) { ++ case -NFS4ERR_BADLAYOUT: ++ lgp->status = -ENOENT; ++ /* FALLTHROUGH */ ++ case -EACCES: /* NFS4ERR_ACCESS */ ++ /* transient error, don't mark with NFS_INO_LAYOUT_FAILED */ ++ goto out; ++ ++ case -NFS4ERR_LAYOUTTRYLATER: ++ case -NFS4ERR_RECALLCONFLICT: ++ case -NFS4ERR_OLD_STATEID: ++ case -EAGAIN: /* NFS4ERR_LOCKED */ ++ lgp->status = -NFS4ERR_DELAY; /* for nfs4_handle_exception */ ++ /* FALLTHROUGH */ ++ case -NFS4ERR_GRACE: ++ case -NFS4ERR_DELAY: ++ goto out; ++ ++ case -NFS4ERR_ADMIN_REVOKED: ++ case -NFS4ERR_DELEG_REVOKED: ++ /* The layout is expected to be returned at this point. ++ * This should clear the layout stateid as well */ ++ suspend = get_seconds() + 1; ++ break; ++ ++ case -NFS4ERR_LAYOUTUNAVAILABLE: ++ lgp->status = -ENOTSUPP; ++ break; ++ ++ case -NFS4ERR_REP_TOO_BIG: ++ case -NFS4ERR_REP_TOO_BIG_TO_CACHE: ++ lgp->status = -E2BIG; ++ break; ++ ++ /* Leave the following errors untranslated */ ++ case -NFS4ERR_DEADSESSION: ++ case -NFS4ERR_DQUOT: ++ case -EINVAL: /* NFS4ERR_INVAL */ ++ case -EIO: /* NFS4ERR_IO */ ++ case -NFS4ERR_FHEXPIRED: ++ case -NFS4ERR_MOVED: ++ case -NFS4ERR_NOSPC: ++ case -ESERVERFAULT: /* NFS4ERR_SERVERFAULT */ ++ case -ESTALE: /* NFS4ERR_STALE */ ++ case -ETOOSMALL: /* NFS4ERR_TOOSMALL */ ++ break; ++ ++ /* The following errors are our fault and should never happen */ ++ case -NFS4ERR_BADIOMODE: ++ case -NFS4ERR_BADXDR: ++ case -NFS4ERR_REQ_TOO_BIG: ++ case -NFS4ERR_UNKNOWN_LAYOUTTYPE: ++ case -NFS4ERR_WRONG_TYPE: ++ lgp->status = -EINVAL; ++ /* FALLTHROUGH */ ++ case -NFS4ERR_BAD_STATEID: ++ case -NFS4ERR_NOFILEHANDLE: ++ case -ENOTSUPP: /* NFS4ERR_NOTSUPP */ ++ case -NFS4ERR_OPENMODE: ++ case -NFS4ERR_OP_NOT_IN_SESSION: ++ case -NFS4ERR_TOO_MANY_OPS: ++ dprintk("%s: error %d: should never happen\n", __func__, ++ rpc_status); ++ break; ++ ++ /* The following errors are the server's fault */ ++ default: ++ dprintk("%s: illegal error %d\n", __func__, rpc_status); ++ lgp->status = -EIO; ++ break; ++ } ++ ++ /* remember that get layout failed and suspend trying */ ++ nfsi->pnfs_layout_suspend = suspend; ++ set_bit(lo_fail_bit(lgp->args.range.iomode), ++ &nfsi->layout->state); ++ dprintk("%s: layout_get suspended until %ld\n", ++ __func__, suspend); ++out: ++ dprintk("%s end (err:%d) state 0x%lx lseg %p\n", ++ __func__, lgp->status, nfsi->layout->state, lseg); ++ return; ++} ++ ++int ++pnfs_layout_process(struct nfs4_layoutget *lgp) ++{ ++ struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; ++ struct nfs4_layoutget_res *res = &lgp->res; ++ struct pnfs_layout_segment *lseg; ++ struct inode *ino = PNFS_INODE(lo); ++ int status = 0; ++ ++ /* Inject layout blob into I/O device driver */ ++ lseg = PNFS_LD_IO_OPS(lo)->alloc_lseg(lo, res); ++ if (!lseg || IS_ERR(lseg)) { ++ if (!lseg) ++ status = -ENOMEM; ++ else ++ status = PTR_ERR(lseg); ++ dprintk("%s: Could not allocate layout: error %d\n", ++ __func__, status); ++ goto out; ++ } ++ ++ spin_lock(&ino->i_lock); ++ init_lseg(lo, lseg); ++ lseg->range = res->range; ++ if (lgp->lsegpp) { ++ get_lseg(lseg); ++ *lgp->lsegpp = lseg; ++ } ++ pnfs_insert_layout(lo, lseg); ++ ++ if (res->return_on_close) { ++ lo->roc_iomode |= res->range.iomode; ++ if (!lo->roc_iomode) ++ lo->roc_iomode = IOMODE_ANY; ++ } ++ ++ /* Done processing layoutget. Set the layout stateid */ ++ pnfs_set_layout_stateid(lo, &res->stateid); ++ spin_unlock(&ino->i_lock); ++out: ++ return status; ++} ++ ++void ++readahead_range(struct inode *inode, struct list_head *pages, loff_t *offset, ++ size_t *count) ++{ ++ struct page *first, *last; ++ loff_t foff, i_size = i_size_read(inode); ++ pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; ++ size_t range; ++ ++ ++ first = list_entry((pages)->prev, struct page, lru); ++ last = list_entry((pages)->next, struct page, lru); ++ ++ foff = (loff_t)first->index << PAGE_CACHE_SHIFT; ++ ++ range = (last->index - first->index) * PAGE_CACHE_SIZE; ++ if (last->index == end_index) ++ range += ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; ++ else ++ range += PAGE_CACHE_SIZE; ++ dprintk("%s foff %lu, range %Zu\n", __func__, (unsigned long)foff, ++ range); ++ *offset = foff; ++ *count = range; ++} ++ ++void ++pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio) ++{ ++ struct pnfs_layout_hdr *lo; ++ struct pnfs_layoutdriver_type *ld; ++ ++ pgio->pg_test = NULL; ++ ++ lo = NFS_I(inode)->layout; ++ ld = NFS_SERVER(inode)->pnfs_curr_ld; ++ if (!pnfs_enabled_sb(NFS_SERVER(inode)) || !lo) ++ return; ++ ++ if (ld->ld_policy_ops) ++ pgio->pg_test = ld->ld_policy_ops->pg_test; ++} ++ ++static u32 ++pnfs_getboundary(struct inode *inode) ++{ ++ u32 stripe_size = 0; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct layoutdriver_policy_operations *policy_ops; ++ ++ if (!nfss->pnfs_curr_ld) ++ goto out; ++ ++ policy_ops = nfss->pnfs_curr_ld->ld_policy_ops; ++ if (!policy_ops || !policy_ops->get_stripesize) ++ goto out; ++ ++ /* The default is to not gather across stripes */ ++ if (pnfs_ld_gather_across_stripes(nfss->pnfs_curr_ld)) ++ goto out; ++ ++ spin_lock(&inode->i_lock); ++ if (NFS_I(inode)->layout) ++ stripe_size = policy_ops->get_stripesize(NFS_I(inode)->layout); ++ spin_unlock(&inode->i_lock); ++out: ++ return stripe_size; ++} ++ ++/* ++ * rsize is already set by caller to MDS rsize. ++ */ ++void ++pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, ++ struct inode *inode, ++ struct nfs_open_context *ctx, ++ struct list_head *pages, ++ size_t *rsize) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ size_t count = 0; ++ loff_t loff; ++ ++ pgio->pg_iswrite = 0; ++ pgio->pg_boundary = 0; ++ pgio->pg_test = NULL; ++ pgio->pg_lseg = NULL; ++ ++ if (!pnfs_enabled_sb(nfss)) ++ return; ++ ++ /* Calculate the total read-ahead count */ ++ readahead_range(inode, pages, &loff, &count); ++ ++ if (count > 0) { ++ _pnfs_update_layout(inode, ctx, loff, count, IOMODE_READ, ++ &pgio->pg_lseg); ++ if (!pgio->pg_lseg) ++ return; ++ ++ *rsize = NFS_SERVER(inode)->ds_rsize; ++ pgio->pg_boundary = pnfs_getboundary(inode); ++ if (pgio->pg_boundary) ++ pnfs_set_pg_test(inode, pgio); ++ } ++} ++ ++void ++pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, ++ size_t *wsize) ++{ ++ struct nfs_server *server = NFS_SERVER(inode); ++ ++ pgio->pg_iswrite = 1; ++ if (!pnfs_enabled_sb(server)) { ++ pgio->pg_boundary = 0; ++ pgio->pg_test = NULL; ++ return; ++ } ++ pgio->pg_boundary = pnfs_getboundary(inode); ++ pnfs_set_pg_test(inode, pgio); ++ *wsize = server->ds_wsize; ++} ++ ++/* Return I/O buffer size for a layout driver ++ * This value will determine what size reads and writes ++ * will be gathered into and sent to the data servers. ++ * blocksize must be a multiple of the page cache size. ++ */ ++unsigned int ++pnfs_getiosize(struct nfs_server *server) ++{ ++ if (!PNFS_EXISTS_LDPOLICY_OP(server, get_blocksize)) ++ return 0; ++ return server->pnfs_curr_ld->ld_policy_ops->get_blocksize(); ++} ++ ++void ++pnfs_set_ds_iosize(struct nfs_server *server) ++{ ++ unsigned dssize = pnfs_getiosize(server); ++ ++ /* Set buffer size for data servers */ ++ if (dssize > 0) { ++ server->ds_rsize = server->ds_wsize = ++ nfs_block_size(dssize, NULL); ++ } else { ++ server->ds_wsize = server->wsize; ++ server->ds_rsize = server->rsize; ++ } ++} ++ ++static int ++pnfs_call_done(struct pnfs_call_data *pdata, struct rpc_task *task, void *data) ++{ ++ put_lseg(pdata->lseg); ++ pdata->lseg = NULL; ++ pdata->call_ops->rpc_call_done(task, data); ++ if (pdata->pnfs_error == -EAGAIN || task->tk_status == -EAGAIN) ++ return -EAGAIN; ++ if (pdata->pnfsflags & PNFS_NO_RPC) { ++ pdata->call_ops->rpc_release(data); ++ } else { ++ /* ++ * just restore original rpc call ops ++ * rpc_release will be called later by the rpc scheduling layer. ++ */ ++ task->tk_ops = pdata->call_ops; ++ } ++ return 0; ++} ++ ++/* Post-write completion function ++ * Invoked by all layout drivers when write_pagelist is done. ++ * ++ * NOTE: callers set data->pnfsflags PNFS_NO_RPC ++ * so that the NFS cleanup routines perform only the page cache ++ * cleanup. ++ */ ++static void ++pnfs_write_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ struct pnfs_layout_range range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = wdata->args.offset; ++ range.length = wdata->args.count; ++ _pnfs_return_layout(wdata->inode, &range, NULL, RETURN_FILE, true); ++ pnfs_initiate_write(wdata, NFS_CLIENT(wdata->inode), ++ wdata->pdata.call_ops, wdata->pdata.how); ++} ++ ++static void ++pnfs_writeback_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ /* update last write offset and need layout commit ++ * for non-files layout types (files layout calls ++ * pnfs4_write_done for this) ++ */ ++ if ((pdata->pnfsflags & PNFS_NO_RPC) && ++ data->task.tk_status >= 0 && data->res.count > 0) { ++ struct nfs_inode *nfsi = NFS_I(data->inode); ++ ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++ } ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_write_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++ ++static void _pnfs_clear_lseg_from_pages(struct list_head *head) ++{ ++ struct nfs_page *req; ++ ++ list_for_each_entry(req, head, wb_list) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } ++} ++ ++/* ++ * Call the appropriate parallel I/O subsystem write function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ * ++ * TODO: Is wdata->how and wdata->args.stable always the same value? ++ * TODO: It seems in NFS, the server may not do a stable write even ++ * though it was requested (and vice-versa?). To check, it looks ++ * in data->res.verf->committed. Do we need this ability ++ * for non-file layout drivers? ++ */ ++enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *wdata, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ struct inode *inode = wdata->inode; ++ enum pnfs_try_status trypnfs; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = wdata->req->wb_lseg; ++ ++ wdata->pdata.call_ops = call_ops; ++ wdata->pdata.pnfs_error = 0; ++ wdata->pdata.how = how; ++ ++ dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, ++ inode->i_ino, wdata->args.count, wdata->args.offset, how); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ wdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ wdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->write_pagelist(wdata, ++ nfs_page_array_len(wdata->args.pgbase, wdata->args.count), ++ how); ++ ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ wdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ wdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&wdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* Post-read completion function. Invoked by all layout drivers when ++ * read_pagelist is done ++ */ ++static void ++pnfs_read_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ struct pnfs_layout_range range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = rdata->args.offset; ++ range.length = rdata->args.count; ++ _pnfs_return_layout(rdata->inode, &range, NULL, RETURN_FILE, true); ++ pnfs_initiate_read(rdata, NFS_CLIENT(rdata->inode), ++ rdata->pdata.call_ops); ++} ++ ++static void ++pnfs_read_done(struct nfs_read_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_read_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++ ++/* ++ * Call the appropriate parallel I/O subsystem read function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ */ ++enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *rdata, ++ const struct rpc_call_ops *call_ops) ++{ ++ struct inode *inode = rdata->inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = rdata->req->wb_lseg; ++ enum pnfs_try_status trypnfs; ++ ++ rdata->pdata.call_ops = call_ops; ++ rdata->pdata.pnfs_error = 0; ++ ++ dprintk("%s: Reading ino:%lu %u@%llu\n", ++ __func__, inode->i_ino, rdata->args.count, rdata->args.offset); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ rdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ rdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->read_pagelist(rdata, ++ nfs_page_array_len(rdata->args.pgbase, rdata->args.count)); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ rdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ rdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&rdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_READ); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* ++ * This gives the layout driver an opportunity to read in page "around" ++ * the data to be written. It returns 0 on success, otherwise an error code ++ * which will either be passed up to user, or ignored if ++ * some previous part of write succeeded. ++ * Note the range [pos, pos+len-1] is entirely within the page. ++ */ ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata) ++{ ++ struct pnfs_fsdata *data; ++ int status = 0; ++ ++ dprintk("--> %s: pos=%llu len=%u\n", ++ __func__, (unsigned long long)pos, len); ++ data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL); ++ if (!data) { ++ status = -ENOMEM; ++ goto out; ++ } ++ data->lseg = lseg; /* refcount passed into data to be managed there */ ++ status = NFS_SERVER(inode)->pnfs_curr_ld->ld_io_ops->write_begin( ++ lseg, page, pos, len, data); ++ if (status) { ++ kfree(data); ++ data = NULL; ++ } ++out: ++ *fsdata = data; ++ dprintk("<-- %s: status=%d\n", __func__, status); ++ return status; ++} ++ ++/* Return 0 on succes, negative on failure */ ++/* CAREFUL - what happens if copied < len??? */ ++int _pnfs_write_end(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int status; ++ ++ status = nfss->pnfs_curr_ld->ld_io_ops->write_end(inode, page, ++ pos, len, copied, lseg); ++ return status; ++} ++ ++/* pNFS Commit callback function for all layout drivers */ ++static void ++pnfs_commit_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ struct pnfs_layout_range range = { ++ .iomode = IOMODE_RW, ++ .offset = data->args.offset, ++ .length = data->args.count, ++ }; ++ dprintk("%s: retrying\n", __func__); ++ _pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE, ++ true); ++ pnfs_initiate_commit(data, NFS_CLIENT(data->inode), ++ pdata->call_ops, pdata->how, 1); ++ } ++} ++ ++enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int sync) ++{ ++ struct inode *inode = data->inode; ++ struct nfs_server *nfss = NFS_SERVER(data->inode); ++ enum pnfs_try_status trypnfs; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ if (!pnfs_use_rpc(nfss)) ++ data->pdata.pnfsflags |= PNFS_NO_RPC; ++ /* We need to account for possibility that ++ * each nfs_page can point to a different lseg (or be NULL). ++ * For the immediate case of whole-file-only layouts, we at ++ * least know there can be only a single lseg. ++ * We still have to account for the possibility of some being NULL. ++ * This will be done by passing the buck to the layout driver. ++ */ ++ data->pdata.call_ops = call_ops; ++ data->pdata.pnfs_error = 0; ++ data->pdata.how = sync; ++ data->pdata.lseg = NULL; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->commit(data, sync); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ data->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ _pnfs_clear_lseg_from_pages(&data->pages); ++ } else ++ nfs_inc_stats(inode, NFSIOS_PNFS_COMMIT); ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) ++{ ++ struct nfs_server *nfss = NFS_SERVER(data->args.inode); ++ ++ /* TODO: Maybe we should avoid this by allowing the layout driver ++ * to directly xdr its layout on the wire. ++ */ ++ if (nfss->pnfs_curr_ld->ld_io_ops->cleanup_layoutcommit) ++ nfss->pnfs_curr_ld->ld_io_ops->cleanup_layoutcommit( ++ NFS_I(data->args.inode)->layout, ++ &data->args, data->status); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int ++pnfs_layoutcommit_setup(struct inode *inode, ++ struct nfs4_layoutcommit_data *data, ++ loff_t write_begin_pos, loff_t write_end_pos) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int result = 0; ++ ++ dprintk("--> %s\n", __func__); ++ ++ data->args.inode = inode; ++ data->args.fh = NFS_FH(inode); ++ data->args.layout_type = nfss->pnfs_curr_ld->id; ++ data->res.fattr = &data->fattr; ++ nfs_fattr_init(&data->fattr); ++ ++ /* TODO: Need to determine the correct values */ ++ data->args.time_modify_changed = 0; ++ ++ /* Set values from inode so it can be reset ++ */ ++ data->args.range.iomode = IOMODE_RW; ++ data->args.range.offset = write_begin_pos; ++ data->args.range.length = write_end_pos - write_begin_pos + 1; ++ data->args.lastbytewritten = min(write_end_pos, ++ i_size_read(inode) - 1); ++ data->args.bitmask = nfss->attr_bitmask; ++ data->res.server = nfss; ++ ++ /* Call layout driver to set the arguments */ ++ if (nfss->pnfs_curr_ld->ld_io_ops->setup_layoutcommit) ++ result = nfss->pnfs_curr_ld->ld_io_ops->setup_layoutcommit( ++ NFS_I(inode)->layout, &data->args); ++ ++ dprintk("<-- %s Status %d\n", __func__, result); ++ return result; ++} ++ ++/* Issue a async layoutcommit for an inode. ++ */ ++int ++pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ struct nfs4_layoutcommit_data *data; ++ struct nfs_inode *nfsi = NFS_I(inode); ++ loff_t write_begin_pos; ++ loff_t write_end_pos; ++ ++ int status = 0; ++ ++ dprintk("%s Begin (sync:%d)\n", __func__, sync); ++ ++ BUG_ON(!has_layout(nfsi)); ++ ++ data = pnfs_layoutcommit_alloc(); ++ if (!data) ++ return -ENOMEM; ++ ++ spin_lock(&inode->i_lock); ++ if (!layoutcommit_needed(nfsi)) { ++ spin_unlock(&inode->i_lock); ++ goto out_free; ++ } ++ ++ /* Clear layoutcommit properties in the inode so ++ * new lc info can be generated ++ */ ++ write_begin_pos = nfsi->layout->write_begin_pos; ++ write_end_pos = nfsi->layout->write_end_pos; ++ data->cred = nfsi->layout->cred; ++ nfsi->layout->write_begin_pos = 0; ++ nfsi->layout->write_end_pos = 0; ++ nfsi->layout->cred = NULL; ++ __clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->state); ++ pnfs_get_layout_stateid(&data->args.stateid, nfsi->layout); ++ ++ /* Reference for layoutcommit matched in pnfs_layoutcommit_release */ ++ get_layout(NFS_I(inode)->layout); ++ ++ spin_unlock(&inode->i_lock); ++ ++ /* Set up layout commit args */ ++ status = pnfs_layoutcommit_setup(inode, data, write_begin_pos, ++ write_end_pos); ++ if (status) { ++ /* The layout driver failed to setup the layoutcommit */ ++ put_rpccred(data->cred); ++ put_layout(inode); ++ goto out_free; ++ } ++ status = nfs4_proc_layoutcommit(data, sync); ++out: ++ dprintk("%s end (err:%d)\n", __func__, status); ++ return status; ++out_free: ++ pnfs_layoutcommit_free(data); ++ goto out; ++} ++ ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata) ++{ ++ if (fsdata) { ++ /* lseg refcounting handled directly in nfs_Write_end */ ++ kfree(fsdata); ++ } ++} ++ ++/* Callback operations for layout drivers. ++ */ ++struct pnfs_client_operations pnfs_ops = { ++ .nfs_getdevicelist = nfs4_proc_getdevicelist, ++ .nfs_getdeviceinfo = nfs4_proc_getdeviceinfo, ++ .nfs_readlist_complete = pnfs_read_done, ++ .nfs_writelist_complete = pnfs_writeback_done, ++ .nfs_commit_complete = pnfs_commit_done, ++}; ++ ++EXPORT_SYMBOL(pnfs_unregister_layoutdriver); ++EXPORT_SYMBOL(pnfs_register_layoutdriver); ++ ++ ++/* Device ID cache. Supports one layout type per struct nfs_client */ ++int ++nfs4_alloc_init_deviceid_cache(struct nfs_client *clp, ++ void (*free_callback)(struct kref *)) ++{ ++ struct nfs4_deviceid_cache *c; ++ ++ c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL); ++ if (!c) ++ return -ENOMEM; ++ spin_lock(&clp->cl_lock); ++ if (clp->cl_devid_cache != NULL) { ++ kref_get(&clp->cl_devid_cache->dc_kref); ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [kref [%d]]\n", __func__, ++ atomic_read(&clp->cl_devid_cache->dc_kref.refcount)); ++ kfree(c); ++ } else { ++ int i; ++ ++ spin_lock_init(&c->dc_lock); ++ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE ; i++) ++ INIT_HLIST_HEAD(&c->dc_deviceids[i]); ++ kref_init(&c->dc_kref); ++ c->dc_free_callback = free_callback; ++ clp->cl_devid_cache = c; ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [new]\n", __func__); ++ } ++ return 0; ++} ++EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache); ++ ++void ++nfs4_init_deviceid_node(struct nfs4_deviceid *d) ++{ ++ INIT_HLIST_NODE(&d->de_node); ++ kref_init(&d->de_kref); ++} ++EXPORT_SYMBOL(nfs4_init_deviceid_node); ++ ++/* Called from layoutdriver_io_operations->alloc_lseg */ ++void ++nfs4_set_layout_deviceid(struct pnfs_layout_segment *l, struct nfs4_deviceid *d) ++{ ++ dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); ++ l->deviceid = d; ++} ++EXPORT_SYMBOL(nfs4_set_layout_deviceid); ++ ++/* Called from layoutdriver_io_operations->free_lseg */ ++void ++nfs4_put_unset_layout_deviceid(struct pnfs_layout_segment *l, ++ struct nfs4_deviceid *d, ++ void (*free_callback)(struct kref *)) ++{ ++ dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); ++ l->deviceid = NULL; ++ kref_put(&d->de_kref, free_callback); ++} ++EXPORT_SYMBOL(nfs4_put_unset_layout_deviceid); ++ ++/* Find and reference a deviceid */ ++struct nfs4_deviceid * ++nfs4_find_get_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ long hash = nfs4_deviceid_hash(id); ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ rcu_read_lock(); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) { ++ if (!atomic_inc_not_zero(&d->de_kref.refcount)) { ++ goto fail; ++ } else { ++ rcu_read_unlock(); ++ return d; ++ } ++ } ++ } ++fail: ++ rcu_read_unlock(); ++ return NULL; ++} ++EXPORT_SYMBOL(nfs4_find_get_deviceid); ++ ++/* ++ * Add and kref_get a deviceid. ++ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new ++ */ ++struct nfs4_deviceid * ++nfs4_add_get_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ long hash = nfs4_deviceid_hash(&new->de_id); ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ spin_lock(&c->dc_lock); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) { ++ kref_get(&d->de_kref); ++ spin_unlock(&c->dc_lock); ++ dprintk("%s [discard]\n", __func__); ++ c->dc_free_callback(&new->de_kref); ++ return d; ++ } ++ } ++ hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]); ++ kref_get(&new->de_kref); ++ spin_unlock(&c->dc_lock); ++ dprintk("%s [new]\n", __func__); ++ return new; ++} ++EXPORT_SYMBOL(nfs4_add_get_deviceid); ++ ++/* ++ * Remove the first deviceid from a hash bucket, or return 0 if bucket list ++ * is empty. ++ */ ++static int ++nfs4_remove_deviceid(struct nfs4_deviceid_cache *c, long hash, ++ struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ spin_lock(&c->dc_lock); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (id && memcmp(id, &d->de_id, NFS4_PNFS_DEVICEID4_SIZE)) ++ continue; ++ hlist_del_rcu(&d->de_node); ++ spin_unlock(&c->dc_lock); ++ synchronize_rcu(); ++ dprintk("%s [%d]\n", __func__, ++ atomic_read(&d->de_kref.refcount)); ++ kref_put(&d->de_kref, c->dc_free_callback); ++ return 1; ++ } ++ spin_unlock(&c->dc_lock); ++ return 0; ++} ++ ++void ++nfs4_delete_device(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) ++{ ++ long hash = nfs4_deviceid_hash(id); ++ ++ nfs4_remove_deviceid(c, hash, id); ++} ++EXPORT_SYMBOL(nfs4_delete_device); ++ ++static void ++nfs4_free_deviceid_cache(struct kref *kref) ++{ ++ struct nfs4_deviceid_cache *cache = ++ container_of(kref, struct nfs4_deviceid_cache, dc_kref); ++ long i; ++ ++ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) ++ while (nfs4_remove_deviceid(cache, i, NULL)) ++ ; ++ kfree(cache); ++} ++ ++void ++nfs4_put_deviceid_cache(struct nfs_client *clp) ++{ ++ struct nfs4_deviceid_cache *tmp = clp->cl_devid_cache; ++ int refcount; ++ ++ dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); ++ spin_lock(&clp->cl_lock); ++ refcount = atomic_read(&clp->cl_devid_cache->dc_kref.refcount); ++ if (refcount == 1) ++ clp->cl_devid_cache = NULL; ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [%d]\n", __func__, refcount); ++ kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache); ++} ++EXPORT_SYMBOL(nfs4_put_deviceid_cache); +diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs.h +--- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-09-30 10:17:08.757998000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-09-30 10:17:08.759996000 -0400 +@@ -0,0 +1,354 @@ ++/* ++ * fs/nfs/pnfs.h ++ * ++ * pNFS client data structures. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef FS_NFS_PNFS_H ++#define FS_NFS_PNFS_H ++ ++#include ++ ++#ifdef CONFIG_NFS_V4_1 ++ ++#include ++#include ++#include "iostat.h" ++ ++/* nfs4proc.c */ ++extern int nfs4_proc_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist); ++extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, ++ struct pnfs_device *dev); ++extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); ++extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, ++ int issync); ++extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait); ++ ++/* pnfs.c */ ++extern const nfs4_stateid zero_stateid; ++ ++void _pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp); ++ ++int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type, bool wait); ++void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id); ++void unmount_pnfs_layoutdriver(struct nfs_server *); ++enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, ++ const struct rpc_call_ops *); ++int pnfs_initialize(void); ++void pnfs_uninitialize(void); ++void pnfs_layoutcommit_free(struct nfs4_layoutcommit_data *data); ++void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); ++int pnfs_layoutcommit_inode(struct inode *inode, int sync); ++void pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent); ++void pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx); ++unsigned int pnfs_getiosize(struct nfs_server *server); ++void pnfs_set_ds_iosize(struct nfs_server *server); ++enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, ++ struct nfs_open_context *, struct list_head *, ++ size_t *); ++void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, ++ size_t *); ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata); ++void pnfs_get_layout_done(struct nfs4_layoutget *, int rpc_status); ++int pnfs_layout_process(struct nfs4_layoutget *lgp); ++void pnfs_layout_release(struct pnfs_layout_hdr *, struct pnfs_layout_range *range); ++void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, ++ const nfs4_stateid *stateid); ++void pnfs_destroy_layout(struct nfs_inode *); ++void pnfs_destroy_all_layouts(struct nfs_client *); ++void put_layout(struct inode *inode); ++void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo); ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata); ++int _pnfs_write_end(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg); ++ ++#define PNFS_EXISTS_LDIO_OP(srv, opname) ((srv)->pnfs_curr_ld && \ ++ (srv)->pnfs_curr_ld->ld_io_ops && \ ++ (srv)->pnfs_curr_ld->ld_io_ops->opname) ++#define PNFS_EXISTS_LDPOLICY_OP(srv, opname) ((srv)->pnfs_curr_ld && \ ++ (srv)->pnfs_curr_ld->ld_policy_ops && \ ++ (srv)->pnfs_curr_ld->ld_policy_ops->opname) ++ ++#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" ++ ++static inline int lo_fail_bit(u32 iomode) ++{ ++ return iomode == IOMODE_RW ? ++ NFS_INO_RW_LAYOUT_FAILED : NFS_INO_RO_LAYOUT_FAILED; ++} ++ ++/* Return true if a layout driver is being used for this mountpoint */ ++static inline int pnfs_enabled_sb(struct nfs_server *nfss) ++{ ++ return nfss->pnfs_curr_ld != NULL; ++} ++ ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) ++{ ++ return !fsdata || ((struct pnfs_layout_segment *)fsdata == lseg) || ++ !fsdata->bypass_eof; ++} ++ ++/* Should the pNFS client commit and return the layout upon a setattr */ ++static inline bool ++pnfs_ld_layoutret_on_setattr(struct inode *inode) ++{ ++ if (!pnfs_enabled_sb(NFS_SERVER(inode))) ++ return false; ++ return NFS_SERVER(inode)->pnfs_curr_ld->ld_policy_ops->flags & ++ PNFS_LAYOUTRET_ON_SETATTR; ++} ++ ++/* Should the pNFS client commit and return the layout on close ++ */ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout->roc_iomode; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int status = 0; ++ ++ *fsdata = lseg; ++ if (lseg && PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ status = _pnfs_write_begin(inode, page, pos, len, lseg, ++ (struct pnfs_fsdata **) fsdata); ++ return status; ++} ++ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_end)) ++ return _pnfs_write_end(inode, page, pos, len, copied, lseg); ++ else ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++ if (fsdata) { ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_end_cleanup)) ++ nfss->pnfs_curr_ld->ld_io_ops->write_end_cleanup(filp, fsdata); ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ pnfs_free_fsdata(fsdata); ++ } ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct pnfs_layout_range *range, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct nfs_server *nfss = NFS_SERVER(ino); ++ ++ if (pnfs_enabled_sb(nfss) && ++ (type != RETURN_FILE || has_layout(nfsi))) ++ return _pnfs_return_layout(ino, range, stateid, type, wait); ++ ++ return 0; ++} ++ ++static inline void pnfs_update_layout(struct inode *ino, ++ struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ struct nfs_server *nfss = NFS_SERVER(ino); ++ ++ if (pnfs_enabled_sb(nfss)) ++ _pnfs_update_layout(ino, ctx, pos, count, access_type, lsegpp); ++ else { ++ if (lsegpp) ++ *lsegpp = NULL; ++ } ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline int pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ if (pnfs_enabled_sb(nfss)) ++ return pnfs_ld_use_rpc_code(nfss->pnfs_curr_ld); ++ ++ return 1; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ if (fsdata) { ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ return ((struct pnfs_fsdata *) fsdata)->lseg; ++ } ++ return fsdata; ++} ++#else /* CONFIG_NFS_V4_1 */ ++ ++static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) ++{ ++} ++ ++static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) ++{ ++} ++ ++static inline void get_lseg(struct pnfs_layout_segment *lseg) ++{ ++} ++ ++static inline void put_lseg(struct pnfs_layout_segment *lseg) ++{ ++} ++ ++static inline void ++pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ if (lsegpp) ++ *lsegpp = NULL; ++} ++ ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) ++{ ++ return 1; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ *fsdata = NULL; ++ return 0; ++} ++ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return 0; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return 0; ++} ++ ++static inline int pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ return 1; ++} ++ ++static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ return 0; ++} ++ ++static inline bool ++pnfs_ld_layoutret_on_setattr(struct inode *inode) ++{ ++ return false; ++} ++ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return 0; ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct pnfs_layout_range *range, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ return 0; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ return NULL; ++} ++ ++#endif /* CONFIG_NFS_V4_1 */ ++ ++#endif /* FS_NFS_PNFS_H */ +diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc.c +--- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-09-30 10:15:17.904725000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-09-30 10:17:08.764996000 -0400 +@@ -443,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, stru + fattr = nfs_alloc_fattr(); + status = -ENOMEM; + if (fh == NULL || fattr == NULL) +- goto out; ++ goto out_free; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); +@@ -455,7 +455,7 @@ nfs_proc_symlink(struct inode *dir, stru + */ + if (status == 0) + status = nfs_instantiate(dentry, fh, fattr); +- ++out_free: + nfs_free_fattr(fattr); + nfs_free_fhandle(fh); + out: +@@ -694,6 +694,7 @@ const struct nfs_rpc_ops nfs_v2_clientop + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs_dir_inode_operations, + .file_inode_ops = &nfs_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs_proc_get_root, + .getattr = nfs_proc_getattr, + .setattr = nfs_proc_setattr, +diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read.c +--- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-09-30 10:15:17.910723000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/read.c 2010-09-30 10:17:08.770996000 -0400 +@@ -18,8 +18,12 @@ + #include + #include + #include ++#include ++#include + + #include ++#include ++#include "pnfs.h" + + #include "nfs4_fs.h" + #include "internal.h" +@@ -117,11 +121,14 @@ int nfs_readpage_async(struct nfs_open_c + LIST_HEAD(one_request); + struct nfs_page *new; + unsigned int len; ++ struct pnfs_layout_segment *lseg; + + len = nfs_page_length(page); + if (len == 0) + return nfs_return_empty_page(page); +- new = nfs_create_request(ctx, inode, page, 0, len); ++ pnfs_update_layout(inode, ctx, 0, NFS4_MAX_UINT64, IOMODE_READ, &lseg); ++ new = nfs_create_request(ctx, inode, page, 0, len, lseg); ++ put_lseg(lseg); + if (IS_ERR(new)) { + unlock_page(page); + return PTR_ERR(new); +@@ -155,24 +162,20 @@ static void nfs_readpage_release(struct + nfs_release_request(req); + } + +-/* +- * Set up the NFS read request struct +- */ +-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset) ++int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, +@@ -180,9 +183,46 @@ static int nfs_read_rpcsetup(struct nfs_ + .flags = RPC_TASK_ASYNC | swap_flags, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->read_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_read); ++ ++int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) ++ return pnfs_get_read_status(data); ++ ++ return nfs_initiate_read(data, clnt, call_ops); ++} ++ ++/* ++ * Set up the NFS read request struct ++ */ ++static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + data->req = req; + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -190,27 +230,14 @@ static int nfs_read_rpcsetup(struct nfs_ + data->args.pages = data->pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); ++ data->args.lock_context = req->wb_lock_context; + + data->res.fattr = &data->fattr; + data->res.count = count; + data->res.eof = 0; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- rpc_put_task(task); +- return 0; ++ return pnfs_initiate_read(data, NFS_CLIENT(inode), call_ops); + } + + static void +@@ -354,7 +381,14 @@ static void nfs_readpage_retry(struct rp + { + struct nfs_readargs *argp = &data->args; + struct nfs_readres *resp = &data->res; ++ struct nfs_client *clp = NFS_SERVER(data->inode)->nfs_client; + ++#ifdef CONFIG_NFS_V4_1 ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + if (resp->eof || resp->count == argp->count) + return; + +@@ -368,7 +402,10 @@ static void nfs_readpage_retry(struct rp + argp->offset += resp->count; + argp->pgbase += resp->count; + argp->count -= resp->count; +- nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ ++ nfs_restart_rpc(task, clp); + } + + /* +@@ -409,13 +446,19 @@ static void nfs_readpage_release_partial + void nfs_read_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_read_data *data = calldata; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client, ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, + &data->args.seq_args, &data->res.seq_res, + 0, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_read_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_read_partial_ops = { +@@ -568,7 +611,8 @@ readpage_async_filler(void *data, struct + if (len == 0) + return nfs_return_empty_page(page); + +- new = nfs_create_request(desc->ctx, inode, page, 0, len); ++ new = nfs_create_request(desc->ctx, inode, page, 0, len, ++ desc->pgio->pg_lseg); + if (IS_ERR(new)) + goto out_error; + +@@ -624,6 +668,9 @@ int nfs_readpages(struct file *filp, str + if (ret == 0) + goto read_complete; /* all pages were read */ + ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_pageio_init_read(&pgio, inode, desc.ctx, pages, &rsize); ++#endif /* CONFIG_NFS_V4_1 */ + if (rsize < PAGE_CACHE_SIZE) + nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); + else +@@ -632,6 +679,7 @@ int nfs_readpages(struct file *filp, str + ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); + + nfs_pageio_complete(&pgio); ++ put_lseg(pgio.pg_lseg); + npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nfs_add_stats(inode, NFSIOS_READPAGES, npages); + read_complete: +diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/super.c +--- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-09-30 10:15:17.918722000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/super.c 2010-09-30 10:17:08.777998000 -0400 +@@ -64,6 +64,7 @@ + #include "iostat.h" + #include "internal.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_VFS + +@@ -676,6 +677,28 @@ static int nfs_show_options(struct seq_f + + return 0; + } ++#ifdef CONFIG_NFS_V4_1 ++void show_sessions(struct seq_file *m, struct nfs_server *server) ++{ ++ if (nfs4_has_session(server->nfs_client)) ++ seq_printf(m, ",sessions"); ++} ++#else ++void show_sessions(struct seq_file *m, struct nfs_server *server) {} ++#endif ++ ++#ifdef CONFIG_NFS_V4_1 ++void show_pnfs(struct seq_file *m, struct nfs_server *server) ++{ ++ seq_printf(m, ",pnfs="); ++ if (server->pnfs_curr_ld) ++ seq_printf(m, "%s", server->pnfs_curr_ld->name); ++ else ++ seq_printf(m, "not configured"); ++} ++#else /* CONFIG_NFS_V4_1 */ ++void show_pnfs(struct seq_file *m, struct nfs_server *server) {} ++#endif /* CONFIG_NFS_V4_1 */ + + /* + * Present statistical information for this VFS mountpoint +@@ -714,6 +737,8 @@ static int nfs_show_stats(struct seq_fil + seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); + seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); + seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); ++ show_sessions(m, nfss); ++ show_pnfs(m, nfss); + } + #endif + +diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unlink.c +--- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-09-30 10:15:17.932726000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-09-30 10:17:08.783003000 -0400 +@@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task + struct nfs_unlinkdata *data = calldata; + struct nfs_server *server = NFS_SERVER(data->dir); + +- if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, ++ if (nfs4_setup_sequence(server, NULL, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/write.c +--- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-09-30 10:15:05.044337000 -0400 ++++ linux-2.6.34.noarch/fs/nfs/write.c 2010-09-30 10:17:08.789996000 -0400 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + #include + +@@ -28,6 +29,7 @@ + #include "iostat.h" + #include "nfs4_fs.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PAGECACHE + +@@ -59,6 +61,7 @@ struct nfs_write_data *nfs_commitdata_al + } + return p; + } ++EXPORT_SYMBOL(nfs_commitdata_alloc); + + void nfs_commit_free(struct nfs_write_data *p) + { +@@ -66,6 +69,7 @@ void nfs_commit_free(struct nfs_write_da + kfree(p->pagevec); + mempool_free(p, nfs_commit_mempool); + } ++EXPORT_SYMBOL(nfs_commit_free); + + struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) + { +@@ -418,6 +422,17 @@ static void nfs_inode_remove_request(str + nfs_clear_request(req); + nfs_release_request(req); + } ++static void ++nfs_mark_request_nopnfs(struct nfs_page *req) ++{ ++ struct pnfs_layout_segment *lseg = req->wb_lseg; ++ ++ if (req->wb_lseg == NULL) ++ return; ++ req->wb_lseg = NULL; ++ put_lseg(lseg); ++ dprintk(" retry through MDS\n"); ++} + + static void + nfs_mark_request_dirty(struct nfs_page *req) +@@ -523,7 +538,7 @@ nfs_need_commit(struct nfs_inode *nfsi) + * The requests are *not* checked to ensure that they form a contiguous set. + */ + static int +-nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) ++nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int *use_pnfs) + { + struct nfs_inode *nfsi = NFS_I(inode); + int ret; +@@ -531,7 +546,8 @@ nfs_scan_commit(struct inode *inode, str + if (!nfs_need_commit(nfsi)) + return 0; + +- ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); ++ ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT, ++ use_pnfs); + if (ret > 0) + nfsi->ncommit -= ret; + if (nfs_need_commit(NFS_I(inode))) +@@ -560,7 +576,8 @@ static inline int nfs_scan_commit(struct + static struct nfs_page *nfs_try_to_update_request(struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int bytes) ++ unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + unsigned int rqend; +@@ -585,8 +602,8 @@ static struct nfs_page *nfs_try_to_updat + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. + */ +- if (offset > rqend +- || end < req->wb_offset) ++ if (offset > rqend || end < req->wb_offset || ++ req->wb_lseg != lseg) + goto out_flushme; + + if (nfs_set_page_tag_locked(req)) +@@ -634,16 +651,17 @@ out_err: + * already called nfs_flush_incompatible() if necessary. + */ + static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, +- struct page *page, unsigned int offset, unsigned int bytes) ++ struct page *page, unsigned int offset, unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct inode *inode = page->mapping->host; + struct nfs_page *req; + int error; + +- req = nfs_try_to_update_request(inode, page, offset, bytes); ++ req = nfs_try_to_update_request(inode, page, offset, bytes, lseg); + if (req != NULL) + goto out; +- req = nfs_create_request(ctx, inode, page, offset, bytes); ++ req = nfs_create_request(ctx, inode, page, offset, bytes, lseg); + if (IS_ERR(req)) + goto out; + error = nfs_inode_add_request(inode, req); +@@ -656,23 +674,27 @@ out: + } + + static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, ++ void *fsdata) + { + struct nfs_page *req; + +- req = nfs_setup_write_request(ctx, page, offset, count); ++ req = nfs_setup_write_request(ctx, page, offset, count, lseg); + if (IS_ERR(req)) + return PTR_ERR(req); + nfs_mark_request_dirty(req); + /* Update file length */ +- nfs_grow_file(page, offset, count); ++ if (pnfs_grow_ok(lseg, fsdata)) ++ nfs_grow_file(page, offset, count); + nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + return 0; + } + +-int nfs_flush_incompatible(struct file *file, struct page *page) ++int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct nfs_page *req; +@@ -689,7 +711,10 @@ int nfs_flush_incompatible(struct file * + req = nfs_page_find_request(page); + if (req == NULL) + return 0; +- do_flush = req->wb_page != page || req->wb_context != ctx; ++ do_flush = req->wb_page != page || req->wb_context != ctx || ++ req->wb_lock_context->lockowner != current->files || ++ req->wb_lock_context->pid != current->tgid || ++ req->wb_lseg != lseg; + nfs_release_request(req); + if (!do_flush) + return 0; +@@ -716,7 +741,8 @@ static int nfs_write_pageuptodate(struct + * things with a page scheduled for an RPC call (e.g. invalidate it). + */ + int nfs_updatepage(struct file *file, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct inode *inode = page->mapping->host; +@@ -741,7 +767,7 @@ int nfs_updatepage(struct file *file, st + offset = 0; + } + +- status = nfs_writepage_setup(ctx, page, offset, count); ++ status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata); + if (status < 0) + nfs_set_pageerror(page); + +@@ -771,25 +797,21 @@ static int flush_task_priority(int how) + return RPC_PRIORITY_NORMAL; + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_write_rpcsetup(struct nfs_page *req, +- struct nfs_write_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset, +- int how) ++int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .task = &data->task, + .rpc_message = &msg, + .callback_ops = call_ops, +@@ -800,12 +822,62 @@ static int nfs_write_rpcsetup(struct nfs + }; + int ret = 0; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->write_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ if (how & FLUSH_SYNC) { ++ ret = rpc_wait_for_completion_task(task); ++ if (ret == 0) ++ ret = task->tk_status; ++ } ++ rpc_put_task(task); ++out: ++ return ret; ++} ++EXPORT_SYMBOL(nfs_initiate_write); ++ ++int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_write(data, clnt, call_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_write_rpcsetup(struct nfs_page *req, ++ struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset, ++ int how) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + data->req = req; + data->inode = inode = req->wb_context->path.dentry->d_inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -813,6 +885,7 @@ static int nfs_write_rpcsetup(struct nfs + data->args.pages = data->pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); ++ data->args.lock_context = req->wb_lock_context; + data->args.stable = NFS_UNSTABLE; + if (how & FLUSH_STABLE) { + data->args.stable = NFS_DATA_SYNC; +@@ -825,30 +898,7 @@ static int nfs_write_rpcsetup(struct nfs + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- dprintk("NFS: %5u initiated write call " +- "(req %s/%lld, %u bytes @ offset %llu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) { +- ret = PTR_ERR(task); +- goto out; +- } +- if (how & FLUSH_SYNC) { +- ret = rpc_wait_for_completion_task(task); +- if (ret == 0) +- ret = task->tk_status; +- } +- rpc_put_task(task); +-out: +- return ret; ++ return pnfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); + } + + /* If a nfs_flush_* function fails, it should remove reqs from @head and +@@ -859,6 +909,7 @@ static void nfs_redirty_request(struct n + { + struct page *page = req->wb_page; + ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); +@@ -971,6 +1022,10 @@ static void nfs_pageio_init_write(struct + { + size_t wsize = NFS_SERVER(inode)->wsize; + ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_pageio_init_write(pgio, inode, &wsize); ++#endif /* CONFIG_NFS_V4_1 */ ++ + if (wsize < PAGE_CACHE_SIZE) + nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); + else +@@ -1036,13 +1091,27 @@ out: + void nfs_write_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_write_data *data = calldata; +- struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(clp, &data->args.seq_args, ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } else if (data->args.count > NFS_SERVER(data->inode)->wsize) { ++ /* retrying via MDS? */ ++ data->pdata.orig_count = data->args.count; ++ data->args.count = NFS_SERVER(data->inode)->wsize; ++ dprintk("%s: trimmed count %u to wsize %u\n", __func__, ++ data->pdata.orig_count, data->args.count); ++ } else ++ data->pdata.orig_count = 0; ++ ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, ++ &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_write_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_write_partial_ops = { +@@ -1126,10 +1195,11 @@ int nfs_writeback_done(struct rpc_task * + struct nfs_writeargs *argp = &data->args; + struct nfs_writeres *resp = &data->res; + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *clp = server->nfs_client; + int status; + +- dprintk("NFS: %5u nfs_writeback_done (status %d)\n", +- task->tk_pid, task->tk_status); ++ dprintk("NFS: %5u nfs_writeback_done (status %d count %u)\n", ++ task->tk_pid, task->tk_status, resp->count); + + /* + * ->write_done will attempt to use post-op attributes to detect +@@ -1142,6 +1212,13 @@ int nfs_writeback_done(struct rpc_task * + if (status != 0) + return status; + nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); ++#ifdef CONFIG_NFS_V4_1 ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) + if (resp->verf->committed < argp->stable && task->tk_status >= 0) { +@@ -1158,7 +1235,7 @@ int nfs_writeback_done(struct rpc_task * + if (time_before(complain, jiffies)) { + dprintk("NFS: faulty NFS server %s:" + " (committed = %d) != (stable = %d)\n", +- server->nfs_client->cl_hostname, ++ clp->cl_hostname, + resp->verf->committed, argp->stable); + complain = jiffies + 300 * HZ; + } +@@ -1168,6 +1245,9 @@ int nfs_writeback_done(struct rpc_task * + if (task->tk_status >= 0 && resp->count < argp->count) { + static unsigned long complain; + ++ dprintk("NFS: short write:" ++ " (resp->count %u) < (argp->count = %u)\n", ++ resp->count, argp->count); + nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE); + + /* Has the server at least made some progress? */ +@@ -1184,7 +1264,10 @@ int nfs_writeback_done(struct rpc_task * + */ + argp->stable = NFS_FILE_SYNC; + } +- nfs_restart_rpc(task, server->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ ++ nfs_restart_rpc(task, clp); + return -EAGAIN; + } + if (time_before(complain, jiffies)) { +@@ -1228,40 +1311,73 @@ static void nfs_commitdata_release(void + nfs_commit_free(wdata); + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_commit_rpcsetup(struct list_head *head, +- struct nfs_write_data *data, +- int how) ++int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct nfs_page *first = nfs_list_entry(head->next); +- struct inode *inode = first->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = first->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, +- .callback_ops = &nfs_commit_ops, ++ .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + .priority = priority, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->commit_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_commit); ++ ++ ++int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs) ++{ ++ if (pnfs && ++ (pnfs_try_to_commit(data, &nfs_commit_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_commit(data, clnt, &nfs_commit_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_commit_rpcsetup(struct list_head *head, ++ struct nfs_write_data *data, ++ int how, int pnfs) ++{ ++ struct nfs_page *first = nfs_list_entry(head->next); ++ struct inode *inode = first->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + list_splice_init(head, &data->pages); + + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = first->wb_context->cred; + + data->args.fh = NFS_FH(data->inode); + /* Note: we always request a commit of the entire inode */ +@@ -1272,45 +1388,47 @@ static int nfs_commit_rpcsetup(struct li + data->res.fattr = &data->fattr; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); ++ kref_init(&data->refcount); ++ data->parent = NULL; ++ data->args.context = first->wb_context; /* used by commit done */ + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->commit_setup(data, &msg); ++ return pnfs_initiate_commit(data, NFS_CLIENT(inode), &nfs_commit_ops, ++ how, pnfs); ++} + +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++/* Handle memory error during commit */ ++void nfs_mark_list_commit(struct list_head *head) ++{ ++ struct nfs_page *req; + +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- rpc_put_task(task); +- return 0; ++ while (!list_empty(head)) { ++ req = nfs_list_entry(head->next); ++ nfs_list_remove_request(req); ++ nfs_mark_request_commit(req); ++ dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); ++ dec_bdi_stat(req->wb_page->mapping->backing_dev_info, ++ BDI_RECLAIMABLE); ++ nfs_clear_page_tag_locked(req); ++ } + } ++EXPORT_SYMBOL(nfs_mark_list_commit); + + /* + * Commit dirty pages + */ + static int +-nfs_commit_list(struct inode *inode, struct list_head *head, int how) ++nfs_commit_list(struct inode *inode, struct list_head *head, int how, int pnfs) + { + struct nfs_write_data *data; +- struct nfs_page *req; + + data = nfs_commitdata_alloc(); +- + if (!data) + goto out_bad; + + /* Set up the argument struct */ +- return nfs_commit_rpcsetup(head, data, how); ++ return nfs_commit_rpcsetup(head, data, how, pnfs); + out_bad: +- while (!list_empty(head)) { +- req = nfs_list_entry(head->next); +- nfs_list_remove_request(req); +- nfs_mark_request_commit(req); +- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); +- dec_bdi_stat(req->wb_page->mapping->backing_dev_info, +- BDI_RECLAIMABLE); +- nfs_clear_page_tag_locked(req); +- } ++ nfs_mark_list_commit(head); + nfs_commit_clear_lock(NFS_I(inode)); + return -ENOMEM; + } +@@ -1330,6 +1448,19 @@ static void nfs_commit_done(struct rpc_t + return; + } + ++static inline void nfs_commit_cleanup(struct kref *kref) ++{ ++ struct nfs_write_data *data; ++ ++ data = container_of(kref, struct nfs_write_data, refcount); ++ /* Clear lock only when all cloned commits are finished */ ++ if (data->parent) ++ kref_put(&data->parent->refcount, nfs_commit_cleanup); ++ else ++ nfs_commit_clear_lock(NFS_I(data->inode)); ++ nfs_commitdata_release(data); ++} ++ + static void nfs_commit_release(void *calldata) + { + struct nfs_write_data *data = calldata; +@@ -1347,6 +1478,11 @@ static void nfs_commit_release(void *cal + req->wb_bytes, + (long long)req_offset(req)); + if (status < 0) { ++ if (req->wb_lseg) { ++ nfs_mark_request_nopnfs(req); ++ nfs_mark_request_dirty(req); ++ goto next; ++ } + nfs_context_set_write_error(req->wb_context, status); + nfs_inode_remove_request(req); + dprintk(", error = %d\n", status); +@@ -1363,12 +1499,12 @@ static void nfs_commit_release(void *cal + } + /* We have a mismatch. Write the page again */ + dprintk(" mismatch\n"); ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + next: + nfs_clear_page_tag_locked(req); + } +- nfs_commit_clear_lock(NFS_I(data->inode)); +- nfs_commitdata_release(calldata); ++ kref_put(&data->refcount, nfs_commit_cleanup); + } + + static const struct rpc_call_ops nfs_commit_ops = { +@@ -1384,21 +1520,22 @@ int nfs_commit_inode(struct inode *inode + LIST_HEAD(head); + int may_wait = how & FLUSH_SYNC; + int res = 0; ++ int use_pnfs = 0; + + if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) + goto out_mark_dirty; + spin_lock(&inode->i_lock); +- res = nfs_scan_commit(inode, &head, 0, 0); ++ res = nfs_scan_commit(inode, &head, 0, 0, &use_pnfs); + spin_unlock(&inode->i_lock); + if (res) { +- int error = nfs_commit_list(inode, &head, how); ++ int error = nfs_commit_list(inode, &head, how, use_pnfs); + if (error < 0) + return error; +- if (may_wait) ++ if (may_wait) { + wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); +- else ++ } else + goto out_mark_dirty; + } else + nfs_commit_clear_lock(NFS_I(inode)); +@@ -1451,7 +1588,18 @@ static int nfs_commit_unstable_pages(str + + int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) + { +- return nfs_commit_unstable_pages(inode, wbc); ++ int ret; ++ ret = nfs_commit_unstable_pages(inode, wbc); ++ if (ret >= 0 && layoutcommit_needed(NFS_I(inode))) { ++ int err, sync = wbc->sync_mode; ++ ++ if (wbc->nonblocking || wbc->for_background) ++ sync = 0; ++ err = pnfs_layoutcommit_inode(inode, sync); ++ if (err < 0) ++ ret = err; ++ } ++ return ret; + } + + /* +@@ -1459,6 +1607,7 @@ int nfs_write_inode(struct inode *inode, + */ + int nfs_wb_all(struct inode *inode) + { ++ int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, +@@ -1466,7 +1615,8 @@ int nfs_wb_all(struct inode *inode) + .range_end = LLONG_MAX, + }; + +- return sync_inode(inode, &wbc); ++ ret = sync_inode(inode, &wbc); ++ return ret; + } + + int nfs_wb_page_cancel(struct inode *inode, struct page *page) +diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/include/linux/exportfs.h +--- linux-2.6.34.noarch/include/linux/exportfs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-09-30 10:17:09.002005000 -0400 +@@ -2,6 +2,7 @@ + #define LINUX_EXPORTFS_H 1 + + #include ++#include + + struct dentry; + struct inode; +@@ -175,4 +176,62 @@ extern struct dentry *generic_fh_to_pare + struct fid *fid, int fh_len, int fh_type, + struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)); + ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct pnfs_filelayout_device; ++struct pnfs_filelayout_layout; ++ ++extern int filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev); ++extern enum nfsstat4 filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct list_head; ++ ++extern int blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes); ++ ++extern enum nfsstat4 blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *layouts); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_PNFSD) ++#include ++ ++struct pnfsd_cb_operations; ++ ++struct pnfsd_cb_ctl { ++ spinlock_t lock; ++ struct module *module; ++ const struct pnfsd_cb_operations *cb_op; ++}; ++ ++/* in expfs.c so that file systems can depend on it */ ++extern struct pnfsd_cb_ctl pnfsd_cb_ctl; ++ ++static inline int ++pnfsd_get_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ int ret = -ENOENT; ++ ++ spin_lock(&pnfsd_cb_ctl.lock); ++ if (!pnfsd_cb_ctl.cb_op) ++ goto out; ++ if (!try_module_get(pnfsd_cb_ctl.module)) ++ goto out; ++ ctl->cb_op = pnfsd_cb_ctl.cb_op; ++ ctl->module = pnfsd_cb_ctl.module; ++ ret = 0; ++out: ++ spin_unlock(&pnfsd_cb_ctl.lock); ++ return ret; ++} ++ ++static inline void ++pnfsd_put_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ module_put(ctl->module); ++} ++#endif /* CONFIG_PNFSD */ + #endif /* LINUX_EXPORTFS_H */ +diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/include/linux/exp_xdr.h +--- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-09-30 10:17:08.988005000 -0400 ++++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-09-30 10:17:08.990007000 -0400 +@@ -0,0 +1,141 @@ ++#ifndef _LINUX_EXP_XDR_H ++#define _LINUX_EXP_XDR_H ++ ++#include ++#include ++#include ++ ++struct exp_xdr_stream { ++ __be32 *p; ++ __be32 *end; ++}; ++ ++/** ++ * exp_xdr_qwords - Calculate the number of quad-words holding nbytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline size_t ++exp_xdr_qwords(__u32 nbytes) ++{ ++ return DIV_ROUND_UP(nbytes, 4); ++} ++ ++/** ++ * exp_xdr_qbytes - Calculate the number of bytes holding qwords ++ * @qwords: number of quad-words to encode ++ */ ++static inline size_t ++exp_xdr_qbytes(size_t qwords) ++{ ++ return qwords << 2; ++} ++ ++/** ++ * exp_xdr_reserve_space - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nbytes: number of bytes to reserve ++ * ++ * Checks that we have enough buffer space to encode 'nbytes' more ++ * bytes of data. If so, update the xdr stream. ++ */ ++static inline __be32 * ++exp_xdr_reserve_space(struct exp_xdr_stream *xdr, size_t nbytes) ++{ ++ __be32 *p = xdr->p; ++ __be32 *q; ++ ++ /* align nbytes on the next 32-bit boundary */ ++ q = p + exp_xdr_qwords(nbytes); ++ if (unlikely(q > xdr->end || q < p)) ++ return NULL; ++ xdr->p = q; ++ return p; ++} ++ ++/** ++ * exp_xdr_reserve_qwords - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nwords: number of quad words (u32's) to reserve ++ */ ++static inline __be32 * ++exp_xdr_reserve_qwords(struct exp_xdr_stream *xdr, size_t qwords) ++{ ++ return exp_xdr_reserve_space(xdr, exp_xdr_qbytes(qwords)); ++} ++ ++/** ++ * exp_xdr_encode_u32 - Encode an unsigned 32-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u32(__be32 *p, __u32 val) ++{ ++ *p = cpu_to_be32(val); ++ return p + 1; ++} ++ ++/** ++ * exp_xdr_encode_u64 - Encode an unsigned 64-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u64(__be32 *p, __u64 val) ++{ ++ put_unaligned_be64(val, p); ++ return p + 2; ++} ++ ++/** ++ * exp_xdr_encode_bytes - Encode an array of bytes onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the array of bytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_bytes(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ if (likely(nbytes != 0)) { ++ unsigned int qwords = exp_xdr_qwords(nbytes); ++ unsigned int padding = exp_xdr_qbytes(qwords) - nbytes; ++ ++ memcpy(p, ptr, nbytes); ++ if (padding != 0) ++ memset((char *)p + nbytes, 0, padding); ++ p += qwords; ++ } ++ return p; ++} ++ ++/** ++ * exp_xdr_encode_opaque - Encode an opaque type onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the opaque array ++ * @nbytes: number of bytes to encode ++ * ++ * Encodes the 32-bit opaque size in bytes followed by the opaque value. ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ p = exp_xdr_encode_u32(p, nbytes); ++ return exp_xdr_encode_bytes(p, ptr, nbytes); ++} ++ ++/** ++ * exp_xdr_encode_opaque_qlen - Encode the opaque length onto a xdr stream ++ * @lenp: pointer to the opaque length destination ++ * @endp: pointer to the end of the opaque array ++ * ++ * Encodes the 32-bit opaque size in bytes given the start and end pointers ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque_len(__be32 *lenp, const void *endp) ++{ ++ size_t nbytes = (char *)endp - (char *)(lenp + 1); ++ ++ exp_xdr_encode_u32(lenp, nbytes); ++ return lenp + 1 + exp_xdr_qwords(nbytes); ++} ++#endif /* _LINUX_EXP_XDR_H */ +diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include/linux/fs.h +--- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-09-30 10:15:16.980690000 -0400 ++++ linux-2.6.34.noarch/include/linux/fs.h 2010-09-30 10:17:09.015004000 -0400 +@@ -387,6 +387,7 @@ struct inodes_stat_t { + #include + + struct export_operations; ++struct pnfs_export_operations; + struct hd_geometry; + struct iovec; + struct nameidata; +@@ -1329,6 +1330,7 @@ struct super_block { + const struct dquot_operations *dq_op; + const struct quotactl_ops *s_qcop; + const struct export_operations *s_export_op; ++ const struct pnfs_export_operations *s_pnfs_op; + unsigned long s_flags; + unsigned long s_magic; + struct dentry *s_root; +diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/include/linux/nfs4.h +--- linux-2.6.34.noarch/include/linux/nfs4.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-09-30 10:17:09.047005000 -0400 +@@ -17,7 +17,10 @@ + + #define NFS4_BITMAP_SIZE 2 + #define NFS4_VERIFIER_SIZE 8 +-#define NFS4_STATEID_SIZE 16 ++#define NFS4_CLIENTID_SIZE 8 ++#define NFS4_STATEID_SEQID_SIZE 4 ++#define NFS4_STATEID_OTHER_SIZE 12 ++#define NFS4_STATEID_SIZE (NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE) + #define NFS4_FHSIZE 128 + #define NFS4_MAXPATHLEN PATH_MAX + #define NFS4_MAXNAMLEN NAME_MAX +@@ -119,6 +122,13 @@ + #define EXCHGID4_FLAG_MASK_A 0x40070003 + #define EXCHGID4_FLAG_MASK_R 0x80070003 + ++static inline bool ++is_ds_only_session(u32 exchange_flags) ++{ ++ u32 mask = EXCHGID4_FLAG_USE_PNFS_DS | EXCHGID4_FLAG_USE_PNFS_MDS; ++ return (exchange_flags & mask) == EXCHGID4_FLAG_USE_PNFS_DS; ++} ++ + #define SEQ4_STATUS_CB_PATH_DOWN 0x00000001 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING 0x00000002 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED 0x00000004 +@@ -166,8 +176,25 @@ struct nfs4_acl { + struct nfs4_ace aces[0]; + }; + ++struct nfs4_fsid { ++ u64 major; ++ u64 minor; ++}; ++ + typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier; +-typedef struct { char data[NFS4_STATEID_SIZE]; } nfs4_stateid; ++typedef struct { char data[NFS4_CLIENTID_SIZE]; } nfs4_clientid; ++ ++struct nfs41_stateid { ++ __be32 seqid; ++ char other[NFS4_STATEID_OTHER_SIZE]; ++} __attribute__ ((packed)); ++ ++typedef struct { ++ union { ++ char data[NFS4_STATEID_SIZE]; ++ struct nfs41_stateid stateid; ++ } u; ++} nfs4_stateid; + + enum nfs_opnum4 { + OP_ACCESS = 3, +@@ -471,6 +498,8 @@ enum lock_type4 { + #define FATTR4_WORD1_TIME_MODIFY (1UL << 21) + #define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22) + #define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23) ++#define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30) ++#define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) + + #define NFSPROC4_NULL 0 + #define NFSPROC4_COMPOUND 1 +@@ -523,6 +552,7 @@ enum { + NFSPROC4_CLNT_GETACL, + NFSPROC4_CLNT_SETACL, + NFSPROC4_CLNT_FS_LOCATIONS, ++ NFSPROC4_CLNT_RELEASE_LOCKOWNER, + + /* nfs41 */ + NFSPROC4_CLNT_EXCHANGE_ID, +@@ -531,6 +561,13 @@ enum { + NFSPROC4_CLNT_SEQUENCE, + NFSPROC4_CLNT_GET_LEASE_TIME, + NFSPROC4_CLNT_RECLAIM_COMPLETE, ++ NFSPROC4_CLNT_LAYOUTGET, ++ NFSPROC4_CLNT_LAYOUTCOMMIT, ++ NFSPROC4_CLNT_LAYOUTRETURN, ++ NFSPROC4_CLNT_GETDEVICELIST, ++ NFSPROC4_CLNT_GETDEVICEINFO, ++ NFSPROC4_CLNT_PNFS_WRITE, ++ NFSPROC4_CLNT_PNFS_COMMIT, + }; + + /* nfs41 types */ +@@ -549,6 +586,43 @@ enum state_protect_how4 { + SP4_SSV = 2 + }; + ++enum pnfs_layouttype { ++ LAYOUT_NFSV4_1_FILES = 1, ++ LAYOUT_OSD2_OBJECTS = 2, ++ LAYOUT_BLOCK_VOLUME = 3, ++}; ++ ++/* used for both layout return and recall */ ++enum pnfs_layoutreturn_type { ++ RETURN_FILE = 1, ++ RETURN_FSID = 2, ++ RETURN_ALL = 3 ++}; ++ ++enum pnfs_iomode { ++ IOMODE_READ = 1, ++ IOMODE_RW = 2, ++ IOMODE_ANY = 3, ++}; ++ ++enum pnfs_notify_deviceid_type4 { ++ NOTIFY_DEVICEID4_CHANGE = 1 << 1, ++ NOTIFY_DEVICEID4_DELETE = 1 << 2, ++}; ++ ++#define NFL4_UFLG_MASK 0x0000003F ++#define NFL4_UFLG_DENSE 0x00000001 ++#define NFL4_UFLG_COMMIT_THRU_MDS 0x00000002 ++#define NFL4_UFLG_STRIPE_UNIT_SIZE_MASK 0xFFFFFFC0 ++ ++/* Encoded in the loh_body field of type layouthint4 */ ++enum filelayout_hint_care4 { ++ NFLH4_CARE_DENSE = NFL4_UFLG_DENSE, ++ NFLH4_CARE_COMMIT_THRU_MDS = NFL4_UFLG_COMMIT_THRU_MDS, ++ NFLH4_CARE_STRIPE_UNIT_SIZE = 0x00000040, ++ NFLH4_CARE_STRIPE_COUNT = 0x00000080 ++}; ++ + #endif + #endif + +diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfs4_pnfs.h +--- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-09-30 10:17:09.057007000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-09-30 10:17:09.059005000 -0400 +@@ -0,0 +1,329 @@ ++/* ++ * include/linux/nfs4_pnfs.h ++ * ++ * Common data structures needed by the pnfs client and pnfs layout driver. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef LINUX_NFS4_PNFS_H ++#define LINUX_NFS4_PNFS_H ++ ++#include ++ ++enum pnfs_try_status { ++ PNFS_ATTEMPTED = 0, ++ PNFS_NOT_ATTEMPTED = 1, ++}; ++ ++#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 ++ ++/* Per-layout driver specific registration structure */ ++struct pnfs_layoutdriver_type { ++ const u32 id; ++ const char *name; ++ struct layoutdriver_io_operations *ld_io_ops; ++ struct layoutdriver_policy_operations *ld_policy_ops; ++}; ++ ++struct pnfs_fsdata { ++ int bypass_eof; ++ struct pnfs_layout_segment *lseg; ++ void *private; ++}; ++ ++#if defined(CONFIG_NFS_V4_1) ++ ++static inline struct nfs_inode * ++PNFS_NFS_INODE(struct pnfs_layout_hdr *lo) ++{ ++ return NFS_I(lo->inode); ++} ++ ++static inline struct inode * ++PNFS_INODE(struct pnfs_layout_hdr *lo) ++{ ++ return lo->inode; ++} ++ ++static inline struct nfs_server * ++PNFS_NFS_SERVER(struct pnfs_layout_hdr *lo) ++{ ++ return NFS_SERVER(PNFS_INODE(lo)); ++} ++ ++static inline struct pnfs_layoutdriver_type * ++PNFS_LD(struct pnfs_layout_hdr *lo) ++{ ++ return NFS_SERVER(PNFS_INODE(lo))->pnfs_curr_ld; ++} ++ ++static inline struct layoutdriver_io_operations * ++PNFS_LD_IO_OPS(struct pnfs_layout_hdr *lo) ++{ ++ return PNFS_LD(lo)->ld_io_ops; ++} ++ ++static inline struct layoutdriver_policy_operations * ++PNFS_LD_POLICY_OPS(struct pnfs_layout_hdr *lo) ++{ ++ return PNFS_LD(lo)->ld_policy_ops; ++} ++ ++static inline bool ++has_layout(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout != NULL; ++} ++ ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) ++{ ++ return has_layout(nfsi) && ++ test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->state); ++} ++ ++extern void put_lseg(struct pnfs_layout_segment *lseg); ++extern void get_lseg(struct pnfs_layout_segment *lseg); ++ ++#else /* CONFIG_NFS_V4_1 */ ++ ++static inline bool ++has_layout(struct nfs_inode *nfsi) ++{ ++ return false; ++} ++ ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_NFS_V4_1 */ ++ ++struct pnfs_layout_segment { ++ struct list_head fi_list; ++ struct pnfs_layout_range range; ++ struct kref kref; ++ bool valid; ++ struct pnfs_layout_hdr *layout; ++ struct nfs4_deviceid *deviceid; ++ u8 ld_data[]; /* layout driver private data */ ++}; ++ ++static inline void * ++LSEG_LD_DATA(struct pnfs_layout_segment *lseg) ++{ ++ return lseg->ld_data; ++} ++ ++/* Layout driver I/O operations. ++ * Either the pagecache or non-pagecache read/write operations must be implemented ++ */ ++struct layoutdriver_io_operations { ++ /* Functions that use the pagecache. ++ * If use_pagecache == 1, then these functions must be implemented. ++ */ ++ /* read and write pagelist should return just 0 (to indicate that ++ * the layout code has taken control) or 1 (to indicate that the ++ * layout code wishes to fall back to normal nfs.) If 0 is returned, ++ * information can be passed back through nfs_data->res and ++ * nfs_data->task.tk_status, and the appropriate pnfs done function ++ * MUST be called. ++ */ ++ enum pnfs_try_status ++ (*read_pagelist) (struct nfs_read_data *nfs_data, unsigned nr_pages); ++ enum pnfs_try_status ++ (*write_pagelist) (struct nfs_write_data *nfs_data, unsigned nr_pages, int how); ++ int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page, ++ loff_t pos, unsigned count, ++ struct pnfs_fsdata *fsdata); ++ int (*write_end)(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, ++ struct pnfs_layout_segment *lseg); ++ void (*write_end_cleanup)(struct file *filp, ++ struct pnfs_fsdata *fsdata); ++ ++ /* Consistency ops */ ++ /* 2 problems: ++ * 1) the page list contains nfs_pages, NOT pages ++ * 2) currently the NFS code doesn't create a page array (as it does with read/write) ++ */ ++ enum pnfs_try_status ++ (*commit) (struct nfs_write_data *nfs_data, int how); ++ ++ /* Layout information. For each inode, alloc_layout is executed once to retrieve an ++ * inode specific layout structure. Each subsequent layoutget operation results in ++ * a set_layout call to set the opaque layout in the layout driver.*/ ++ struct pnfs_layout_hdr * (*alloc_layout) (struct inode *inode); ++ void (*free_layout) (struct pnfs_layout_hdr *); ++ struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); ++ void (*free_lseg) (struct pnfs_layout_segment *lseg); ++ ++ int (*setup_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct nfs4_layoutcommit_args *args); ++ void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *args); ++ void (*cleanup_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct nfs4_layoutcommit_args *args, ++ int status); ++ void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args); ++ ++ /* Registration information for a new mounted file system ++ */ ++ int (*initialize_mountpoint) (struct nfs_server *, ++ const struct nfs_fh * mntfh); ++ int (*uninitialize_mountpoint) (struct nfs_server *server); ++}; ++ ++enum layoutdriver_policy_flags { ++ /* Should the full nfs rpc cleanup code be used after io */ ++ PNFS_USE_RPC_CODE = 1 << 0, ++ ++ /* Should the NFS req. gather algorithm cross stripe boundaries? */ ++ PNFS_GATHER_ACROSS_STRIPES = 1 << 1, ++ ++ /* Should the pNFS client commit and return the layout upon a setattr */ ++ PNFS_LAYOUTRET_ON_SETATTR = 1 << 3, ++}; ++ ++struct layoutdriver_policy_operations { ++ unsigned flags; ++ ++ /* The stripe size of the file system */ ++ ssize_t (*get_stripesize) (struct pnfs_layout_hdr *layoutid); ++ ++ /* test for nfs page cache coalescing */ ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++ ++ /* Retreive the block size of the file system. ++ * If gather_across_stripes == 1, then the file system will gather ++ * requests into the block size. ++ * TODO: Where will the layout driver get this info? It is hard ++ * coded in PVFS2. ++ */ ++ ssize_t (*get_blocksize) (void); ++}; ++ ++/* Should the full nfs rpc cleanup code be used after io */ ++static inline int ++pnfs_ld_use_rpc_code(struct pnfs_layoutdriver_type *ld) ++{ ++ return ld->ld_policy_ops->flags & PNFS_USE_RPC_CODE; ++} ++ ++/* Should the NFS req. gather algorithm cross stripe boundaries? */ ++static inline int ++pnfs_ld_gather_across_stripes(struct pnfs_layoutdriver_type *ld) ++{ ++ return ld->ld_policy_ops->flags & PNFS_GATHER_ACROSS_STRIPES; ++} ++ ++struct pnfs_device { ++ struct pnfs_deviceid dev_id; ++ unsigned int layout_type; ++ unsigned int mincount; ++ struct page **pages; ++ void *area; ++ unsigned int pgbase; ++ unsigned int pglen; ++ unsigned int dev_notify_types; ++}; ++ ++struct pnfs_devicelist { ++ unsigned int eof; ++ unsigned int num_devs; ++ struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; ++}; ++ ++/* ++ * Device ID RCU cache. A device ID is unique per client ID and layout type. ++ */ ++#define NFS4_DEVICE_ID_HASH_BITS 5 ++#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) ++#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) ++ ++static inline u32 ++nfs4_deviceid_hash(struct pnfs_deviceid *id) ++{ ++ unsigned char *cptr = (unsigned char *)id->data; ++ unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE; ++ u32 x = 0; ++ ++ while (nbytes--) { ++ x *= 37; ++ x += *cptr++; ++ } ++ return x & NFS4_DEVICE_ID_HASH_MASK; ++} ++ ++struct nfs4_deviceid_cache { ++ spinlock_t dc_lock; ++ struct kref dc_kref; ++ void (*dc_free_callback)(struct kref *); ++ struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE]; ++}; ++ ++/* Device ID cache node */ ++struct nfs4_deviceid { ++ struct hlist_node de_node; ++ struct pnfs_deviceid de_id; ++ struct kref de_kref; ++}; ++ ++extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *, ++ void (*free_callback)(struct kref *)); ++extern void nfs4_put_deviceid_cache(struct nfs_client *); ++extern void nfs4_init_deviceid_node(struct nfs4_deviceid *); ++extern struct nfs4_deviceid *nfs4_find_get_deviceid( ++ struct nfs4_deviceid_cache *, ++ struct pnfs_deviceid *); ++extern struct nfs4_deviceid *nfs4_add_get_deviceid(struct nfs4_deviceid_cache *, ++ struct nfs4_deviceid *); ++extern void nfs4_set_layout_deviceid(struct pnfs_layout_segment *, ++ struct nfs4_deviceid *); ++extern void nfs4_put_unset_layout_deviceid(struct pnfs_layout_segment *, ++ struct nfs4_deviceid *, ++ void (*free_callback)(struct kref *)); ++extern void nfs4_delete_device(struct nfs4_deviceid_cache *, ++ struct pnfs_deviceid *); ++ ++/* pNFS client callback functions. ++ * These operations allow the layout driver to access pNFS client ++ * specific information or call pNFS client->server operations. ++ * E.g., getdeviceinfo, I/O callbacks, etc ++ */ ++struct pnfs_client_operations { ++ int (*nfs_getdevicelist) (struct nfs_server *, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist); ++ int (*nfs_getdeviceinfo) (struct nfs_server *, ++ struct pnfs_device *dev); ++ ++ /* Post read callback. */ ++ void (*nfs_readlist_complete) (struct nfs_read_data *nfs_data); ++ ++ /* Post write callback. */ ++ void (*nfs_writelist_complete) (struct nfs_write_data *nfs_data); ++ ++ /* Post commit callback. */ ++ void (*nfs_commit_complete) (struct nfs_write_data *nfs_data); ++ void (*nfs_return_layout) (struct inode *); ++}; ++ ++extern struct pnfs_client_operations pnfs_ops; ++ ++extern struct pnfs_client_operations *pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); ++extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); ++ ++#define NFS4_PNFS_MAX_LAYOUTS 4 ++#define NFS4_PNFS_PRIVATE_LAYOUT 0x80000000 ++ ++#endif /* LINUX_NFS4_PNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarch/include/linux/nfsd4_block.h +--- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-09-30 10:17:09.178011000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-09-30 10:17:09.180010000 -0400 +@@ -0,0 +1,101 @@ ++#ifndef NFSD4_BLOCK ++#define NFSD4_BLOCK ++ ++#include ++#include ++#include ++#include ++ ++#define PNFS_BLOCK_SUCCESS 1 ++#define PNFS_BLOCK_FAILURE 0 ++ ++#define PNFS_BLOCK_CTL_START 1 ++#define PNFS_BLOCK_CTL_STOP 2 ++#define PNFS_BLOCK_CTL_VERS 3 /* Allows daemon to request current ++ * version from kernel via an upcall. ++ */ ++ ++#define PNFS_UPCALL_MSG_STOP 0 ++#define PNFS_UPCALL_MSG_GETSIG 1 ++#define PNFS_UPCALL_MSG_GETSLICE 2 ++#define PNFS_UPCALL_MSG_DMCHK 3 // See if dev_t is a DM volume ++#define PNFS_UPCALL_MSG_DMGET 4 ++#define PNFS_UPCALL_MSG_VERS 5 ++ ++#define PNFS_UPCALL_VERS 8 ++ ++typedef struct stripe_dev { ++ int major, ++ minor, ++ offset; ++} stripe_dev_t; ++ ++typedef struct bl_comm_res { ++ int res_status; ++ union { ++ struct { ++ long long start, ++ length; ++ } slice; ++ struct { ++ int num_stripes, ++ stripe_size; ++ stripe_dev_t devs[]; ++ } stripe; ++ struct { ++ long long sector; ++ int offset, ++ len; ++ char sig[]; ++ } sig; ++ int vers, ++ dm_vol; ++ } u; ++} bl_comm_res_t; ++ ++typedef struct bl_comm_msg { ++ int msg_type, ++ msg_status; ++ union { ++ dev_t msg_dev; ++ int msg_vers; ++ } u; ++ bl_comm_res_t *msg_res; ++} bl_comm_msg_t; ++ ++#ifdef __KERNEL__ ++ ++typedef struct bl_comm { ++ /* ---- protects access to this structure ---- */ ++ struct mutex lock; ++ /* ---- protects access to rpc pipe ---- */ ++ struct mutex pipe_lock; ++ struct dentry *pipe_dentry; ++ wait_queue_head_t pipe_wq; ++ bl_comm_msg_t msg; ++} bl_comm_t; ++ ++int pnfs_block_enabled(struct inode *, int); ++int bl_layout_type(struct super_block *sb); ++int bl_getdeviceiter(struct super_block *, u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int bl_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++enum nfsstat4 bl_layoutget(struct inode *, struct exp_xdr_stream *, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int bl_layoutcommit(struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++int bl_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len); ++int bl_init_proc(void); ++int bl_upcall(bl_comm_t *, bl_comm_msg_t *, bl_comm_res_t **); ++ ++extern bl_comm_t *bl_comm_global; // Ugly... ++#endif /* __KERNEL__ */ ++ ++#endif /* NFSD4_BLOCK */ ++ +diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h +--- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-09-30 10:17:09.190013000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-09-30 10:17:09.192012000 -0400 +@@ -0,0 +1,345 @@ ++/* ++ * include/linux/nfsd4_spnfs.h ++ * ++ * spNFS - simple pNFS implementation with userspace daemon ++ * ++ */ ++ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#ifndef NFS_SPNFS_H ++#define NFS_SPNFS_H ++ ++ ++#ifdef __KERNEL__ ++#include "exportfs.h" ++#include "sunrpc/svc.h" ++#include "nfsd/nfsfh.h" ++#else ++#include ++#endif /* __KERNEL__ */ ++ ++#define SPNFS_STATUS_INVALIDMSG 0x01 ++#define SPNFS_STATUS_AGAIN 0x02 ++#define SPNFS_STATUS_FAIL 0x04 ++#define SPNFS_STATUS_SUCCESS 0x08 ++ ++#define SPNFS_TYPE_LAYOUTGET 0x01 ++#define SPNFS_TYPE_LAYOUTCOMMIT 0x02 ++#define SPNFS_TYPE_LAYOUTRETURN 0x03 ++#define SPNFS_TYPE_GETDEVICEITER 0x04 ++#define SPNFS_TYPE_GETDEVICEINFO 0x05 ++#define SPNFS_TYPE_SETATTR 0x06 ++#define SPNFS_TYPE_OPEN 0x07 ++#define SPNFS_TYPE_CLOSE 0x08 ++#define SPNFS_TYPE_CREATE 0x09 ++#define SPNFS_TYPE_REMOVE 0x0a ++#define SPNFS_TYPE_COMMIT 0x0b ++#define SPNFS_TYPE_READ 0x0c ++#define SPNFS_TYPE_WRITE 0x0d ++ ++#define SPNFS_MAX_DEVICES 1 ++#define SPNFS_MAX_DATA_SERVERS 16 ++#define SPNFS_MAX_IO 512 ++ ++/* layout */ ++struct spnfs_msg_layoutget_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_filelayout_list { ++ u_int32_t fh_len; ++ unsigned char fh_val[128]; /* DMXXX fix this const */ ++}; ++ ++struct spnfs_msg_layoutget_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t stripe_size; ++ u_int32_t stripe_type; ++ u_int32_t stripe_count; ++ struct spnfs_filelayout_list flist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++/* layoutcommit */ ++struct spnfs_msg_layoutcommit_args { ++ unsigned long inode; ++ unsigned long generation; ++ u_int64_t file_size; ++}; ++ ++struct spnfs_msg_layoutcommit_res { ++ int status; ++}; ++ ++/* layoutreturn */ ++/* No op for the daemon */ ++/* ++struct spnfs_msg_layoutreturn_args { ++}; ++ ++struct spnfs_msg_layoutreturn_res { ++}; ++*/ ++ ++/* getdeviceiter */ ++struct spnfs_msg_getdeviceiter_args { ++ unsigned long inode; ++ u_int64_t cookie; ++ u_int64_t verf; ++}; ++ ++struct spnfs_msg_getdeviceiter_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t cookie; ++ u_int64_t verf; ++ u_int32_t eof; ++}; ++ ++/* getdeviceinfo */ ++struct spnfs_data_server { ++ u_int32_t dsid; ++ char netid[5]; ++ char addr[29]; ++}; ++ ++struct spnfs_device { ++ u_int64_t devid; ++ int dscount; ++ struct spnfs_data_server dslist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++struct spnfs_msg_getdeviceinfo_args { ++ u_int64_t devid; ++}; ++ ++struct spnfs_msg_getdeviceinfo_res { ++ int status; ++ struct spnfs_device devinfo; ++}; ++ ++/* setattr */ ++struct spnfs_msg_setattr_args { ++ unsigned long inode; ++ unsigned long generation; ++ int file_size; ++}; ++ ++struct spnfs_msg_setattr_res { ++ int status; ++}; ++ ++/* open */ ++struct spnfs_msg_open_args { ++ unsigned long inode; ++ unsigned long generation; ++ int create; ++ int createmode; ++ int truncate; ++}; ++ ++struct spnfs_msg_open_res { ++ int status; ++}; ++ ++/* close */ ++/* No op for daemon */ ++struct spnfs_msg_close_args { ++ int x; ++}; ++ ++struct spnfs_msg_close_res { ++ int y; ++}; ++ ++/* create */ ++/* ++struct spnfs_msg_create_args { ++ int x; ++}; ++ ++struct spnfs_msg_create_res { ++ int y; ++}; ++*/ ++ ++/* remove */ ++struct spnfs_msg_remove_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_msg_remove_res { ++ int status; ++}; ++ ++/* commit */ ++/* ++struct spnfs_msg_commit_args { ++ int x; ++}; ++ ++struct spnfs_msg_commit_res { ++ int y; ++}; ++*/ ++ ++/* read */ ++struct spnfs_msg_read_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++}; ++ ++struct spnfs_msg_read_res { ++ int status; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++/* write */ ++struct spnfs_msg_write_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++struct spnfs_msg_write_res { ++ int status; ++}; ++ ++/* bundle args and responses */ ++union spnfs_msg_args { ++ struct spnfs_msg_layoutget_args layoutget_args; ++ struct spnfs_msg_layoutcommit_args layoutcommit_args; ++/* ++ struct spnfs_msg_layoutreturn_args layoutreturn_args; ++*/ ++ struct spnfs_msg_getdeviceiter_args getdeviceiter_args; ++ struct spnfs_msg_getdeviceinfo_args getdeviceinfo_args; ++ struct spnfs_msg_setattr_args setattr_args; ++ struct spnfs_msg_open_args open_args; ++ struct spnfs_msg_close_args close_args; ++/* ++ struct spnfs_msg_create_args create_args; ++*/ ++ struct spnfs_msg_remove_args remove_args; ++/* ++ struct spnfs_msg_commit_args commit_args; ++*/ ++ struct spnfs_msg_read_args read_args; ++ struct spnfs_msg_write_args write_args; ++}; ++ ++union spnfs_msg_res { ++ struct spnfs_msg_layoutget_res layoutget_res; ++ struct spnfs_msg_layoutcommit_res layoutcommit_res; ++/* ++ struct spnfs_msg_layoutreturn_res layoutreturn_res; ++*/ ++ struct spnfs_msg_getdeviceiter_res getdeviceiter_res; ++ struct spnfs_msg_getdeviceinfo_res getdeviceinfo_res; ++ struct spnfs_msg_setattr_res setattr_res; ++ struct spnfs_msg_open_res open_res; ++ struct spnfs_msg_close_res close_res; ++/* ++ struct spnfs_msg_create_res create_res; ++*/ ++ struct spnfs_msg_remove_res remove_res; ++/* ++ struct spnfs_msg_commit_res commit_res; ++*/ ++ struct spnfs_msg_read_res read_res; ++ struct spnfs_msg_write_res write_res; ++}; ++ ++/* a spnfs message, args and response */ ++struct spnfs_msg { ++ unsigned char im_type; ++ unsigned char im_status; ++ union spnfs_msg_args im_args; ++ union spnfs_msg_res im_res; ++}; ++ ++/* spnfs configuration info */ ++struct spnfs_config { ++ unsigned char dense_striping; ++ int stripe_size; ++ int num_ds; ++ char ds_dir[SPNFS_MAX_DATA_SERVERS][80]; /* XXX */ ++}; ++ ++#if defined(__KERNEL__) && defined(CONFIG_SPNFS) ++ ++#include ++ ++/* pipe mgmt structure. messages flow through here */ ++struct spnfs { ++ struct dentry *spnfs_dentry; /* dentry for pipe */ ++ wait_queue_head_t spnfs_wq; ++ struct spnfs_msg spnfs_im; /* spnfs message */ ++ struct mutex spnfs_lock; /* Serializes upcalls */ ++ struct mutex spnfs_plock; ++}; ++ ++struct nfsd4_open; ++ ++int spnfs_layout_type(struct super_block *); ++enum nfsstat4 spnfs_layoutget(struct inode *, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int spnfs_layoutcommit(void); ++int spnfs_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int spnfs_getdeviceiter(struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int spnfs_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++int spnfs_setattr(void); ++int spnfs_open(struct inode *, struct nfsd4_open *); ++int spnfs_get_state(struct inode *, struct knfsd_fh *, struct pnfs_get_state *); ++int spnfs_remove(unsigned long, unsigned long); ++__be32 spnfs_read(struct inode *, loff_t, unsigned long *, ++ int, struct svc_rqst *); ++__be32 spnfs_write(struct inode *, loff_t, size_t, int, struct svc_rqst *); ++int spnfs_getfh(int, struct nfs_fh *); ++int spnfs_test_layoutrecall(char *, u64, u64); ++int spnfs_layoutrecall(struct inode *, int, u64, u64); ++ ++int nfsd_spnfs_new(void); ++void nfsd_spnfs_delete(void); ++int spnfs_upcall(struct spnfs *, struct spnfs_msg *, union spnfs_msg_res *); ++int spnfs_enabled(void); ++int spnfs_init_proc(void); ++ ++extern struct spnfs_config *spnfs_config; ++ ++#endif /* __KERNEL__ && CONFIG_SPNFS */ ++ ++#endif /* NFS_SPNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch/include/linux/nfsd/const.h +--- linux-2.6.34.noarch/include/linux/nfsd/const.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-09-30 10:17:09.139009000 -0400 +@@ -29,6 +29,7 @@ + #ifdef __KERNEL__ + + #include ++#include + + /* + * Largest number of bytes we need to allocate for an NFS +diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch/include/linux/nfsd/debug.h +--- linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-09-30 10:17:09.144010000 -0400 +@@ -32,6 +32,8 @@ + #define NFSDDBG_REPCACHE 0x0080 + #define NFSDDBG_XDR 0x0100 + #define NFSDDBG_LOCKD 0x0200 ++#define NFSDDBG_PNFS 0x0400 ++#define NFSDDBG_FILELAYOUT 0x0800 + #define NFSDDBG_ALL 0x7FFF + #define NFSDDBG_NOCHANGE 0xFFFF + +diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarch/include/linux/nfsd/export.h +--- linux-2.6.34.noarch/include/linux/nfsd/export.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-09-30 10:17:09.149010000 -0400 +@@ -100,6 +100,7 @@ struct svc_export { + uid_t ex_anon_uid; + gid_t ex_anon_gid; + int ex_fsid; ++ int ex_pnfs; + unsigned char * ex_uuid; /* 16 byte fsid */ + struct nfsd4_fs_locations ex_fslocs; + int ex_nflavors; +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-09-30 10:17:09.153006000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-09-30 10:17:09.154012000 -0400 +@@ -0,0 +1,132 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef NFSD_NFS4LAYOUTXDR_H ++#define NFSD_NFS4LAYOUTXDR_H ++ ++#include ++#include ++ ++/* the nfsd4_pnfs_devlist dev_addr for the file layout type */ ++struct pnfs_filelayout_devaddr { ++ struct xdr_netobj r_netid; ++ struct xdr_netobj r_addr; ++}; ++ ++/* list of multipath servers */ ++struct pnfs_filelayout_multipath { ++ u32 fl_multipath_length; ++ struct pnfs_filelayout_devaddr *fl_multipath_list; ++}; ++ ++struct pnfs_filelayout_device { ++ u32 fl_stripeindices_length; ++ u32 *fl_stripeindices_list; ++ u32 fl_device_length; ++ struct pnfs_filelayout_multipath *fl_device_list; ++}; ++ ++struct pnfs_filelayout_layout { ++ u32 lg_layout_type; /* response */ ++ u32 lg_stripe_type; /* response */ ++ u32 lg_commit_through_mds; /* response */ ++ u64 lg_stripe_unit; /* response */ ++ u64 lg_pattern_offset; /* response */ ++ u32 lg_first_stripe_index; /* response */ ++ struct nfsd4_pnfs_deviceid device_id; /* response */ ++ u32 lg_fh_length; /* response */ ++ struct knfsd_fh *lg_fh_list; /* response */ ++}; ++ ++enum stripetype4 { ++ STRIPE_SPARSE = 1, ++ STRIPE_DENSE = 2 ++}; ++ ++enum pnfs_block_extent_state4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, ++ PNFS_BLOCK_NONE_DATA = 3 ++}; ++ ++enum pnfs_block_volume_type4 { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, ++ PNFS_BLOCK_VOLUME_SLICE = 1, ++ PNFS_BLOCK_VOLUME_CONCAT = 2, ++ PNFS_BLOCK_VOLUME_STRIPE = 3, ++}; ++typedef enum pnfs_block_volume_type4 pnfs_block_volume_type4; ++ ++enum bl_cache_state { ++ BLOCK_LAYOUT_NEW = 0, ++ BLOCK_LAYOUT_CACHE = 1, ++ BLOCK_LAYOUT_UPDATE = 2, ++}; ++ ++typedef struct pnfs_blocklayout_layout { ++ struct list_head bll_list; ++ struct nfsd4_pnfs_deviceid bll_vol_id; ++ u64 bll_foff; // file offset ++ u64 bll_len; ++ u64 bll_soff; // storage offset ++ int bll_recalled; ++ enum pnfs_block_extent_state4 bll_es; ++ enum bl_cache_state bll_cache_state; ++} pnfs_blocklayout_layout_t; ++ ++typedef struct pnfs_blocklayout_devinfo { ++ struct list_head bld_list; ++ pnfs_block_volume_type4 bld_type; ++ struct nfsd4_pnfs_deviceid bld_devid; ++ int bld_index_loc; ++ union { ++ struct { ++ u64 bld_offset; ++ u32 bld_sig_len, ++ *bld_sig; ++ } simple; ++ struct { ++ u64 bld_start, ++ bld_len; ++ u32 bld_index; /* Index of Simple Volume */ ++ } slice; ++ struct { ++ u32 bld_stripes; ++ u64 bld_chunk_size; ++ u32 *bld_stripe_indexs; ++ } stripe; ++ } u; ++} pnfs_blocklayout_devinfo_t; ++ ++#endif /* NFSD_NFS4LAYOUTXDR_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-09-30 10:17:09.157010000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-09-30 10:17:09.159008000 -0400 +@@ -0,0 +1,54 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++#include ++ ++/* ++ * Length of comma separated pnfs data server IPv4 addresses. Enough room for ++ * 32 addresses. ++ */ ++#define NFSD_DLM_DS_LIST_MAX 512 ++/* ++ * Length of colon separated pnfs dlm device of the form ++ * disk_name:comma separated data server IPv4 address ++ */ ++#define NFSD_PNFS_DLM_DEVICE_MAX (NFSD_DLM_DS_LIST_MAX + DISK_NAME_LEN + 1) ++ ++#ifdef CONFIG_PNFSD ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++extern const struct pnfs_export_operations pnfs_dlm_export_ops; ++ ++int nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len); ++ ++void nfsd4_pnfs_dlm_shutdown(void); ++ ++ssize_t nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen); ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h +--- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-09-30 10:17:09.162007000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-09-30 10:17:09.163012000 -0400 +@@ -0,0 +1,271 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef _LINUX_NFSD_NFSD4_PNFS_H ++#define _LINUX_NFSD_NFSD4_PNFS_H ++ ++#include ++#include ++#include ++ ++struct nfsd4_pnfs_deviceid { ++ u64 sbid; /* per-superblock unique ID */ ++ u64 devid; /* filesystem-wide unique device ID */ ++}; ++ ++struct nfsd4_pnfs_dev_iter_res { ++ u64 gd_cookie; /* request/repsonse */ ++ u64 gd_verf; /* request/repsonse */ ++ u64 gd_devid; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++/* Arguments for set_device_notify */ ++struct pnfs_devnotify_arg { ++ struct nfsd4_pnfs_deviceid dn_devid; /* request */ ++ u32 dn_layout_type; /* request */ ++ u32 dn_notify_types; /* request/response */ ++}; ++ ++struct nfsd4_layout_seg { ++ u64 clientid; ++ u32 layout_type; ++ u32 iomode; ++ u64 offset; ++ u64 length; ++}; ++ ++/* Used by layout_get to encode layout (loc_body var in spec) ++ * Args: ++ * minlength - min number of accessible bytes given by layout ++ * fsid - Major part of struct pnfs_deviceid. File system uses this ++ * to build the deviceid returned in the layout. ++ * fh - fs can modify the file handle for use on data servers ++ * seg - layout info requested and layout info returned ++ * xdr - xdr info ++ * return_on_close - true if layout to be returned on file close ++ */ ++ ++struct nfsd4_pnfs_layoutget_arg { ++ u64 lg_minlength; ++ u64 lg_sbid; ++ const struct knfsd_fh *lg_fh; ++}; ++ ++struct nfsd4_pnfs_layoutget_res { ++ struct nfsd4_layout_seg lg_seg; /* request/resopnse */ ++ u32 lg_return_on_close; ++}; ++ ++struct nfsd4_pnfs_layoutcommit_arg { ++ struct nfsd4_layout_seg lc_seg; /* request */ ++ u32 lc_reclaim; /* request */ ++ u32 lc_newoffset; /* request */ ++ u64 lc_last_wr; /* request */ ++ struct nfstime4 lc_mtime; /* request */ ++ u32 lc_up_len; /* layout length */ ++ void *lc_up_layout; /* decoded by callback */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit_res { ++ u32 lc_size_chg; /* boolean for response */ ++ u64 lc_newsize; /* response */ ++}; ++ ++#define PNFS_LAST_LAYOUT_NO_RECALLS ((void *)-1) /* used with lr_cookie below */ ++ ++struct nfsd4_pnfs_layoutreturn_arg { ++ u32 lr_return_type; /* request */ ++ struct nfsd4_layout_seg lr_seg; /* request */ ++ u32 lr_reclaim; /* request */ ++ u32 lrf_body_len; /* request */ ++ void *lrf_body; /* request */ ++ void *lr_cookie; /* fs private */ ++}; ++ ++/* pNFS Metadata to Data server state communication */ ++struct pnfs_get_state { ++ u32 dsid; /* request */ ++ u64 ino; /* request */ ++ nfs4_stateid stid; /* request;response */ ++ nfs4_clientid clid; /* response */ ++ u32 access; /* response */ ++ u32 stid_gen; /* response */ ++ u32 verifier[2]; /* response */ ++}; ++ ++/* ++ * pNFS export operations vector. ++ * ++ * The filesystem must implement the following methods: ++ * layout_type ++ * get_device_info ++ * layout_get ++ * ++ * All other methods are optional and can be set to NULL if not implemented. ++ */ ++struct pnfs_export_operations { ++ /* Returns the supported pnfs_layouttype4. */ ++ int (*layout_type) (struct super_block *); ++ ++ /* Encode device info onto the xdr stream. */ ++ int (*get_device_info) (struct super_block *, ++ struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++ ++ /* Retrieve all available devices via an iterator. ++ * arg->cookie == 0 indicates the beginning of the list, ++ * otherwise arg->verf is used to verify that the list hasn't changed ++ * while retrieved. ++ * ++ * On output, the filesystem sets the devid based on the current cookie ++ * and sets res->cookie and res->verf corresponding to the next entry. ++ * When the last entry in the list is retrieved, res->eof is set to 1. ++ */ ++ int (*get_device_iter) (struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++ ++ int (*set_device_notify) (struct super_block *, ++ struct pnfs_devnotify_arg *); ++ ++ /* Retrieve and encode a layout for inode onto the xdr stream. ++ * arg->minlength is the minimum number of accessible bytes required ++ * by the client. ++ * The maximum number of bytes to encode the layout is given by ++ * the xdr stream end pointer. ++ * arg->fsid contains the major part of struct pnfs_deviceid. ++ * The file system uses this to build the deviceid returned ++ * in the layout. ++ * res->seg - layout segment requested and layout info returned. ++ * res->fh can be modified the file handle for use on data servers ++ * res->return_on_close - true if layout to be returned on file close ++ * ++ * return one of the following nfs errors: ++ * NFS_OK Success ++ * NFS4ERR_ACCESS Permission error ++ * NFS4ERR_BADIOMODE Server does not support requested iomode ++ * NFS4ERR_BADLAYOUT No layout matching loga_minlength rules ++ * NFS4ERR_INVAL Parameter other than layout is invalid ++ * NFS4ERR_IO I/O error ++ * NFS4ERR_LAYOUTTRYLATER Layout may be retrieved later ++ * NFS4ERR_LAYOUTUNAVAILABLE Layout unavailable for this file ++ * NFS4ERR_LOCKED Lock conflict ++ * NFS4ERR_NOSPC Out-of-space error occured ++ * NFS4ERR_RECALLCONFLICT Layout currently unavialable due to ++ * a conflicting CB_LAYOUTRECALL ++ * NFS4ERR_SERVERFAULT Server went bezerk ++ * NFS4ERR_TOOSMALL loga_maxcount too small to fit layout ++ * NFS4ERR_WRONG_TYPE Wrong file type (not a regular file) ++ */ ++ enum nfsstat4 (*layout_get) (struct inode *, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++ ++ /* Commit changes to layout */ ++ int (*layout_commit) (struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++ ++ /* Returns the layout */ ++ int (*layout_return) (struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++ ++ /* Can layout segments be merged for this layout type? */ ++ int (*can_merge_layouts) (u32 layout_type); ++ ++ /* pNFS Files layout specific operations */ ++ ++ /* Get the write verifier for DS (called on MDS only) */ ++ void (*get_verifier) (struct super_block *, u32 *p); ++ /* Call fs on DS only */ ++ int (*get_state) (struct inode *, struct knfsd_fh *, ++ struct pnfs_get_state *); ++}; ++ ++struct nfsd4_pnfs_cb_layout { ++ u32 cbl_recall_type; /* request */ ++ struct nfsd4_layout_seg cbl_seg; /* request */ ++ u32 cbl_layoutchanged; /* request */ ++ nfs4_stateid cbl_sid; /* request */ ++ struct nfs4_fsid cbl_fsid; ++ void *cbl_cookie; /* fs private */ ++}; ++ ++/* layoutrecall request (from exported filesystem) */ ++struct nfs4_layoutrecall { ++ struct kref clr_ref; ++ struct nfsd4_pnfs_cb_layout cb; /* request */ ++ struct list_head clr_perclnt; /* on cl_layoutrecalls */ ++ struct nfs4_client *clr_client; ++ struct nfs4_file *clr_file; ++ struct timespec clr_time; /* last activity */ ++ struct super_block *clr_sb; /* We might not have a file */ ++ struct nfs4_layoutrecall *parent; /* The initiating recall */ ++ ++ void *clr_args; /* nfsd internal */ ++}; ++ ++struct nfsd4_pnfs_cb_dev_item { ++ u32 cbd_notify_type; /* request */ ++ u32 cbd_layout_type; /* request */ ++ struct nfsd4_pnfs_deviceid cbd_devid; /* request */ ++ u32 cbd_immediate; /* request */ ++}; ++ ++struct nfsd4_pnfs_cb_dev_list { ++ u32 cbd_len; /* request */ ++ struct nfsd4_pnfs_cb_dev_item *cbd_list; /* request */ ++}; ++ ++/* ++ * callbacks provided by the nfsd ++ */ ++struct pnfsd_cb_operations { ++ /* Generic callbacks */ ++ int (*cb_layout_recall) (struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++ int (*cb_device_notify) (struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++ ++ /* pNFS Files layout specific callbacks */ ++ ++ /* Callback from fs on MDS only */ ++ int (*cb_get_state) (struct super_block *, struct pnfs_get_state *); ++ /* Callback from fs on DS only */ ++ int (*cb_change_state) (struct pnfs_get_state *); ++}; ++ ++#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ +diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noarch/include/linux/nfsd/syscall.h +--- linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-09-30 10:17:09.168010000 -0400 +@@ -29,6 +29,7 @@ + /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ + #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ + #define NFSCTL_GETFS 8 /* get an fh by path with max FH len */ ++#define NFSCTL_FD2FH 9 /* get a fh from a fd */ + + /* SVC */ + struct nfsctl_svc { +@@ -71,6 +72,11 @@ struct nfsctl_fsparm { + int gd_maxlen; + }; + ++/* FD2FH */ ++struct nfsctl_fd2fh { ++ int fd; ++}; ++ + /* + * This is the argument union. + */ +@@ -82,6 +88,7 @@ struct nfsctl_arg { + struct nfsctl_export u_export; + struct nfsctl_fdparm u_getfd; + struct nfsctl_fsparm u_getfs; ++ struct nfsctl_fd2fh u_fd2fh; + /* + * The following dummy member is needed to preserve binary compatibility + * on platforms where alignof(void*)>alignof(int). It's needed because +@@ -95,6 +102,7 @@ struct nfsctl_arg { + #define ca_export u.u_export + #define ca_getfd u.u_getfd + #define ca_getfs u.u_getfs ++#define ca_fd2fh u.u_fd2fh + }; + + union nfsctl_res { +diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/include/linux/nfs_fs.h +--- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-09-30 10:15:17.949718000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-09-30 10:17:09.071005000 -0400 +@@ -72,13 +72,20 @@ struct nfs_access_entry { + int mask; + }; + ++struct nfs_lock_context { ++ atomic_t count; ++ struct list_head list; ++ struct nfs_open_context *open_context; ++ fl_owner_t lockowner; ++ pid_t pid; ++}; ++ + struct nfs4_state; + struct nfs_open_context { +- atomic_t count; ++ struct nfs_lock_context lock_context; + struct path path; + struct rpc_cred *cred; + struct nfs4_state *state; +- fl_owner_t lockowner; + fmode_t mode; + + unsigned long flags; +@@ -97,6 +104,27 @@ struct nfs_delegation; + + struct posix_acl; + ++struct pnfs_layout_hdr { ++ int refcount; ++ struct list_head layouts; /* other client layouts */ ++ struct list_head segs; /* layout segments list */ ++ int roc_iomode;/* return on close iomode, 0=none */ ++ seqlock_t seqlock; /* Protects the stateid */ ++ nfs4_stateid stateid; ++ unsigned long state; ++#define NFS_INO_RO_LAYOUT_FAILED 0 /* ro layoutget failed stop trying */ ++#define NFS_INO_RW_LAYOUT_FAILED 1 /* rw layoutget failed stop trying */ ++#define NFS_INO_LAYOUTCOMMIT 2 /* LAYOUTCOMMIT needed */ ++ ++ struct rpc_cred *cred; /* layoutcommit credential */ ++ /* DH: These vars keep track of the maximum write range ++ * so the values can be used for layoutcommit. ++ */ ++ loff_t write_begin_pos; ++ loff_t write_end_pos; ++ struct inode *inode; ++}; ++ + /* + * nfs fs inode data in memory + */ +@@ -181,6 +209,13 @@ struct nfs_inode { + struct nfs_delegation *delegation; + fmode_t delegation_state; + struct rw_semaphore rwsem; ++ ++ /* pNFS layout information */ ++#if defined(CONFIG_NFS_V4_1) ++ wait_queue_head_t lo_waitq; ++ struct pnfs_layout_hdr *layout; ++ time_t pnfs_layout_suspend; ++#endif /* CONFIG_NFS_V4_1 */ + #endif /* CONFIG_NFS_V4*/ + #ifdef CONFIG_NFS_FSCACHE + struct fscache_cookie *fscache; +@@ -353,6 +388,8 @@ extern void nfs_setattr_update_inode(str + extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); + extern void put_nfs_open_context(struct nfs_open_context *ctx); + extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode); ++extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx); ++extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); + extern u64 nfs_compat_user_ino64(u64 fileid); + extern void nfs_fattr_init(struct nfs_fattr *fattr); + +@@ -481,8 +518,12 @@ extern void nfs_unblock_sillyrename(stru + extern int nfs_congestion_kb; + extern int nfs_writepage(struct page *page, struct writeback_control *wbc); + extern int nfs_writepages(struct address_space *, struct writeback_control *); +-extern int nfs_flush_incompatible(struct file *file, struct page *page); +-extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); ++struct pnfs_layout_segment; ++extern int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg); ++extern int nfs_updatepage(struct file *, struct page *, ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata); + extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); + + /* +@@ -604,6 +645,8 @@ extern void * nfs_root_data(void); + #define NFSDBG_CLIENT 0x0200 + #define NFSDBG_MOUNT 0x0400 + #define NFSDBG_FSCACHE 0x0800 ++#define NFSDBG_PNFS 0x1000 ++#define NFSDBG_PNFS_LD 0x2000 + #define NFSDBG_ALL 0xFFFF + + #ifdef __KERNEL__ +diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/include/linux/nfs_fs_sb.h +--- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-09-30 10:15:17.959722000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-09-30 10:17:09.083008000 -0400 +@@ -15,6 +15,7 @@ struct nlm_host; + struct nfs4_sequence_args; + struct nfs4_sequence_res; + struct nfs_server; ++struct nfs4_minor_version_ops; + + /* + * The nfs_client identifies our client state to the server. +@@ -70,11 +71,7 @@ struct nfs_client { + */ + char cl_ipaddr[48]; + unsigned char cl_id_uniquifier; +- int (* cl_call_sync)(struct nfs_server *server, +- struct rpc_message *msg, +- struct nfs4_sequence_args *args, +- struct nfs4_sequence_res *res, +- int cache_reply); ++ const struct nfs4_minor_version_ops *cl_mvops; + #endif /* CONFIG_NFS_V4 */ + + #ifdef CONFIG_NFS_V4_1 +@@ -85,6 +82,8 @@ struct nfs_client { + /* The flags used for obtaining the clientid during EXCHANGE_ID */ + u32 cl_exchange_flags; + struct nfs4_session *cl_session; /* sharred session */ ++ struct list_head cl_layouts; ++ struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */ + #endif /* CONFIG_NFS_V4_1 */ + + #ifdef CONFIG_NFS_FSCACHE +@@ -92,6 +91,16 @@ struct nfs_client { + #endif + }; + ++static inline bool ++is_ds_only_client(struct nfs_client *clp) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ return is_ds_only_session(clp->cl_exchange_flags); ++#else ++ return false; ++#endif ++} ++ + /* + * NFS client parameters stored in the superblock. + */ +@@ -136,7 +145,7 @@ struct nfs_server { + #endif + + #ifdef CONFIG_NFS_V4 +- u32 attr_bitmask[2];/* V4 bitmask representing the set ++ u32 attr_bitmask[3];/* V4 bitmask representing the set + of attributes supported on this + filesystem */ + u32 cache_consistency_bitmask[2]; +@@ -148,6 +157,15 @@ struct nfs_server { + that are supported on this + filesystem */ + #endif ++ ++#ifdef CONFIG_NFS_V4_1 ++ u32 pnfs_blksize; /* layout_blksize attr */ ++ struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ ++ void *pnfs_ld_data; /* Per-mount data */ ++ unsigned int ds_rsize; /* Data server read size */ ++ unsigned int ds_wsize; /* Data server write size */ ++#endif /* CONFIG_NFS_V4_1 */ ++ + void (*destroy)(struct nfs_server *); + + atomic_t active; /* Keep trace of any activity to this server */ +diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch/include/linux/nfs_iostat.h +--- linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-09-30 10:17:09.110005000 -0400 +@@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { + NFSIOS_SHORTREAD, + NFSIOS_SHORTWRITE, + NFSIOS_DELAY, ++ NFSIOS_PNFS_READ, ++ NFSIOS_PNFS_WRITE, ++ NFSIOS_PNFS_COMMIT, + __NFSIOS_COUNTSMAX, + }; + +diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/include/linux/nfs_page.h +--- linux-2.6.34.noarch/include/linux/nfs_page.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-09-30 10:17:09.122008000 -0400 +@@ -39,6 +39,7 @@ struct nfs_page { + struct list_head wb_list; /* Defines state of page: */ + struct page *wb_page; /* page to read in/write out */ + struct nfs_open_context *wb_context; /* File state context info */ ++ struct nfs_lock_context *wb_lock_context; /* lock context info */ + atomic_t wb_complete; /* i/os we're waiting for */ + pgoff_t wb_index; /* Offset >> PAGE_CACHE_SHIFT */ + unsigned int wb_offset, /* Offset & ~PAGE_CACHE_MASK */ +@@ -47,6 +48,7 @@ struct nfs_page { + struct kref wb_kref; /* reference count */ + unsigned long wb_flags; + struct nfs_writeverf wb_verf; /* Commit cookie */ ++ struct pnfs_layout_segment *wb_lseg; /* Pnfs layout info */ + }; + + struct nfs_pageio_descriptor { +@@ -60,6 +62,12 @@ struct nfs_pageio_descriptor { + int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int); + int pg_ioflags; + int pg_error; ++ struct pnfs_layout_segment *pg_lseg; ++#ifdef CONFIG_NFS_V4_1 ++ int pg_iswrite; ++ int pg_boundary; ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++#endif /* CONFIG_NFS_V4_1 */ + }; + + #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) +@@ -68,13 +76,15 @@ extern struct nfs_page *nfs_create_reque + struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int count); ++ unsigned int count, ++ struct pnfs_layout_segment *lseg); + extern void nfs_clear_request(struct nfs_page *req); + extern void nfs_release_request(struct nfs_page *req); + + + extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst, +- pgoff_t idx_start, unsigned int npages, int tag); ++ pgoff_t idx_start, unsigned int npages, int tag, ++ int *use_pnfs); + extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, + struct inode *inode, + int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), +diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/include/linux/nfs_xdr.h +--- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-09-30 10:15:17.965727000 -0400 ++++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-09-30 10:17:09.134006000 -0400 +@@ -3,6 +3,8 @@ + + #include + #include ++#include ++#include + + /* + * To change the maximum rsize and wsize supported by the NFS client, adjust +@@ -10,7 +12,7 @@ + * support a megabyte or more. The default is left at 4096 bytes, which is + * reasonable for NFS over UDP. + */ +-#define NFS_MAX_FILE_IO_SIZE (1048576U) ++#define NFS_MAX_FILE_IO_SIZE (4U * 1048576U) + #define NFS_DEF_FILE_IO_SIZE (4096U) + #define NFS_MIN_FILE_IO_SIZE (1024U) + +@@ -113,6 +115,10 @@ struct nfs_fsinfo { + __u32 dtpref; /* pref. readdir transfer size */ + __u64 maxfilesize; + __u32 lease_time; /* in seconds */ ++#if defined(CONFIG_NFS_V4_1) ++ __u32 layouttype; /* supported pnfs layout driver */ ++ __u32 blksize; /* preferred pnfs io block size */ ++#endif + }; + + struct nfs_fsstat { +@@ -185,6 +191,125 @@ struct nfs4_get_lease_time_res { + struct nfs4_sequence_res lr_seq_res; + }; + ++#define PNFS_LAYOUT_MAXSIZE 4096 ++#define NFS4_PNFS_DEVICEID4_SIZE 16 ++ ++struct pnfs_deviceid { ++ char data[NFS4_PNFS_DEVICEID4_SIZE]; ++}; ++ ++struct nfs4_layoutdriver_data { ++ __u32 len; ++ void *buf; ++}; ++ ++struct pnfs_layout_range { ++ u32 iomode; ++ u64 offset; ++ u64 length; ++}; ++ ++struct nfs4_layoutget_args { ++ __u32 type; ++ struct pnfs_layout_range range; ++ __u64 minlength; ++ __u32 maxcount; ++ struct inode *inode; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_layoutget_res { ++ __u32 return_on_close; ++ struct pnfs_layout_range range; ++ __u32 type; ++ nfs4_stateid stateid; ++ struct nfs4_layoutdriver_data layout; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_layoutget { ++ struct nfs4_layoutget_args args; ++ struct nfs4_layoutget_res res; ++ struct pnfs_layout_segment **lsegpp; ++ int status; ++}; ++ ++struct nfs4_layoutcommit_args { ++ nfs4_stateid stateid; ++ __u64 lastbytewritten; ++ __u32 time_modify_changed; ++ struct timespec time_modify; ++ const u32 *bitmask; ++ struct nfs_fh *fh; ++ struct inode *inode; ++ ++ /* Values set by layout driver */ ++ struct pnfs_layout_range range; ++ __u32 layout_type; ++ void *layoutdriver_data; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_layoutcommit_res { ++ __u32 sizechanged; ++ __u64 newsize; ++ struct nfs_fattr *fattr; ++ const struct nfs_server *server; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_layoutcommit_data { ++ struct rpc_task task; ++ struct rpc_cred *cred; ++ struct nfs_fattr fattr; ++ struct nfs4_layoutcommit_args args; ++ struct nfs4_layoutcommit_res res; ++ int status; ++}; ++ ++struct nfs4_layoutreturn_args { ++ __u32 reclaim; ++ __u32 layout_type; ++ __u32 return_type; ++ struct pnfs_layout_range range; ++ struct inode *inode; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_layoutreturn_res { ++ struct nfs4_sequence_res seq_res; ++ u32 lrs_present; ++ nfs4_stateid stateid; ++}; ++ ++struct nfs4_layoutreturn { ++ struct nfs4_layoutreturn_args args; ++ struct nfs4_layoutreturn_res res; ++ struct rpc_cred *cred; ++ int rpc_status; ++}; ++ ++struct nfs4_getdevicelist_args { ++ const struct nfs_fh *fh; ++ u32 layoutclass; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_getdevicelist_res { ++ struct pnfs_devicelist *devlist; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_getdeviceinfo_args { ++ struct pnfs_device *pdev; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_getdeviceinfo_res { ++ struct pnfs_device *pdev; ++ struct nfs4_sequence_res seq_res; ++}; ++ + /* + * Arguments to the open call. + */ +@@ -196,8 +321,10 @@ struct nfs_openargs { + __u64 clientid; + __u64 id; + union { +- struct iattr * attrs; /* UNCHECKED, GUARDED */ +- nfs4_verifier verifier; /* EXCLUSIVE */ ++ struct { ++ struct iattr * attrs; /* UNCHECKED, GUARDED */ ++ nfs4_verifier verifier; /* EXCLUSIVE */ ++ }; + nfs4_stateid delegation; /* CLAIM_DELEGATE_CUR */ + fmode_t delegation_type; /* CLAIM_PREVIOUS */ + } u; +@@ -313,6 +440,10 @@ struct nfs_lockt_res { + struct nfs4_sequence_res seq_res; + }; + ++struct nfs_release_lockowner_args { ++ struct nfs_lowner lock_owner; ++}; ++ + struct nfs4_delegreturnargs { + const struct nfs_fh *fhandle; + const nfs4_stateid *stateid; +@@ -332,6 +463,7 @@ struct nfs4_delegreturnres { + struct nfs_readargs { + struct nfs_fh * fh; + struct nfs_open_context *context; ++ struct nfs_lock_context *lock_context; + __u64 offset; + __u32 count; + unsigned int pgbase; +@@ -352,6 +484,7 @@ struct nfs_readres { + struct nfs_writeargs { + struct nfs_fh * fh; + struct nfs_open_context *context; ++ struct nfs_lock_context *lock_context; + __u64 offset; + __u32 count; + enum nfs3_stable_how stable; +@@ -846,7 +979,7 @@ struct nfs4_server_caps_arg { + }; + + struct nfs4_server_caps_res { +- u32 attr_bitmask[2]; ++ u32 attr_bitmask[3]; + u32 acl_bitmask; + u32 has_links; + u32 has_symlinks; +@@ -961,6 +1094,27 @@ struct nfs_page; + + #define NFS_PAGEVEC_SIZE (8U) + ++#if defined(CONFIG_NFS_V4_1) ++/* pnfsflag values */ ++#define PNFS_NO_RPC 0x0001 /* non rpc result callback switch */ ++ ++/* pnfs-specific data needed for read, write, and commit calls */ ++struct pnfs_call_data { ++ struct pnfs_layout_segment *lseg; ++ const struct rpc_call_ops *call_ops; ++ u32 orig_count; /* for retry via MDS */ ++ int pnfs_error; ++ u8 pnfsflags; ++ u8 how; /* for FLUSH_STABLE */ ++}; ++ ++/* files layout-type specific data for read, write, and commit */ ++struct pnfs_fl_call_data { ++ struct nfs_client *ds_nfs_client; ++ __u64 orig_offset; ++}; ++#endif /* CONFIG_NFS_V4_1 */ ++ + struct nfs_read_data { + int flags; + struct rpc_task task; +@@ -976,10 +1130,16 @@ struct nfs_read_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + + struct nfs_write_data { ++ struct kref refcount; /* For pnfs commit splitting */ ++ struct nfs_write_data *parent; /* For pnfs commit splitting */ + int flags; + struct rpc_task task; + struct inode *inode; +@@ -995,6 +1155,10 @@ struct nfs_write_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + +@@ -1008,6 +1172,7 @@ struct nfs_rpc_ops { + const struct dentry_operations *dentry_ops; + const struct inode_operations *dir_inode_ops; + const struct inode_operations *file_inode_ops; ++ const struct file_operations *file_ops; + + int (*getroot) (struct nfs_server *, struct nfs_fh *, + struct nfs_fsinfo *); +@@ -1072,6 +1237,7 @@ struct nfs_rpc_ops { + extern const struct nfs_rpc_ops nfs_v2_clientops; + extern const struct nfs_rpc_ops nfs_v3_clientops; + extern const struct nfs_rpc_ops nfs_v4_clientops; ++extern const struct nfs_rpc_ops pnfs_v4_clientops; + extern struct rpc_version nfs_version2; + extern struct rpc_version nfs_version3; + extern struct rpc_version nfs_version4; +diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.noarch/include/linux/panfs_shim_api.h +--- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-09-30 10:17:09.202009000 -0400 ++++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-09-30 10:17:09.204008000 -0400 +@@ -0,0 +1,57 @@ ++#ifndef _PANFS_SHIM_API_H ++#define _PANFS_SHIM_API_H ++ ++/* ++ * imported panfs functions ++ */ ++struct panfs_export_operations { ++ int (*convert_rc)(pan_status_t rc); ++ ++ int (*sm_sec_t_get_size_otw)( ++ pan_sm_sec_otw_t *var, ++ pan_size_t *core_sizep, ++ pan_size_t *wire_size, ++ void *buf_end); ++ ++ int (*sm_sec_t_unmarshall)( ++ pan_sm_sec_otw_t *in, ++ pan_sm_sec_t *out, ++ void *buf, ++ pan_size_t size, ++ pan_size_t *otw_consumed, ++ pan_size_t *in_core_consumed); ++ ++ int (*ucreds_get)(void **ucreds_pp); ++ ++ void (*ucreds_put)(void *ucreds); ++ ++ int (*sam_read)( ++ pan_sam_access_flags_t flags, ++ pan_sam_read_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_read_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p); ++ ++ int (*sam_write)( ++ pan_sam_access_flags_t flags, ++ pan_sam_write_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_write_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p); ++}; ++ ++extern int ++panfs_shim_register(struct panfs_export_operations *ops); ++ ++extern int ++panfs_shim_unregister(void); ++ ++#endif /* _PANFS_SHIM_API_H */ +diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h +--- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-09-30 10:17:09.214010000 -0400 ++++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-09-30 10:17:09.215014000 -0400 +@@ -0,0 +1,439 @@ ++/* ++ * pnfs_osd_xdr.h ++ * ++ * pNFS-osd on-the-wire data structures ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#ifndef __PNFS_OSD_XDR_H__ ++#define __PNFS_OSD_XDR_H__ ++ ++#include ++#include ++#include ++#include ++ ++#define PNFS_OSD_OSDNAME_MAXSIZE 256 ++ ++/* ++ * START OF "GENERIC" DECODE ROUTINES. ++ * These may look a little ugly since they are imported from a "generic" ++ * set of XDR encode/decode routines which are intended to be shared by ++ * all of our NFSv4 implementations (OpenBSD, MacOS X...). ++ * ++ * If the pain of reading these is too great, it should be a straightforward ++ * task to translate them into Linux-specific versions which are more ++ * consistent with the style used in NFSv2/v3... ++ */ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (u64)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++ ++/* ++ * draft-ietf-nfsv4-minorversion-22 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* Layout Structure */ ++ ++enum pnfs_osd_raid_algorithm4 { ++ PNFS_OSD_RAID_0 = 1, ++ PNFS_OSD_RAID_4 = 2, ++ PNFS_OSD_RAID_5 = 3, ++ PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ ++}; ++ ++/* struct pnfs_osd_data_map4 { ++ * uint32_t odm_num_comps; ++ * length4 odm_stripe_unit; ++ * uint32_t odm_group_width; ++ * uint32_t odm_group_depth; ++ * uint32_t odm_mirror_cnt; ++ * pnfs_osd_raid_algorithm4 odm_raid_algorithm; ++ * }; ++ */ ++struct pnfs_osd_data_map { ++ u32 odm_num_comps; ++ u64 odm_stripe_unit; ++ u32 odm_group_width; ++ u32 odm_group_depth; ++ u32 odm_mirror_cnt; ++ u32 odm_raid_algorithm; ++}; ++ ++static inline int ++pnfs_osd_data_map_xdr_sz(void) ++{ ++ return 1 + 2 + 1 + 1 + 1 + 1; ++} ++ ++static inline size_t ++pnfs_osd_data_map_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_data_map); ++} ++ ++/* struct pnfs_osd_objid4 { ++ * deviceid4 oid_device_id; ++ * uint64_t oid_partition_id; ++ * uint64_t oid_object_id; ++ * }; ++ */ ++struct pnfs_osd_objid { ++ struct pnfs_deviceid oid_device_id; ++ u64 oid_partition_id; ++ u64 oid_object_id; ++}; ++ ++/* For printout. I use "dev(%llx:%llx)", _DEVID_LO(), _DEVID_HI BE style */ ++#define _DEVID_LO(oid_device_id) \ ++ (unsigned long long)be64_to_cpup((__be64 *)oid_device_id.data) ++ ++#define _DEVID_HI(oid_device_id) \ ++ (unsigned long long)be64_to_cpup(((__be64 *)oid_device_id.data) + 1) ++ ++static inline int ++pnfs_osd_objid_xdr_sz(void) ++{ ++ return (NFS4_PNFS_DEVICEID4_SIZE / 4) + 2 + 2; ++} ++ ++static inline size_t ++pnfs_osd_objid_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_objid); ++} ++ ++enum pnfs_osd_version { ++ PNFS_OSD_MISSING = 0, ++ PNFS_OSD_VERSION_1 = 1, ++ PNFS_OSD_VERSION_2 = 2 ++}; ++ ++struct pnfs_osd_opaque_cred { ++ u32 cred_len; ++ u8 *cred; ++}; ++ ++static inline int ++pnfs_osd_opaque_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ READ32(n); ++ p += XDR_QUADLEN(n); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_opaque_cred_incore_sz(u32 *p) ++{ ++ u32 n; ++ ++ READ32(n); ++ return XDR_QUADLEN(n) * 4; ++} ++ ++enum pnfs_osd_cap_key_sec { ++ PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ PNFS_OSD_CAP_KEY_SEC_SSV = 1, ++}; ++ ++/* struct pnfs_osd_object_cred4 { ++ * pnfs_osd_objid4 oc_object_id; ++ * pnfs_osd_version4 oc_osd_version; ++ * pnfs_osd_cap_key_sec4 oc_cap_key_sec; ++ * opaque oc_capability_key<>; ++ * opaque oc_capability<>; ++ * }; ++ */ ++struct pnfs_osd_object_cred { ++ struct pnfs_osd_objid oc_object_id; ++ u32 oc_osd_version; ++ u32 oc_cap_key_sec; ++ struct pnfs_osd_opaque_cred oc_cap_key; ++ struct pnfs_osd_opaque_cred oc_cap; ++}; ++ ++static inline int ++pnfs_osd_object_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_object_cred_incore_sz(u32 *p) ++{ ++ size_t sz = sizeof(struct pnfs_osd_object_cred); ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ return sz; ++} ++ ++/* struct pnfs_osd_layout4 { ++ * pnfs_osd_data_map4 olo_map; ++ * uint32_t olo_comps_index; ++ * pnfs_osd_object_cred4 olo_components<>; ++ * }; ++ */ ++struct pnfs_osd_layout { ++ struct pnfs_osd_data_map olo_map; ++ u32 olo_comps_index; ++ u32 olo_num_comps; ++ struct pnfs_osd_object_cred *olo_comps; ++}; ++ ++static inline int ++pnfs_osd_layout_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ while ((int)(n--) > 0) ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_layout_incore_sz(u32 *p) ++{ ++ u32 n; ++ size_t sz; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ sz = sizeof(struct pnfs_osd_layout); ++ while ((int)(n--) > 0) { ++ sz += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ return sz; ++} ++ ++/* Device Address */ ++ ++enum pnfs_osd_targetid_type { ++ OBJ_TARGET_ANON = 1, ++ OBJ_TARGET_SCSI_NAME = 2, ++ OBJ_TARGET_SCSI_DEVICE_ID = 3, ++}; ++ ++/* union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) { ++ * case OBJ_TARGET_SCSI_NAME: ++ * string oti_scsi_name<>; ++ * ++ * case OBJ_TARGET_SCSI_DEVICE_ID: ++ * opaque oti_scsi_device_id<>; ++ * ++ * default: ++ * void; ++ * }; ++ * ++ * union pnfs_osd_targetaddr4 switch (bool ota_available) { ++ * case TRUE: ++ * netaddr4 ota_netaddr; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_deviceaddr4 { ++ * pnfs_osd_targetid4 oda_targetid; ++ * pnfs_osd_targetaddr4 oda_targetaddr; ++ * uint64_t oda_lun; ++ * opaque oda_systemid<>; ++ * pnfs_osd_object_cred4 oda_root_obj_cred; ++ * opaque oda_osdname<>; ++ * }; ++ */ ++struct pnfs_osd_targetid { ++ u32 oti_type; ++ struct nfs4_string oti_scsi_device_id; ++}; ++ ++enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 }; ++ ++/* struct netaddr4 { ++ * // see struct rpcb in RFC1833 ++ * string r_netid<>; // network id ++ * string r_addr<>; // universal address ++ * }; ++ */ ++struct pnfs_osd_net_addr { ++ struct nfs4_string r_netid; ++ struct nfs4_string r_addr; ++}; ++ ++struct pnfs_osd_targetaddr { ++ u32 ota_available; ++ struct pnfs_osd_net_addr ota_netaddr; ++}; ++ ++enum { ++ NETWORK_ID_MAX = 16 / 4, ++ UNIVERSAL_ADDRESS_MAX = 64 / 4, ++ PNFS_OSD_TARGETADDR_MAX = 3 + NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX, ++}; ++ ++struct pnfs_osd_deviceaddr { ++ struct pnfs_osd_targetid oda_targetid; ++ struct pnfs_osd_targetaddr oda_targetaddr; ++ u8 oda_lun[8]; ++ struct nfs4_string oda_systemid; ++ struct pnfs_osd_object_cred oda_root_obj_cred; ++ struct nfs4_string oda_osdname; ++}; ++ ++enum { ++ ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4, ++ PNFS_OSD_DEVICEADDR_MAX = ++ PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX + ++ 2 /*oda_lun*/ + ++ 1 + OSD_SYSTEMID_LEN + ++ 1 + ODA_OSDNAME_MAX, ++}; ++ ++/* LAYOUTCOMMIT: layoutupdate */ ++ ++/* union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) { ++ * case TRUE: ++ * int64_t dsu_delta; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_layoutupdate4 { ++ * pnfs_osd_deltaspaceused4 olu_delta_space_used; ++ * bool olu_ioerr_flag; ++ * }; ++ */ ++struct pnfs_osd_layoutupdate { ++ u32 dsu_valid; ++ s64 dsu_delta; ++ u32 olu_ioerr_flag; ++}; ++ ++/* LAYOUTRETURN: I/O Rrror Report */ ++ ++enum pnfs_osd_errno { ++ PNFS_OSD_ERR_EIO = 1, ++ PNFS_OSD_ERR_NOT_FOUND = 2, ++ PNFS_OSD_ERR_NO_SPACE = 3, ++ PNFS_OSD_ERR_BAD_CRED = 4, ++ PNFS_OSD_ERR_NO_ACCESS = 5, ++ PNFS_OSD_ERR_UNREACHABLE = 6, ++ PNFS_OSD_ERR_RESOURCE = 7 ++}; ++ ++/* struct pnfs_osd_ioerr4 { ++ * pnfs_osd_objid4 oer_component; ++ * length4 oer_comp_offset; ++ * length4 oer_comp_length; ++ * bool oer_iswrite; ++ * pnfs_osd_errno4 oer_errno; ++ * }; ++ */ ++struct pnfs_osd_ioerr { ++ struct pnfs_osd_objid oer_component; ++ u64 oer_comp_offset; ++ u64 oer_comp_length; ++ u32 oer_iswrite; ++ u32 oer_errno; ++}; ++ ++static inline unsigned ++pnfs_osd_ioerr_xdr_sz(void) ++{ ++ return pnfs_osd_objid_xdr_sz() + 2 + 2 + 1 + 1; ++} ++ ++/* OSD XDR API */ ++ ++/* Layout helpers */ ++extern struct pnfs_osd_layout *pnfs_osd_xdr_decode_layout( ++ struct pnfs_osd_layout *layout, u32 *p); ++ ++extern int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *layout); ++ ++/* Device Info helpers */ ++ ++/* First pass calculate total size for space needed */ ++extern size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p); ++ ++/* Note: some strings pointed to inside @deviceaddr might point ++ * to space inside @p. @p should stay valid while @deviceaddr ++ * is in use. ++ * It is assumed that @deviceaddr points to bigger memory of size ++ * calculated in first pass by pnfs_osd_xdr_deviceaddr_incore_sz() ++ */ ++extern void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p); ++ ++/* For Servers */ ++extern int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr); ++ ++/* layoutupdate (layout_commit) xdr helpers */ ++extern int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou); ++extern __be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p); ++ ++/* osd_ioerror encoding/decoding (layout_return) */ ++extern int ++pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, struct pnfs_osd_ioerr *ioerr); ++extern __be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p); ++ ++#endif /* __PNFS_OSD_XDR_H__ */ +diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/include/linux/posix_acl.h +--- linux-2.6.34.noarch/include/linux/posix_acl.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-09-30 10:17:09.227023000 -0400 +@@ -8,6 +8,7 @@ + #ifndef __LINUX_POSIX_ACL_H + #define __LINUX_POSIX_ACL_H + ++#include + #include + + #define ACL_UNDEFINED_ID (-1) +diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h +--- linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-09-30 10:17:09.233014000 -0400 +@@ -14,6 +14,8 @@ + /* size of an XDR encoding unit in bytes, i.e. 32bit */ + #define XDR_UNIT (4) + ++#include ++ + /* spec defines authentication flavor as an unsigned 32 bit integer */ + typedef u32 rpc_authflavor_t; + +diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h +--- linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-09-30 10:17:09.238025000 -0400 +@@ -3,6 +3,7 @@ + + #ifdef __KERNEL__ + ++#include + #include + + struct rpc_pipe_msg { +@@ -11,6 +12,10 @@ struct rpc_pipe_msg { + size_t len; + size_t copied; + int errno; ++#define PIPEFS_AUTOFREE_RPCMSG 0x01 /* frees rpc_pipe_msg */ ++#define PIPEFS_AUTOFREE_RPCMSG_DATA 0x02 /* frees rpc_pipe_msg->data */ ++#define PIPEFS_AUTOFREE_UPCALL_MSG PIPEFS_AUTOFREE_RPCMSG_DATA ++ u8 flags; + }; + + struct rpc_pipe_ops { +diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h +--- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-09-30 10:17:09.242015000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-09-30 10:17:09.244014000 -0400 +@@ -0,0 +1,111 @@ ++/* ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#ifndef _SIMPLE_RPC_PIPEFS_H_ ++#define _SIMPLE_RPC_PIPEFS_H_ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#define payload_of(headerp) ((void *)(headerp + 1)) ++ ++/* ++ * struct pipefs_hdr -- the generic message format for simple_rpc_pipefs. ++ * Messages may simply be the header itself, although having an optional ++ * data payload follow the header allows much more flexibility. ++ * ++ * Messages are created using pipefs_alloc_init_msg() and ++ * pipefs_alloc_init_msg_padded(), both of which accept a pointer to an ++ * (optional) data payload. ++ * ++ * Given a struct pipefs_hdr *msg that has a struct foo payload, the data ++ * can be accessed using: struct foo *foop = payload_of(msg) ++ */ ++struct pipefs_hdr { ++ u32 msgid; ++ u8 type; ++ u8 flags; ++ u16 totallen; /* length of entire message, including hdr itself */ ++ u32 status; ++}; ++ ++/* ++ * struct pipefs_list -- a type of list used for tracking callers who've made an ++ * upcall and are blocked waiting for a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_assign_upcall_reply(). ++ */ ++struct pipefs_list { ++ struct list_head list; ++ spinlock_t list_lock; ++}; ++ ++ ++/* See net/sunrpc/simple_rpc_pipefs.c for more info on using these functions. */ ++extern struct dentry *pipefs_mkpipe(const char *name, ++ const struct rpc_pipe_ops *ops, ++ int wait_for_open); ++extern void pipefs_closepipe(struct dentry *pipe); ++extern void pipefs_init_list(struct pipefs_list *list); ++extern struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen); ++extern struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, ++ u8 flags, void *data, ++ u16 datalen, u16 padlen); ++extern struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list ++ *uplist, u8 upflags, ++ u32 timeout); ++extern int pipefs_queue_upcall_noreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, u8 upflags); ++extern int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist); ++extern struct pipefs_hdr *pipefs_readmsg(struct file *filp, ++ const char __user *src, size_t len); ++extern ssize_t pipefs_generic_upcall(struct file *filp, ++ struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen); ++extern void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg); ++ ++#endif /* _SIMPLE_RPC_PIPEFS_H_ */ +diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h +--- linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-09-30 10:17:09.249016000 -0400 +@@ -166,4 +166,41 @@ static inline char *__svc_print_addr(con + + return buf; + } ++ ++/* ++ * Print a network address in a universal format (see rfc1833 and nfsv4.1) ++ */ ++static inline int __svc_print_netaddr(struct sockaddr *addr, ++ struct xdr_netobj *na) ++{ ++ u16 port; ++ ssize_t len; ++ ++ switch (addr->sa_family) { ++ case AF_INET: { ++ struct sockaddr_in *sin = (struct sockaddr_in *)addr; ++ port = ntohs(sin->sin_port); ++ ++ len = snprintf(na->data, na->len, "%pI4.%u.%u", ++ &sin->sin_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ case AF_INET6: { ++ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; ++ port = ntohs(sin6->sin6_port); ++ ++ len = snprintf(na->data, na->len, "%pI6.%u.%u", ++ &sin6->sin6_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ default: ++ snprintf(na->data, na->len, "unknown address type: %d", ++ addr->sa_family); ++ len = -EINVAL; ++ break; ++ } ++ return len; ++} + #endif /* SUNRPC_SVC_XPRT_H */ +diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xdr.h +--- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-09-30 10:15:18.029721000 -0400 ++++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-09-30 10:17:09.254021000 -0400 +@@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp) + return p + 2; + } + ++static inline __be32 * ++xdr_decode_opaque_fixed(__be32 *p, void *ptr, unsigned int len) ++{ ++ memcpy(ptr, p, len); ++ return p + XDR_QUADLEN(len); ++} ++ + /* + * Adjust kvec to reflect end of xdr'ed data (RPC client XDR) + */ +@@ -197,6 +204,7 @@ struct xdr_stream { + + extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); + extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); ++extern __be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q); + extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, + unsigned int base, unsigned int len); + extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); +diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/sunrpc/Makefile +--- linux-2.6.34.noarch/net/sunrpc/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-09-30 10:17:09.263013000 -0400 +@@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt + svc.o svcsock.o svcauth.o svcauth_unix.o \ + addr.o rpcb_clnt.o timer.o xdr.o \ + sunrpc_syms.o cache.o rpc_pipe.o \ +- svc_xprt.o ++ svc_xprt.o simple_rpc_pipefs.o + sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o + sunrpc-$(CONFIG_PROC_FS) += stats.o + sunrpc-$(CONFIG_SYSCTL) += sysctl.o +diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c +--- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-09-30 10:17:09.267010000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-09-30 10:17:09.268015000 -0400 +@@ -0,0 +1,424 @@ ++/* ++ * net/sunrpc/simple_rpc_pipefs.c ++ * ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++ ++/* ++ * Make an rpc_pipefs pipe named @name at the root of the mounted rpc_pipefs ++ * filesystem. ++ * ++ * If @wait_for_open is non-zero and an upcall is later queued but the userland ++ * end of the pipe has not yet been opened, the upcall will remain queued until ++ * the pipe is opened; otherwise, the upcall queueing will return with -EPIPE. ++ */ ++struct dentry *pipefs_mkpipe(const char *name, const struct rpc_pipe_ops *ops, ++ int wait_for_open) ++{ ++ struct dentry *dir, *pipe; ++ struct vfsmount *mnt; ++ ++ mnt = rpc_get_mount(); ++ if (IS_ERR(mnt)) { ++ pipe = ERR_CAST(mnt); ++ goto out; ++ } ++ dir = mnt->mnt_root; ++ if (!dir) { ++ pipe = ERR_PTR(-ENOENT); ++ goto out; ++ } ++ pipe = rpc_mkpipe(dir, name, NULL, ops, ++ wait_for_open ? RPC_PIPE_WAIT_FOR_OPEN : 0); ++out: ++ return pipe; ++} ++EXPORT_SYMBOL(pipefs_mkpipe); ++ ++/* ++ * Shutdown a pipe made by pipefs_mkpipe(). ++ * XXX: do we need to retain an extra reference on the mount? ++ */ ++void pipefs_closepipe(struct dentry *pipe) ++{ ++ rpc_unlink(pipe); ++ rpc_put_mount(); ++} ++EXPORT_SYMBOL(pipefs_closepipe); ++ ++/* ++ * Initialize a struct pipefs_list -- which are a way to keep track of callers ++ * who're blocked having made an upcall and are awaiting a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_find_upcall_msgid() for how ++ * to use them. ++ */ ++inline void pipefs_init_list(struct pipefs_list *list) ++{ ++ INIT_LIST_HEAD(&list->list); ++ spin_lock_init(&list->list_lock); ++} ++EXPORT_SYMBOL(pipefs_init_list); ++ ++/* ++ * Alloc/init a generic pipefs message header and copy into its message body ++ * an arbitrary data payload. ++ * ++ * struct pipefs_hdr's are meant to serve as generic, general-purpose message ++ * headers for easy rpc_pipefs I/O. When an upcall is made, the ++ * struct pipefs_hdr is assigned to a struct rpc_pipe_msg and delivered ++ * therein. --And yes, the naming can seem a little confusing at first: ++ * ++ * When one thinks of an upcall "message", in simple_rpc_pipefs that's a ++ * struct pipefs_hdr (possibly with an attached message body). A ++ * struct rpc_pipe_msg is actually only the -vehicle- by which the "real" ++ * message is delivered and processed. ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen, u16 padlen) ++{ ++ u16 totallen; ++ struct pipefs_hdr *msg = NULL; ++ ++ totallen = sizeof(*msg) + datalen + padlen; ++ if (totallen > PAGE_SIZE) { ++ msg = ERR_PTR(-E2BIG); ++ goto out; ++ } ++ ++ msg = kzalloc(totallen, GFP_KERNEL); ++ if (!msg) { ++ msg = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ msg->msgid = msgid; ++ msg->type = type; ++ msg->flags = flags; ++ msg->totallen = totallen; ++ memcpy(payload_of(msg), data, datalen); ++out: ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg_padded); ++ ++/* ++ * See the description of pipefs_alloc_init_msg_padded(). ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen) ++{ ++ return pipefs_alloc_init_msg_padded(msgid, type, flags, data, ++ datalen, 0); ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg); ++ ++ ++static void pipefs_init_rpcmsg(struct rpc_pipe_msg *rpcmsg, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ memset(rpcmsg, 0, sizeof(*rpcmsg)); ++ rpcmsg->data = msg; ++ rpcmsg->len = msg->totallen; ++ rpcmsg->flags = upflags; ++} ++ ++static struct rpc_pipe_msg *pipefs_alloc_init_rpcmsg(struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ struct rpc_pipe_msg *rpcmsg; ++ ++ rpcmsg = kmalloc(sizeof(*rpcmsg), GFP_KERNEL); ++ if (!rpcmsg) ++ return ERR_PTR(-ENOMEM); ++ ++ pipefs_init_rpcmsg(rpcmsg, msg, upflags); ++ return rpcmsg; ++} ++ ++ ++/* represents an upcall that'll block and wait for a reply */ ++struct pipefs_upcall { ++ u32 msgid; ++ struct rpc_pipe_msg rpcmsg; ++ struct list_head list; ++ wait_queue_head_t waitq; ++ struct pipefs_hdr *reply; ++}; ++ ++ ++static void pipefs_init_upcall_waitreply(struct pipefs_upcall *upcall, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ upcall->reply = NULL; ++ upcall->msgid = msg->msgid; ++ INIT_LIST_HEAD(&upcall->list); ++ init_waitqueue_head(&upcall->waitq); ++ pipefs_init_rpcmsg(&upcall->rpcmsg, msg, upflags); ++} ++ ++static int __pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_upcall *upcall, ++ struct pipefs_list *uplist, ++ u32 timeout) ++{ ++ int err = 0; ++ DECLARE_WAITQUEUE(wq, current); ++ ++ add_wait_queue(&upcall->waitq, &wq); ++ spin_lock(&uplist->list_lock); ++ list_add(&upcall->list, &uplist->list); ++ spin_unlock(&uplist->list_lock); ++ ++ err = rpc_queue_upcall(pipe->d_inode, &upcall->rpcmsg); ++ if (err < 0) ++ goto out; ++ ++ if (timeout) { ++ /* retval of 0 means timer expired */ ++ err = schedule_timeout_uninterruptible(timeout); ++ if (err == 0 && upcall->reply == NULL) ++ err = -ETIMEDOUT; ++ } else { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ } ++ ++out: ++ spin_lock(&uplist->list_lock); ++ list_del_init(&upcall->list); ++ spin_unlock(&uplist->list_lock); ++ remove_wait_queue(&upcall->waitq, &wq); ++ return err; ++} ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace, place the calling thread ++ * on @uplist, and block the thread to wait for a reply. If @timeout is ++ * nonzero, the thread will be blocked for at most @timeout jiffies. ++ * ++ * (To convert time units into jiffies, consider the functions ++ * msecs_to_jiffies(), usecs_to_jiffies(), timeval_to_jiffies(), and ++ * timespec_to_jiffies().) ++ * ++ * Once a reply is received by your downcall handler, call ++ * pipefs_assign_upcall_reply() with @uplist to find the corresponding upcall, ++ * assign the reply, and wake the waiting thread. ++ * ++ * This function's return value pointer may be an error and should be checked ++ * with IS_ERR() before attempting to access the reply message. ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list *uplist, ++ u8 upflags, u32 timeout) ++{ ++ int err = 0; ++ struct pipefs_upcall upcall; ++ ++ pipefs_init_upcall_waitreply(&upcall, msg, upflags); ++ err = __pipefs_queue_upcall_waitreply(pipe, &upcall, uplist, timeout); ++ if (err < 0) { ++ kfree(upcall.reply); ++ upcall.reply = ERR_PTR(err); ++ } ++ ++ return upcall.reply; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_waitreply); ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace and immediately return (i.e., ++ * no reply is expected). ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++int pipefs_queue_upcall_noreply(struct dentry *pipe, struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ int err = 0; ++ struct rpc_pipe_msg *rpcmsg; ++ ++ upflags |= PIPEFS_AUTOFREE_RPCMSG; ++ rpcmsg = pipefs_alloc_init_rpcmsg(msg, upflags); ++ if (IS_ERR(rpcmsg)) { ++ err = PTR_ERR(rpcmsg); ++ goto out; ++ } ++ err = rpc_queue_upcall(pipe->d_inode, rpcmsg); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_noreply); ++ ++ ++static struct pipefs_upcall *pipefs_find_upcall_msgid(u32 msgid, ++ struct pipefs_list *uplist) ++{ ++ struct pipefs_upcall *upcall; ++ ++ spin_lock(&uplist->list_lock); ++ list_for_each_entry(upcall, &uplist->list, list) ++ if (upcall->msgid == msgid) ++ goto out; ++ upcall = NULL; ++out: ++ spin_unlock(&uplist->list_lock); ++ return upcall; ++} ++ ++/* ++ * In your rpc_pipe_ops->downcall() handler, once you've read in a downcall ++ * message and have determined that it is a reply to a waiting upcall, ++ * you can use this function to find the appropriate upcall, assign the result, ++ * and wake the upcall thread. ++ * ++ * The reply message must have the same msgid as the original upcall message's. ++ * ++ * See also pipefs_queue_upcall_waitreply() and pipefs_readmsg(). ++ */ ++int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist) ++{ ++ int err = 0; ++ struct pipefs_upcall *upcall; ++ ++ upcall = pipefs_find_upcall_msgid(reply->msgid, uplist); ++ if (!upcall) { ++ printk(KERN_ERR "%s: ERROR: have reply but no matching upcall " ++ "for msgid %d\n", __func__, reply->msgid); ++ err = -ENOENT; ++ goto out; ++ } ++ upcall->reply = reply; ++ wake_up(&upcall->waitq); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_assign_upcall_reply); ++ ++/* ++ * Generic method to read-in and return a newly-allocated message which begins ++ * with a struct pipefs_hdr. ++ */ ++struct pipefs_hdr *pipefs_readmsg(struct file *filp, const char __user *src, ++ size_t len) ++{ ++ int err = 0, hdrsize; ++ struct pipefs_hdr *msg = NULL; ++ ++ hdrsize = sizeof(*msg); ++ if (len < hdrsize) { ++ printk(KERN_ERR "%s: ERROR: header is too short (%d vs %d)\n", ++ __func__, (int) len, hdrsize); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ msg = kzalloc(len, GFP_KERNEL); ++ if (!msg) { ++ err = -ENOMEM; ++ goto out; ++ } ++ if (copy_from_user(msg, src, len)) ++ err = -EFAULT; ++out: ++ if (err) { ++ kfree(msg); ++ msg = ERR_PTR(err); ++ } ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_readmsg); ++ ++/* ++ * Generic rpc_pipe_ops->upcall() handler implementation. ++ * ++ * Don't call this directly: to make an upcall, use ++ * pipefs_queue_upcall_waitreply() or pipefs_queue_upcall_noreply(). ++ */ ++ssize_t pipefs_generic_upcall(struct file *filp, struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen) ++{ ++ char *data; ++ ssize_t len, left; ++ ++ data = (char *)rpcmsg->data + rpcmsg->copied; ++ len = rpcmsg->len - rpcmsg->copied; ++ if (len > buflen) ++ len = buflen; ++ ++ left = copy_to_user(dst, data, len); ++ if (left < 0) { ++ rpcmsg->errno = left; ++ return left; ++ } ++ ++ len -= left; ++ rpcmsg->copied += len; ++ rpcmsg->errno = 0; ++ return len; ++} ++EXPORT_SYMBOL(pipefs_generic_upcall); ++ ++/* ++ * Generic rpc_pipe_ops->destroy_msg() handler implementation. ++ * ++ * Items are only freed if @rpcmsg->flags has been set appropriately. ++ * See pipefs_queue_upcall_noreply() and rpc_pipe_fs.h. ++ */ ++void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg) ++{ ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_UPCALL_MSG) ++ kfree(rpcmsg->data); ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_RPCMSG) ++ kfree(rpcmsg); ++} ++EXPORT_SYMBOL(pipefs_generic_destroy_msg); +diff -up linux-2.6.34.noarch/net/sunrpc/xdr.c.orig linux-2.6.34.noarch/net/sunrpc/xdr.c +--- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-09-30 10:15:18.189725000 -0400 ++++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-09-30 10:17:09.274010000 -0400 +@@ -395,24 +395,29 @@ xdr_shrink_pagelen(struct xdr_buf *buf, + { + struct kvec *tail; + size_t copy; +- char *p; + unsigned int pglen = buf->page_len; ++ unsigned int tailbuf_len; + + tail = buf->tail; + BUG_ON (len > pglen); + ++ tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len; ++ + /* Shift the tail first */ +- if (tail->iov_len != 0) { +- p = (char *)tail->iov_base + len; ++ if (tailbuf_len != 0) { ++ unsigned int free_space = tailbuf_len - tail->iov_len; ++ ++ if (len < free_space) ++ free_space = len; ++ tail->iov_len += free_space; ++ ++ copy = len; + if (tail->iov_len > len) { +- copy = tail->iov_len - len; +- memmove(p, tail->iov_base, copy); ++ char *p = (char *)tail->iov_base + len; ++ memmove(p, tail->iov_base, tail->iov_len - len); + } else +- buf->buflen -= len; +- /* Copy from the inlined pages into the tail */ +- copy = len; +- if (copy > tail->iov_len) + copy = tail->iov_len; ++ /* Copy from the inlined pages into the tail */ + _copy_from_pages((char *)tail->iov_base, + buf->pages, buf->page_base + pglen - len, + copy); +@@ -496,6 +501,27 @@ __be32 * xdr_reserve_space(struct xdr_st + EXPORT_SYMBOL_GPL(xdr_reserve_space); + + /** ++ * xdr_rewind_stream - rewind a stream back to some checkpoint ++ * @xdr: pointer to xdr_stream ++ * @q: some checkpoint at historical place of @xdr ++ * ++ * Restors an xdr stream to some historical point. @q must be ++ * a logical xdr point in the past that was sampled by @q = @xdr->p. ++ */ ++__be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q) ++{ ++ size_t nbytes = (xdr->p - q) << 2; ++ ++ BUG_ON(xdr->p < q); ++ BUG_ON(nbytes > xdr->iov->iov_len || nbytes > xdr->buf->len); ++ xdr->p = q; ++ xdr->iov->iov_len -= nbytes; ++ xdr->buf->len -= nbytes; ++ return q; ++} ++EXPORT_SYMBOL_GPL(xdr_rewind_stream); ++ ++/** + * xdr_write_pages - Insert a list of pages into an XDR buffer for sending + * @xdr: pointer to xdr_stream + * @pages: list of pages