From e851db5b05408b89b9a9429a66814b79fabee2a1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Jun 2008 18:45:30 -0400 Subject: [PATCH 01/59] SUNRPC: Add address family field to svc_serv data structure Introduce and initialize an address family field in the svc_serv structure. This field will determine what family to use for the service's listener sockets and what families are advertised via the local rpcbind daemon. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/svc.c | 2 +- fs/nfs/callback.c | 3 ++- fs/nfsd/nfssvc.c | 1 + include/linux/sunrpc/svc.h | 9 +++++---- net/sunrpc/svc.c | 11 ++++++----- 5 files changed, 15 insertions(+), 11 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 5bd9bf0fa9d..1553fecc567 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -266,7 +266,7 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ "lockd_up: no pid, %d users??\n", nlmsvc_users); error = -ENOMEM; - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); goto out; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index f447f4b4476..6a09760c596 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -105,7 +105,8 @@ int nfs_callback_up(void) mutex_lock(&nfs_callback_mutex); if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) goto out; - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, + AF_INET, NULL); ret = -ENOMEM; if (!serv) goto out_err; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 80292ff5e92..7f3d76a7839 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -229,6 +229,7 @@ int nfsd_create_serv(void) atomic_set(&nfsd_busy, 0); nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, + AF_INET, nfsd_last_thread, nfsd, THIS_MODULE); if (nfsd_serv == NULL) err = -ENOMEM; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index dc69068d94c..23143f38b12 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -66,6 +66,7 @@ struct svc_serv { struct list_head sv_tempsocks; /* all temporary sockets */ int sv_tmpcnt; /* count of temporary sockets */ struct timer_list sv_temptimer; /* timer for aging temporary sockets */ + sa_family_t sv_family; /* listener's address family */ char * sv_name; /* service name */ @@ -381,14 +382,14 @@ struct svc_procedure { /* * Function prototypes. */ -struct svc_serv * svc_create(struct svc_program *, unsigned int, - void (*shutdown)(struct svc_serv*)); +struct svc_serv *svc_create(struct svc_program *, unsigned int, sa_family_t, + void (*shutdown)(struct svc_serv *)); struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool); void svc_exit_thread(struct svc_rqst *); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, - void (*shutdown)(struct svc_serv*), svc_thread_fn, - struct module *); + sa_family_t, void (*shutdown)(struct svc_serv *), + svc_thread_fn, struct module *); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); void svc_destroy(struct svc_serv *); int svc_process(struct svc_rqst *); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 5a32cb7c4bb..9ba17044109 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -357,7 +357,7 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu) */ static struct svc_serv * __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - void (*shutdown)(struct svc_serv *serv)) + sa_family_t family, void (*shutdown)(struct svc_serv *serv)) { struct svc_serv *serv; unsigned int vers; @@ -366,6 +366,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) return NULL; + serv->sv_family = family; serv->sv_name = prog->pg_name; serv->sv_program = prog; serv->sv_nrthreads = 1; @@ -425,21 +426,21 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, struct svc_serv * svc_create(struct svc_program *prog, unsigned int bufsize, - void (*shutdown)(struct svc_serv *serv)) + sa_family_t family, void (*shutdown)(struct svc_serv *serv)) { - return __svc_create(prog, bufsize, /*npools*/1, shutdown); + return __svc_create(prog, bufsize, /*npools*/1, family, shutdown); } EXPORT_SYMBOL(svc_create); struct svc_serv * svc_create_pooled(struct svc_program *prog, unsigned int bufsize, - void (*shutdown)(struct svc_serv *serv), + sa_family_t family, void (*shutdown)(struct svc_serv *serv), svc_thread_fn func, struct module *mod) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); - serv = __svc_create(prog, bufsize, npools, shutdown); + serv = __svc_create(prog, bufsize, npools, family, shutdown); if (serv != NULL) { serv->sv_function = func; From 5dd248f6f1ffe1f691fd66749e2a3dc8f8eb7b5e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Jun 2008 18:45:37 -0400 Subject: [PATCH 02/59] SUNRPC: Use proper INADDR_ANY when setting up RPC services on IPv6 Teach svc_create_xprt() to use the correct ANY address for AF_INET6 based RPC services. No caller uses AF_INET6 yet. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index e46c825f495..bf5b5cdafeb 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -159,15 +159,44 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt, } EXPORT_SYMBOL_GPL(svc_xprt_init); -int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, - int flags) +static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, + struct svc_serv *serv, + unsigned short port, int flags) { - struct svc_xprt_class *xcl; struct sockaddr_in sin = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = htons(port), }; + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_port = htons(port), + }; + struct sockaddr *sap; + size_t len; + + switch (serv->sv_family) { + case AF_INET: + sap = (struct sockaddr *)&sin; + len = sizeof(sin); + break; + case AF_INET6: + sap = (struct sockaddr *)&sin6; + len = sizeof(sin6); + break; + default: + return ERR_PTR(-EAFNOSUPPORT); + } + + return xcl->xcl_ops->xpo_create(serv, sap, len, flags); +} + +int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, + int flags) +{ + struct svc_xprt_class *xcl; + dprintk("svc: creating transport %s[%d]\n", xprt_name, port); spin_lock(&svc_xprt_class_lock); list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { @@ -180,9 +209,7 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, goto err; spin_unlock(&svc_xprt_class_lock); - newxprt = xcl->xcl_ops-> - xpo_create(serv, (struct sockaddr *)&sin, sizeof(sin), - flags); + newxprt = __svc_xpo_create(xcl, serv, port, flags); if (IS_ERR(newxprt)) { module_put(xcl->xcl_owner); return PTR_ERR(newxprt); From 04716e6621ff4abb422d64ba7b48718f52716a3e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 7 Aug 2008 13:00:20 -0400 Subject: [PATCH 03/59] nfsd: permit unauthenticated stat of export root RFC 2623 section 2.3.2 permits the server to bypass gss authentication checks for certain operations that a client may perform when mounting. In the case of a client that doesn't have some form of credentials available to it on boot, this allows it to perform the mount unattended. (Presumably real file access won't be needed until a user with credentials logs in.) Being slightly more lenient allows lots of old clients to access krb5-only exports, with the only loss being a small amount of information leaked about the root directory of the export. This affects only v2 and v3; v4 still requires authentication for all access. Thanks to Peter Staubach testing against a Solaris client, which suggesting addition of v3 getattr, to the list, and to Trond for noting that doing so exposes no additional information. Signed-off-by: J. Bruce Fields Cc: Peter Staubach Cc: Trond Myklebust --- fs/nfsd/nfs3proc.c | 8 +++++--- fs/nfsd/nfsfh.c | 30 ++++++++++++++++++++---------- fs/nfsd/nfsproc.c | 6 ++++-- fs/nfsd/vfs.c | 4 ++-- include/linux/nfsd/nfsd.h | 3 ++- 5 files changed, 33 insertions(+), 18 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 4d617ea28cf..9dbd2eb9128 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -63,7 +63,8 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + nfserr = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); if (nfserr) RETURN_STATUS(nfserr); @@ -530,7 +531,7 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, dprintk("nfsd: FSSTAT(3) %s\n", SVCFH_fmt(&argp->fh)); - nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats); + nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0); fh_put(&argp->fh); RETURN_STATUS(nfserr); } @@ -558,7 +559,8 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, resp->f_maxfilesize = ~(u32) 0; resp->f_properties = NFS3_FSF_DEFAULT; - nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); + nfserr = fh_verify(rqstp, &argp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); /* Check special features of the file system. May request * different read/write sizes for file systems known to have diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index ea37c96f044..cd25d91895a 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -302,17 +302,27 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) if (error) goto out; - if (!(access & NFSD_MAY_LOCK)) { - /* - * pseudoflavor restrictions are not enforced on NLM, - * which clients virtually always use auth_sys for, - * even while using RPCSEC_GSS for NFS. - */ - error = check_nfsd_access(exp, rqstp); - if (error) - goto out; - } + /* + * pseudoflavor restrictions are not enforced on NLM, + * which clients virtually always use auth_sys for, + * even while using RPCSEC_GSS for NFS. + */ + if (access & NFSD_MAY_LOCK) + goto skip_pseudoflavor_check; + /* + * Clients may expect to be able to use auth_sys during mount, + * even if they use gss for everything else; see section 2.3.2 + * of rfc 2623. + */ + if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT + && exp->ex_path.dentry == dentry) + goto skip_pseudoflavor_check; + error = check_nfsd_access(exp, rqstp); + if (error) + goto out; + +skip_pseudoflavor_check: /* Finally, check access permissions. */ error = nfsd_permission(rqstp, exp, dentry, access); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 0766f95d236..5cffeca7ace 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -65,7 +65,8 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + nfserr = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); return nfsd_return_attrs(nfserr, resp); } @@ -521,7 +522,8 @@ nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh)); - nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats); + nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, + NFSD_MAY_BYPASS_GSS_ON_ROOT); fh_put(&argp->fh); return nfserr; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 18060bed526..1319e8027d5 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1866,9 +1866,9 @@ out: * N.B. After this call fhp needs an fh_put */ __be32 -nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat) +nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access) { - __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); + __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); if (!err && vfs_statfs(fhp->fh_dentry,stat)) err = nfserr_io; return err; diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h index 108f47e5fd9..21269405ffe 100644 --- a/include/linux/nfsd/nfsd.h +++ b/include/linux/nfsd/nfsd.h @@ -38,6 +38,7 @@ #define NFSD_MAY_LOCK 32 #define NFSD_MAY_OWNER_OVERRIDE 64 #define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ +#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 #define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) #define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) @@ -125,7 +126,7 @@ int nfsd_truncate(struct svc_rqst *, struct svc_fh *, __be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, loff_t *, struct readdir_cd *, filldir_t); __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, - struct kstatfs *); + struct kstatfs *, int access); int nfsd_notify_change(struct inode *, struct iattr *); __be32 nfsd_permission(struct svc_rqst *, struct svc_export *, From bfcd17a6c5529bc37234cfa720a047cf9397bcfc Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Wed, 6 Aug 2008 15:12:22 +0200 Subject: [PATCH 04/59] Configure out file locking features This patch adds the CONFIG_FILE_LOCKING option which allows to remove support for advisory locks. With this patch enabled, the flock() system call, the F_GETLK, F_SETLK and F_SETLKW operations of fcntl() and NFS support are disabled. These features are not necessarly needed on embedded systems. It allows to save ~11 Kb of kernel code and data: text data bss dec hex filename 1125436 118764 212992 1457192 163c28 vmlinux.old 1114299 118564 212992 1445855 160fdf vmlinux -11137 -200 0 -11337 -2C49 +/- This patch has originally been written by Matt Mackall , and is part of the Linux Tiny project. Signed-off-by: Thomas Petazzoni Signed-off-by: Matt Mackall Cc: matthew@wil.cx Cc: linux-fsdevel@vger.kernel.org Cc: mpm@selenic.com Cc: akpm@linux-foundation.org Signed-off-by: J. Bruce Fields --- fs/Kconfig | 8 +++++++ fs/Makefile | 3 ++- fs/proc/proc_misc.c | 4 ++++ include/linux/fs.h | 57 +++++++++++++++++++++++++++++++++++++++------ kernel/sys_ni.c | 1 + kernel/sysctl.c | 6 ++++- 6 files changed, 70 insertions(+), 9 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index abccb5dab9a..c6ae4d4842e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -419,6 +419,14 @@ config FS_POSIX_ACL bool default n +config FILE_LOCKING + bool "Enable POSIX file locking API" if EMBEDDED + default y + help + This option enables standard file locking support, required + for filesystems like NFS and for the flock() system + call. Disabling this option saves about 11k. + source "fs/xfs/Kconfig" source "fs/gfs2/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index a1482a5eff1..4b86d433baa 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -7,7 +7,7 @@ obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ - ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ + ioctl.o readdir.o select.o fifo.o dcache.o inode.o \ attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o drop_caches.o splice.o sync.o utimes.o \ @@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o +obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o nfsd-$(CONFIG_NFSD) := nfsctl.o diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 29e20c6b1f7..1aabbe2592e 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -684,6 +684,7 @@ static int cmdline_read_proc(char *page, char **start, off_t off, return proc_calc_metrics(page, start, off, count, eof, len); } +#ifdef CONFIG_FILE_LOCKING static int locks_open(struct inode *inode, struct file *filp) { return seq_open(filp, &locks_seq_operations); @@ -695,6 +696,7 @@ static const struct file_operations proc_locks_operations = { .llseek = seq_lseek, .release = seq_release, }; +#endif /* CONFIG_FILE_LOCKING */ static int execdomains_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -888,7 +890,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_PRINTK proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); #endif +#ifdef CONFIG_FILE_LOCKING proc_create("locks", 0, NULL, &proc_locks_operations); +#endif proc_create("devices", 0, NULL, &proc_devinfo_operations); proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); #ifdef CONFIG_BLOCK diff --git a/include/linux/fs.h b/include/linux/fs.h index 580b513668f..9f540165a07 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -983,6 +983,13 @@ struct file_lock { #include +extern void send_sigio(struct fown_struct *fown, int fd, int band); + +/* fs/sync.c */ +extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset, + loff_t endbyte, unsigned int flags); + +#ifdef CONFIG_FILE_LOCKING extern int fcntl_getlk(struct file *, struct flock __user *); extern int fcntl_setlk(unsigned int, struct file *, unsigned int, struct flock __user *); @@ -993,14 +1000,9 @@ extern int fcntl_setlk64(unsigned int, struct file *, unsigned int, struct flock64 __user *); #endif -extern void send_sigio(struct fown_struct *fown, int fd, int band); extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); extern int fcntl_getlease(struct file *filp); -/* fs/sync.c */ -extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset, - loff_t endbyte, unsigned int flags); - /* fs/locks.c */ extern void locks_init_lock(struct file_lock *); extern void locks_copy_lock(struct file_lock *, struct file_lock *); @@ -1023,6 +1025,37 @@ extern int lease_modify(struct file_lock **, int); extern int lock_may_read(struct inode *, loff_t start, unsigned long count); extern int lock_may_write(struct inode *, loff_t start, unsigned long count); extern struct seq_operations locks_seq_operations; +#else /* !CONFIG_FILE_LOCKING */ +#define fcntl_getlk(a, b) ({ -EINVAL; }) +#define fcntl_setlk(a, b, c, d) ({ -EACCES; }) +#if BITS_PER_LONG == 32 +#define fcntl_getlk64(a, b) ({ -EINVAL; }) +#define fcntl_setlk64(a, b, c, d) ({ -EACCES; }) +#endif +#define fcntl_setlease(a, b, c) ({ 0; }) +#define fcntl_getlease(a) ({ 0; }) +#define locks_init_lock(a) ({ }) +#define __locks_copy_lock(a, b) ({ }) +#define locks_copy_lock(a, b) ({ }) +#define locks_remove_posix(a, b) ({ }) +#define locks_remove_flock(a) ({ }) +#define posix_test_lock(a, b) ({ 0; }) +#define posix_lock_file(a, b, c) ({ -ENOLCK; }) +#define posix_lock_file_wait(a, b) ({ -ENOLCK; }) +#define posix_unblock_lock(a, b) (-ENOENT) +#define vfs_test_lock(a, b) ({ 0; }) +#define vfs_lock_file(a, b, c, d) (-ENOLCK) +#define vfs_cancel_lock(a, b) ({ 0; }) +#define flock_lock_file_wait(a, b) ({ -ENOLCK; }) +#define __break_lease(a, b) ({ 0; }) +#define lease_get_mtime(a, b) ({ }) +#define generic_setlease(a, b, c) ({ -EINVAL; }) +#define vfs_setlease(a, b, c) ({ -EINVAL; }) +#define lease_modify(a, b) ({ -EINVAL; }) +#define lock_may_read(a, b, c) ({ 1; }) +#define lock_may_write(a, b, c) ({ 1; }) +#endif /* !CONFIG_FILE_LOCKING */ + struct fasync_struct { int magic; @@ -1554,9 +1587,12 @@ extern int vfs_statfs(struct dentry *, struct kstatfs *); /* /sys/fs */ extern struct kobject *fs_kobj; +extern int rw_verify_area(int, struct file *, loff_t *, size_t); + #define FLOCK_VERIFY_READ 1 #define FLOCK_VERIFY_WRITE 2 +#ifdef CONFIG_FILE_LOCKING extern int locks_mandatory_locked(struct inode *); extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); @@ -1587,8 +1623,6 @@ static inline int locks_verify_locked(struct inode *inode) return 0; } -extern int rw_verify_area(int, struct file *, loff_t *, size_t); - static inline int locks_verify_truncate(struct inode *inode, struct file *filp, loff_t size) @@ -1609,6 +1643,15 @@ static inline int break_lease(struct inode *inode, unsigned int mode) return __break_lease(inode, mode); return 0; } +#else /* !CONFIG_FILE_LOCKING */ +#define locks_mandatory_locked(a) ({ 0; }) +#define locks_mandatory_area(a, b, c, d, e) ({ 0; }) +#define __mandatory_lock(a) ({ 0; }) +#define mandatory_lock(a) ({ 0; }) +#define locks_verify_locked(a) ({ 0; }) +#define locks_verify_truncate(a, b, c) ({ 0; }) +#define break_lease(a, b) ({ 0; }) +#endif /* CONFIG_FILE_LOCKING */ /* fs/open.c */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 08d6e1bb99a..503d8d4eb80 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -125,6 +125,7 @@ cond_syscall(sys_vm86old); cond_syscall(sys_vm86); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); +cond_syscall(sys_flock); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 50ec0886fa3..4588b2cf2ec 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -97,7 +97,7 @@ static int sixty = 60; static int neg_one = -1; #endif -#ifdef CONFIG_MMU +#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING) static int two = 2; #endif @@ -1261,6 +1261,7 @@ static struct ctl_table fs_table[] = { .extra1 = &minolduid, .extra2 = &maxolduid, }, +#ifdef CONFIG_FILE_LOCKING { .ctl_name = FS_LEASES, .procname = "leases-enable", @@ -1269,6 +1270,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif #ifdef CONFIG_DNOTIFY { .ctl_name = FS_DIR_NOTIFY, @@ -1280,6 +1282,7 @@ static struct ctl_table fs_table[] = { }, #endif #ifdef CONFIG_MMU +#ifdef CONFIG_FILE_LOCKING { .ctl_name = FS_LEASE_TIME, .procname = "lease-break-time", @@ -1291,6 +1294,7 @@ static struct ctl_table fs_table[] = { .extra1 = &zero, .extra2 = &two, }, +#endif { .procname = "aio-nr", .data = &aio_nr, From 8e40741494bd08a15af17b8cfd26a1f7ad1f5081 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Tue, 12 Aug 2008 20:42:51 +0300 Subject: [PATCH 05/59] nfsd: properly xdr-encode stateid4.seqid as uint32_t for cb_recall Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 702fa577aa6..ab13e02c568 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -225,7 +225,8 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); WRITE32(OP_CB_RECALL); - WRITEMEM(&cb_rec->cbr_stateid, sizeof(stateid_t)); + WRITE32(cb_rec->cbr_stateid.si_generation); + WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t)); WRITE32(cb_rec->cbr_trunc); WRITE32(len); WRITEMEM(cb_rec->cbr_fhval, len); From c47b2ca42e848e2dce122acbefa0de59320f5683 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Tue, 12 Aug 2008 20:43:37 +0300 Subject: [PATCH 06/59] nfsd: properly xdr-encode deleg stateid returned from open Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 14ba4d9b285..d7e4bd50ed3 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2149,7 +2149,9 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op break; case NFS4_OPEN_DELEGATE_READ: RESERVE_SPACE(20 + sizeof(stateid_t)); - WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + WRITE32(open->op_delegate_stateid.si_generation); + WRITEMEM(&open->op_delegate_stateid.si_opaque, + sizeof(stateid_opaque_t)); WRITE32(open->op_recall); /* @@ -2163,7 +2165,9 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op break; case NFS4_OPEN_DELEGATE_WRITE: RESERVE_SPACE(32 + sizeof(stateid_t)); - WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + WRITE32(open->op_delegate_stateid.si_generation); + WRITEMEM(&open->op_delegate_stateid.si_opaque, + sizeof(stateid_opaque_t)); WRITE32(0); /* From 5033b77a931a12bc7395c1834fa50f6d477be3ae Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Tue, 12 Aug 2008 20:44:41 +0300 Subject: [PATCH 07/59] nfsd: fix nfsd4_encode_open buffer space reservation nfsd4_encode_open first reservation is currently for 36 + sizeof(stateid_t) while it writes after the stateid a cinfo (20 bytes) and 5 more 4-bytes words, for a total of 40 + sizeof(stateid_t). Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index d7e4bd50ed3..ace39349670 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2133,7 +2133,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op if (nfserr) goto out; - RESERVE_SPACE(36 + sizeof(stateid_t)); + RESERVE_SPACE(40 + sizeof(stateid_t)); WRITE32(open->op_stateid.si_generation); WRITEMEM(&open->op_stateid.si_opaque, sizeof(stateid_opaque_t)); WRITECINFO(open->op_cinfo); From e2f282b9f0538e4f63255ffa35bf3b902f5fbde2 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Tue, 12 Aug 2008 20:45:07 +0300 Subject: [PATCH 08/59] nfsd: nfs4xdr encode_stateid helper function Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 76 ++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 44 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index ace39349670..c560af8c69d 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1950,6 +1950,17 @@ fail: return -EINVAL; } +static void +nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid) +{ + ENCODE_HEAD; + + RESERVE_SPACE(sizeof(stateid_t)); + WRITE32(sid->si_generation); + WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); + ADJUST_ARGS(); +} + static __be32 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) { @@ -1969,12 +1980,9 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c { ENCODE_SEQID_OP_HEAD; - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(close->cl_stateid.si_generation); - WRITEMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } + if (!nfserr) + nfsd4_encode_stateid(resp, &close->cl_stateid); + ENCODE_SEQID_OP_TAIL(close->cl_stateowner); return nfserr; } @@ -2074,12 +2082,9 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo { ENCODE_SEQID_OP_HEAD; - if (!nfserr) { - RESERVE_SPACE(4 + sizeof(stateid_t)); - WRITE32(lock->lk_resp_stateid.si_generation); - WRITEMEM(&lock->lk_resp_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } else if (nfserr == nfserr_denied) + if (!nfserr) + nfsd4_encode_stateid(resp, &lock->lk_resp_stateid); + else if (nfserr == nfserr_denied) nfsd4_encode_lock_denied(resp, &lock->lk_denied); ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner); @@ -2099,13 +2104,9 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l { ENCODE_SEQID_OP_HEAD; - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(locku->lu_stateid.si_generation); - WRITEMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } - + if (!nfserr) + nfsd4_encode_stateid(resp, &locku->lu_stateid); + ENCODE_SEQID_OP_TAIL(locku->lu_stateowner); return nfserr; } @@ -2133,9 +2134,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op if (nfserr) goto out; - RESERVE_SPACE(40 + sizeof(stateid_t)); - WRITE32(open->op_stateid.si_generation); - WRITEMEM(&open->op_stateid.si_opaque, sizeof(stateid_opaque_t)); + nfsd4_encode_stateid(resp, &open->op_stateid); + RESERVE_SPACE(40); WRITECINFO(open->op_cinfo); WRITE32(open->op_rflags); WRITE32(2); @@ -2148,10 +2148,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op case NFS4_OPEN_DELEGATE_NONE: break; case NFS4_OPEN_DELEGATE_READ: - RESERVE_SPACE(20 + sizeof(stateid_t)); - WRITE32(open->op_delegate_stateid.si_generation); - WRITEMEM(&open->op_delegate_stateid.si_opaque, - sizeof(stateid_opaque_t)); + nfsd4_encode_stateid(resp, &open->op_delegate_stateid); + RESERVE_SPACE(20); WRITE32(open->op_recall); /* @@ -2164,10 +2162,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op ADJUST_ARGS(); break; case NFS4_OPEN_DELEGATE_WRITE: - RESERVE_SPACE(32 + sizeof(stateid_t)); - WRITE32(open->op_delegate_stateid.si_generation); - WRITEMEM(&open->op_delegate_stateid.si_opaque, - sizeof(stateid_opaque_t)); + nfsd4_encode_stateid(resp, &open->op_delegate_stateid); + RESERVE_SPACE(32); WRITE32(0); /* @@ -2199,13 +2195,9 @@ static __be32 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) { ENCODE_SEQID_OP_HEAD; - - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(oc->oc_resp_stateid.si_generation); - WRITEMEM(&oc->oc_resp_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } + + if (!nfserr) + nfsd4_encode_stateid(resp, &oc->oc_resp_stateid); ENCODE_SEQID_OP_TAIL(oc->oc_stateowner); return nfserr; @@ -2215,13 +2207,9 @@ static __be32 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) { ENCODE_SEQID_OP_HEAD; - - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(od->od_stateid.si_generation); - WRITEMEM(&od->od_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } + + if (!nfserr) + nfsd4_encode_stateid(resp, &od->od_stateid); ENCODE_SEQID_OP_TAIL(od->od_stateowner); return nfserr; From 1b6b2257dc5a88aae375ff44485f8d6c4ee800e4 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Tue, 12 Aug 2008 20:45:28 +0300 Subject: [PATCH 09/59] nfsd: don't declare p in ENCODE_SEQID_OP_HEAD After using the encode_stateid helper the "p" pointer declared by ENCODE_SEQID_OP_HEAD is warned as unused. In the single site where it is still needed it can be declared separately using the ENCODE_HEAD macro. Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c560af8c69d..1b81f1656bc 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1183,7 +1183,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) * Header routine to setup seqid operation replay cache */ #define ENCODE_SEQID_OP_HEAD \ - __be32 *p; \ __be32 *save; \ \ save = resp->p; @@ -2129,6 +2128,7 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li static __be32 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) { + ENCODE_HEAD; ENCODE_SEQID_OP_HEAD; if (nfserr) From 5bf8c6911fe88bea4f9850007796fbacf9fbfb88 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Tue, 12 Aug 2008 20:45:51 +0300 Subject: [PATCH 10/59] nfsd: properly xdr-decode NFS4_OPEN_CLAIM_DELEGATE_CUR stateid Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 1b81f1656bc..48a0cc17d5c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -679,7 +679,9 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) break; case NFS4_OPEN_CLAIM_DELEGATE_CUR: READ_BUF(sizeof(stateid_t) + 4); - COPYMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + READ32(open->op_delegate_stateid.si_generation); + COPYMEM(&open->op_delegate_stateid.si_opaque, + sizeof(stateid_opaque_t)); READ32(open->op_fname.len); READ_BUF(open->op_fname.len); SAVEMEM(open->op_fname.data, open->op_fname.len); From e31a1b662f40fd460e982ef87582c66c51596cd0 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Tue, 12 Aug 2008 20:46:18 +0300 Subject: [PATCH 11/59] nfsd: nfs4xdr decode_stateid helper function Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 99 ++++++++++++++++++++++++++--------------------- 1 file changed, 55 insertions(+), 44 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 48a0cc17d5c..afcdf4b7684 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -412,6 +412,18 @@ out_nfserr: goto out; } +static __be32 +nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid) +{ + DECODE_HEAD; + + READ_BUF(sizeof(stateid_t)); + READ32(sid->si_generation); + COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); + + DECODE_TAIL; +} + static __be32 nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access) { @@ -429,10 +441,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) DECODE_HEAD; close->cl_stateowner = NULL; - READ_BUF(4 + sizeof(stateid_t)); + READ_BUF(4); READ32(close->cl_seqid); - READ32(close->cl_stateid.si_generation); - COPYMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t)); + return nfsd4_decode_stateid(argp, &close->cl_stateid); DECODE_TAIL; } @@ -493,13 +504,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create static inline __be32 nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) { - DECODE_HEAD; - - READ_BUF(sizeof(stateid_t)); - READ32(dr->dr_stateid.si_generation); - COPYMEM(&dr->dr_stateid.si_opaque, sizeof(stateid_opaque_t)); - - DECODE_TAIL; + return nfsd4_decode_stateid(argp, &dr->dr_stateid); } static inline __be32 @@ -542,20 +547,22 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) READ32(lock->lk_is_new); if (lock->lk_is_new) { - READ_BUF(36); + READ_BUF(4); READ32(lock->lk_new_open_seqid); - READ32(lock->lk_new_open_stateid.si_generation); - - COPYMEM(&lock->lk_new_open_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid); + if (status) + return status; + READ_BUF(8 + sizeof(clientid_t)); READ32(lock->lk_new_lock_seqid); COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t)); READ32(lock->lk_new_owner.len); READ_BUF(lock->lk_new_owner.len); READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len); } else { - READ_BUF(20); - READ32(lock->lk_old_lock_stateid.si_generation); - COPYMEM(&lock->lk_old_lock_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid); + if (status) + return status; + READ_BUF(4); READ32(lock->lk_old_lock_seqid); } @@ -587,13 +594,15 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) DECODE_HEAD; locku->lu_stateowner = NULL; - READ_BUF(24 + sizeof(stateid_t)); + READ_BUF(8); READ32(locku->lu_type); if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) goto xdr_error; READ32(locku->lu_seqid); - READ32(locku->lu_stateid.si_generation); - COPYMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &locku->lu_stateid); + if (status) + return status; + READ_BUF(16); READ64(locku->lu_offset); READ64(locku->lu_length); @@ -678,10 +687,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) READ32(open->op_delegate_type); break; case NFS4_OPEN_CLAIM_DELEGATE_CUR: - READ_BUF(sizeof(stateid_t) + 4); - READ32(open->op_delegate_stateid.si_generation); - COPYMEM(&open->op_delegate_stateid.si_opaque, - sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + if (status) + return status; + READ_BUF(4); READ32(open->op_fname.len); READ_BUF(open->op_fname.len); SAVEMEM(open->op_fname.data, open->op_fname.len); @@ -701,9 +710,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con DECODE_HEAD; open_conf->oc_stateowner = NULL; - READ_BUF(4 + sizeof(stateid_t)); - READ32(open_conf->oc_req_stateid.si_generation); - COPYMEM(&open_conf->oc_req_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); + if (status) + return status; + READ_BUF(4); READ32(open_conf->oc_seqid); DECODE_TAIL; @@ -715,9 +725,10 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d DECODE_HEAD; open_down->od_stateowner = NULL; - READ_BUF(12 + sizeof(stateid_t)); - READ32(open_down->od_stateid.si_generation); - COPYMEM(&open_down->od_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &open_down->od_stateid); + if (status) + return status; + READ_BUF(12); READ32(open_down->od_seqid); READ32(open_down->od_share_access); READ32(open_down->od_share_deny); @@ -745,9 +756,10 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) { DECODE_HEAD; - READ_BUF(sizeof(stateid_t) + 12); - READ32(read->rd_stateid.si_generation); - COPYMEM(&read->rd_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &read->rd_stateid); + if (status) + return status; + READ_BUF(12); READ64(read->rd_offset); READ32(read->rd_length); @@ -836,15 +848,13 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) { - DECODE_HEAD; + __be32 status; - READ_BUF(sizeof(stateid_t)); - READ32(setattr->sa_stateid.si_generation); - COPYMEM(&setattr->sa_stateid.si_opaque, sizeof(stateid_opaque_t)); - if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, &setattr->sa_acl))) - goto out; - - DECODE_TAIL; + status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); + if (status) + return status; + return nfsd4_decode_fattr(argp, setattr->sa_bmval, + &setattr->sa_iattr, &setattr->sa_acl); } static __be32 @@ -929,9 +939,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) int len; DECODE_HEAD; - READ_BUF(sizeof(stateid_opaque_t) + 20); - READ32(write->wr_stateid.si_generation); - COPYMEM(&write->wr_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &write->wr_stateid); + if (status) + return status; + READ_BUF(16); READ64(write->wr_offset); READ32(write->wr_stable_how); if (write->wr_stable_how > 2) From 54a66e548079f12a6f54c3cae96812a9ed9b54ae Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 13 Aug 2008 22:03:27 -0400 Subject: [PATCH 12/59] knfsd: allocate readahead cache in individual chunks I had a report from someone building a large NFS server that they were unable to start more than 585 nfsd threads. It was reported against an older kernel using the slab allocator, and I tracked it down to the large allocation in nfsd_racache_init failing. It appears that the slub allocator handles large allocations better, but large contiguous allocations can often be problematic. There doesn't seem to be any reason that the racache has to be allocated as a single large chunk. This patch breaks this up so that the racache is built up from separate allocations. (Thanks also to Takashi Iwai for a bugfix.) Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields Cc: Takashi Iwai --- fs/nfsd/vfs.c | 59 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 1319e8027d5..aa1d0d6489a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -83,7 +83,6 @@ struct raparm_hbucket { spinlock_t pb_lock; } ____cacheline_aligned_in_smp; -static struct raparms * raparml; #define RAPARM_HASH_BITS 4 #define RAPARM_HASH_SIZE (1<p_next; + kfree(last_raparm); + } + raparm_hash[i].pb_head = NULL; + } } /* * Initialize readahead param cache @@ -1981,35 +1989,38 @@ nfsd_racache_init(int cache_size) int i; int j = 0; int nperbucket; + struct raparms **raparm = NULL; - if (raparml) + if (raparm_hash[0].pb_head) return 0; - if (cache_size < 2*RAPARM_HASH_SIZE) - cache_size = 2*RAPARM_HASH_SIZE; - raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL); - - if (!raparml) { - printk(KERN_WARNING - "nfsd: Could not allocate memory read-ahead cache.\n"); - return -ENOMEM; - } + nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); + if (nperbucket < 2) + nperbucket = 2; + cache_size = nperbucket * RAPARM_HASH_SIZE; dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); - for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) { - raparm_hash[i].pb_head = NULL; + + for (i = 0; i < RAPARM_HASH_SIZE; i++) { spin_lock_init(&raparm_hash[i].pb_lock); - } - nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); - for (i = 0; i < cache_size - 1; i++) { - if (i % nperbucket == 0) - raparm_hash[j++].pb_head = raparml + i; - if (i % nperbucket < nperbucket-1) - raparml[i].p_next = raparml + i + 1; + + raparm = &raparm_hash[i].pb_head; + for (j = 0; j < nperbucket; j++) { + *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL); + if (!*raparm) + goto out_nomem; + raparm = &(*raparm)->p_next; + } + *raparm = NULL; } nfsdstats.ra_size = cache_size; return 0; + +out_nomem: + dprintk("nfsd: kmalloc failed, freeing readahead buffers\n"); + nfsd_racache_shutdown(); + return -ENOMEM; } #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) From 8fafa90082ab18859d97627fc454edf12f7efbff Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 24 Jan 2008 11:11:34 -0500 Subject: [PATCH 13/59] locks: allow lockd to process blocked locks during grace period The check here is currently harmless but unnecessary, since, as the comment notes, there aren't any blocked-lock callbacks to process during the grace period anyway. And eventually we want to allow multiple grace periods that come and go for different filesystems over the course of the lifetime of lockd, at which point this check is just going to get in the way. Signed-off-by: J. Bruce Fields --- fs/lockd/svc.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 1553fecc567..bdc607bb25e 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -158,15 +158,9 @@ lockd(void *vrqstp) continue; } - /* - * Retry any blocked locks that have been notified by - * the VFS. Don't do this during grace period. - * (Theoretically, there shouldn't even be blocked locks - * during grace period). - */ - if (!nlmsvc_grace_period) { - timeout = nlmsvc_retry_blocked(); - } else if (time_before(grace_period_expire, jiffies)) + timeout = nlmsvc_retry_blocked(); + + if (time_before(grace_period_expire, jiffies)) clear_grace_period(); /* From c8ab5f2a13fb41a878863c61a1e27d78f1844b5e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 18 Mar 2008 19:00:19 -0400 Subject: [PATCH 14/59] lockd: don't depend on lockd main loop to end grace End lockd's grace period using schedule_delayed_work() instead of a check on every pass through the main loop. After a later patch, we'll depend on lockd to end its grace period even if it's not currently handling requests; so it shouldn't depend on being woken up from the main loop to do so. Also, Nakano Hiroaki (who independently produced a similar patch) noticed that the current behavior is buggy in the face of jiffies wraparound: "lockd uses time_before() to determine whether the grace period has expired. This would seem to be enough to avoid timer wrap-around issues, but, unfortunately, that is not the case. The time_* family of comparison functions can be safely used to compare jiffies relatively close in time, but they stop working after approximately LONG_MAX/2 ticks. nfsd can suffer this problem because the time_before() comparison in lockd() is not performed until the first request comes in, which means that if there is no lockd traffic for more than LONG_MAX/2 ticks we are screwed. "The implication of this is that once time_before() starts misbehaving any attempt from a NFS client to execute fcntl() will be received with a NLM_LCK_DENIED_GRACE_PERIOD message for 25 days (assuming HZ=1000). In other words, the 50 seconds grace period could turn into a grace period of 50 days or more. "Note: This bug was analyzed independently by Oda-san and myself." Signed-off-by: J. Bruce Fields Cc: Nakano Hiroaki Cc: Itsuro Oda --- fs/lockd/svc.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index bdc607bb25e..f345ef7fb8a 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -97,17 +97,22 @@ unsigned long get_nfs_grace_period(void) } EXPORT_SYMBOL(get_nfs_grace_period); -static unsigned long set_grace_period(void) -{ - nlmsvc_grace_period = 1; - return get_nfs_grace_period() + jiffies; -} - -static inline void clear_grace_period(void) +static void grace_ender(struct work_struct *not_used) { nlmsvc_grace_period = 0; } +static DECLARE_DELAYED_WORK(grace_period_end, grace_ender); + +static void set_grace_period(void) +{ + unsigned long grace_period = get_nfs_grace_period() + jiffies; + + nlmsvc_grace_period = 1; + cancel_delayed_work_sync(&grace_period_end); + schedule_delayed_work(&grace_period_end, grace_period); +} + /* * This is the lockd kernel thread */ @@ -116,7 +121,6 @@ lockd(void *vrqstp) { int err = 0, preverr = 0; struct svc_rqst *rqstp = vrqstp; - unsigned long grace_period_expire; /* try_to_freeze() is called from svc_recv() */ set_freezable(); @@ -139,7 +143,7 @@ lockd(void *vrqstp) nlm_timeout = LOCKD_DFLT_TIMEO; nlmsvc_timeout = nlm_timeout * HZ; - grace_period_expire = set_grace_period(); + set_grace_period(); /* * The main request loop. We don't terminate until the last @@ -153,16 +157,13 @@ lockd(void *vrqstp) flush_signals(current); if (nlmsvc_ops) { nlmsvc_invalidate_all(); - grace_period_expire = set_grace_period(); + set_grace_period(); } continue; } timeout = nlmsvc_retry_blocked(); - if (time_before(grace_period_expire, jiffies)) - clear_grace_period(); - /* * Find a socket with data available and call its * recvfrom routine. @@ -189,6 +190,7 @@ lockd(void *vrqstp) svc_process(rqstp); } flush_signals(current); + cancel_delayed_work_sync(&grace_period_end); if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); From b6632339e3afbcbb438a3c8935190ea22464fc99 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 Aug 2008 19:33:44 -0400 Subject: [PATCH 15/59] SUNRPC: Set V6ONLY socket option for RPC listener sockets My plan is to use an AF_INET listener on systems that support only IPv4, and an AF_INET6 listener on systems that can support IPv6. Incoming IPv4 packets will be posted to an AF_INET6 listener with a mapped IPv4 address. Max Matveev says: Creating a single listener can be dangerous - if net.ipv6.bindv6only is enabled then it's possible to create another listener in v4 namespace on the same port and steal the traffic from the "unifed" listener. You need to disable V6ONLY explicitly via a sockopt to stop that. Set appropriate socket option on RPC server listener sockets to prevent this. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 3e65719f1ef..f91377c1495 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1114,6 +1114,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); + int val; dprintk("svc: svc_setup_socket %p\n", sock); if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { @@ -1146,6 +1147,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, else svc_tcp_init(svsk, serv); + /* + * We start one listener per sv_serv. We want AF_INET + * requests to be automatically shunted to our AF_INET6 + * listener using a mapped IPv4 address. Make sure + * no-one starts an equivalent IPv4 listener, which + * would steal our incoming connections. + */ + val = 0; + if (serv->sv_family == AF_INET6) + kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, + (char *)&val, sizeof(val)); + dprintk("svc: svc_setup_socket created %p (inet %p)\n", svsk, svsk->sk_sk); From 14aeb2118d6e9fd9ee988324c740a00c80979093 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 Aug 2008 19:34:00 -0400 Subject: [PATCH 16/59] SUNRPC: Simplify rpcb_register() API Bruce suggested there's no need to expose the difference between an error sending the PMAP_SET request and an error reply from the portmapper to rpcb_register's callers. The user space equivalent of rpcb_register() is pmap_set(3), which returns a bool_t : either the PMAP set worked, or it didn't. Simple. So let's remove the "*okay" argument from rpcb_register() and rpcb_v4_register(), and simply return an error if any part of the call didn't work. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/clnt.h | 4 +-- net/sunrpc/rpcb_clnt.c | 65 ++++++++++++++++--------------------- net/sunrpc/svc.c | 8 ++--- 3 files changed, 32 insertions(+), 45 deletions(-) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index e5bfe01ee30..8ac8e75243a 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -124,10 +124,10 @@ struct rpc_clnt *rpc_clone_client(struct rpc_clnt *); void rpc_shutdown_client(struct rpc_clnt *); void rpc_release_client(struct rpc_clnt *); -int rpcb_register(u32, u32, int, unsigned short, int *); +int rpcb_register(u32, u32, int, unsigned short); int rpcb_v4_register(const u32 program, const u32 version, const struct sockaddr *address, - const char *netid, int *result); + const char *netid); int rpcb_getport_sync(struct sockaddr_in *, u32, u32, int); void rpcb_getport_async(struct rpc_task *); diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 24db2b4d12d..cc7250d4659 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -176,13 +176,12 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, } static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, - u32 version, struct rpc_message *msg, - int *result) + u32 version, struct rpc_message *msg) { struct rpc_clnt *rpcb_clnt; - int error = 0; + int result, error = 0; - *result = 0; + msg->rpc_resp = &result; rpcb_clnt = rpcb_create_local(addr, addrlen, version); if (!IS_ERR(rpcb_clnt)) { @@ -191,12 +190,19 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, } else error = PTR_ERR(rpcb_clnt); - if (error < 0) + if (error < 0) { printk(KERN_WARNING "RPC: failed to contact local rpcbind " "server (errno %d).\n", -error); - dprintk("RPC: registration status %d/%d\n", error, *result); + return error; + } - return error; + if (!result) { + dprintk("RPC: registration failed\n"); + return -EACCES; + } + + dprintk("RPC: registration succeeded\n"); + return 0; } /** @@ -205,7 +211,11 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, * @vers: RPC version number to bind * @prot: transport protocol to register * @port: port value to register - * @okay: OUT: result code + * + * Returns zero if the registration request was dispatched successfully + * and the rpcbind daemon returned success. Otherwise, returns an errno + * value that reflects the nature of the error (request could not be + * dispatched, timed out, or rpcbind returned an error). * * RPC services invoke this function to advertise their contact * information via the system's rpcbind daemon. RPC services @@ -217,15 +227,6 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, * all registered transports for [program, version] from the local * rpcbind database. * - * Returns zero if the registration request was dispatched - * successfully and a reply was received. The rpcbind daemon's - * boolean result code is stored in *okay. - * - * Returns an errno value and sets *result to zero if there was - * some problem that prevented the rpcbind request from being - * dispatched, or if the rpcbind daemon did not respond within - * the timeout. - * * This function uses rpcbind protocol version 2 to contact the * local rpcbind daemon. * @@ -236,7 +237,7 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, * IN6ADDR_ANY (ie available for all AF_INET and AF_INET6 * addresses). */ -int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) +int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port) { struct rpcbind_args map = { .r_prog = prog, @@ -246,7 +247,6 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) }; struct rpc_message msg = { .rpc_argp = &map, - .rpc_resp = okay, }; dprintk("RPC: %sregistering (%u, %u, %d, %u) with local " @@ -259,7 +259,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, sizeof(rpcb_inaddr_loopback), - RPCBVERS_2, &msg, okay); + RPCBVERS_2, &msg); } /* @@ -290,7 +290,7 @@ static int rpcb_register_netid4(struct sockaddr_in *address_to_register, return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, sizeof(rpcb_inaddr_loopback), - RPCBVERS_4, msg, msg->rpc_resp); + RPCBVERS_4, msg); } /* @@ -321,7 +321,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, return rpcb_register_call((struct sockaddr *)&rpcb_in6addr_loopback, sizeof(rpcb_in6addr_loopback), - RPCBVERS_4, msg, msg->rpc_resp); + RPCBVERS_4, msg); } /** @@ -330,7 +330,11 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, * @version: RPC version number of service to (un)register * @address: address family, IP address, and port to (un)register * @netid: netid of transport protocol to (un)register - * @result: result code from rpcbind RPC call + * + * Returns zero if the registration request was dispatched successfully + * and the rpcbind daemon returned success. Otherwise, returns an errno + * value that reflects the nature of the error (request could not be + * dispatched, timed out, or rpcbind returned an error). * * RPC services invoke this function to advertise their contact * information via the system's rpcbind daemon. RPC services @@ -342,15 +346,6 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, * to zero. Callers pass a netid of "" to unregister all * transport netids associated with [program, version, address]. * - * Returns zero if the registration request was dispatched - * successfully and a reply was received. The rpcbind daemon's - * result code is stored in *result. - * - * Returns an errno value and sets *result to zero if there was - * some problem that prevented the rpcbind request from being - * dispatched, or if the rpcbind daemon did not respond within - * the timeout. - * * This function uses rpcbind protocol version 4 to contact the * local rpcbind daemon. The local rpcbind daemon must support * version 4 of the rpcbind protocol in order for these functions @@ -372,8 +367,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, * advertises the service on all IPv4 and IPv6 addresses. */ int rpcb_v4_register(const u32 program, const u32 version, - const struct sockaddr *address, const char *netid, - int *result) + const struct sockaddr *address, const char *netid) { struct rpcbind_args map = { .r_prog = program, @@ -383,11 +377,8 @@ int rpcb_v4_register(const u32 program, const u32 version, }; struct rpc_message msg = { .rpc_argp = &map, - .rpc_resp = result, }; - *result = 0; - switch (address->sa_family) { case AF_INET: return rpcb_register_netid4((struct sockaddr_in *)address, diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 9ba17044109..9805143d066 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -730,7 +730,7 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) struct svc_program *progp; unsigned long flags; unsigned int i; - int error = 0, dummy; + int error = 0; if (!port) clear_thread_flag(TIF_SIGPENDING); @@ -751,13 +751,9 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) if (progp->pg_vers[i]->vs_hidden) continue; - error = rpcb_register(progp->pg_prog, i, proto, port, &dummy); + error = rpcb_register(progp->pg_prog, i, proto, port); if (error < 0) break; - if (port && !dummy) { - error = -EACCES; - break; - } } } From 7252d575ab0e8771269a3d245c36a05ace5152bd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 Aug 2008 19:34:08 -0400 Subject: [PATCH 17/59] SUNRPC: Split portmap unregister API into separate function Create a separate server-level interface for unregistering RPC services. The mechanics of, and the API for, registering and unregistering RPC services will diverge further as support for IPv6 is added. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/svc.c | 62 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 12 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 9805143d066..9eb78a771da 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -28,6 +28,8 @@ #define RPCDBG_FACILITY RPCDBG_SVCDSP +static void svc_unregister(const struct svc_serv *serv); + #define svc_serv_is_pooled(serv) ((serv)->sv_function) /* @@ -417,9 +419,8 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, spin_lock_init(&pool->sp_lock); } - /* Remove any stale portmap registrations */ - svc_register(serv, 0, 0); + svc_unregister(serv); return serv; } @@ -487,8 +488,7 @@ svc_destroy(struct svc_serv *serv) if (svc_serv_is_pooled(serv)) svc_pool_map_put(); - /* Unregister service with the portmapper */ - svc_register(serv, 0, 0); + svc_unregister(serv); kfree(serv->sv_pools); kfree(serv); } @@ -728,12 +728,10 @@ int svc_register(struct svc_serv *serv, int proto, unsigned short port) { struct svc_program *progp; - unsigned long flags; unsigned int i; int error = 0; - if (!port) - clear_thread_flag(TIF_SIGPENDING); + BUG_ON(proto == 0 && port == 0); for (progp = serv->sv_program; progp; progp = progp->pg_next) { for (i = 0; i < progp->pg_nvers; i++) { @@ -757,13 +755,53 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) } } - if (!port) { - spin_lock_irqsave(¤t->sighand->siglock, flags); - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); + return error; +} + +/* + * All transport protocols and ports for this service are removed + * from the local rpcbind database if the service is not hidden. + * + * The result of unregistration is reported via dprintk for those + * who want verification of the result, but is otherwise not + * important. + * + * The local rpcbind daemon listens on either only IPv6 or only + * IPv4. The kernel can't tell how it's configured. However, + * AF_INET addresses are mapped to AF_INET6 in IPv6-only config- + * urations, so even an unregistration request on AF_INET will + * get to a local rpcbind daemon listening only on AF_INET6. So + * we always unregister via AF_INET. + * + * At this point we don't need rpcbind version 4 for unregis- + * tration: A v2 UNSET request will clear all transports (netids), + * addresses, and address families for [program, version]. + */ +static void svc_unregister(const struct svc_serv *serv) +{ + struct svc_program *progp; + unsigned long flags; + unsigned int i; + int error; + + clear_thread_flag(TIF_SIGPENDING); + + for (progp = serv->sv_program; progp; progp = progp->pg_next) { + for (i = 0; i < progp->pg_nvers; i++) { + if (progp->pg_vers[i] == NULL) + continue; + if (progp->pg_vers[i]->vs_hidden) + continue; + + error = rpcb_register(progp->pg_prog, i, 0, 0); + dprintk("svc: svc_unregister(%sv%u), error %d\n", + progp->pg_name, i, error); + } } - return error; + spin_lock_irqsave(¤t->sighand->siglock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); } /* From a26cfad6e0a308a2c68df1f1ef50aabd48b17e6d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 Aug 2008 19:34:16 -0400 Subject: [PATCH 18/59] SUNRPC: Support IPv6 when registering kernel RPC services In order to advertise NFS-related services on IPv6 interfaces via rpcbind, the kernel RPC server implementation must use rpcb_v4_register() instead of rpcb_register(). A new kernel build option allows distributions to use the legacy v2 call until they integrate an appropriate user-space rpcbind daemon that can support IPv6 RPC services. I tried adding some automatic logic to fall back if registering with a v4 protocol request failed, but there are too many corner cases. So I just made it a compile-time switch that distributions can throw when they've replaced portmapper with rpcbind. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/Kconfig | 22 +++++++++ include/linux/sunrpc/svc.h | 4 +- net/sunrpc/svc.c | 95 +++++++++++++++++++++++++++++++++++--- 3 files changed, 113 insertions(+), 8 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index c6ae4d4842e..ed57a5a3725 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1773,6 +1773,28 @@ config SUNRPC_XPRT_RDMA If unsure, say N. +config SUNRPC_REGISTER_V4 + bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)" + depends on SUNRPC && EXPERIMENTAL + default n + help + Sun added support for registering RPC services at an IPv6 + address by creating two new versions of the rpcbind protocol + (RFC 1833). + + This option enables support in the kernel RPC server for + registering kernel RPC services via version 4 of the rpcbind + protocol. If you enable this option, you must run a portmapper + daemon that supports rpcbind protocol version 4. + + Serving NFS over IPv6 from knfsd (the kernel's NFS server) + requires that you enable this option and use a portmapper that + supports rpcbind version 4. + + If unsure, say N to get traditional behavior (register kernel + RPC services using only rpcbind version 2). Distributions + using the legacy Linux portmapper daemon must say N here. + config RPCSEC_GSS_KRB5 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" depends on SUNRPC && EXPERIMENTAL diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 23143f38b12..54a79e1ad63 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -393,7 +393,9 @@ struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); void svc_destroy(struct svc_serv *); int svc_process(struct svc_rqst *); -int svc_register(struct svc_serv *, int, unsigned short); +int svc_register(const struct svc_serv *, const unsigned short, + const unsigned short); + void svc_wake_up(struct svc_serv *); void svc_reserve(struct svc_rqst *rqstp, int space); struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 9eb78a771da..c43ccb62805 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -719,13 +719,92 @@ svc_exit_thread(struct svc_rqst *rqstp) } EXPORT_SYMBOL(svc_exit_thread); +#ifdef CONFIG_SUNRPC_REGISTER_V4 /* - * Register an RPC service with the local portmapper. - * To unregister a service, call this routine with - * proto and port == 0. + * Registering kernel RPC services with rpcbind version 2 will work + * over either IPv4 or IPv6, since the Linux kernel always registers + * services for the "any" address. + * + * However, the local rpcbind daemon listens on either only AF_INET + * or AF_INET6 (never both). When it listens on AF_INET6, an rpcbind + * version 2 registration will result in registering the service at + * IN6ADDR_ANY, even if the RPC service being registered is not + * IPv6-enabled. + * + * Rpcbind version 4 allows us to be a little more specific. Kernel + * RPC services that don't yet support AF_INET6 can register + * themselves as IPv4-only with the local rpcbind daemon, even if the + * daemon is listening only on AF_INET6. + * + * And, registering IPv6-enabled kernel RPC services via AF_INET6 + * verifies that the local user space rpcbind daemon is properly + * configured to support remote AF_INET6 rpcbind requests. + * + * An AF_INET6 registration request will fail if the local rpcbind + * daemon is not set up to listen on AF_INET6. Likewise, we fail + * AF_INET6 registration requests if svc_register() is configured to + * support only rpcbind version 2. */ -int -svc_register(struct svc_serv *serv, int proto, unsigned short port) +static int __svc_register(const u32 program, const u32 version, + const sa_family_t family, + const unsigned short protocol, + const unsigned short port) +{ + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + .sin_port = htons(port), + }; + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_port = htons(port), + }; + struct sockaddr *sap; + char *netid; + + switch (family) { + case AF_INET: + sap = (struct sockaddr *)&sin; + netid = RPCBIND_NETID_TCP; + if (protocol == IPPROTO_UDP) + netid = RPCBIND_NETID_UDP; + break; + case AF_INET6: + sap = (struct sockaddr *)&sin6; + netid = RPCBIND_NETID_TCP6; + if (protocol == IPPROTO_UDP) + netid = RPCBIND_NETID_UDP6; + break; + default: + return -EAFNOSUPPORT; + } + + return rpcb_v4_register(program, version, sap, netid); +} +#else +static int __svc_register(const u32 program, const u32 version, + sa_family_t family, + const unsigned short protocol, + const unsigned short port) +{ + if (family != AF_INET) + return -EAFNOSUPPORT; + + return rpcb_register(program, version, protocol, port); +} +#endif + +/** + * svc_register - register an RPC service with the local portmapper + * @serv: svc_serv struct for the service to register + * @proto: transport protocol number to advertise + * @port: port to advertise + * + * Service is registered for any address in serv's address family + */ +int svc_register(const struct svc_serv *serv, const unsigned short proto, + const unsigned short port) { struct svc_program *progp; unsigned int i; @@ -738,8 +817,9 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) if (progp->pg_vers[i] == NULL) continue; - dprintk("svc: svc_register(%s, %s, %d, %d)%s\n", + dprintk("svc: svc_register(%s, %u, %s, %u, %d)%s\n", progp->pg_name, + serv->sv_family, proto == IPPROTO_UDP? "udp" : "tcp", port, i, @@ -749,7 +829,8 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) if (progp->pg_vers[i]->vs_hidden) continue; - error = rpcb_register(progp->pg_prog, i, proto, port); + error = __svc_register(progp->pg_prog, i, + serv->sv_family, proto, port); if (error < 0) break; } From c2526f42711d93f3455f92a82b5e586880fc44be Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 27 Aug 2008 16:57:15 -0400 Subject: [PATCH 19/59] NLM: Clean up before introducing new debugging messages We're about to introduce some extra debugging messages in nlm_lookup_host(). Bring the coding style up to date first so we can cleanly introduce the new debugging messages. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/host.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index a17664c7eac..cb26e3d952a 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -103,16 +103,19 @@ static struct nlm_host *nlm_lookup_host(int server, nlm_get_host(host); goto out; } + + /* + * The host wasn't in our hash table. If we don't + * have an NSM handle for it yet, create one. + */ if (nsm) atomic_inc(&nsm->sm_count); - - host = NULL; - - /* Sadly, the host isn't in our hash table yet. See if - * we have an NSM handle for it. If not, create one. - */ - if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len))) - goto out; + else { + host = NULL; + nsm = nsm_find(sin, hostname, hostname_len); + if (!nsm) + goto out; + } host = kzalloc(sizeof(*host), GFP_KERNEL); if (!host) { From 1b333c54a15a746ff6b04a684b0845a66daacef2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 27 Aug 2008 16:57:23 -0400 Subject: [PATCH 20/59] lockd: address-family independent printable addresses Knowing which source address is used for communicating with remote NLM services can be helpful for debugging configuration problems on hosts with multiple addresses. Keep the dprintk debugging here, but adapt it so it displays AF_INET6 addresses properly. There are also a couple of dprintk clean-ups as well. At some point we will aggregate the helpers that display presentation format addresses into a single set of shared helpers. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/host.c | 78 +++++++++++++++++++++++++++++-------- include/linux/lockd/lockd.h | 4 ++ 2 files changed, 65 insertions(+), 17 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index cb26e3d952a..e5dcfa57e09 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -11,12 +11,14 @@ #include #include #include +#include #include #include #include #include #include +#include #define NLMDBG_FACILITY NLMDBG_HOSTCACHE #define NLM_HOST_NRHASH 32 @@ -38,6 +40,32 @@ static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, const char *hostname, unsigned int hostname_len); +static void nlm_display_address(const struct sockaddr *sap, + char *buf, const size_t len) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + + switch (sap->sa_family) { + case AF_UNSPEC: + snprintf(buf, len, "unspecified"); + break; + case AF_INET: + snprintf(buf, len, NIPQUAD_FMT, NIPQUAD(sin->sin_addr.s_addr)); + break; + case AF_INET6: + if (ipv6_addr_v4mapped(&sin6->sin6_addr)) + snprintf(buf, len, NIPQUAD_FMT, + NIPQUAD(sin6->sin6_addr.s6_addr32[3])); + else + snprintf(buf, len, NIP6_FMT, NIP6(sin6->sin6_addr)); + break; + default: + snprintf(buf, len, "unsupported address family"); + break; + } +} + /* * Common host lookup routine for server & client */ @@ -54,14 +82,10 @@ static struct nlm_host *nlm_lookup_host(int server, struct nsm_handle *nsm = NULL; int hash; - dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT - ", p=%d, v=%u, my role=%s, name=%.*s)\n", - NIPQUAD(ssin->sin_addr.s_addr), - NIPQUAD(sin->sin_addr.s_addr), proto, version, - server? "server" : "client", - hostname_len, - hostname? hostname : ""); - + dprintk("lockd: nlm_lookup_host(proto=%d, vers=%u," + " my role is %s, hostname=%.*s)\n", + proto, version, server ? "server" : "client", + hostname_len, hostname ? hostname : ""); hash = NLM_ADDRHASH(sin->sin_addr.s_addr); @@ -101,6 +125,8 @@ static struct nlm_host *nlm_lookup_host(int server, hlist_add_head(&host->h_hash, chain); nlm_get_host(host); + dprintk("lockd: nlm_lookup_host found host %s (%s)\n", + host->h_name, host->h_addrbuf); goto out; } @@ -113,13 +139,17 @@ static struct nlm_host *nlm_lookup_host(int server, else { host = NULL; nsm = nsm_find(sin, hostname, hostname_len); - if (!nsm) + if (!nsm) { + dprintk("lockd: nlm_lookup_host failed; " + "no nsm handle\n"); goto out; + } } host = kzalloc(sizeof(*host), GFP_KERNEL); if (!host) { nsm_release(nsm); + dprintk("lockd: nlm_lookup_host failed; no memory\n"); goto out; } host->h_name = nsm->sm_name; @@ -146,6 +176,15 @@ static struct nlm_host *nlm_lookup_host(int server, INIT_LIST_HEAD(&host->h_reclaim); nrhosts++; + + nlm_display_address((struct sockaddr *)&host->h_addr, + host->h_addrbuf, sizeof(host->h_addrbuf)); + nlm_display_address((struct sockaddr *)&host->h_saddr, + host->h_saddrbuf, sizeof(host->h_saddrbuf)); + + dprintk("lockd: nlm_lookup_host created host %s\n", + host->h_name); + out: mutex_unlock(&nlm_host_mutex); return host; @@ -210,9 +249,8 @@ nlm_bind_host(struct nlm_host *host) { struct rpc_clnt *clnt; - dprintk("lockd: nlm_bind_host("NIPQUAD_FMT"->"NIPQUAD_FMT")\n", - NIPQUAD(host->h_saddr.sin_addr), - NIPQUAD(host->h_addr.sin_addr)); + dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n", + host->h_name, host->h_addrbuf, host->h_saddrbuf); /* Lock host handle */ mutex_lock(&host->h_mutex); @@ -224,7 +262,7 @@ nlm_bind_host(struct nlm_host *host) if (time_after_eq(jiffies, host->h_nextrebind)) { rpc_force_rebind(clnt); host->h_nextrebind = jiffies + NLM_HOST_REBIND; - dprintk("lockd: next rebind in %ld jiffies\n", + dprintk("lockd: next rebind in %lu jiffies\n", host->h_nextrebind - jiffies); } } else { @@ -327,12 +365,16 @@ void nlm_host_rebooted(const struct sockaddr_in *sin, struct nsm_handle *nsm; struct nlm_host *host; - dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n", - hostname, NIPQUAD(sin->sin_addr)); - /* Find the NSM handle for this peer */ - if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0))) + nsm = __nsm_find(sin, hostname, hostname_len, 0); + if (nsm == NULL) { + dprintk("lockd: never saw rebooted peer '%.*s' before\n", + hostname_len, hostname); return; + } + + dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n", + hostname_len, hostname, nsm->sm_addrbuf); /* When reclaiming locks on this peer, make sure that * we set up a new notification */ @@ -516,6 +558,8 @@ retry: nsm->sm_name = (char *) (nsm + 1); memcpy(nsm->sm_name, hostname, hostname_len); nsm->sm_name[hostname_len] = '\0'; + nlm_display_address((struct sockaddr *)&nsm->sm_addr, + nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf)); atomic_set(&nsm->sm_count, 1); goto retry; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index dbb87ab282e..0691efbd0b3 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -61,6 +61,9 @@ struct nlm_host { struct list_head h_granted; /* Locks in GRANTED state */ struct list_head h_reclaim; /* Locks in RECLAIM state */ struct nsm_handle * h_nsmhandle; /* NSM status handle */ + + char h_addrbuf[48], /* address eyecatchers */ + h_saddrbuf[48]; }; struct nsm_handle { @@ -70,6 +73,7 @@ struct nsm_handle { struct sockaddr_in sm_addr; unsigned int sm_monitored : 1, sm_sticky : 1; /* don't unmonitor */ + char sm_addrbuf[48]; /* address eyecatcher */ }; /* From 2860a0227b700feb8d6e5c4f07a62a1b40d96022 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 27 Aug 2008 16:57:31 -0400 Subject: [PATCH 21/59] lockd: Specify address family for source address Make sure an address family is specified for source addresses passed to nlm_lookup_host(). nlm_lookup_host() will need this when it becomes capable of dealing with AF_INET6 addresses. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/host.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index e5dcfa57e09..22423abea28 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -220,10 +220,12 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin, const char *hostname, unsigned int hostname_len) { - struct sockaddr_in ssin = {0}; + const struct sockaddr_in source = { + .sin_family = AF_UNSPEC, + }; return nlm_lookup_host(0, sin, proto, version, - hostname, hostname_len, &ssin); + hostname, hostname_len, &source); } /* @@ -233,12 +235,14 @@ struct nlm_host * nlmsvc_lookup_host(struct svc_rqst *rqstp, const char *hostname, unsigned int hostname_len) { - struct sockaddr_in ssin = {0}; + const struct sockaddr_in source = { + .sin_family = AF_INET, + .sin_addr = rqstp->rq_daddr.addr, + }; - ssin.sin_addr = rqstp->rq_daddr.addr; return nlm_lookup_host(1, svc_addr_in(rqstp), rqstp->rq_prot, rqstp->rq_vers, - hostname, hostname_len, &ssin); + hostname, hostname_len, &source); } /* From 396cb3d003c2ce72b50c8c06fddfbb7516f30eb1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 27 Aug 2008 16:57:38 -0400 Subject: [PATCH 22/59] lockd: Add address family-agnostic helper for zeroing the port number Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/host.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 22423abea28..008e4026f54 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -40,6 +40,18 @@ static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, const char *hostname, unsigned int hostname_len); +static void nlm_clear_port(struct sockaddr *sap) +{ + switch (sap->sa_family) { + case AF_INET: + ((struct sockaddr_in *)sap)->sin_port = 0; + break; + case AF_INET6: + ((struct sockaddr_in6 *)sap)->sin6_port = 0; + break; + } +} + static void nlm_display_address(const struct sockaddr *sap, char *buf, const size_t len) { @@ -154,7 +166,7 @@ static struct nlm_host *nlm_lookup_host(int server, } host->h_name = nsm->sm_name; host->h_addr = *sin; - host->h_addr.sin_port = 0; /* ouch! */ + nlm_clear_port((struct sockaddr *)&host->h_addr); host->h_saddr = *ssin; host->h_version = version; host->h_proto = proto; From 5344b12d4f97d4a9a62d806425977a6ff64b6baf Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 27 Aug 2008 16:57:46 -0400 Subject: [PATCH 23/59] SUNRPC: Make svc_addr's argument a constant Clean up: Add extra type safety and squelch a few compiler complaints in upcoming patches. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 54a79e1ad63..3afe7fb403b 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -266,17 +266,17 @@ struct svc_rqst { /* * Rigorous type checking on sockaddr type conversions */ -static inline struct sockaddr_in *svc_addr_in(struct svc_rqst *rqst) +static inline struct sockaddr_in *svc_addr_in(const struct svc_rqst *rqst) { return (struct sockaddr_in *) &rqst->rq_addr; } -static inline struct sockaddr_in6 *svc_addr_in6(struct svc_rqst *rqst) +static inline struct sockaddr_in6 *svc_addr_in6(const struct svc_rqst *rqst) { return (struct sockaddr_in6 *) &rqst->rq_addr; } -static inline struct sockaddr *svc_addr(struct svc_rqst *rqst) +static inline struct sockaddr *svc_addr(const struct svc_rqst *rqst) { return (struct sockaddr *) &rqst->rq_addr; } From b4ed58fd34d4def88bda59f9cc566ec9fca6a096 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 3 Sep 2008 14:35:39 -0400 Subject: [PATCH 24/59] lockd: Use sockaddr_storage + length for h_addr field To store larger addresses in the nlm_host structure, make h_addr a sockaddr_storage, and add an address length field. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/clntlock.c | 2 +- fs/lockd/host.c | 11 ++++++----- include/linux/lockd/lockd.h | 16 +++++++++++++++- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 0b45fd3a4bf..0df5587f804 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -166,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; - if (!nlm_cmp_addr(&block->b_host->h_addr, addr)) + if (!nlm_cmp_addr(nlm_addr_in(block->b_host), addr)) continue; if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 008e4026f54..8c7022eeae6 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -116,7 +116,7 @@ static struct nlm_host *nlm_lookup_host(int server, */ chain = &nlm_hosts[hash]; hlist_for_each_entry(host, pos, chain, h_hash) { - if (!nlm_cmp_addr(&host->h_addr, sin)) + if (!nlm_cmp_addr(nlm_addr_in(host), sin)) continue; /* See if we have an NSM handle for this client */ @@ -165,8 +165,9 @@ static struct nlm_host *nlm_lookup_host(int server, goto out; } host->h_name = nsm->sm_name; - host->h_addr = *sin; - nlm_clear_port((struct sockaddr *)&host->h_addr); + memcpy(nlm_addr(host), sin, sizeof(*sin)); + host->h_addrlen = sizeof(*sin); + nlm_clear_port(nlm_addr(host)); host->h_saddr = *ssin; host->h_version = version; host->h_proto = proto; @@ -291,8 +292,8 @@ nlm_bind_host(struct nlm_host *host) }; struct rpc_create_args args = { .protocol = host->h_proto, - .address = (struct sockaddr *)&host->h_addr, - .addrsize = sizeof(host->h_addr), + .address = nlm_addr(host), + .addrsize = host->h_addrlen, .saddress = (struct sockaddr *)&host->h_saddr, .timeout = &timeparms, .servername = host->h_name, diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 0691efbd0b3..41d7a8e61ce 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -38,7 +38,8 @@ */ struct nlm_host { struct hlist_node h_hash; /* doubly linked list */ - struct sockaddr_in h_addr; /* peer address */ + struct sockaddr_storage h_addr; /* peer address */ + size_t h_addrlen; struct sockaddr_in h_saddr; /* our address (optional) */ struct rpc_clnt * h_rpcclnt; /* RPC client to talk to peer */ char * h_name; /* remote hostname */ @@ -76,6 +77,19 @@ struct nsm_handle { char sm_addrbuf[48]; /* address eyecatcher */ }; +/* + * Rigorous type checking on sockaddr type conversions + */ +static inline struct sockaddr_in *nlm_addr_in(const struct nlm_host *host) +{ + return (struct sockaddr_in *)&host->h_addr; +} + +static inline struct sockaddr *nlm_addr(const struct nlm_host *host) +{ + return (struct sockaddr *)&host->h_addr; +} + /* * Map an fl_owner_t into a unique 32-bit "pid" */ From 90151e6e4d00a3150d03d52170c246734b274622 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 3 Sep 2008 14:35:46 -0400 Subject: [PATCH 25/59] lockd: Use sockaddr_storage for h_saddr field To store larger addresses in the nlm_host structure, make h_saddr a sockaddr_storage. And let's call it something more self-explanatory: "saddr" could easily be mistaken for "server address". Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/host.c | 12 ++++++------ fs/lockd/svcsubs.c | 2 +- include/linux/lockd/lockd.h | 14 ++++++++++++-- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 8c7022eeae6..3ce2702d036 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -129,7 +129,7 @@ static struct nlm_host *nlm_lookup_host(int server, continue; if (host->h_server != server) continue; - if (!nlm_cmp_addr(&host->h_saddr, ssin)) + if (!nlm_cmp_addr(nlm_srcaddr_in(host), ssin)) continue; /* Move to head of hash chain. */ @@ -168,7 +168,7 @@ static struct nlm_host *nlm_lookup_host(int server, memcpy(nlm_addr(host), sin, sizeof(*sin)); host->h_addrlen = sizeof(*sin); nlm_clear_port(nlm_addr(host)); - host->h_saddr = *ssin; + memcpy(nlm_srcaddr(host), ssin, sizeof(*ssin)); host->h_version = version; host->h_proto = proto; host->h_rpcclnt = NULL; @@ -192,8 +192,8 @@ static struct nlm_host *nlm_lookup_host(int server, nlm_display_address((struct sockaddr *)&host->h_addr, host->h_addrbuf, sizeof(host->h_addrbuf)); - nlm_display_address((struct sockaddr *)&host->h_saddr, - host->h_saddrbuf, sizeof(host->h_saddrbuf)); + nlm_display_address((struct sockaddr *)&host->h_srcaddr, + host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf)); dprintk("lockd: nlm_lookup_host created host %s\n", host->h_name); @@ -267,7 +267,7 @@ nlm_bind_host(struct nlm_host *host) struct rpc_clnt *clnt; dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n", - host->h_name, host->h_addrbuf, host->h_saddrbuf); + host->h_name, host->h_addrbuf, host->h_srcaddrbuf); /* Lock host handle */ mutex_lock(&host->h_mutex); @@ -294,7 +294,7 @@ nlm_bind_host(struct nlm_host *host) .protocol = host->h_proto, .address = nlm_addr(host), .addrsize = host->h_addrlen, - .saddress = (struct sockaddr *)&host->h_saddr, + .saddress = nlm_srcaddr(host), .timeout = &timeparms, .servername = host->h_name, .program = &nlm_program, diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 198b4e55b37..d3d1330d7c2 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb); static int nlmsvc_match_ip(void *datap, struct nlm_host *host) { - return nlm_cmp_addr(&host->h_saddr, datap); + return nlm_cmp_addr(nlm_srcaddr_in(host), datap); } /** diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 41d7a8e61ce..964e6c93830 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -40,7 +40,7 @@ struct nlm_host { struct hlist_node h_hash; /* doubly linked list */ struct sockaddr_storage h_addr; /* peer address */ size_t h_addrlen; - struct sockaddr_in h_saddr; /* our address (optional) */ + struct sockaddr_storage h_srcaddr; /* our address (optional) */ struct rpc_clnt * h_rpcclnt; /* RPC client to talk to peer */ char * h_name; /* remote hostname */ u32 h_version; /* interface version */ @@ -64,7 +64,7 @@ struct nlm_host { struct nsm_handle * h_nsmhandle; /* NSM status handle */ char h_addrbuf[48], /* address eyecatchers */ - h_saddrbuf[48]; + h_srcaddrbuf[48]; }; struct nsm_handle { @@ -90,6 +90,16 @@ static inline struct sockaddr *nlm_addr(const struct nlm_host *host) return (struct sockaddr *)&host->h_addr; } +static inline struct sockaddr_in *nlm_srcaddr_in(const struct nlm_host *host) +{ + return (struct sockaddr_in *)&host->h_srcaddr; +} + +static inline struct sockaddr *nlm_srcaddr(const struct nlm_host *host) +{ + return (struct sockaddr *)&host->h_srcaddr; +} + /* * Map an fl_owner_t into a unique 32-bit "pid" */ From 7e9d7746bfd40121438b155023793796499497d8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 3 Sep 2008 14:35:54 -0400 Subject: [PATCH 26/59] NSM: Use sockaddr_storage for sm_addr field To store larger addresses in the nsm_handle structure, make sm_addr a sockaddr_storage. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/host.c | 5 +++-- fs/lockd/mon.c | 2 +- include/linux/lockd/lockd.h | 13 ++++++++++++- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 3ce2702d036..510ebcf485f 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -551,7 +551,7 @@ retry: if (strlen(pos->sm_name) != hostname_len || memcmp(pos->sm_name, hostname, hostname_len)) continue; - } else if (!nlm_cmp_addr(&pos->sm_addr, sin)) + } else if (!nlm_cmp_addr(nsm_addr_in(pos), sin)) continue; atomic_inc(&pos->sm_count); kfree(nsm); @@ -571,7 +571,8 @@ retry: if (nsm == NULL) return NULL; - nsm->sm_addr = *sin; + memcpy(nsm_addr(nsm), sin, sizeof(*sin)); + nsm->sm_addrlen = sizeof(*sin); nsm->sm_name = (char *) (nsm + 1); memcpy(nsm->sm_name, hostname, hostname_len); nsm->sm_name[hostname_len] = '\0'; diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index e4d563543b1..4e7e958e8f6 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -51,7 +51,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) memset(&args, 0, sizeof(args)); args.mon_name = nsm->sm_name; - args.addr = nsm->sm_addr.sin_addr.s_addr; + args.addr = nsm_addr_in(nsm)->sin_addr.s_addr; args.prog = NLM_PROGRAM; args.vers = 3; args.proc = NLMPROC_NSM_NOTIFY; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 964e6c93830..b1dfa0b1d1b 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -71,7 +71,8 @@ struct nsm_handle { struct list_head sm_link; atomic_t sm_count; char * sm_name; - struct sockaddr_in sm_addr; + struct sockaddr_storage sm_addr; + size_t sm_addrlen; unsigned int sm_monitored : 1, sm_sticky : 1; /* don't unmonitor */ char sm_addrbuf[48]; /* address eyecatcher */ @@ -100,6 +101,16 @@ static inline struct sockaddr *nlm_srcaddr(const struct nlm_host *host) return (struct sockaddr *)&host->h_srcaddr; } +static inline struct sockaddr_in *nsm_addr_in(const struct nsm_handle *handle) +{ + return (struct sockaddr_in *)&handle->sm_addr; +} + +static inline struct sockaddr *nsm_addr(const struct nsm_handle *handle) +{ + return (struct sockaddr *)&handle->sm_addr; +} + /* * Map an fl_owner_t into a unique 32-bit "pid" */ From 781b61a6f4ff94cb8c14cf598b547f5d5c490969 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 3 Sep 2008 14:36:01 -0400 Subject: [PATCH 27/59] lockd: Teach nlm_cmp_addr() to support AF_INET6 addresses Update the nlm_cmp_addr() helper to support AF_INET6 as well as AF_INET addresses. New version takes two "struct sockaddr *" arguments instead of "struct sockaddr_in *" arguments. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/clntlock.c | 3 ++- fs/lockd/host.c | 6 +++--- fs/lockd/svcsubs.c | 2 +- include/linux/lockd/lockd.h | 38 ++++++++++++++++++++++++++++++++----- 4 files changed, 39 insertions(+), 10 deletions(-) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 0df5587f804..237224a3c42 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -166,7 +166,8 @@ __be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; - if (!nlm_cmp_addr(nlm_addr_in(block->b_host), addr)) + if (!nlm_cmp_addr(nlm_addr(block->b_host), + (struct sockaddr *)addr)) continue; if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 510ebcf485f..dbf3fe620a0 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -116,7 +116,7 @@ static struct nlm_host *nlm_lookup_host(int server, */ chain = &nlm_hosts[hash]; hlist_for_each_entry(host, pos, chain, h_hash) { - if (!nlm_cmp_addr(nlm_addr_in(host), sin)) + if (!nlm_cmp_addr(nlm_addr(host), (struct sockaddr *)sin)) continue; /* See if we have an NSM handle for this client */ @@ -129,7 +129,7 @@ static struct nlm_host *nlm_lookup_host(int server, continue; if (host->h_server != server) continue; - if (!nlm_cmp_addr(nlm_srcaddr_in(host), ssin)) + if (!nlm_cmp_addr(nlm_srcaddr(host), (struct sockaddr *)ssin)) continue; /* Move to head of hash chain. */ @@ -551,7 +551,7 @@ retry: if (strlen(pos->sm_name) != hostname_len || memcmp(pos->sm_name, hostname, hostname_len)) continue; - } else if (!nlm_cmp_addr(nsm_addr_in(pos), sin)) + } else if (!nlm_cmp_addr(nsm_addr(pos), (struct sockaddr *)sin)) continue; atomic_inc(&pos->sm_count); kfree(nsm); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index d3d1330d7c2..34c2766e27c 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb); static int nlmsvc_match_ip(void *datap, struct nlm_host *host) { - return nlm_cmp_addr(nlm_srcaddr_in(host), datap); + return nlm_cmp_addr(nlm_srcaddr(host), datap); } /** diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index b1dfa0b1d1b..ec8af115843 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -12,6 +12,8 @@ #ifdef __KERNEL__ #include +#include +#include #include #include #include @@ -272,15 +274,41 @@ static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) return file->f_file->f_path.dentry->d_inode; } -/* - * Compare two host addresses (needs modifying for ipv6) - */ -static inline int nlm_cmp_addr(const struct sockaddr_in *sin1, - const struct sockaddr_in *sin2) +static inline int __nlm_cmp_addr4(const struct sockaddr *sap1, + const struct sockaddr *sap2) { + const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1; + const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2; return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; } +static inline int __nlm_cmp_addr6(const struct sockaddr *sap1, + const struct sockaddr *sap2) +{ + const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1; + const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2; + return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); +} + +/* + * Compare two host addresses + * + * Return TRUE if the addresses are the same; otherwise FALSE. + */ +static inline int nlm_cmp_addr(const struct sockaddr *sap1, + const struct sockaddr *sap2) +{ + if (sap1->sa_family == sap2->sa_family) { + switch (sap1->sa_family) { + case AF_INET: + return __nlm_cmp_addr4(sap1, sap2); + case AF_INET6: + return __nlm_cmp_addr6(sap1, sap2); + } + } + return 0; +} + /* * Compare two NLM locks. * When the second lock is of type F_UNLCK, this acts like a wildcard. From ede2fea099cf1dabe41e5b9563558bc7aee82248 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 3 Sep 2008 14:36:08 -0400 Subject: [PATCH 28/59] lockd: Support AF_INET6 when hashing addresses in nlm_lookup_host Adopt an approach similar to the RPC server's auth cache (from Aurelien Charbon and Brian Haley). Note nlm_lookup_host()'s existing IP address hash function has the same issue with correctness on little-endian systems as the original IPv4 auth cache hash function, so I've also updated it with a hash function similar to the new auth cache hash function. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/host.c | 49 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index dbf3fe620a0..1f9d72a7a03 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -22,7 +22,6 @@ #define NLMDBG_FACILITY NLMDBG_HOSTCACHE #define NLM_HOST_NRHASH 32 -#define NLM_ADDRHASH(addr) (ntohl(addr) & (NLM_HOST_NRHASH-1)) #define NLM_HOST_REBIND (60 * HZ) #define NLM_HOST_EXPIRE (300 * HZ) #define NLM_HOST_COLLECT (120 * HZ) @@ -40,6 +39,48 @@ static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, const char *hostname, unsigned int hostname_len); +/* + * Hash function must work well on big- and little-endian platforms + */ +static unsigned int __nlm_hash32(const __be32 n) +{ + unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16); + return hash ^ (hash >> 8); +} + +static unsigned int __nlm_hash_addr4(const struct sockaddr *sap) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + return __nlm_hash32(sin->sin_addr.s_addr); +} + +static unsigned int __nlm_hash_addr6(const struct sockaddr *sap) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + const struct in6_addr addr = sin6->sin6_addr; + return __nlm_hash32(addr.s6_addr32[0]) ^ + __nlm_hash32(addr.s6_addr32[1]) ^ + __nlm_hash32(addr.s6_addr32[2]) ^ + __nlm_hash32(addr.s6_addr32[3]); +} + +static unsigned int nlm_hash_address(const struct sockaddr *sap) +{ + unsigned int hash; + + switch (sap->sa_family) { + case AF_INET: + hash = __nlm_hash_addr4(sap); + break; + case AF_INET6: + hash = __nlm_hash_addr6(sap); + break; + default: + hash = 0; + } + return hash & (NLM_HOST_NRHASH - 1); +} + static void nlm_clear_port(struct sockaddr *sap) { switch (sap->sa_family) { @@ -92,16 +133,12 @@ static struct nlm_host *nlm_lookup_host(int server, struct hlist_node *pos; struct nlm_host *host; struct nsm_handle *nsm = NULL; - int hash; dprintk("lockd: nlm_lookup_host(proto=%d, vers=%u," " my role is %s, hostname=%.*s)\n", proto, version, server ? "server" : "client", hostname_len, hostname ? hostname : ""); - hash = NLM_ADDRHASH(sin->sin_addr.s_addr); - - /* Lock hash table */ mutex_lock(&nlm_host_mutex); if (time_after_eq(jiffies, next_gc)) @@ -114,7 +151,7 @@ static struct nlm_host *nlm_lookup_host(int server, * different NLM rpc_clients into one single nlm_host object. * This would allow us to have one nlm_host per address. */ - chain = &nlm_hosts[hash]; + chain = &nlm_hosts[nlm_hash_address((struct sockaddr *)sin)]; hlist_for_each_entry(host, pos, chain, h_hash) { if (!nlm_cmp_addr(nlm_addr(host), (struct sockaddr *)sin)) continue; From bc48e4d6371137b1b06e985ea76c1254e9c06e83 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 3 Sep 2008 14:36:16 -0400 Subject: [PATCH 29/59] lockd: Combine __nsm_find() and nsm_find(). Clean up: Having two separate functions doesn't add clarity, so eliminate one of them. Use contemporary kernel coding conventions where appropriate. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/host.c | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 1f9d72a7a03..b9eeafe99a6 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -31,13 +31,11 @@ static unsigned long next_gc; static int nrhosts; static DEFINE_MUTEX(nlm_host_mutex); - static void nlm_gc_hosts(void); -static struct nsm_handle * __nsm_find(const struct sockaddr_in *, - const char *, unsigned int, int); -static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, - const char *hostname, - unsigned int hostname_len); +static struct nsm_handle *nsm_find(const struct sockaddr_in *sin, + const char *hostname, + const size_t hostname_len, + const int create); /* * Hash function must work well on big- and little-endian platforms @@ -187,7 +185,7 @@ static struct nlm_host *nlm_lookup_host(int server, atomic_inc(&nsm->sm_count); else { host = NULL; - nsm = nsm_find(sin, hostname, hostname_len); + nsm = nsm_find(sin, hostname, hostname_len, 1); if (!nsm) { dprintk("lockd: nlm_lookup_host failed; " "no nsm handle\n"); @@ -419,8 +417,7 @@ void nlm_host_rebooted(const struct sockaddr_in *sin, struct nsm_handle *nsm; struct nlm_host *host; - /* Find the NSM handle for this peer */ - nsm = __nsm_find(sin, hostname, hostname_len, 0); + nsm = nsm_find(sin, hostname, hostname_len, 0); if (nsm == NULL) { dprintk("lockd: never saw rebooted peer '%.*s' before\n", hostname_len, hostname); @@ -560,10 +557,10 @@ nlm_gc_hosts(void) static LIST_HEAD(nsm_handles); static DEFINE_SPINLOCK(nsm_lock); -static struct nsm_handle * -__nsm_find(const struct sockaddr_in *sin, - const char *hostname, unsigned int hostname_len, - int create) +static struct nsm_handle *nsm_find(const struct sockaddr_in *sin, + const char *hostname, + const size_t hostname_len, + const int create) { struct nsm_handle *nsm = NULL; struct nsm_handle *pos; @@ -575,7 +572,7 @@ __nsm_find(const struct sockaddr_in *sin, if (printk_ratelimit()) { printk(KERN_WARNING "Invalid hostname \"%.*s\" " "in NFS lock request\n", - hostname_len, hostname); + (int)hostname_len, hostname); } return NULL; } @@ -623,13 +620,6 @@ found: return nsm; } -static struct nsm_handle * -nsm_find(const struct sockaddr_in *sin, const char *hostname, - unsigned int hostname_len) -{ - return __nsm_find(sin, hostname, hostname_len, 1); -} - /* * Release an NSM handle */ From e018040a824ab48211a1fcb86acebc9fc84759b0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 3 Sep 2008 14:36:23 -0400 Subject: [PATCH 30/59] lockd: Update nsm_find() to support non-AF_INET addresses Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/host.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index b9eeafe99a6..be8f19d5318 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -32,7 +32,8 @@ static int nrhosts; static DEFINE_MUTEX(nlm_host_mutex); static void nlm_gc_hosts(void); -static struct nsm_handle *nsm_find(const struct sockaddr_in *sin, +static struct nsm_handle *nsm_find(const struct sockaddr *sap, + const size_t salen, const char *hostname, const size_t hostname_len, const int create); @@ -185,7 +186,8 @@ static struct nlm_host *nlm_lookup_host(int server, atomic_inc(&nsm->sm_count); else { host = NULL; - nsm = nsm_find(sin, hostname, hostname_len, 1); + nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin), + hostname, hostname_len, 1); if (!nsm) { dprintk("lockd: nlm_lookup_host failed; " "no nsm handle\n"); @@ -417,7 +419,8 @@ void nlm_host_rebooted(const struct sockaddr_in *sin, struct nsm_handle *nsm; struct nlm_host *host; - nsm = nsm_find(sin, hostname, hostname_len, 0); + nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin), + hostname, hostname_len, 0); if (nsm == NULL) { dprintk("lockd: never saw rebooted peer '%.*s' before\n", hostname_len, hostname); @@ -557,7 +560,8 @@ nlm_gc_hosts(void) static LIST_HEAD(nsm_handles); static DEFINE_SPINLOCK(nsm_lock); -static struct nsm_handle *nsm_find(const struct sockaddr_in *sin, +static struct nsm_handle *nsm_find(const struct sockaddr *sap, + const size_t salen, const char *hostname, const size_t hostname_len, const int create) @@ -565,7 +569,7 @@ static struct nsm_handle *nsm_find(const struct sockaddr_in *sin, struct nsm_handle *nsm = NULL; struct nsm_handle *pos; - if (!sin) + if (!sap) return NULL; if (hostname && memchr(hostname, '/', hostname_len) != NULL) { @@ -585,7 +589,7 @@ retry: if (strlen(pos->sm_name) != hostname_len || memcmp(pos->sm_name, hostname, hostname_len)) continue; - } else if (!nlm_cmp_addr(nsm_addr(pos), (struct sockaddr *)sin)) + } else if (!nlm_cmp_addr(nsm_addr(pos), sap)) continue; atomic_inc(&pos->sm_count); kfree(nsm); @@ -605,8 +609,8 @@ retry: if (nsm == NULL) return NULL; - memcpy(nsm_addr(nsm), sin, sizeof(*sin)); - nsm->sm_addrlen = sizeof(*sin); + memcpy(nsm_addr(nsm), sap, salen); + nsm->sm_addrlen = salen; nsm->sm_name = (char *) (nsm + 1); memcpy(nsm->sm_name, hostname, hostname_len); nsm->sm_name[hostname_len] = '\0'; From 2c7eb0b206b8408d92c518033a359f4374c75314 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 15 Sep 2008 16:27:23 -0500 Subject: [PATCH 31/59] SUNRPC: Register both netids for AF_INET6 servers TI-RPC is a user-space library of RPC functions that replaces ONC RPC and allows RPC to operate in the new world of IPv6. TI-RPC combines the concept of a transport protocol (UDP and TCP) and a protocol family (PF_INET and PF_INET6) into a single identifier called a "netid." For example, "udp" means UDP over IPv4, and "udp6" means UDP over IPv6. For rpcbind, then, the RPC service tuple that is registered and advertised is: [RPC program, RPC version, service address and port, netid] instead of [RPC program, RPC version, port, protocol] Service address is typically ANYADDR, but can be a specific address of one of the interfaces on a multi-homed host. The third item in the new tuple is expressed as a universal address. The current Linux rpcbind implementation registers a netid for both protocol families when RPCB_SET is done for just the PF_INET6 version of the netid (ie udp6 or tcp6). So registering "udp6" causes a registration for "udp" to appear automatically as well. We've recently determined that this is incorrect behavior. In the TI-RPC world, "udp6" is not meant to imply that the registered RPC service handles requests from AF_INET as well, even if the listener socket does address mapping. "udp" and "udp6" are entirely separate capabilities, and must be registered separately. The Linux kernel, unlike TI-RPC, leverages address mapping to allow a single listener socket to handle requests for both AF_INET and AF_INET6. This is still OK, but the kernel currently assumes registering "udp6" will cover "udp" as well. It registers only "udp6" for it's AF_INET6 services, even though they handle both AF_INET and AF_INET6 on the same port. So svc_register() actually needs to register both "udp" and "udp6" explicitly (and likewise for TCP). Until rpcbind is fixed, the kernel can ignore the return code for the second RPCB_SET call. Please merge this with commit 15231312: SUNRPC: Support IPv6 when registering kernel RPC services Signed-off-by: Chuck Lever Cc: Olaf Kirch Signed-off-by: J. Bruce Fields --- net/sunrpc/svc.c | 145 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 101 insertions(+), 44 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index c43ccb62805..b8d2fcd0f71 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -720,69 +720,125 @@ svc_exit_thread(struct svc_rqst *rqstp) EXPORT_SYMBOL(svc_exit_thread); #ifdef CONFIG_SUNRPC_REGISTER_V4 + /* - * Registering kernel RPC services with rpcbind version 2 will work - * over either IPv4 or IPv6, since the Linux kernel always registers - * services for the "any" address. + * Register an "inet" protocol family netid with the local + * rpcbind daemon via an rpcbind v4 SET request. * - * However, the local rpcbind daemon listens on either only AF_INET - * or AF_INET6 (never both). When it listens on AF_INET6, an rpcbind - * version 2 registration will result in registering the service at - * IN6ADDR_ANY, even if the RPC service being registered is not - * IPv6-enabled. + * No netconfig infrastructure is available in the kernel, so + * we map IP_ protocol numbers to netids by hand. * - * Rpcbind version 4 allows us to be a little more specific. Kernel - * RPC services that don't yet support AF_INET6 can register - * themselves as IPv4-only with the local rpcbind daemon, even if the - * daemon is listening only on AF_INET6. - * - * And, registering IPv6-enabled kernel RPC services via AF_INET6 - * verifies that the local user space rpcbind daemon is properly - * configured to support remote AF_INET6 rpcbind requests. - * - * An AF_INET6 registration request will fail if the local rpcbind - * daemon is not set up to listen on AF_INET6. Likewise, we fail - * AF_INET6 registration requests if svc_register() is configured to - * support only rpcbind version 2. + * Returns zero on success; a negative errno value is returned + * if any error occurs. */ -static int __svc_register(const u32 program, const u32 version, - const sa_family_t family, - const unsigned short protocol, - const unsigned short port) +static int __svc_rpcb_register4(const u32 program, const u32 version, + const unsigned short protocol, + const unsigned short port) { struct sockaddr_in sin = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = htons(port), }; + char *netid; + + switch (protocol) { + case IPPROTO_UDP: + netid = RPCBIND_NETID_UDP; + break; + case IPPROTO_TCP: + netid = RPCBIND_NETID_TCP; + break; + default: + return -EPROTONOSUPPORT; + } + + return rpcb_v4_register(program, version, + (struct sockaddr *)&sin, netid); +} + +/* + * Register an "inet6" protocol family netid with the local + * rpcbind daemon via an rpcbind v4 SET request. + * + * No netconfig infrastructure is available in the kernel, so + * we map IP_ protocol numbers to netids by hand. + * + * Returns zero on success; a negative errno value is returned + * if any error occurs. + */ +static int __svc_rpcb_register6(const u32 program, const u32 version, + const unsigned short protocol, + const unsigned short port) +{ struct sockaddr_in6 sin6 = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_ANY_INIT, .sin6_port = htons(port), }; - struct sockaddr *sap; char *netid; + switch (protocol) { + case IPPROTO_UDP: + netid = RPCBIND_NETID_UDP6; + break; + case IPPROTO_TCP: + netid = RPCBIND_NETID_TCP6; + break; + default: + return -EPROTONOSUPPORT; + } + + return rpcb_v4_register(program, version, + (struct sockaddr *)&sin6, netid); +} + +/* + * Register a kernel RPC service via rpcbind version 4. + * + * Returns zero on success; a negative errno value is returned + * if any error occurs. + */ +static int __svc_register(const u32 program, const u32 version, + const sa_family_t family, + const unsigned short protocol, + const unsigned short port) +{ + int error; + switch (family) { case AF_INET: - sap = (struct sockaddr *)&sin; - netid = RPCBIND_NETID_TCP; - if (protocol == IPPROTO_UDP) - netid = RPCBIND_NETID_UDP; - break; + return __svc_rpcb_register4(program, version, + protocol, port); case AF_INET6: - sap = (struct sockaddr *)&sin6; - netid = RPCBIND_NETID_TCP6; - if (protocol == IPPROTO_UDP) - netid = RPCBIND_NETID_UDP6; - break; - default: - return -EAFNOSUPPORT; + error = __svc_rpcb_register6(program, version, + protocol, port); + if (error < 0) + return error; + + /* + * Work around bug in some versions of Linux rpcbind + * which don't allow registration of both inet and + * inet6 netids. + * + * Error return ignored for now. + */ + __svc_rpcb_register4(program, version, + protocol, port); + return 0; } - return rpcb_v4_register(program, version, sap, netid); + return -EAFNOSUPPORT; } -#else + +#else /* CONFIG_SUNRPC_REGISTER_V4 */ + +/* + * Register a kernel RPC service via rpcbind version 2. + * + * Returns zero on success; a negative errno value is returned + * if any error occurs. + */ static int __svc_register(const u32 program, const u32 version, sa_family_t family, const unsigned short protocol, @@ -793,7 +849,8 @@ static int __svc_register(const u32 program, const u32 version, return rpcb_register(program, version, protocol, port); } -#endif + +#endif /* CONFIG_SUNRPC_REGISTER_V4 */ /** * svc_register - register an RPC service with the local portmapper @@ -817,12 +874,12 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto, if (progp->pg_vers[i] == NULL) continue; - dprintk("svc: svc_register(%s, %u, %s, %u, %d)%s\n", + dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n", progp->pg_name, - serv->sv_family, + i, proto == IPPROTO_UDP? "udp" : "tcp", port, - i, + serv->sv_family, progp->pg_vers[i]->vs_hidden? " (but not telling portmap)" : ""); From 9d548b9c955c0709d1229d21d0bc14afa6b356de Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 15 Sep 2008 16:27:30 -0500 Subject: [PATCH 32/59] SUNRPC: Use short-hand IPv6 ANYADDR for RPCB_SET Clean up: When doing an RPCB_SET, make the kernel's rpcb client use the shorthand "::" for the universal form of the IPv6 ANY address. Without this patch, rpcbind will advertise: 0000:0000:0000:0000:0000:0000:0000:0000.x.y This is cosmetic only. It cleans up the display of information from /sbin/rpcinfo. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/rpcb_clnt.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index cc7250d4659..0fa1086cf99 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -304,10 +305,13 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, char buf[64]; /* Construct AF_INET6 universal address */ - snprintf(buf, sizeof(buf), - NIP6_FMT".%u.%u", - NIP6(address_to_register->sin6_addr), - port >> 8, port & 0xff); + if (ipv6_addr_any(&address_to_register->sin6_addr)) + snprintf(buf, sizeof(buf), "::.%u.%u", + port >> 8, port & 0xff); + else + snprintf(buf, sizeof(buf), NIP6_FMT".%u.%u", + NIP6(address_to_register->sin6_addr), + port >> 8, port & 0xff); map->r_addr = buf; dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " From f6fb3f6f591b50fa4f51962ad06ee0d8782e1bc8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Sep 2008 11:56:57 -0400 Subject: [PATCH 33/59] SUNRPC: Fix up svc_unregister() With the new rpcbind code, a PMAP_UNSET will not have any effect on services registered via rpcbind v3 or v4. Implement a version of svc_unregister() that uses an RPCB_UNSET with an empty netid string to make sure we have cleared *all* entries for a kernel RPC service when shutting down, or before starting a fresh instance of the service. Use the new version only when CONFIG_SUNRPC_REGISTER_V4 is enabled; otherwise, the legacy PMAP version is used to ensure complete backwards-compatibility with the Linux portmapper daemon. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/svc.c | 58 +++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index b8d2fcd0f71..54c98d87684 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -896,31 +896,51 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto, return error; } +#ifdef CONFIG_SUNRPC_REGISTER_V4 + +static void __svc_unregister(const u32 program, const u32 version, + const char *progname) +{ + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_port = 0, + }; + int error; + + error = rpcb_v4_register(program, version, + (struct sockaddr *)&sin6, ""); + dprintk("svc: %s(%sv%u), error %d\n", + __func__, progname, version, error); +} + +#else /* CONFIG_SUNRPC_REGISTER_V4 */ + +static void __svc_unregister(const u32 program, const u32 version, + const char *progname) +{ + int error; + + error = rpcb_register(program, version, 0, 0); + dprintk("svc: %s(%sv%u), error %d\n", + __func__, progname, version, error); +} + +#endif /* CONFIG_SUNRPC_REGISTER_V4 */ + /* - * All transport protocols and ports for this service are removed - * from the local rpcbind database if the service is not hidden. + * All netids, bind addresses and ports registered for [program, version] + * are removed from the local rpcbind database (if the service is not + * hidden) to make way for a new instance of the service. * - * The result of unregistration is reported via dprintk for those - * who want verification of the result, but is otherwise not - * important. - * - * The local rpcbind daemon listens on either only IPv6 or only - * IPv4. The kernel can't tell how it's configured. However, - * AF_INET addresses are mapped to AF_INET6 in IPv6-only config- - * urations, so even an unregistration request on AF_INET will - * get to a local rpcbind daemon listening only on AF_INET6. So - * we always unregister via AF_INET. - * - * At this point we don't need rpcbind version 4 for unregis- - * tration: A v2 UNSET request will clear all transports (netids), - * addresses, and address families for [program, version]. + * The result of unregistration is reported via dprintk for those who want + * verification of the result, but is otherwise not important. */ static void svc_unregister(const struct svc_serv *serv) { struct svc_program *progp; unsigned long flags; unsigned int i; - int error; clear_thread_flag(TIF_SIGPENDING); @@ -931,9 +951,7 @@ static void svc_unregister(const struct svc_serv *serv) if (progp->pg_vers[i]->vs_hidden) continue; - error = rpcb_register(progp->pg_prog, i, 0, 0); - dprintk("svc: svc_unregister(%sv%u), error %d\n", - progp->pg_name, i, error); + __svc_unregister(progp->pg_prog, i, progp->pg_name); } } From db820d6376aa81accf5b648651e160fd76e363e2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Sep 2008 11:57:05 -0400 Subject: [PATCH 34/59] SUNRPC: Clean up debug messages in rpcb_clnt.c The RPCB XDR functions are used for multiple procedures. For instance, rpcb_encode_getaddr() is used for RPCB_GETADDR, RPCB_SET, and RPCB_UNSET. Make the XDR debug messages more generic so they are less confusing. And, unlike in other RPC consumers in the kernel, a single debug flag enables all levels of debug messages in the RPC bind client, including XDR debug messages. Since the XDR decoders already report success or failure in this case, remove redundant debug messages in the mid-level rpcb_register_call() function. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/rpcb_clnt.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 0fa1086cf99..34abc91058d 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -197,12 +197,8 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, return error; } - if (!result) { - dprintk("RPC: registration failed\n"); + if (!result) return -EACCES; - } - - dprintk("RPC: registration succeeded\n"); return 0; } @@ -628,7 +624,7 @@ static void rpcb_getport_done(struct rpc_task *child, void *data) static int rpcb_encode_mapping(struct rpc_rqst *req, __be32 *p, struct rpcbind_args *rpcb) { - dprintk("RPC: rpcb_encode_mapping(%u, %u, %d, %u)\n", + dprintk("RPC: encoding rpcb request (%u, %u, %d, %u)\n", rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port); *p++ = htonl(rpcb->r_prog); *p++ = htonl(rpcb->r_vers); @@ -643,7 +639,7 @@ static int rpcb_decode_getport(struct rpc_rqst *req, __be32 *p, unsigned short *portp) { *portp = (unsigned short) ntohl(*p++); - dprintk("RPC: rpcb_decode_getport result %u\n", + dprintk("RPC: rpcb getport result: %u\n", *portp); return 0; } @@ -652,7 +648,7 @@ static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p, unsigned int *boolp) { *boolp = (unsigned int) ntohl(*p++); - dprintk("RPC: rpcb_decode_set: call %s\n", + dprintk("RPC: rpcb set/unset call %s\n", (*boolp ? "succeeded" : "failed")); return 0; } @@ -660,7 +656,7 @@ static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p, static int rpcb_encode_getaddr(struct rpc_rqst *req, __be32 *p, struct rpcbind_args *rpcb) { - dprintk("RPC: rpcb_encode_getaddr(%u, %u, %s)\n", + dprintk("RPC: encoding rpcb request (%u, %u, %s)\n", rpcb->r_prog, rpcb->r_vers, rpcb->r_addr); *p++ = htonl(rpcb->r_prog); *p++ = htonl(rpcb->r_vers); From 97eb89bb0e5d9ab20dbc677cb18fad1421473287 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 26 Sep 2008 15:14:13 +0300 Subject: [PATCH 35/59] nfsd: do_probe_callback should not clear rpc stats Now that cb_stats are static (since commit ff7d9756b501744540be65e172d27ee321d86103) there's no need to clear them. Initially I thought it might make sense to do that every callback probing but since the stats are per-program and they are shared between possibly several client callback instances, zeroing them out seems like the wrong thing to do. Note that that commit also introduced a bug since stats.program is also being cleared in the process and it is not restored after the memset as it used to be. Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index ab13e02c568..f7c793a5b80 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -397,9 +397,6 @@ static int do_probe_callback(void *data) addr.sin_port = htons(cb->cb_port); addr.sin_addr.s_addr = htonl(cb->cb_addr); - /* Initialize rpc_stat */ - memset(args.program->stats, 0, sizeof(struct rpc_stat)); - /* Create RPC client */ client = rpc_create(&args); if (IS_ERR(client)) { From d5b337b4877f7c4e1d761434ee04d045b0201e03 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 28 Sep 2008 09:21:26 +0300 Subject: [PATCH 36/59] nfsd: use nfs client rpc callback program since commit ff7d9756b501744540be65e172d27ee321d86103 "nfsd: use static memory for callback program and stats" do_probe_callback uses a static callback program (NFS4_CALLBACK) rather than the one set in clp->cl_callback.cb_prog as passed in by the client in setclientid (4.0) or create_session (4.1). This patches introduces rpc_create_args.prognumber that allows overriding program->number when creating rpc_clnt. Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 1 + include/linux/sunrpc/clnt.h | 1 + net/sunrpc/clnt.c | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index f7c793a5b80..094747a1227 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -380,6 +380,7 @@ static int do_probe_callback(void *data) .addrsize = sizeof(addr), .timeout = &timeparms, .program = &cb_program, + .prognumber = cb->cb_prog, .version = nfs_cb_version[1]->number, .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 8ac8e75243a..6f0ee1b84a4 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -104,6 +104,7 @@ struct rpc_create_args { const struct rpc_timeout *timeout; char *servername; struct rpc_program *program; + u32 prognumber; /* overrides program->number */ u32 version; rpc_authflavor_t authflavor; unsigned long flags; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 76739e928d0..da0789fa1b8 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -174,7 +174,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru clnt->cl_procinfo = version->procs; clnt->cl_maxproc = version->nrprocs; clnt->cl_protname = program->name; - clnt->cl_prog = program->number; + clnt->cl_prog = args->prognumber ? : program->number; clnt->cl_vers = version->number; clnt->cl_stats = program->stats; clnt->cl_metrics = rpc_alloc_iostats(clnt); From af558e33bedab672f5cfd3260bce7445e353fe21 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 6 Sep 2007 12:34:25 -0400 Subject: [PATCH 37/59] nfsd: common grace period control Rewrite grace period code to unify management of grace period across lockd and nfsd. The current code has lockd and nfsd cooperate to compute a grace period which is satisfactory to them both, and then individually enforce it. This creates a slight race condition, since the enforcement is not coordinated. It's also more complicated than necessary. Here instead we have lockd and nfsd each inform common code when they enter the grace period, and when they're ready to leave the grace period, and allow normal locking only after both of them are ready to leave. We also expect the locks_start_grace()/locks_end_grace() interface here to be simpler to build on for future cluster/high-availability work, which may require (for example) putting individual filesystems into grace, or enforcing grace periods across multiple cluster nodes. Signed-off-by: J. Bruce Fields --- fs/lockd/Makefile | 2 +- fs/lockd/grace.c | 59 ++++++++++++++++++++++++++++++++++++++ fs/lockd/svc.c | 20 ++++--------- fs/lockd/svc4proc.c | 12 ++++---- fs/lockd/svcproc.c | 12 ++++---- fs/nfsd/lockd.c | 1 - fs/nfsd/nfs4proc.c | 8 +++--- fs/nfsd/nfs4state.c | 34 ++++++++++------------ include/linux/fs.h | 8 ++++++ include/linux/lockd/bind.h | 9 ------ 10 files changed, 104 insertions(+), 61 deletions(-) create mode 100644 fs/lockd/grace.c diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile index 7725a0a9a55..97f6073ab33 100644 --- a/fs/lockd/Makefile +++ b/fs/lockd/Makefile @@ -5,6 +5,6 @@ obj-$(CONFIG_LOCKD) += lockd.o lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \ - svcproc.o svcsubs.o mon.o xdr.o + svcproc.o svcsubs.o mon.o xdr.o grace.o lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o lockd-objs := $(lockd-objs-y) diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c new file mode 100644 index 00000000000..183cc1f0af1 --- /dev/null +++ b/fs/lockd/grace.c @@ -0,0 +1,59 @@ +/* + * Common code for control of lockd and nfsv4 grace periods. + */ + +#include +#include + +static LIST_HEAD(grace_list); +static DEFINE_SPINLOCK(grace_lock); + +/** + * locks_start_grace + * @lm: who this grace period is for + * + * A grace period is a period during which locks should not be given + * out. Currently grace periods are only enforced by the two lock + * managers (lockd and nfsd), using the locks_in_grace() function to + * check when they are in a grace period. + * + * This function is called to start a grace period. + */ +void locks_start_grace(struct lock_manager *lm) +{ + spin_lock(&grace_lock); + list_add(&lm->list, &grace_list); + spin_unlock(&grace_lock); +} +EXPORT_SYMBOL_GPL(locks_start_grace); + +/** + * locks_end_grace + * @lm: who this grace period is for + * + * Call this function to state that the given lock manager is ready to + * resume regular locking. The grace period will not end until all lock + * managers that called locks_start_grace() also call locks_end_grace(). + * Note that callers count on it being safe to call this more than once, + * and the second call should be a no-op. + */ +void locks_end_grace(struct lock_manager *lm) +{ + spin_lock(&grace_lock); + list_del_init(&lm->list); + spin_unlock(&grace_lock); +} +EXPORT_SYMBOL_GPL(locks_end_grace); + +/** + * locks_in_grace + * + * Lock managers call this function to determine when it is OK for them + * to answer ordinary lock requests, and when they should accept only + * lock reclaims. + */ +int locks_in_grace(void) +{ + return !list_empty(&grace_list); +} +EXPORT_SYMBOL_GPL(locks_in_grace); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index f345ef7fb8a..f013aed1153 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -51,7 +51,6 @@ static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; -int nlmsvc_grace_period; unsigned long nlmsvc_timeout; /* @@ -85,30 +84,21 @@ static unsigned long get_lockd_grace_period(void) return nlm_timeout * 5 * HZ; } -unsigned long get_nfs_grace_period(void) -{ - unsigned long lockdgrace = get_lockd_grace_period(); - unsigned long nfsdgrace = 0; - - if (nlmsvc_ops) - nfsdgrace = nlmsvc_ops->get_grace_period(); - - return max(lockdgrace, nfsdgrace); -} -EXPORT_SYMBOL(get_nfs_grace_period); +static struct lock_manager lockd_manager = { +}; static void grace_ender(struct work_struct *not_used) { - nlmsvc_grace_period = 0; + locks_end_grace(&lockd_manager); } static DECLARE_DELAYED_WORK(grace_period_end, grace_ender); static void set_grace_period(void) { - unsigned long grace_period = get_nfs_grace_period() + jiffies; + unsigned long grace_period = get_lockd_grace_period(); - nlmsvc_grace_period = 1; + locks_start_grace(&lockd_manager); cancel_delayed_work_sync(&grace_period_end); schedule_delayed_work(&grace_period_end, grace_period); } diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4a714f64515..7ca617367b3 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -89,7 +89,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept test requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rc; } @@ -123,7 +123,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { + if (locks_in_grace() && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; return rc; } @@ -169,7 +169,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -202,7 +202,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -341,7 +341,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { + if (locks_in_grace() && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -374,7 +374,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 76262c1986f..1b013e19880 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -118,7 +118,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept test requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rc; } @@ -153,7 +153,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { + if (locks_in_grace() && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; return rc; } @@ -199,7 +199,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -232,7 +232,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -373,7 +373,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { + if (locks_in_grace() && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -406,7 +406,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 15c6faeec77..b2786a5f9af 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -70,7 +70,6 @@ nlm_fclose(struct file *filp) static struct nlmsvc_binding nfsd_nlm_ops = { .fopen = nlm_fopen, /* open file for locking */ .fclose = nlm_fclose, /* close file */ - .get_grace_period = get_nfs4_grace_period, }; void diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index e5b51ffafc6..669461e291a 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -201,10 +201,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Openowner is now set, so sequence id will get bumped. Now we need * these checks before we do any creates: */ status = nfserr_grace; - if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) goto out; status = nfserr_no_grace; - if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) goto out; switch (open->op_claim_type) { @@ -575,7 +575,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; - if (nfs4_in_grace()) + if (locks_in_grace()) return nfserr_grace; status = nfsd_unlink(rqstp, &cstate->current_fh, 0, remove->rm_name, remove->rm_namelen); @@ -596,7 +596,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!cstate->save_fh.fh_dentry) return status; - if (nfs4_in_grace() && !(cstate->save_fh.fh_export->ex_flags + if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK)) return nfserr_grace; status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1578d7a2667..0cc7ff5d5ab 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -61,7 +61,6 @@ static time_t lease_time = 90; /* default lease time */ static time_t user_lease_time = 90; static time_t boot_time; -static int in_grace = 1; static u32 current_ownerid = 1; static u32 current_fileid = 1; static u32 current_delegid = 1; @@ -1640,7 +1639,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta case NFS4_OPEN_CLAIM_NULL: /* Let's not give out any delegations till everyone's * had the chance to reclaim theirs.... */ - if (nfs4_in_grace()) + if (locks_in_grace()) goto out; if (!atomic_read(&cb->cb_set) || !sop->so_confirmed) goto out; @@ -1816,12 +1815,15 @@ out: return status; } +struct lock_manager nfsd4_manager = { +}; + static void -end_grace(void) +nfsd4_end_grace(void) { dprintk("NFSD: end of grace period\n"); nfsd4_recdir_purge_old(); - in_grace = 0; + locks_end_grace(&nfsd4_manager); } static time_t @@ -1838,8 +1840,8 @@ nfs4_laundromat(void) nfs4_lock_state(); dprintk("NFSD: laundromat service - starting\n"); - if (in_grace) - end_grace(); + if (locks_in_grace()) + nfsd4_end_grace(); list_for_each_safe(pos, next, &client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { @@ -1974,7 +1976,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) return nfserr_bad_stateid; else if (ONE_STATEID(stateid) && (flags & RD_STATE)) return nfs_ok; - else if (nfs4_in_grace()) { + else if (locks_in_grace()) { /* Answer in remaining cases depends on existance of * conflicting state; so we must wait out the grace period. */ return nfserr_grace; @@ -1993,7 +1995,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) static inline int io_during_grace_disallowed(struct inode *inode, int flags) { - return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE)) + return locks_in_grace() && (flags & (RD_STATE | WR_STATE)) && mandatory_lock(inode); } @@ -2693,10 +2695,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, filp = lock_stp->st_vfs_file; status = nfserr_grace; - if (nfs4_in_grace() && !lock->lk_reclaim) + if (locks_in_grace() && !lock->lk_reclaim) goto out; status = nfserr_no_grace; - if (!nfs4_in_grace() && lock->lk_reclaim) + if (!locks_in_grace() && lock->lk_reclaim) goto out; locks_init_lock(&file_lock); @@ -2779,7 +2781,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, int error; __be32 status; - if (nfs4_in_grace()) + if (locks_in_grace()) return nfserr_grace; if (check_lock_length(lockt->lt_offset, lockt->lt_length)) @@ -3192,9 +3194,9 @@ __nfs4_state_start(void) unsigned long grace_time; boot_time = get_seconds(); - grace_time = get_nfs_grace_period(); + grace_time = get_nfs4_grace_period(); lease_time = user_lease_time; - in_grace = 1; + locks_start_grace(&nfsd4_manager); printk(KERN_INFO "NFSD: starting %ld-second grace period\n", grace_time/HZ); laundry_wq = create_singlethread_workqueue("nfsd4"); @@ -3213,12 +3215,6 @@ nfs4_state_start(void) return; } -int -nfs4_in_grace(void) -{ - return in_grace; -} - time_t nfs4_lease_time(void) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 9f540165a07..27cfa723b92 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -942,6 +942,14 @@ struct lock_manager_operations { int (*fl_change)(struct file_lock **, int); }; +struct lock_manager { + struct list_head list; +}; + +void locks_start_grace(struct lock_manager *); +void locks_end_grace(struct lock_manager *); +int locks_in_grace(void); + /* that will die - we need it for nfs_lock_info */ #include diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 3d25bcd139d..1f0465c374d 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -27,7 +27,6 @@ struct nlmsvc_binding { struct nfs_fh *, struct file **); void (*fclose)(struct file *); - unsigned long (*get_grace_period)(void); }; extern struct nlmsvc_binding * nlmsvc_ops; @@ -56,12 +55,4 @@ extern int nlmclnt_proc(struct nlm_host *host, int cmd, extern int lockd_up(int proto); extern void lockd_down(void); -unsigned long get_nfs_grace_period(void); - -#ifdef CONFIG_NFSD_V4 -unsigned long get_nfs4_grace_period(void); -#else -static inline unsigned long get_nfs4_grace_period(void) {return 0;} -#endif - #endif /* LINUX_LOCKD_BIND_H */ From b2b5028905226f85075a408b1118857c9aa48bb3 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 6 Feb 2008 13:59:23 -0500 Subject: [PATCH 38/59] lockd: move grace period checks to common code Do all the grace period checks in svclock.c. This simplifies the code a bit, and will ease some later changes. Signed-off-by: J. Bruce Fields --- fs/lockd/svc4proc.c | 15 ++------------- fs/lockd/svclock.c | 14 +++++++++++++- fs/lockd/svcproc.c | 15 ++------------- include/linux/lockd/lockd.h | 2 +- 4 files changed, 18 insertions(+), 28 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 7ca617367b3..f6f18fa5cf8 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -88,12 +88,6 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, dprintk("lockd: TEST4 called\n"); resp->cookie = argp->cookie; - /* Don't accept test requests during grace period */ - if (locks_in_grace()) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -122,12 +116,6 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; - /* Don't accept new lock requests during grace period */ - if (locks_in_grace() && !argp->reclaim) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -146,7 +134,8 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, /* Now try to lock the file */ resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, - argp->block, &argp->cookie); + argp->block, &argp->cookie, + argp->reclaim); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; else diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index cf0d5c2c318..808d246ada4 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -360,7 +360,7 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block) __be32 nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, int wait, - struct nlm_cookie *cookie) + struct nlm_cookie *cookie, int reclaim) { struct nlm_block *block = NULL; int error; @@ -406,6 +406,11 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } + if (locks_in_grace() && !reclaim) { + ret = nlm_lck_denied_grace_period; + goto out; + } + if (!wait) lock->fl.fl_flags &= ~FL_SLEEP; error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); @@ -502,6 +507,10 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } + if (locks_in_grace()) { + ret = nlm_lck_denied_grace_period; + goto out; + } error = vfs_test_lock(file->f_file, &lock->fl); if (error == FILE_LOCK_DEFERRED) { ret = nlmsvc_defer_lock_rqst(rqstp, block); @@ -582,6 +591,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); + if (locks_in_grace()) + return nlm_lck_denied_grace_period; + mutex_lock(&file->f_mutex); block = nlmsvc_lookup_block(file, lock); mutex_unlock(&file->f_mutex); diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 1b013e19880..a587b81338b 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -117,12 +117,6 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, dprintk("lockd: TEST called\n"); resp->cookie = argp->cookie; - /* Don't accept test requests during grace period */ - if (locks_in_grace()) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -152,12 +146,6 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; - /* Don't accept new lock requests during grace period */ - if (locks_in_grace() && !argp->reclaim) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -176,7 +164,8 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, /* Now try to lock the file */ resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock, - argp->block, &argp->cookie)); + argp->block, &argp->cookie, + argp->reclaim)); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; else diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index ec8af115843..973ab1d6e86 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -242,7 +242,7 @@ typedef int (*nlm_host_match_fn_t)(void *cur, struct nlm_host *ref); */ __be32 nlmsvc_lock(struct svc_rqst *, struct nlm_file *, struct nlm_host *, struct nlm_lock *, int, - struct nlm_cookie *); + struct nlm_cookie *, int); __be32 nlmsvc_unlock(struct nlm_file *, struct nlm_lock *); __be32 nlmsvc_testlock(struct svc_rqst *, struct nlm_file *, struct nlm_host *, struct nlm_lock *, From d22b1cff099737f74f3ac5950094508b4cddec1e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 6 Feb 2008 15:05:12 -0500 Subject: [PATCH 39/59] lockd: reject reclaims outside the grace period The current lockd does not reject reclaims that arrive outside of the grace period. Accepting a reclaim means promising to the client that no conflicting locks were granted since last it held the lock. We can meet that promise if we assume the only lockers are nfs clients, and that they are sufficiently well-behaved to reclaim only locks that they held before, and that only reclaim locks have been permitted so far. Once we leave the grace period (and start permitting non-reclaims), we can no longer keep that promise. So we must start rejecting reclaims at that point. Signed-off-by: J. Bruce Fields --- fs/lockd/svclock.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 808d246ada4..6063a8e4b9f 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -410,6 +410,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, ret = nlm_lck_denied_grace_period; goto out; } + if (reclaim && !locks_in_grace()) { + ret = nlm_lck_denied_grace_period; + goto out; + } if (!wait) lock->fl.fl_flags &= ~FL_SLEEP; From 0d3ebb9ae9f9c887518fd4c81a68084111d154d7 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Tue, 30 Sep 2008 13:06:13 -0500 Subject: [PATCH 40/59] svcrdma: Add Fast Reg MR Data Types Add data types to track Fast Reg Memory Regions. The core data type is svc_rdma_fastreg_mr that associates a device MR with a host kva and page list. A field is added to the WR context to keep track of the FRMR used to map the local memory for an RPC. An FRMR list and spin lock are added to the transport instance to keep track of all FRMR allocated for the transport. Also added are device capability flags to indicate what the memory registration capabilities are for the underlying device and whether or not fast memory registration is supported. Signed-off-by: Tom Tucker --- include/linux/sunrpc/svc_rdma.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index dc05b54bd3a..49e458d9894 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -72,6 +72,7 @@ extern atomic_t rdma_stat_sq_prod; */ struct svc_rdma_op_ctxt { struct svc_rdma_op_ctxt *read_hdr; + struct svc_rdma_fastreg_mr *frmr; int hdr_count; struct xdr_buf arg; struct list_head dto_q; @@ -103,16 +104,30 @@ struct svc_rdma_chunk_sge { int start; /* sge no for this chunk */ int count; /* sge count for this chunk */ }; +struct svc_rdma_fastreg_mr { + struct ib_mr *mr; + void *kva; + struct ib_fast_reg_page_list *page_list; + int page_list_len; + unsigned long access_flags; + unsigned long map_len; + enum dma_data_direction direction; + struct list_head frmr_list; +}; struct svc_rdma_req_map { + struct svc_rdma_fastreg_mr *frmr; unsigned long count; union { struct kvec sge[RPCSVC_MAXPAGES]; struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES]; }; }; - +#define RDMACTXT_F_FAST_UNREG 1 #define RDMACTXT_F_LAST_CTXT 2 +#define SVCRDMA_DEVCAP_FAST_REG 1 /* fast mr registration */ +#define SVCRDMA_DEVCAP_READ_W_INV 2 /* read w/ invalidate */ + struct svcxprt_rdma { struct svc_xprt sc_xprt; /* SVC transport structure */ struct rdma_cm_id *sc_cm_id; /* RDMA connection id */ @@ -136,6 +151,11 @@ struct svcxprt_rdma { struct ib_cq *sc_rq_cq; struct ib_cq *sc_sq_cq; struct ib_mr *sc_phys_mr; /* MR for server memory */ + u32 sc_dev_caps; /* distilled device caps */ + u32 sc_dma_lkey; /* local dma key */ + unsigned int sc_frmr_pg_list_len; + struct list_head sc_frmr_q; + spinlock_t sc_frmr_q_lock; spinlock_t sc_lock; /* transport lock */ From 7f1ed18bd3aa1e8008cf5cc768a141787633da18 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 3 Oct 2008 12:50:07 -0400 Subject: [PATCH 41/59] NLM: Convert nlm_lookup_host() to use a single argument The nlm_lookup_host() function already has a large number of arguments, and I'm about to add a few more. As a clean up, convert the function to use a single data structure argument. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/host.c | 86 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 56 insertions(+), 30 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index be8f19d5318..3c4dc33c1be 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -38,6 +38,17 @@ static struct nsm_handle *nsm_find(const struct sockaddr *sap, const size_t hostname_len, const int create); +struct nlm_lookup_host_info { + const int server; /* search for server|client */ + const struct sockaddr_in *sin; /* address to search for */ + const unsigned short protocol; /* transport to search for*/ + const u32 version; /* NLM version to search for */ + const char *hostname; /* remote's hostname */ + const size_t hostname_len; /* it's length */ + const struct sockaddr_in *src_sin; /* our address (optional) */ + const size_t src_len; /* it's length */ +}; + /* * Hash function must work well on big- and little-endian platforms */ @@ -121,23 +132,13 @@ static void nlm_display_address(const struct sockaddr *sap, /* * Common host lookup routine for server & client */ -static struct nlm_host *nlm_lookup_host(int server, - const struct sockaddr_in *sin, - int proto, u32 version, - const char *hostname, - unsigned int hostname_len, - const struct sockaddr_in *ssin) +static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) { struct hlist_head *chain; struct hlist_node *pos; struct nlm_host *host; struct nsm_handle *nsm = NULL; - dprintk("lockd: nlm_lookup_host(proto=%d, vers=%u," - " my role is %s, hostname=%.*s)\n", - proto, version, server ? "server" : "client", - hostname_len, hostname ? hostname : ""); - mutex_lock(&nlm_host_mutex); if (time_after_eq(jiffies, next_gc)) @@ -150,22 +151,23 @@ static struct nlm_host *nlm_lookup_host(int server, * different NLM rpc_clients into one single nlm_host object. * This would allow us to have one nlm_host per address. */ - chain = &nlm_hosts[nlm_hash_address((struct sockaddr *)sin)]; + chain = &nlm_hosts[nlm_hash_address((struct sockaddr *)ni->sin)]; hlist_for_each_entry(host, pos, chain, h_hash) { - if (!nlm_cmp_addr(nlm_addr(host), (struct sockaddr *)sin)) + if (!nlm_cmp_addr(nlm_addr(host), (struct sockaddr *)ni->sin)) continue; /* See if we have an NSM handle for this client */ if (!nsm) nsm = host->h_nsmhandle; - if (host->h_proto != proto) + if (host->h_proto != ni->protocol) continue; - if (host->h_version != version) + if (host->h_version != ni->version) continue; - if (host->h_server != server) + if (host->h_server != ni->server) continue; - if (!nlm_cmp_addr(nlm_srcaddr(host), (struct sockaddr *)ssin)) + if (!nlm_cmp_addr(nlm_srcaddr(host), + (struct sockaddr *)ni->src_sin)) continue; /* Move to head of hash chain. */ @@ -186,8 +188,9 @@ static struct nlm_host *nlm_lookup_host(int server, atomic_inc(&nsm->sm_count); else { host = NULL; - nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin), - hostname, hostname_len, 1); + nsm = nsm_find((struct sockaddr *)ni->sin, + sizeof(struct sockaddr_in), + ni->hostname, ni->hostname_len, 1); if (!nsm) { dprintk("lockd: nlm_lookup_host failed; " "no nsm handle\n"); @@ -202,12 +205,12 @@ static struct nlm_host *nlm_lookup_host(int server, goto out; } host->h_name = nsm->sm_name; - memcpy(nlm_addr(host), sin, sizeof(*sin)); - host->h_addrlen = sizeof(*sin); + memcpy(nlm_addr(host), ni->sin, sizeof(struct sockaddr_in)); + host->h_addrlen = sizeof(struct sockaddr_in); nlm_clear_port(nlm_addr(host)); - memcpy(nlm_srcaddr(host), ssin, sizeof(*ssin)); - host->h_version = version; - host->h_proto = proto; + memcpy(nlm_srcaddr(host), ni->src_sin, sizeof(struct sockaddr_in)); + host->h_version = ni->version; + host->h_proto = ni->protocol; host->h_rpcclnt = NULL; mutex_init(&host->h_mutex); host->h_nextrebind = jiffies + NLM_HOST_REBIND; @@ -218,7 +221,7 @@ static struct nlm_host *nlm_lookup_host(int server, host->h_state = 0; /* pseudo NSM state */ host->h_nsmstate = 0; /* real NSM state */ host->h_nsmhandle = nsm; - host->h_server = server; + host->h_server = ni->server; hlist_add_head(&host->h_hash, chain); INIT_LIST_HEAD(&host->h_lockowners); spin_lock_init(&host->h_lock); @@ -273,9 +276,21 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin, const struct sockaddr_in source = { .sin_family = AF_UNSPEC, }; + struct nlm_lookup_host_info ni = { + .server = 0, + .sin = sin, + .protocol = proto, + .version = version, + .hostname = hostname, + .hostname_len = hostname_len, + .src_sin = &source, + }; - return nlm_lookup_host(0, sin, proto, version, - hostname, hostname_len, &source); + dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__, + (hostname ? hostname : ""), version, + (proto == IPPROTO_UDP ? "udp" : "tcp")); + + return nlm_lookup_host(&ni); } /* @@ -289,10 +304,21 @@ nlmsvc_lookup_host(struct svc_rqst *rqstp, .sin_family = AF_INET, .sin_addr = rqstp->rq_daddr.addr, }; + struct nlm_lookup_host_info ni = { + .server = 1, + .sin = svc_addr_in(rqstp), + .protocol = rqstp->rq_prot, + .version = rqstp->rq_vers, + .hostname = hostname, + .hostname_len = hostname_len, + .src_sin = &source, + }; - return nlm_lookup_host(1, svc_addr_in(rqstp), - rqstp->rq_prot, rqstp->rq_vers, - hostname, hostname_len, &source); + dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, + (int)hostname_len, hostname, rqstp->rq_vers, + (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp")); + + return nlm_lookup_host(&ni); } /* From 88541c848746442ddff45dea05ddea6b734d88b5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 3 Oct 2008 12:50:14 -0400 Subject: [PATCH 42/59] lockd: Support non-AF_INET addresses in nlm_lookup_host() Use struct sockaddr * and length in nlm_lookup_host_info to all callers to pass in either AF_INET or AF_INET6 addresses. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/host.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 3c4dc33c1be..5876b0e4c0b 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -40,12 +40,13 @@ static struct nsm_handle *nsm_find(const struct sockaddr *sap, struct nlm_lookup_host_info { const int server; /* search for server|client */ - const struct sockaddr_in *sin; /* address to search for */ + const struct sockaddr *sap; /* address to search for */ + const size_t salen; /* it's length */ const unsigned short protocol; /* transport to search for*/ const u32 version; /* NLM version to search for */ const char *hostname; /* remote's hostname */ const size_t hostname_len; /* it's length */ - const struct sockaddr_in *src_sin; /* our address (optional) */ + const struct sockaddr *src_sap; /* our address (optional) */ const size_t src_len; /* it's length */ }; @@ -151,9 +152,9 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) * different NLM rpc_clients into one single nlm_host object. * This would allow us to have one nlm_host per address. */ - chain = &nlm_hosts[nlm_hash_address((struct sockaddr *)ni->sin)]; + chain = &nlm_hosts[nlm_hash_address(ni->sap)]; hlist_for_each_entry(host, pos, chain, h_hash) { - if (!nlm_cmp_addr(nlm_addr(host), (struct sockaddr *)ni->sin)) + if (!nlm_cmp_addr(nlm_addr(host), ni->sap)) continue; /* See if we have an NSM handle for this client */ @@ -166,8 +167,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) continue; if (host->h_server != ni->server) continue; - if (!nlm_cmp_addr(nlm_srcaddr(host), - (struct sockaddr *)ni->src_sin)) + if (!nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap)) continue; /* Move to head of hash chain. */ @@ -188,8 +188,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) atomic_inc(&nsm->sm_count); else { host = NULL; - nsm = nsm_find((struct sockaddr *)ni->sin, - sizeof(struct sockaddr_in), + nsm = nsm_find(ni->sap, ni->salen, ni->hostname, ni->hostname_len, 1); if (!nsm) { dprintk("lockd: nlm_lookup_host failed; " @@ -205,10 +204,10 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) goto out; } host->h_name = nsm->sm_name; - memcpy(nlm_addr(host), ni->sin, sizeof(struct sockaddr_in)); - host->h_addrlen = sizeof(struct sockaddr_in); + memcpy(nlm_addr(host), ni->sap, ni->salen); + host->h_addrlen = ni->salen; nlm_clear_port(nlm_addr(host)); - memcpy(nlm_srcaddr(host), ni->src_sin, sizeof(struct sockaddr_in)); + memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); host->h_version = ni->version; host->h_proto = ni->protocol; host->h_rpcclnt = NULL; @@ -273,17 +272,19 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin, const char *hostname, unsigned int hostname_len) { - const struct sockaddr_in source = { - .sin_family = AF_UNSPEC, + const struct sockaddr source = { + .sa_family = AF_UNSPEC, }; struct nlm_lookup_host_info ni = { .server = 0, - .sin = sin, + .sap = (struct sockaddr *)sin, + .salen = sizeof(*sin), .protocol = proto, .version = version, .hostname = hostname, .hostname_len = hostname_len, - .src_sin = &source, + .src_sap = &source, + .src_len = sizeof(source), }; dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__, @@ -306,12 +307,14 @@ nlmsvc_lookup_host(struct svc_rqst *rqstp, }; struct nlm_lookup_host_info ni = { .server = 1, - .sin = svc_addr_in(rqstp), + .sap = svc_addr(rqstp), + .salen = rqstp->rq_addrlen, .protocol = rqstp->rq_prot, .version = rqstp->rq_vers, .hostname = hostname, .hostname_len = hostname_len, - .src_sin = &source, + .src_sap = (struct sockaddr *)&source, + .src_len = sizeof(source), }; dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, From d7d204403b31beb83b1aefef7bd76f5209369555 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 3 Oct 2008 12:50:21 -0400 Subject: [PATCH 43/59] lockd: Adjust nlmclnt_lookup_host() signature to accomodate non-AF_INET Pass a struct sockaddr * and a length to nlmclnt_lookup_host() to accomodate non-AF_INET family addresses. As a side benefit, eliminate the hostname_len argument, as the hostname is always NUL-terminated. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/clntlock.c | 5 ++--- fs/lockd/host.c | 32 +++++++++++++++++++++----------- include/linux/lockd/lockd.h | 9 +++++---- 3 files changed, 28 insertions(+), 18 deletions(-) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 237224a3c42..9eaf306d15f 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -58,10 +58,9 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) if (status < 0) return ERR_PTR(status); - host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address, + host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen, nlm_init->protocol, nlm_version, - nlm_init->hostname, - strlen(nlm_init->hostname)); + nlm_init->hostname); if (host == NULL) { lockd_down(); return ERR_PTR(-ENOLCK); diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 5876b0e4c0b..cbd2398e594 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -264,32 +264,42 @@ nlm_destroy_host(struct nlm_host *host) kfree(host); } -/* - * Find an NLM server handle in the cache. If there is none, create it. +/** + * nlmclnt_lookup_host - Find an NLM host handle matching a remote server + * @sap: network address of server + * @salen: length of server address + * @protocol: transport protocol to use + * @version: NLM protocol version + * @hostname: '\0'-terminated hostname of server + * + * Returns an nlm_host structure that matches the passed-in + * [server address, transport protocol, NLM version, server hostname]. + * If one doesn't already exist in the host cache, a new handle is + * created and returned. */ -struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin, - int proto, u32 version, - const char *hostname, - unsigned int hostname_len) +struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, + const size_t salen, + const unsigned short protocol, + const u32 version, const char *hostname) { const struct sockaddr source = { .sa_family = AF_UNSPEC, }; struct nlm_lookup_host_info ni = { .server = 0, - .sap = (struct sockaddr *)sin, - .salen = sizeof(*sin), - .protocol = proto, + .sap = sap, + .salen = salen, + .protocol = protocol, .version = version, .hostname = hostname, - .hostname_len = hostname_len, + .hostname_len = strlen(hostname), .src_sap = &source, .src_len = sizeof(source), }; dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__, (hostname ? hostname : ""), version, - (proto == IPPROTO_UDP ? "udp" : "tcp")); + (protocol == IPPROTO_UDP ? "udp" : "tcp")); return nlm_lookup_host(&ni); } diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 973ab1d6e86..90a996d2f00 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -215,10 +215,11 @@ void nlmclnt_next_cookie(struct nlm_cookie *); /* * Host cache */ -struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin, - int proto, u32 version, - const char *hostname, - unsigned int hostname_len); +struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, + const size_t salen, + const unsigned short protocol, + const u32 version, + const char *hostname); struct nlm_host *nlmsvc_lookup_host(struct svc_rqst *, const char *, unsigned int); struct rpc_clnt * nlm_bind_host(struct nlm_host *); From 6bfbe8af4674458e6d88aef8f0136bd1b8855b11 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 3 Oct 2008 12:50:29 -0400 Subject: [PATCH 44/59] lockd: Adjust nlmsvc_lookup_host() to accomodate AF_INET6 addresses Fix up nlmsvc_lookup_host() to pass AF_INET6 source addresses to nlm_lookup_host(). Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/host.c | 47 ++++++++++++++++++++++++++++++------- include/linux/lockd/lockd.h | 5 ++-- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index cbd2398e594..9fd8889097b 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -304,16 +304,33 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, return nlm_lookup_host(&ni); } -/* - * Find an NLM client handle in the cache. If there is none, create it. +/** + * nlmsvc_lookup_host - Find an NLM host handle matching a remote client + * @rqstp: incoming NLM request + * @hostname: name of client host + * @hostname_len: length of client hostname + * + * Returns an nlm_host structure that matches the [client address, + * transport protocol, NLM version, client hostname] of the passed-in + * NLM request. If one doesn't already exist in the host cache, a + * new handle is created and returned. + * + * Before possibly creating a new nlm_host, construct a sockaddr + * for a specific source address in case the local system has + * multiple network addresses. The family of the address in + * rq_daddr is guaranteed to be the same as the family of the + * address in rq_addr, so it's safe to use the same family for + * the source address. */ -struct nlm_host * -nlmsvc_lookup_host(struct svc_rqst *rqstp, - const char *hostname, unsigned int hostname_len) +struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, + const char *hostname, + const size_t hostname_len) { - const struct sockaddr_in source = { + struct sockaddr_in sin = { .sin_family = AF_INET, - .sin_addr = rqstp->rq_daddr.addr, + }; + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, }; struct nlm_lookup_host_info ni = { .server = 1, @@ -323,14 +340,26 @@ nlmsvc_lookup_host(struct svc_rqst *rqstp, .version = rqstp->rq_vers, .hostname = hostname, .hostname_len = hostname_len, - .src_sap = (struct sockaddr *)&source, - .src_len = sizeof(source), + .src_len = rqstp->rq_addrlen, }; dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, (int)hostname_len, hostname, rqstp->rq_vers, (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp")); + switch (ni.sap->sa_family) { + case AF_INET: + sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr; + ni.src_sap = (struct sockaddr *)&sin; + break; + case AF_INET6: + ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6); + ni.src_sap = (struct sockaddr *)&sin6; + break; + default: + return NULL; + } + return nlm_lookup_host(&ni); } diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 90a996d2f00..16ff2e88f05 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -220,8 +220,9 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, const unsigned short protocol, const u32 version, const char *hostname); -struct nlm_host *nlmsvc_lookup_host(struct svc_rqst *, const char *, - unsigned int); +struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, + const char *hostname, + const size_t hostname_len); struct rpc_clnt * nlm_bind_host(struct nlm_host *); void nlm_rebind_host(struct nlm_host *); struct nlm_host * nlm_get_host(struct nlm_host *); From dcff09f124f71d1d4fe61eb63c79e52f488ac22e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 3 Oct 2008 12:50:36 -0400 Subject: [PATCH 45/59] lockd: change nlmclnt_grant() to take a "struct sockaddr *" Adjust the signature and callers of nlmclnt_grant() to pass a "struct sockaddr *" instead of a "struct sockaddr_in *" in order to support IPv6 addresses. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/clntlock.c | 5 ++--- fs/lockd/svc4proc.c | 2 +- fs/lockd/svcproc.c | 2 +- include/linux/lockd/lockd.h | 3 ++- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 9eaf306d15f..2976bf0f414 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -141,7 +141,7 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) /* * The server lockd has called us back to tell us the lock was granted */ -__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock) +__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) { const struct file_lock *fl = &lock->fl; const struct nfs_fh *fh = &lock->fh; @@ -165,8 +165,7 @@ __be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; - if (!nlm_cmp_addr(nlm_addr(block->b_host), - (struct sockaddr *)addr)) + if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) continue; if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index f6f18fa5cf8..50ee8eb139a 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -220,7 +220,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock); + resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index a587b81338b..935ce967a6a 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -250,7 +250,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock); + resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 16ff2e88f05..e6b07097928 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -207,7 +207,8 @@ int nlm_async_reply(struct nlm_rqst *, u32, const struct rpc_call_ops *); struct nlm_wait * nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl); void nlmclnt_finish_block(struct nlm_wait *block); int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout); -__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *); +__be32 nlmclnt_grant(const struct sockaddr *addr, + const struct nlm_lock *lock); void nlmclnt_recovery(struct nlm_host *); int nlmclnt_reclaim(struct nlm_host *, struct file_lock *); void nlmclnt_next_cookie(struct nlm_cookie *); From b85e4676344fc4d7ec5e0f62c3d3712e48bbe223 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 3 Oct 2008 12:50:44 -0400 Subject: [PATCH 46/59] lockd: Add helper to sanity check incoming NOTIFY requests lockd accepts SM_NOTIFY calls only from a privileged process on the local system. If lockd uses an AF_INET6 listener, the sender's address (ie the local rpc.statd) will be the IPv6 loopback address, not the IPv4 loopback address. Make sure the privilege test in nlmsvc_proc_sm_notify() and nlm4svc_proc_sm_notify() works for both AF_INET and AF_INET6 family addresses by refactoring the test into a helper and adding support for IPv6 addresses. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/svc4proc.c | 6 ++---- fs/lockd/svcproc.c | 6 ++---- include/linux/lockd/lockd.h | 41 +++++++++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 8 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 50ee8eb139a..014f6ce4817 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -421,11 +421,9 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, { struct sockaddr_in saddr; - memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr)); - dprintk("lockd: SM_NOTIFY called\n"); - if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) - || ntohs(saddr.sin_port) >= 1024) { + + if (!nlm_privileged_requester(rqstp)) { char buf[RPC_MAX_ADDRBUFLEN]; printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", svc_print_addr(rqstp, buf, sizeof(buf))); diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 935ce967a6a..548b0bb2b84 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -453,11 +453,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, { struct sockaddr_in saddr; - memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr)); - dprintk("lockd: SM_NOTIFY called\n"); - if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) - || ntohs(saddr.sin_port) >= 1024) { + + if (!nlm_privileged_requester(rqstp)) { char buf[RPC_MAX_ADDRBUFLEN]; printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", svc_print_addr(rqstp, buf, sizeof(buf))); diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index e6b07097928..b56d5aa9b19 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -277,6 +277,47 @@ static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) return file->f_file->f_path.dentry->d_inode; } +static inline int __nlm_privileged_request4(const struct sockaddr *sap) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + return (sin->sin_addr.s_addr == htonl(INADDR_LOOPBACK)) && + (ntohs(sin->sin_port) < 1024); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static inline int __nlm_privileged_request6(const struct sockaddr *sap) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + return (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LOOPBACK) && + (ntohs(sin6->sin6_port) < 1024); +} +#else /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ +static inline int __nlm_privileged_request6(const struct sockaddr *sap) +{ + return 0; +} +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ + +/* + * Ensure incoming requests are from local privileged callers. + * + * Return TRUE if sender is local and is connecting via a privileged port; + * otherwise return FALSE. + */ +static inline int nlm_privileged_requester(const struct svc_rqst *rqstp) +{ + const struct sockaddr *sap = svc_addr(rqstp); + + switch (sap->sa_family) { + case AF_INET: + return __nlm_privileged_request4(sap); + case AF_INET6: + return __nlm_privileged_request6(sap); + default: + return 0; + } +} + static inline int __nlm_cmp_addr4(const struct sockaddr *sap1, const struct sockaddr *sap2) { From 9a38a83880c224c6a3fd973ac9ae30a043487f0f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 3 Oct 2008 12:50:51 -0400 Subject: [PATCH 47/59] lockd: Remove unused fields in the nlm_reboot structure The nlm_reboot structure is used to store information provided by the NSM_NOTIFY procedure. This procedure is not specified by the NLM or NSM protocols, other than to say that the procedure can be used to transmit information private to a particular NLM/NSM implementation. For Linux, the callback arguments include the name of the monitored host, the new NSM state of the host, and a 16-byte private opaque. As a clean up, remove the unused fields and the server-side XDR logic that decodes them. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/xdr.c | 2 -- fs/lockd/xdr4.c | 2 -- include/linux/lockd/xdr.h | 2 -- 3 files changed, 6 deletions(-) diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 3e459e18cc3..1f226290c67 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -351,8 +351,6 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp) argp->state = ntohl(*p++); /* Preserve the address in network byte order */ argp->addr = *p++; - argp->vers = *p++; - argp->proto = *p++; return xdr_argsize_check(rqstp, p); } diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 43ff9397e6c..50c493a8ad8 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -358,8 +358,6 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp argp->state = ntohl(*p++); /* Preserve the address in network byte order */ argp->addr = *p++; - argp->vers = *p++; - argp->proto = *p++; return xdr_argsize_check(rqstp, p); } diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index df18fa053bc..d6b3a802c04 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -81,8 +81,6 @@ struct nlm_reboot { unsigned int len; u32 state; __be32 addr; - __be32 vers; - __be32 proto; }; /* From 8c3916f4bdf9c8388bd70d0b399b3a43daf2087a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 3 Oct 2008 17:15:23 -0400 Subject: [PATCH 48/59] NLM: Always start both UDP and TCP listeners Commit 24e36663, which first appeared in 2.6.19, changed lockd so that the client side starts a UDP listener only if there is a UDP NFSv2/v3 mount. Its description notes: This... means that lockd will *not* listen on UDP if the only mounts are TCP mount (and nfsd hasn't started). The latter is the only one that concerns me at all - I don't know if this might be a problem with some servers. Unfortunately it is a problem for Linux itself. The rpc.statd daemon on Linux uses UDP for contacting the local lockd, no matter which protocol is used for NFS mounts. Without a local lockd UDP listener, NFSv2/v3 lock recovery from Linux NFS clients always fails. Revert parts of commit 24e36663 so lockd_up() always starts both listeners. Signed-off-by: Chuck Lever Cc: Neil Brown Signed-off-by: J. Bruce Fields --- fs/lockd/svc.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index f013aed1153..36396fc058c 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -189,25 +189,28 @@ lockd(void *vrqstp) } /* - * Make any sockets that are needed but not present. - * If nlm_udpport or nlm_tcpport were set as module - * options, make those sockets unconditionally + * Ensure there are active UDP and TCP listeners for lockd. + * + * Even if we have only TCP NFS mounts and/or TCP NFSDs, some + * local services (such as rpc.statd) still require UDP, and + * some NFS servers do not yet support NLM over TCP. + * + * Returns zero if all listeners are available; otherwise a + * negative errno value is returned. */ -static int make_socks(struct svc_serv *serv, int proto) +static int make_socks(struct svc_serv *serv) { static int warned; struct svc_xprt *xprt; int err = 0; - if (proto == IPPROTO_UDP || nlm_udpport) { - xprt = svc_find_xprt(serv, "udp", 0, 0); - if (!xprt) - err = svc_create_xprt(serv, "udp", nlm_udpport, - SVC_SOCK_DEFAULTS); - else - svc_xprt_put(xprt); - } - if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) { + xprt = svc_find_xprt(serv, "udp", 0, 0); + if (!xprt) + err = svc_create_xprt(serv, "udp", nlm_udpport, + SVC_SOCK_DEFAULTS); + else + svc_xprt_put(xprt); + if (err >= 0) { xprt = svc_find_xprt(serv, "tcp", 0, 0); if (!xprt) err = svc_create_xprt(serv, "tcp", nlm_tcpport, @@ -237,11 +240,8 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ /* * Check whether we're already up and running. */ - if (nlmsvc_rqst) { - if (proto) - error = make_socks(nlmsvc_rqst->rq_server, proto); + if (nlmsvc_rqst) goto out; - } /* * Sanity check: if there's no pid, @@ -258,7 +258,8 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ goto out; } - if ((error = make_socks(serv, proto)) < 0) + error = make_socks(serv); + if (error < 0) goto destroy_and_out; /* From 26a414092353590ceaa5955bcb53f863d6ea7549 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 3 Oct 2008 17:15:30 -0400 Subject: [PATCH 49/59] NLM: Remove "proto" argument from lockd_up() Clean up: Now that lockd_up() starts listeners for both transports, the "proto" argument is no longer needed. Signed-off-by: Chuck Lever Cc: Neil Brown Signed-off-by: J. Bruce Fields --- fs/lockd/clntlock.c | 4 ++-- fs/lockd/svc.c | 3 +-- fs/nfsd/nfsctl.c | 5 ++--- fs/nfsd/nfssvc.c | 19 +++++++------------ include/linux/lockd/bind.h | 2 +- 5 files changed, 13 insertions(+), 20 deletions(-) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 2976bf0f414..8307dd64bf4 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -54,7 +54,7 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4; int status; - status = lockd_up(nlm_init->protocol); + status = lockd_up(); if (status < 0) return ERR_PTR(status); @@ -215,7 +215,7 @@ reclaimer(void *ptr) /* This one ensures that our parent doesn't terminate while the * reclaim is in progress */ lock_kernel(); - lockd_up(0); /* note: this cannot fail as lockd is already running */ + lockd_up(); /* note: this cannot fail as lockd is already running */ dprintk("lockd: reclaiming locks for host %s\n", host->h_name); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 36396fc058c..c631a83931c 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -230,8 +230,7 @@ static int make_socks(struct svc_serv *serv) /* * Bring up the lockd process if it's not already up. */ -int -lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ +int lockd_up(void) { struct svc_serv *serv; int error = 0; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index c53e65f8f3a..862dff5247f 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -614,10 +614,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) return -EINVAL; err = nfsd_create_serv(); if (!err) { - int proto = 0; - err = svc_addsock(nfsd_serv, fd, buf, &proto); + err = svc_addsock(nfsd_serv, fd, buf, NULL); if (err >= 0) { - err = lockd_up(proto); + err = lockd_up(); if (err < 0) svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf); } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 7f3d76a7839..59eeb46f82c 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -244,25 +244,20 @@ static int nfsd_init_socks(int port) if (!list_empty(&nfsd_serv->sv_permsocks)) return 0; - error = lockd_up(IPPROTO_UDP); - if (error >= 0) { - error = svc_create_xprt(nfsd_serv, "udp", port, + error = svc_create_xprt(nfsd_serv, "udp", port, SVC_SOCK_DEFAULTS); - if (error < 0) - lockd_down(); - } if (error < 0) return error; - error = lockd_up(IPPROTO_TCP); - if (error >= 0) { - error = svc_create_xprt(nfsd_serv, "tcp", port, + error = svc_create_xprt(nfsd_serv, "tcp", port, SVC_SOCK_DEFAULTS); - if (error < 0) - lockd_down(); - } if (error < 0) return error; + + error = lockd_up(); + if (error < 0) + return error; + return 0; } diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 1f0465c374d..e5872dc994c 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -52,7 +52,7 @@ extern void nlmclnt_done(struct nlm_host *host); extern int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl); -extern int lockd_up(int proto); +extern int lockd_up(void); extern void lockd_down(void); #endif /* LINUX_LOCKD_BIND_H */ From 2937391385807b3da9cd7a39345259caf550b032 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 3 Oct 2008 17:15:38 -0400 Subject: [PATCH 50/59] NLM: Remove unused argument from svc_addsock() function Clean up: The svc_addsock() function no longer uses its "proto" argument, so remove it. Signed-off-by: Chuck Lever Cc: Neil Brown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 2 +- include/linux/sunrpc/svcsock.h | 5 +---- net/sunrpc/svcsock.c | 4 +--- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 862dff5247f..97543df5824 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -614,7 +614,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) return -EINVAL; err = nfsd_create_serv(); if (!err) { - err = svc_addsock(nfsd_serv, fd, buf, NULL); + err = svc_addsock(nfsd_serv, fd, buf); if (err >= 0) { err = lockd_up(); if (err < 0) diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 8cff696dedf..483e10380aa 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -39,10 +39,7 @@ int svc_send(struct svc_rqst *); void svc_drop(struct svc_rqst *); void svc_sock_update_bufs(struct svc_serv *serv); int svc_sock_names(char *buf, struct svc_serv *serv, char *toclose); -int svc_addsock(struct svc_serv *serv, - int fd, - char *name_return, - int *proto); +int svc_addsock(struct svc_serv *serv, int fd, char *name_return); void svc_init_xprt_sock(void); void svc_cleanup_xprt_sock(void); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index f91377c1495..95293f549e9 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1167,8 +1167,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, int svc_addsock(struct svc_serv *serv, int fd, - char *name_return, - int *proto) + char *name_return) { int err = 0; struct socket *so = sockfd_lookup(fd, &err); @@ -1203,7 +1202,6 @@ int svc_addsock(struct svc_serv *serv, sockfd_put(so); return err; } - if (proto) *proto = so->sk->sk_protocol; return one_sock_name(name_return, svsk); } EXPORT_SYMBOL_GPL(svc_addsock); From 64be8608c163bd480cf5ec4b34366f11e0f3c87f Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Mon, 6 Oct 2008 14:45:18 -0500 Subject: [PATCH 51/59] svcrdma: Add FRMR get/put services Add services for the allocating, freeing, and unmapping Fast Reg MR. These services will be used by the transport connection setup, send and receive routines. Signed-off-by: Tom Tucker --- include/linux/sunrpc/svc_rdma.h | 3 + net/sunrpc/xprtrdma/svc_rdma_transport.c | 116 ++++++++++++++++++++++- 2 files changed, 114 insertions(+), 5 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 49e458d9894..34252683671 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -214,6 +214,9 @@ extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); extern struct svc_rdma_req_map *svc_rdma_get_req_map(void); extern void svc_rdma_put_req_map(struct svc_rdma_req_map *); +extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *); +extern void svc_rdma_put_frmr(struct svcxprt_rdma *, + struct svc_rdma_fastreg_mr *); extern void svc_sq_reap(struct svcxprt_rdma *); extern void svc_rq_reap(struct svcxprt_rdma *); extern struct svc_xprt_class svc_rdma_class; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 900cb69728c..f0b5c5f2f62 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -100,6 +100,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) ctxt->xprt = xprt; INIT_LIST_HEAD(&ctxt->dto_q); ctxt->count = 0; + ctxt->frmr = NULL; atomic_inc(&xprt->sc_ctxt_used); return ctxt; } @@ -109,11 +110,19 @@ static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) struct svcxprt_rdma *xprt = ctxt->xprt; int i; for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { - atomic_dec(&xprt->sc_dma_used); - ib_dma_unmap_single(xprt->sc_cm_id->device, - ctxt->sge[i].addr, - ctxt->sge[i].length, - ctxt->direction); + /* + * Unmap the DMA addr in the SGE if the lkey matches + * the sc_dma_lkey, otherwise, ignore it since it is + * an FRMR lkey and will be unmapped later when the + * last WR that uses it completes. + */ + if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) { + atomic_dec(&xprt->sc_dma_used); + ib_dma_unmap_single(xprt->sc_cm_id->device, + ctxt->sge[i].addr, + ctxt->sge[i].length, + ctxt->direction); + } } } @@ -150,6 +159,7 @@ struct svc_rdma_req_map *svc_rdma_get_req_map(void) schedule_timeout_uninterruptible(msecs_to_jiffies(500)); } map->count = 0; + map->frmr = NULL; return map; } @@ -425,10 +435,12 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); + INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); + spin_lock_init(&cma_xprt->sc_frmr_q_lock); cma_xprt->sc_ord = svcrdma_ord; @@ -686,6 +698,97 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, return ERR_PTR(ret); } +static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt) +{ + struct ib_mr *mr; + struct ib_fast_reg_page_list *pl; + struct svc_rdma_fastreg_mr *frmr; + + frmr = kmalloc(sizeof(*frmr), GFP_KERNEL); + if (!frmr) + goto err; + + mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES); + if (!mr) + goto err_free_frmr; + + pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device, + RPCSVC_MAXPAGES); + if (!pl) + goto err_free_mr; + + frmr->mr = mr; + frmr->page_list = pl; + INIT_LIST_HEAD(&frmr->frmr_list); + return frmr; + + err_free_mr: + ib_dereg_mr(mr); + err_free_frmr: + kfree(frmr); + err: + return ERR_PTR(-ENOMEM); +} + +static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_fastreg_mr *frmr; + + while (!list_empty(&xprt->sc_frmr_q)) { + frmr = list_entry(xprt->sc_frmr_q.next, + struct svc_rdma_fastreg_mr, frmr_list); + list_del_init(&frmr->frmr_list); + ib_dereg_mr(frmr->mr); + ib_free_fast_reg_page_list(frmr->page_list); + kfree(frmr); + } +} + +struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_fastreg_mr *frmr = NULL; + + spin_lock_bh(&rdma->sc_frmr_q_lock); + if (!list_empty(&rdma->sc_frmr_q)) { + frmr = list_entry(rdma->sc_frmr_q.next, + struct svc_rdma_fastreg_mr, frmr_list); + list_del_init(&frmr->frmr_list); + frmr->map_len = 0; + frmr->page_list_len = 0; + } + spin_unlock_bh(&rdma->sc_frmr_q_lock); + if (frmr) + return frmr; + + return rdma_alloc_frmr(rdma); +} + +static void frmr_unmap_dma(struct svcxprt_rdma *xprt, + struct svc_rdma_fastreg_mr *frmr) +{ + int page_no; + for (page_no = 0; page_no < frmr->page_list_len; page_no++) { + dma_addr_t addr = frmr->page_list->page_list[page_no]; + if (ib_dma_mapping_error(frmr->mr->device, addr)) + continue; + atomic_dec(&xprt->sc_dma_used); + ib_dma_unmap_single(frmr->mr->device, addr, PAGE_SIZE, + frmr->direction); + } +} + +void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, + struct svc_rdma_fastreg_mr *frmr) +{ + if (frmr) { + frmr_unmap_dma(rdma, frmr); + spin_lock_bh(&rdma->sc_frmr_q_lock); + BUG_ON(!list_empty(&frmr->frmr_list)); + list_add(&frmr->frmr_list, &rdma->sc_frmr_q); + spin_unlock_bh(&rdma->sc_frmr_q_lock); + } +} + /* * This is the xpo_recvfrom function for listening endpoints. Its * purpose is to accept incoming connections. The CMA callback handler @@ -961,6 +1064,9 @@ static void __svc_rdma_free(struct work_struct *work) WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); WARN_ON(atomic_read(&rdma->sc_dma_used) != 0); + /* De-allocate fastreg mr */ + rdma_dealloc_frmr_q(rdma); + /* Destroy the QP if present (not a listener) */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_destroy_qp(rdma->sc_qp); From 3a5c63803d0552a3ad93b85c262f12cd86471443 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Tue, 30 Sep 2008 13:46:13 -0500 Subject: [PATCH 52/59] svcrdma: Query device for Fast Reg support during connection setup Query the device capabilities in the svc_rdma_accept function to determine what advanced memory management capabilities are supported by the device. Based on the query, select the most secure model available given the requirements of the transport and capabilities of the adapter. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 76 ++++++++++++++++++++++-- 1 file changed, 70 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index f0b5c5f2f62..a8ec4b1eec5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -807,6 +807,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; struct ib_device_attr devattr; + int dma_mr_acc; + int need_dma_mr; int ret; int i; @@ -922,15 +924,77 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) } newxprt->sc_qp = newxprt->sc_cm_id->qp; - /* Register all of physical memory */ - newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd, - IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE); - if (IS_ERR(newxprt->sc_phys_mr)) { - dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret); + /* + * Use the most secure set of MR resources based on the + * transport type and available memory management features in + * the device. Here's the table implemented below: + * + * Fast Global DMA Remote WR + * Reg LKEY MR Access + * Sup'd Sup'd Needed Needed + * + * IWARP N N Y Y + * N Y Y Y + * Y N Y N + * Y Y N - + * + * IB N N Y N + * N Y N - + * Y N Y N + * Y Y N - + * + * NB: iWARP requires remote write access for the data sink + * of an RDMA_READ. IB does not. + */ + if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + newxprt->sc_frmr_pg_list_len = + devattr.max_fast_reg_page_list_len; + newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; + } + + /* + * Determine if a DMA MR is required and if so, what privs are required + */ + switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) { + case RDMA_TRANSPORT_IWARP: + newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; + if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) { + need_dma_mr = 1; + dma_mr_acc = + (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE); + } else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + } else + need_dma_mr = 0; + break; + case RDMA_TRANSPORT_IB: + if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + } else + need_dma_mr = 0; + break; + default: goto errout; } + /* Create the DMA MR if needed, otherwise, use the DMA LKEY */ + if (need_dma_mr) { + /* Register all of physical memory */ + newxprt->sc_phys_mr = + ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc); + if (IS_ERR(newxprt->sc_phys_mr)) { + dprintk("svcrdma: Failed to create DMA MR ret=%d\n", + ret); + goto errout; + } + newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey; + } else + newxprt->sc_dma_lkey = + newxprt->sc_cm_id->device->local_dma_lkey; + /* Post receive buffers */ for (i = 0; i < newxprt->sc_max_requests; i++) { ret = svc_rdma_post_recv(newxprt); From e1183210625cc8e02ce13eec78fb7a246567fc59 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Fri, 3 Oct 2008 15:22:18 -0500 Subject: [PATCH 53/59] svcrdma: Add a service to register a Fast Reg MR with the device Fast Reg MR introduces a new WR type. Add a service to register the region with the adapter and update the completion handling to support completions with a NULL WR context. Signed-off-by: Tom Tucker --- include/linux/sunrpc/svc_rdma.h | 1 + net/sunrpc/xprtrdma/svc_rdma_transport.c | 111 ++++++++++++++++------- 2 files changed, 77 insertions(+), 35 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 34252683671..1402d193b39 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -214,6 +214,7 @@ extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); extern struct svc_rdma_req_map *svc_rdma_get_req_map(void); extern void svc_rdma_put_req_map(struct svc_rdma_req_map *); +extern int svc_rdma_fastreg(struct svcxprt_rdma *, struct svc_rdma_fastreg_mr *); extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *); extern void svc_rdma_put_frmr(struct svcxprt_rdma *, struct svc_rdma_fastreg_mr *); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index a8ec4b1eec5..c3e8db0eeb3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -325,6 +325,45 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) svc_xprt_enqueue(&xprt->sc_xprt); } +/* + * Processs a completion context + */ +static void process_context(struct svcxprt_rdma *xprt, + struct svc_rdma_op_ctxt *ctxt) +{ + svc_rdma_unmap_dma(ctxt); + + switch (ctxt->wr_op) { + case IB_WR_SEND: + svc_rdma_put_context(ctxt, 1); + break; + + case IB_WR_RDMA_WRITE: + svc_rdma_put_context(ctxt, 0); + break; + + case IB_WR_RDMA_READ: + if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { + struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; + BUG_ON(!read_hdr); + spin_lock_bh(&xprt->sc_rq_dto_lock); + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + list_add_tail(&read_hdr->dto_q, + &xprt->sc_read_complete_q); + spin_unlock_bh(&xprt->sc_rq_dto_lock); + svc_xprt_enqueue(&xprt->sc_xprt); + } + svc_rdma_put_context(ctxt, 0); + break; + + default: + printk(KERN_ERR "svcrdma: unexpected completion type, " + "opcode=%d\n", + ctxt->wr_op); + break; + } +} + /* * Send Queue Completion Handler - potentially called on interrupt context. * @@ -337,17 +376,12 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) struct ib_cq *cq = xprt->sc_sq_cq; int ret; - if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) return; ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); atomic_inc(&rdma_stat_sq_poll); while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { - ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; - xprt = ctxt->xprt; - - svc_rdma_unmap_dma(ctxt); if (wc.status != IB_WC_SUCCESS) /* Close the transport */ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); @@ -356,35 +390,10 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) atomic_dec(&xprt->sc_sq_count); wake_up(&xprt->sc_send_wait); - switch (ctxt->wr_op) { - case IB_WR_SEND: - svc_rdma_put_context(ctxt, 1); - break; + ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; + if (ctxt) + process_context(xprt, ctxt); - case IB_WR_RDMA_WRITE: - svc_rdma_put_context(ctxt, 0); - break; - - case IB_WR_RDMA_READ: - if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { - struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; - BUG_ON(!read_hdr); - spin_lock_bh(&xprt->sc_rq_dto_lock); - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - list_add_tail(&read_hdr->dto_q, - &xprt->sc_read_complete_q); - spin_unlock_bh(&xprt->sc_rq_dto_lock); - svc_xprt_enqueue(&xprt->sc_xprt); - } - svc_rdma_put_context(ctxt, 0); - break; - - default: - printk(KERN_ERR "svcrdma: unexpected completion type, " - "opcode=%d, status=%d\n", - wc.opcode, wc.status); - break; - } svc_xprt_put(&xprt->sc_xprt); } @@ -1184,6 +1193,40 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) return 1; } +/* + * Attempt to register the kvec representing the RPC memory with the + * device. + * + * Returns: + * NULL : The device does not support fastreg or there were no more + * fastreg mr. + * frmr : The kvec register request was successfully posted. + * <0 : An error was encountered attempting to register the kvec. + */ +int svc_rdma_fastreg(struct svcxprt_rdma *xprt, + struct svc_rdma_fastreg_mr *frmr) +{ + struct ib_send_wr fastreg_wr; + u8 key; + + /* Bump the key */ + key = (u8)(frmr->mr->lkey & 0x000000FF); + ib_update_fast_reg_key(frmr->mr, ++key); + + /* Prepare FASTREG WR */ + memset(&fastreg_wr, 0, sizeof fastreg_wr); + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.send_flags = IB_SEND_SIGNALED; + fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva; + fastreg_wr.wr.fast_reg.page_list = frmr->page_list; + fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.length = frmr->map_len; + fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags; + fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey; + return svc_rdma_send(xprt, &fastreg_wr); +} + int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) { struct ib_send_wr *bad_wr; @@ -1193,8 +1236,6 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) return -ENOTCONN; BUG_ON(wr->send_flags != IB_SEND_SIGNALED); - BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op != - wr->opcode); /* If the SQ is full, wait until an SQ entry is available */ while (1) { spin_lock_bh(&xprt->sc_lock); From a5abf4e81545d9c7280c49cae853cc45fd769ddf Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Tue, 30 Sep 2008 14:05:41 -0500 Subject: [PATCH 54/59] svcrdma: Modify post recv path to use local dma key Update the svc_rdma_post_recv routine to use the adapter's global LKEY instead of sc_phys_mr which is only valid when using a DMA MR. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index c3e8db0eeb3..d9183cbcd96 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -483,7 +483,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) struct ib_recv_wr recv_wr, *bad_recv_wr; struct svc_rdma_op_ctxt *ctxt; struct page *page; - unsigned long pa; + dma_addr_t pa; int sge_no; int buflen; int ret; @@ -495,13 +495,15 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) BUG_ON(sge_no >= xprt->sc_max_sge); page = svc_rdma_get_page(); ctxt->pages[sge_no] = page; - atomic_inc(&xprt->sc_dma_used); pa = ib_dma_map_page(xprt->sc_cm_id->device, page, 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa)) + goto err_put_ctxt; + atomic_inc(&xprt->sc_dma_used); ctxt->sge[sge_no].addr = pa; ctxt->sge[sge_no].length = PAGE_SIZE; - ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey; buflen += PAGE_SIZE; } ctxt->count = sge_no; @@ -517,6 +519,10 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) svc_rdma_put_context(ctxt, 1); } return ret; + + err_put_ctxt: + svc_rdma_put_context(ctxt, 1); + return -ENOMEM; } /* From 5b180a9a64ca2217a658bd515ef910eafefc5e5a Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Mon, 11 Aug 2008 14:10:19 -0500 Subject: [PATCH 55/59] svcrdma: Add support to svc_rdma_send to handle chained WR WR can be submitted as linked lists of WR. Update the svc_rdma_send routine to handle WR chains. This will be used to submit a WR that uses an FRMR with another WR that invalidates the FRMR. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 29 +++++++++++++++++------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index d9183cbcd96..f22f5876766 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -1235,17 +1235,23 @@ int svc_rdma_fastreg(struct svcxprt_rdma *xprt, int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) { - struct ib_send_wr *bad_wr; + struct ib_send_wr *bad_wr, *n_wr; + int wr_count; + int i; int ret; if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) return -ENOTCONN; BUG_ON(wr->send_flags != IB_SEND_SIGNALED); + wr_count = 1; + for (n_wr = wr->next; n_wr; n_wr = n_wr->next) + wr_count++; + /* If the SQ is full, wait until an SQ entry is available */ while (1) { spin_lock_bh(&xprt->sc_lock); - if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) { + if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) { spin_unlock_bh(&xprt->sc_lock); atomic_inc(&rdma_stat_sq_starve); @@ -1260,19 +1266,26 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) return 0; continue; } - /* Bumped used SQ WR count and post */ - svc_xprt_get(&xprt->sc_xprt); + /* Take a transport ref for each WR posted */ + for (i = 0; i < wr_count; i++) + svc_xprt_get(&xprt->sc_xprt); + + /* Bump used SQ WR count and post */ + atomic_add(wr_count, &xprt->sc_sq_count); ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); - if (!ret) - atomic_inc(&xprt->sc_sq_count); - else { - svc_xprt_put(&xprt->sc_xprt); + if (ret) { + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + atomic_sub(wr_count, &xprt->sc_sq_count); + for (i = 0; i < wr_count; i ++) + svc_xprt_put(&xprt->sc_xprt); dprintk("svcrdma: failed to post SQ WR rc=%d, " "sc_sq_count=%d, sc_sq_depth=%d\n", ret, atomic_read(&xprt->sc_sq_count), xprt->sc_sq_depth); } spin_unlock_bh(&xprt->sc_lock); + if (ret) + wake_up(&xprt->sc_send_wait); break; } return ret; From 146b6df6a537939570c5772ebd7db826fdbd5d82 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Tue, 12 Aug 2008 15:12:10 -0500 Subject: [PATCH 56/59] svcrdma: Modify the RPC recv path to use FRMR when available RPCRDMA requests that specify a read-list are fetched with RDMA_READ. Using an FRMR to map the data sink improves NFSRDMA security on transports that place the RDMA_READ data sink LKEY on the wire because the valid lifetime of the MR is only the duration of the RDMA_READ. The LKEY is invalidated when the last RDMA_READ WR completes. Mapping the data sink also allows for very large amounts to data to be fetched with a single WR, so if the client is also using FRMR, the entire RPC read-list can be fetched with a single WR. Signed-off-by: Tom Tucker --- include/linux/sunrpc/svc_rdma.h | 1 + net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 187 ++++++++++++++++++++--- net/sunrpc/xprtrdma/svc_rdma_transport.c | 5 +- 3 files changed, 171 insertions(+), 22 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 1402d193b39..c14fe86dac5 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -212,6 +212,7 @@ extern int svc_rdma_post_recv(struct svcxprt_rdma *); extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); +extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt); extern struct svc_rdma_req_map *svc_rdma_get_req_map(void); extern void svc_rdma_put_req_map(struct svc_rdma_req_map *); extern int svc_rdma_fastreg(struct svcxprt_rdma *, struct svc_rdma_fastreg_mr *); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 74de31a0661..a4756576d68 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -116,7 +116,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, * * Assumptions: * - chunk[0]->position points to pages[0] at an offset of 0 - * - pages[] is not physically or virtually contigous and consists of + * - pages[] is not physically or virtually contiguous and consists of * PAGE_SIZE elements. * * Output: @@ -125,7 +125,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, * chunk in the read list * */ -static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, +static int map_read_chunks(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head, struct rpcrdma_msg *rmsgp, @@ -211,26 +211,128 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, return sge_no; } -static void rdma_set_ctxt_sge(struct svcxprt_rdma *xprt, - struct svc_rdma_op_ctxt *ctxt, - struct kvec *vec, - u64 *sgl_offset, - int count) +/* Map a read-chunk-list to an XDR and fast register the page-list. + * + * Assumptions: + * - chunk[0] position points to pages[0] at an offset of 0 + * - pages[] will be made physically contiguous by creating a one-off memory + * region using the fastreg verb. + * - byte_count is # of bytes in read-chunk-list + * - ch_count is # of chunks in read-chunk-list + * + * Output: + * - sge array pointing into pages[] array. + * - chunk_sge array specifying sge index and count for each + * chunk in the read list + */ +static int fast_reg_read_chunks(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + struct rpcrdma_msg *rmsgp, + struct svc_rdma_req_map *rpl_map, + struct svc_rdma_req_map *chl_map, + int ch_count, + int byte_count) +{ + int page_no; + int ch_no; + u32 offset; + struct rpcrdma_read_chunk *ch; + struct svc_rdma_fastreg_mr *frmr; + int ret = 0; + + frmr = svc_rdma_get_frmr(xprt); + if (IS_ERR(frmr)) + return -ENOMEM; + + head->frmr = frmr; + head->arg.head[0] = rqstp->rq_arg.head[0]; + head->arg.tail[0] = rqstp->rq_arg.tail[0]; + head->arg.pages = &head->pages[head->count]; + head->hdr_count = head->count; /* save count of hdr pages */ + head->arg.page_base = 0; + head->arg.page_len = byte_count; + head->arg.len = rqstp->rq_arg.len + byte_count; + head->arg.buflen = rqstp->rq_arg.buflen + byte_count; + + /* Fast register the page list */ + frmr->kva = page_address(rqstp->rq_arg.pages[0]); + frmr->direction = DMA_FROM_DEVICE; + frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE); + frmr->map_len = byte_count; + frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT; + for (page_no = 0; page_no < frmr->page_list_len; page_no++) { + frmr->page_list->page_list[page_no] = + ib_dma_map_single(xprt->sc_cm_id->device, + page_address(rqstp->rq_arg.pages[page_no]), + PAGE_SIZE, DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[page_no])) + goto fatal_err; + atomic_inc(&xprt->sc_dma_used); + head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; + } + head->count += page_no; + + /* rq_respages points one past arg pages */ + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; + + /* Create the reply and chunk maps */ + offset = 0; + ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + for (ch_no = 0; ch_no < ch_count; ch_no++) { + rpl_map->sge[ch_no].iov_base = frmr->kva + offset; + rpl_map->sge[ch_no].iov_len = ch->rc_target.rs_length; + chl_map->ch[ch_no].count = 1; + chl_map->ch[ch_no].start = ch_no; + offset += ch->rc_target.rs_length; + ch++; + } + + ret = svc_rdma_fastreg(xprt, frmr); + if (ret) + goto fatal_err; + + return ch_no; + + fatal_err: + printk("svcrdma: error fast registering xdr for xprt %p", xprt); + svc_rdma_put_frmr(xprt, frmr); + return -EIO; +} + +static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt, + struct svc_rdma_op_ctxt *ctxt, + struct svc_rdma_fastreg_mr *frmr, + struct kvec *vec, + u64 *sgl_offset, + int count) { int i; ctxt->count = count; ctxt->direction = DMA_FROM_DEVICE; for (i = 0; i < count; i++) { - atomic_inc(&xprt->sc_dma_used); - ctxt->sge[i].addr = - ib_dma_map_single(xprt->sc_cm_id->device, - vec[i].iov_base, vec[i].iov_len, - DMA_FROM_DEVICE); + ctxt->sge[i].length = 0; /* in case map fails */ + if (!frmr) { + ctxt->sge[i].addr = + ib_dma_map_single(xprt->sc_cm_id->device, + vec[i].iov_base, + vec[i].iov_len, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + ctxt->sge[i].addr)) + return -EINVAL; + ctxt->sge[i].lkey = xprt->sc_dma_lkey; + atomic_inc(&xprt->sc_dma_used); + } else { + ctxt->sge[i].addr = (unsigned long)vec[i].iov_base; + ctxt->sge[i].lkey = frmr->mr->lkey; + } ctxt->sge[i].length = vec[i].iov_len; - ctxt->sge[i].lkey = xprt->sc_phys_mr->lkey; *sgl_offset = *sgl_offset + vec[i].iov_len; } + return 0; } static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) @@ -278,6 +380,7 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, struct svc_rdma_op_ctxt *hdr_ctxt) { struct ib_send_wr read_wr; + struct ib_send_wr inv_wr; int err = 0; int ch_no; int ch_count; @@ -301,9 +404,20 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); if (ch_count > RPCSVC_MAXPAGES) return -EINVAL; - sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp, - rpl_map, chl_map, - ch_count, byte_count); + + if (!xprt->sc_frmr_pg_list_len) + sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, + rpl_map, chl_map, ch_count, + byte_count); + else + sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, + rpl_map, chl_map, ch_count, + byte_count); + if (sge_count < 0) { + err = -EIO; + goto out; + } + sgl_offset = 0; ch_no = 0; @@ -312,13 +426,16 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, next_sge: ctxt = svc_rdma_get_context(xprt); ctxt->direction = DMA_FROM_DEVICE; + ctxt->frmr = hdr_ctxt->frmr; + ctxt->read_hdr = NULL; clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); /* Prepare READ WR */ memset(&read_wr, 0, sizeof read_wr); - ctxt->wr_op = IB_WR_RDMA_READ; read_wr.wr_id = (unsigned long)ctxt; read_wr.opcode = IB_WR_RDMA_READ; + ctxt->wr_op = read_wr.opcode; read_wr.send_flags = IB_SEND_SIGNALED; read_wr.wr.rdma.rkey = ch->rc_target.rs_handle; read_wr.wr.rdma.remote_addr = @@ -327,10 +444,15 @@ next_sge: read_wr.sg_list = ctxt->sge; read_wr.num_sge = rdma_read_max_sge(xprt, chl_map->ch[ch_no].count); - rdma_set_ctxt_sge(xprt, ctxt, - &rpl_map->sge[chl_map->ch[ch_no].start], - &sgl_offset, - read_wr.num_sge); + err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr, + &rpl_map->sge[chl_map->ch[ch_no].start], + &sgl_offset, + read_wr.num_sge); + if (err) { + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 0); + goto out; + } if (((ch+1)->rc_discrim == 0) && (read_wr.num_sge == chl_map->ch[ch_no].count)) { /* @@ -339,6 +461,29 @@ next_sge: * the client and the RPC needs to be enqueued. */ set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + if (hdr_ctxt->frmr) { + set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); + /* + * Invalidate the local MR used to map the data + * sink. + */ + if (xprt->sc_dev_caps & + SVCRDMA_DEVCAP_READ_W_INV) { + read_wr.opcode = + IB_WR_RDMA_READ_WITH_INV; + ctxt->wr_op = read_wr.opcode; + read_wr.ex.invalidate_rkey = + ctxt->frmr->mr->lkey; + } else { + /* Prepare INVALIDATE WR */ + memset(&inv_wr, 0, sizeof inv_wr); + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.send_flags = IB_SEND_SIGNALED; + inv_wr.ex.invalidate_rkey = + hdr_ctxt->frmr->mr->lkey; + read_wr.next = &inv_wr; + } + } ctxt->read_hdr = hdr_ctxt; } /* Post the read */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index f22f5876766..fb0dff5e53e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -105,7 +105,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) return ctxt; } -static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) +void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) { struct svcxprt_rdma *xprt = ctxt->xprt; int i; @@ -343,9 +343,12 @@ static void process_context(struct svcxprt_rdma *xprt, break; case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; BUG_ON(!read_hdr); + if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) + svc_rdma_put_frmr(xprt, ctxt->frmr); spin_lock_bh(&xprt->sc_rq_dto_lock); set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); list_add_tail(&read_hdr->dto_q, From afd566ea080572499cc01d42d2f578bf4b54f20f Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Fri, 3 Oct 2008 15:45:03 -0500 Subject: [PATCH 57/59] svcrdma: Modify the RPC reply path to use FRMR when available Use FRMR to map local RPC reply data. This allows RDMA_WRITE to send reply data using a single WR. The FRMR is invalidated by linking the LOCAL_INV WR to the RDMA_SEND message used to complete the reply. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 255 +++++++++++++++++++---- net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 + 2 files changed, 217 insertions(+), 40 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 84d328329d9..9a7a8e7ae03 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -69,9 +69,127 @@ * array is only concerned with the reply we are assured that we have * on extra page for the RPCRMDA header. */ -static void xdr_to_sge(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - struct svc_rdma_req_map *vec) +int fast_reg_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) +{ + int sge_no; + u32 sge_bytes; + u32 page_bytes; + u32 page_off; + int page_no = 0; + u8 *frva; + struct svc_rdma_fastreg_mr *frmr; + + frmr = svc_rdma_get_frmr(xprt); + if (IS_ERR(frmr)) + return -ENOMEM; + vec->frmr = frmr; + + /* Skip the RPCRDMA header */ + sge_no = 1; + + /* Map the head. */ + frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK); + vec->sge[sge_no].iov_base = xdr->head[0].iov_base; + vec->sge[sge_no].iov_len = xdr->head[0].iov_len; + vec->count = 2; + sge_no++; + + /* Build the FRMR */ + frmr->kva = frva; + frmr->direction = DMA_TO_DEVICE; + frmr->access_flags = 0; + frmr->map_len = PAGE_SIZE; + frmr->page_list_len = 1; + frmr->page_list->page_list[page_no] = + ib_dma_map_single(xprt->sc_cm_id->device, + (void *)xdr->head[0].iov_base, + PAGE_SIZE, DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[page_no])) + goto fatal_err; + atomic_inc(&xprt->sc_dma_used); + + page_off = xdr->page_base; + page_bytes = xdr->page_len + page_off; + if (!page_bytes) + goto encode_tail; + + /* Map the pages */ + vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; + vec->sge[sge_no].iov_len = page_bytes; + sge_no++; + while (page_bytes) { + struct page *page; + + page = xdr->pages[page_no++]; + sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off)); + page_bytes -= sge_bytes; + + frmr->page_list->page_list[page_no] = + ib_dma_map_page(xprt->sc_cm_id->device, page, 0, + PAGE_SIZE, DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[page_no])) + goto fatal_err; + + atomic_inc(&xprt->sc_dma_used); + page_off = 0; /* reset for next time through loop */ + frmr->map_len += PAGE_SIZE; + frmr->page_list_len++; + } + vec->count++; + + encode_tail: + /* Map tail */ + if (0 == xdr->tail[0].iov_len) + goto done; + + vec->count++; + vec->sge[sge_no].iov_len = xdr->tail[0].iov_len; + + if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) == + ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) { + /* + * If head and tail use the same page, we don't need + * to map it again. + */ + vec->sge[sge_no].iov_base = xdr->tail[0].iov_base; + } else { + void *va; + + /* Map another page for the tail */ + page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK; + va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK); + vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; + + frmr->page_list->page_list[page_no] = + ib_dma_map_single(xprt->sc_cm_id->device, va, PAGE_SIZE, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[page_no])) + goto fatal_err; + atomic_inc(&xprt->sc_dma_used); + frmr->map_len += PAGE_SIZE; + frmr->page_list_len++; + } + + done: + if (svc_rdma_fastreg(xprt, frmr)) + goto fatal_err; + + return 0; + + fatal_err: + printk("svcrdma: Error fast registering memory for xprt %p\n", xprt); + svc_rdma_put_frmr(xprt, frmr); + return -EIO; +} + +static int map_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) { int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3; int sge_no; @@ -83,6 +201,9 @@ static void xdr_to_sge(struct svcxprt_rdma *xprt, BUG_ON(xdr->len != (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)); + if (xprt->sc_frmr_pg_list_len) + return fast_reg_xdr(xprt, xdr, vec); + /* Skip the first sge, this is for the RPCRDMA header */ sge_no = 1; @@ -116,9 +237,12 @@ static void xdr_to_sge(struct svcxprt_rdma *xprt, BUG_ON(sge_no > sge_max); vec->count = sge_no; + return 0; } /* Assumptions: + * - We are using FRMR + * - or - * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE */ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, @@ -158,30 +282,35 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, sge_no = 0; /* Copy the remaining SGE */ - while (bc != 0 && xdr_sge_no < vec->count) { - sge[sge_no].lkey = xprt->sc_phys_mr->lkey; - sge_bytes = min((size_t)bc, - (size_t)(vec->sge[xdr_sge_no].iov_len-sge_off)); + while (bc != 0) { + sge_bytes = min_t(size_t, + bc, vec->sge[xdr_sge_no].iov_len-sge_off); sge[sge_no].length = sge_bytes; - atomic_inc(&xprt->sc_dma_used); - sge[sge_no].addr = - ib_dma_map_single(xprt->sc_cm_id->device, - (void *) - vec->sge[xdr_sge_no].iov_base + sge_off, - sge_bytes, DMA_TO_DEVICE); - if (dma_mapping_error(xprt->sc_cm_id->device->dma_device, - sge[sge_no].addr)) - goto err; + if (!vec->frmr) { + sge[sge_no].addr = + ib_dma_map_single(xprt->sc_cm_id->device, + (void *) + vec->sge[xdr_sge_no].iov_base + sge_off, + sge_bytes, DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + sge[sge_no].addr)) + goto err; + atomic_inc(&xprt->sc_dma_used); + sge[sge_no].lkey = xprt->sc_dma_lkey; + } else { + sge[sge_no].addr = (unsigned long) + vec->sge[xdr_sge_no].iov_base + sge_off; + sge[sge_no].lkey = vec->frmr->mr->lkey; + } + ctxt->count++; + ctxt->frmr = vec->frmr; sge_off = 0; sge_no++; - ctxt->count++; xdr_sge_no++; + BUG_ON(xdr_sge_no > vec->count); bc -= sge_bytes; } - BUG_ON(bc != 0); - BUG_ON(xdr_sge_no > vec->count); - /* Prepare WRITE WR */ memset(&write_wr, 0, sizeof write_wr); ctxt->wr_op = IB_WR_RDMA_WRITE; @@ -226,7 +355,10 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[1]; - max_write = xprt->sc_max_sge * PAGE_SIZE; + if (vec->frmr) + max_write = vec->frmr->map_len; + else + max_write = xprt->sc_max_sge * PAGE_SIZE; /* Write chunks start at the pagelist */ for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; @@ -297,7 +429,10 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[2]; - max_write = xprt->sc_max_sge * PAGE_SIZE; + if (vec->frmr) + max_write = vec->frmr->map_len; + else + max_write = xprt->sc_max_sge * PAGE_SIZE; /* xdr offset starts at RPC message */ for (xdr_off = 0, chunk_no = 0; @@ -307,7 +442,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, ch = &arg_ary->wc_array[chunk_no].wc_target; write_len = min(xfer_len, ch->rs_length); - /* Prepare the reply chunk given the length actually * written */ rs_offset = get_unaligned(&(ch->rs_offset)); @@ -366,6 +500,7 @@ static int send_reply(struct svcxprt_rdma *rdma, int byte_count) { struct ib_send_wr send_wr; + struct ib_send_wr inv_wr; int sge_no; int sge_bytes; int page_no; @@ -385,27 +520,45 @@ static int send_reply(struct svcxprt_rdma *rdma, /* Prepare the context */ ctxt->pages[0] = page; ctxt->count = 1; + ctxt->frmr = vec->frmr; + if (vec->frmr) + set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); + else + clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); /* Prepare the SGE for the RPCRDMA Header */ - atomic_inc(&rdma->sc_dma_used); ctxt->sge[0].addr = ib_dma_map_page(rdma->sc_cm_id->device, page, 0, PAGE_SIZE, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) + goto err; + atomic_inc(&rdma->sc_dma_used); + ctxt->direction = DMA_TO_DEVICE; + ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); - ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey; + ctxt->sge[0].lkey = rdma->sc_dma_lkey; /* Determine how many of our SGE are to be transmitted */ for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); byte_count -= sge_bytes; - atomic_inc(&rdma->sc_dma_used); - ctxt->sge[sge_no].addr = - ib_dma_map_single(rdma->sc_cm_id->device, - vec->sge[sge_no].iov_base, - sge_bytes, DMA_TO_DEVICE); + if (!vec->frmr) { + ctxt->sge[sge_no].addr = + ib_dma_map_single(rdma->sc_cm_id->device, + vec->sge[sge_no].iov_base, + sge_bytes, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->sc_cm_id->device, + ctxt->sge[sge_no].addr)) + goto err; + atomic_inc(&rdma->sc_dma_used); + ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; + } else { + ctxt->sge[sge_no].addr = (unsigned long) + vec->sge[sge_no].iov_base; + ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey; + } ctxt->sge[sge_no].length = sge_bytes; - ctxt->sge[sge_no].lkey = rdma->sc_phys_mr->lkey; } BUG_ON(byte_count != 0); @@ -417,11 +570,16 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; ctxt->count++; rqstp->rq_respages[page_no] = NULL; - /* If there are more pages than SGE, terminate SGE list */ + /* + * If there are more pages than SGE, terminate SGE + * list so that svc_rdma_unmap_dma doesn't attempt to + * unmap garbage. + */ if (page_no+1 >= sge_no) ctxt->sge[page_no+1].length = 0; } BUG_ON(sge_no > rdma->sc_max_sge); + BUG_ON(sge_no > ctxt->count); memset(&send_wr, 0, sizeof send_wr); ctxt->wr_op = IB_WR_SEND; send_wr.wr_id = (unsigned long)ctxt; @@ -429,12 +587,26 @@ static int send_reply(struct svcxprt_rdma *rdma, send_wr.num_sge = sge_no; send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; + if (vec->frmr) { + /* Prepare INVALIDATE WR */ + memset(&inv_wr, 0, sizeof inv_wr); + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.send_flags = IB_SEND_SIGNALED; + inv_wr.ex.invalidate_rkey = + vec->frmr->mr->lkey; + send_wr.next = &inv_wr; + } ret = svc_rdma_send(rdma, &send_wr); if (ret) - svc_rdma_put_context(ctxt, 1); + goto err; - return ret; + return 0; + + err: + svc_rdma_put_frmr(rdma, vec->frmr); + svc_rdma_put_context(ctxt, 1); + return -EIO; } void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) @@ -477,8 +649,9 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) ctxt = svc_rdma_get_context(rdma); ctxt->direction = DMA_TO_DEVICE; vec = svc_rdma_get_req_map(); - xdr_to_sge(rdma, &rqstp->rq_res, vec); - + ret = map_xdr(rdma, &rqstp->rq_res, vec); + if (ret) + goto err0; inline_bytes = rqstp->rq_res.len; /* Create the RDMA response header */ @@ -498,7 +671,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (ret < 0) { printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n", ret); - goto error; + goto err1; } inline_bytes -= ret; @@ -508,7 +681,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (ret < 0) { printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n", ret); - goto error; + goto err1; } inline_bytes -= ret; @@ -517,9 +690,11 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) svc_rdma_put_req_map(vec); dprintk("svcrdma: send_reply returns %d\n", ret); return ret; - error: + + err1: + put_page(res_page); + err0: svc_rdma_put_req_map(vec); svc_rdma_put_context(ctxt, 0); - put_page(res_page); return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index fb0dff5e53e..98f945c5a00 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -335,6 +335,8 @@ static void process_context(struct svcxprt_rdma *xprt, switch (ctxt->wr_op) { case IB_WR_SEND: + if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) + svc_rdma_put_frmr(xprt, ctxt->frmr); svc_rdma_put_context(ctxt, 1); break; From 04911b539c9817aa88a6da8f563e65e3e0bc974b Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Mon, 11 Aug 2008 15:14:53 -0500 Subject: [PATCH 58/59] svcrdma: Update svc_rdma_send_error to use DMA LKEY Update the svc_rdma_send_error code to use the DMA LKEY which is valid regardless of the memory registration strategy in use. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 98f945c5a00..c0cd334fe1c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -1314,10 +1314,14 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); /* Prepare SGE for local address */ - atomic_inc(&xprt->sc_dma_used); sge.addr = ib_dma_map_page(xprt->sc_cm_id->device, p, 0, PAGE_SIZE, DMA_FROM_DEVICE); - sge.lkey = xprt->sc_phys_mr->lkey; + if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge.addr)) { + put_page(p); + return; + } + atomic_inc(&xprt->sc_dma_used); + sge.lkey = xprt->sc_dma_lkey; sge.length = length; ctxt = svc_rdma_get_context(xprt); @@ -1338,6 +1342,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, if (ret) { dprintk("svcrdma: Error %d posting send for protocol error\n", ret); + ib_dma_unmap_page(xprt->sc_cm_id->device, + sge.addr, PAGE_SIZE, + DMA_FROM_DEVICE); svc_rdma_put_context(ctxt, 1); } } From 67080c82361b7510b602c87b83399421aa2d2895 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Fri, 3 Oct 2008 12:41:14 -0500 Subject: [PATCH 59/59] svcrdma: Fix IRD/ORD polarity The inititator/responder resources in the event have been swapped. They no represent what the local peer would set their values to in order to match the peer. Note that iWARP does not exchange these on the wire and the provider is simply putting in the local device max. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index c0cd334fe1c..6fb493cbd29 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -598,7 +598,7 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " "event=%d\n", cma_id, cma_id->context, event->event); handle_connect_req(cma_id, - event->param.conn.responder_resources); + event->param.conn.initiator_depth); break; case RDMA_CM_EVENT_ESTABLISHED: