aha/net/rds/page.c

/*
 * Copyright (c) 2006 Oracle.  All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */
#include <linux/highmem.h>

#include "rds.h"

struct rds_page_remainder {
	struct page	*r_page;
	unsigned long	r_offset;
};

DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);

/*
 * returns 0 on success or -errno on failure.
 *
 * We don't have to worry about flush_dcache_page() as this only works
 * with private pages.  If, say, we were to do directed receive to pinned
 * user pages we'd have to worry more about cache coherence.  (Though
 * the flush_dcache_page() in get_user_pages() would probably be enough).
 */
int rds_page_copy_user(struct page *page, unsigned long offset,
		       void __user *ptr, unsigned long bytes,
		       int to_user)
{
	unsigned long ret;
	void *addr;

	if (to_user)
		rds_stats_add(s_copy_to_user, bytes);
	else
		rds_stats_add(s_copy_from_user, bytes);

	addr = kmap_atomic(page, KM_USER0);
	if (to_user)
		ret = __copy_to_user_inatomic(ptr, addr + offset, bytes);
	else
		ret = __copy_from_user_inatomic(addr + offset, ptr, bytes);
	kunmap_atomic(addr, KM_USER0);

	if (ret) {
		addr = kmap(page);
		if (to_user)
			ret = copy_to_user(ptr, addr + offset, bytes);
		else
			ret = copy_from_user(addr + offset, ptr, bytes);
		kunmap(page);
		if (ret)
			return -EFAULT;
	}

	return 0;
}
EXPORT_SYMBOL_GPL(rds_page_copy_user);

/*
 * Message allocation uses this to build up regions of a message.
 *
 * @bytes - the number of bytes needed.
 * @gfp - the waiting behaviour of the allocation
 *
 * @gfp is always ored with __GFP_HIGHMEM.  Callers must be prepared to
 * kmap the pages, etc.
 *
 * If @bytes is at least a full page then this just returns a page from
 * alloc_page().
 *
 * If @bytes is a partial page then this stores the unused region of the
 * page in a per-cpu structure.  Future partial-page allocations may be
 * satisfied from that cached region.  This lets us waste less memory on
 * small allocations with minimal complexity.  It works because the transmit
 * path passes read-only page regions down to devices.  They hold a page
 * reference until they are done with the region.
 */
int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
			     gfp_t gfp)
{
	struct rds_page_remainder *rem;
	unsigned long flags;
	struct page *page;
	int ret;

	gfp |= __GFP_HIGHMEM;

	/* jump straight to allocation if we're trying for a huge page */
	if (bytes >= PAGE_SIZE) {
		page = alloc_page(gfp);
		if (page == NULL) {
			ret = -ENOMEM;
		} else {
			sg_set_page(scat, page, PAGE_SIZE, 0);
			ret = 0;
		}
		goto out;
	}

	rem = &per_cpu(rds_page_remainders, get_cpu());
	local_irq_save(flags);

	while (1) {
		/* avoid a tiny region getting stuck by tossing it */
		if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) {
			rds_stats_inc(s_page_remainder_miss);
			__free_page(rem->r_page);
			rem->r_page = NULL;
		}

		/* hand out a fragment from the cached page */
		if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) {
			sg_set_page(scat, rem->r_page, bytes, rem->r_offset);
			get_page(sg_page(scat));

			if (rem->r_offset != 0)
				rds_stats_inc(s_page_remainder_hit);

			rem->r_offset += bytes;
			if (rem->r_offset == PAGE_SIZE) {
				__free_page(rem->r_page);
				rem->r_page = NULL;
			}
			ret = 0;
			break;
		}

		/* alloc if there is nothing for us to use */
		local_irq_restore(flags);
		put_cpu();

		page = alloc_page(gfp);

		rem = &per_cpu(rds_page_remainders, get_cpu());
		local_irq_save(flags);

		if (page == NULL) {
			ret = -ENOMEM;
			break;
		}

		/* did someone race to fill the remainder before us? */
		if (rem->r_page) {
			__free_page(page);
			continue;
		}

		/* otherwise install our page and loop around to alloc */
		rem->r_page = page;
		rem->r_offset = 0;
	}

	local_irq_restore(flags);
	put_cpu();
out:
	rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,
		 ret ? NULL : sg_page(scat), ret ? 0 : scat->offset,
		 ret ? 0 : scat->length);
	return ret;
}

static int rds_page_remainder_cpu_notify(struct notifier_block *self,
					 unsigned long action, void *hcpu)
{
	struct rds_page_remainder *rem;
	long cpu = (long)hcpu;

	rem = &per_cpu(rds_page_remainders, cpu);

	rdsdebug("cpu %ld action 0x%lx\n", cpu, action);

	switch (action) {
	case CPU_DEAD:
		if (rem->r_page)
			__free_page(rem->r_page);
		rem->r_page = NULL;
		break;
	}

	return 0;
}

static struct notifier_block rds_page_remainder_nb = {
	.notifier_call = rds_page_remainder_cpu_notify,
};

void rds_page_exit(void)
{
	int i;

	for_each_possible_cpu(i)
		rds_page_remainder_cpu_notify(&rds_page_remainder_nb,
					      (unsigned long)CPU_DEAD,
					      (void *)(long)i);
}
RDS: Message parsing Parsing of newly-received RDS message headers (including ext. headers) and copy-to/from-user routines. page.c implements a per-cpu page remainder cache, to reduce the number of allocations needed for small datagrams. Signed-off-by: Andy Grover <andy.grover@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2009-02-24 15:30:26 +00:00			`/*`
			`* Copyright (c) 2006 Oracle. All rights reserved.`
			`*`
			`* This software is available to you under a choice of one of two`
			`* licenses. You may choose to be licensed under the terms of the GNU`
			`* General Public License (GPL) Version 2, available from the file`
			`* COPYING in the main directory of this source tree, or the`
			`* OpenIB.org BSD license below:`
			`*`
			`* Redistribution and use in source and binary forms, with or`
			`* without modification, are permitted provided that the following`
			`* conditions are met:`
			`*`
			`* - Redistributions of source code must retain the above`
			`* copyright notice, this list of conditions and the following`
			`* disclaimer.`
			`*`
			`* - Redistributions in binary form must reproduce the above`
			`* copyright notice, this list of conditions and the following`
			`* disclaimer in the documentation and/or other materials`
			`* provided with the distribution.`
			`*`
			`* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,`
			`* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF`
			`* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND`
			`* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS`
			`* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN`
			`* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN`
			`* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE`
			`* SOFTWARE.`
			`*`
			`*/`
			`#include <linux/highmem.h>`

			`#include "rds.h"`

			`struct rds_page_remainder {`
			`struct page *r_page;`
			`unsigned long r_offset;`
			`};`

percpu: use DEFINE_PER_CPU_SHARED_ALIGNED() There are a few places where ___cacheline_aligned* is used with DEFINE_PER_CPU(). Use DEFINE_PER_CPU_SHARED_ALIGNED() instead. DEFINE_PER_CPU_SHARED_ALIGNED() applies alignment only on SMPs. While all other converted places used _in_smp variant or only get compiled for SMP, net/rds used unconditional ____cacheline_aligned. I don't see any reason these data structures should be aligned on UP and thus converted together. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Tony Luck <tony.luck@intel.com> Cc: Andy Grover <andy.grover@oracle.com> 2009-06-24 06:13:47 +00:00			`DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);`
RDS: Message parsing Parsing of newly-received RDS message headers (including ext. headers) and copy-to/from-user routines. page.c implements a per-cpu page remainder cache, to reduce the number of allocations needed for small datagrams. Signed-off-by: Andy Grover <andy.grover@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2009-02-24 15:30:26 +00:00
			`/*`
			`* returns 0 on success or -errno on failure.`
			`*`
			`* We don't have to worry about flush_dcache_page() as this only works`
			`* with private pages. If, say, we were to do directed receive to pinned`
			`* user pages we'd have to worry more about cache coherence. (Though`
			`* the flush_dcache_page() in get_user_pages() would probably be enough).`
			`*/`
			`int rds_page_copy_user(struct page *page, unsigned long offset,`
			`void __user *ptr, unsigned long bytes,`
			`int to_user)`
			`{`
			`unsigned long ret;`
			`void *addr;`

			`if (to_user)`
			`rds_stats_add(s_copy_to_user, bytes);`
			`else`
			`rds_stats_add(s_copy_from_user, bytes);`

			`addr = kmap_atomic(page, KM_USER0);`
			`if (to_user)`
			`ret = __copy_to_user_inatomic(ptr, addr + offset, bytes);`
			`else`
			`ret = __copy_from_user_inatomic(addr + offset, ptr, bytes);`
			`kunmap_atomic(addr, KM_USER0);`

			`if (ret) {`
			`addr = kmap(page);`
			`if (to_user)`
			`ret = copy_to_user(ptr, addr + offset, bytes);`
			`else`
			`ret = copy_from_user(addr + offset, ptr, bytes);`
			`kunmap(page);`
			`if (ret)`
			`return -EFAULT;`
			`}`

			`return 0;`
			`}`
RDS: Export symbols from core RDS Now that rdma and tcp transports will be modularized, we need to export a number of functions so they can call them. Signed-off-by: Andy Grover <andy.grover@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2009-08-21 12:28:32 +00:00			`EXPORT_SYMBOL_GPL(rds_page_copy_user);`
RDS: Message parsing Parsing of newly-received RDS message headers (including ext. headers) and copy-to/from-user routines. page.c implements a per-cpu page remainder cache, to reduce the number of allocations needed for small datagrams. Signed-off-by: Andy Grover <andy.grover@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2009-02-24 15:30:26 +00:00
			`/*`
			`* Message allocation uses this to build up regions of a message.`
			`*`
			`* @bytes - the number of bytes needed.`
			`* @gfp - the waiting behaviour of the allocation`
			`*`
			`* @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to`
			`* kmap the pages, etc.`
			`*`
			`* If @bytes is at least a full page then this just returns a page from`
			`* alloc_page().`
			`*`
			`* If @bytes is a partial page then this stores the unused region of the`
			`* page in a per-cpu structure. Future partial-page allocations may be`
			`* satisfied from that cached region. This lets us waste less memory on`
			`* small allocations with minimal complexity. It works because the transmit`
			`* path passes read-only page regions down to devices. They hold a page`
			`* reference until they are done with the region.`
			`*/`
			`int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,`
			`gfp_t gfp)`
			`{`
			`struct rds_page_remainder *rem;`
			`unsigned long flags;`
			`struct page *page;`
			`int ret;`

			`gfp \|= __GFP_HIGHMEM;`

			`/* jump straight to allocation if we're trying for a huge page */`
			`if (bytes >= PAGE_SIZE) {`
			`page = alloc_page(gfp);`
			`if (page == NULL) {`
			`ret = -ENOMEM;`
			`} else {`
			`sg_set_page(scat, page, PAGE_SIZE, 0);`
			`ret = 0;`
			`}`
			`goto out;`
			`}`

			`rem = &per_cpu(rds_page_remainders, get_cpu());`
			`local_irq_save(flags);`

			`while (1) {`
			`/* avoid a tiny region getting stuck by tossing it */`
			`if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) {`
			`rds_stats_inc(s_page_remainder_miss);`
			`__free_page(rem->r_page);`
			`rem->r_page = NULL;`
			`}`

			`/* hand out a fragment from the cached page */`
			`if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) {`
			`sg_set_page(scat, rem->r_page, bytes, rem->r_offset);`
			`get_page(sg_page(scat));`

			`if (rem->r_offset != 0)`
			`rds_stats_inc(s_page_remainder_hit);`

			`rem->r_offset += bytes;`
			`if (rem->r_offset == PAGE_SIZE) {`
			`__free_page(rem->r_page);`
			`rem->r_page = NULL;`
			`}`
			`ret = 0;`
			`break;`
			`}`

			`/* alloc if there is nothing for us to use */`
			`local_irq_restore(flags);`
			`put_cpu();`

			`page = alloc_page(gfp);`

			`rem = &per_cpu(rds_page_remainders, get_cpu());`
			`local_irq_save(flags);`

			`if (page == NULL) {`
			`ret = -ENOMEM;`
			`break;`
			`}`

			`/* did someone race to fill the remainder before us? */`
			`if (rem->r_page) {`
			`__free_page(page);`
			`continue;`
			`}`

			`/* otherwise install our page and loop around to alloc */`
			`rem->r_page = page;`
			`rem->r_offset = 0;`
			`}`

			`local_irq_restore(flags);`
			`put_cpu();`
			`out:`
			`rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,`
			`ret ? NULL : sg_page(scat), ret ? 0 : scat->offset,`
			`ret ? 0 : scat->length);`
			`return ret;`
			`}`

			`static int rds_page_remainder_cpu_notify(struct notifier_block *self,`
			`unsigned long action, void *hcpu)`
			`{`
			`struct rds_page_remainder *rem;`
			`long cpu = (long)hcpu;`

			`rem = &per_cpu(rds_page_remainders, cpu);`

			`rdsdebug("cpu %ld action 0x%lx\n", cpu, action);`

			`switch (action) {`
			`case CPU_DEAD:`
			`if (rem->r_page)`
			`__free_page(rem->r_page);`
			`rem->r_page = NULL;`
			`break;`
			`}`

			`return 0;`
			`}`

			`static struct notifier_block rds_page_remainder_nb = {`
			`.notifier_call = rds_page_remainder_cpu_notify,`
			`};`

			`void rds_page_exit(void)`
			`{`
			`int i;`

			`for_each_possible_cpu(i)`
			`rds_page_remainder_cpu_notify(&rds_page_remainder_nb,`
			`(unsigned long)CPU_DEAD,`
			`(void *)(long)i);`
			`}`