mirror of
https://github.com/adulau/aha.git
synced 2024-12-28 03:36:19 +00:00
MIPS: Outline udelay and fix a few issues.
Outlining fixes the issue were on certain CPUs such as the R10000 family the delay loop would need an extra cycle if it overlaps a cacheline boundary. The rewrite also fixes build errors with GCC 4.4 which was changed in way incompatible with the kernel's inline assembly. Relying on pure C for computation of the delay value removes the need for explicit. The price we pay is a slight slowdown of the computation - to be fixed on another day. Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
This commit is contained in:
parent
3a553147ea
commit
5636919b5c
5 changed files with 66 additions and 92 deletions
|
@ -39,8 +39,8 @@ struct cache_desc {
|
|||
#define MIPS_CACHE_PINDEX 0x00000020 /* Physically indexed cache */
|
||||
|
||||
struct cpuinfo_mips {
|
||||
unsigned long udelay_val;
|
||||
unsigned long asid_cache;
|
||||
unsigned int udelay_val;
|
||||
unsigned int asid_cache;
|
||||
|
||||
/*
|
||||
* Capability and feature descriptor structure for MIPS CPU
|
||||
|
|
|
@ -11,94 +11,12 @@
|
|||
#ifndef _ASM_DELAY_H
|
||||
#define _ASM_DELAY_H
|
||||
|
||||
#include <linux/param.h>
|
||||
#include <linux/smp.h>
|
||||
extern void __delay(unsigned int loops);
|
||||
extern void __ndelay(unsigned int ns);
|
||||
extern void __udelay(unsigned int us);
|
||||
|
||||
#include <asm/compiler.h>
|
||||
#include <asm/war.h>
|
||||
|
||||
static inline void __delay(unsigned long loops)
|
||||
{
|
||||
if (sizeof(long) == 4)
|
||||
__asm__ __volatile__ (
|
||||
" .set noreorder \n"
|
||||
" .align 3 \n"
|
||||
"1: bnez %0, 1b \n"
|
||||
" subu %0, 1 \n"
|
||||
" .set reorder \n"
|
||||
: "=r" (loops)
|
||||
: "0" (loops));
|
||||
else if (sizeof(long) == 8 && !DADDI_WAR)
|
||||
__asm__ __volatile__ (
|
||||
" .set noreorder \n"
|
||||
" .align 3 \n"
|
||||
"1: bnez %0, 1b \n"
|
||||
" dsubu %0, 1 \n"
|
||||
" .set reorder \n"
|
||||
: "=r" (loops)
|
||||
: "0" (loops));
|
||||
else if (sizeof(long) == 8 && DADDI_WAR)
|
||||
__asm__ __volatile__ (
|
||||
" .set noreorder \n"
|
||||
" .align 3 \n"
|
||||
"1: bnez %0, 1b \n"
|
||||
" dsubu %0, %2 \n"
|
||||
" .set reorder \n"
|
||||
: "=r" (loops)
|
||||
: "0" (loops), "r" (1));
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Division by multiplication: you don't have to worry about
|
||||
* loss of precision.
|
||||
*
|
||||
* Use only for very small delays ( < 1 msec). Should probably use a
|
||||
* lookup table, really, as the multiplications take much too long with
|
||||
* short delays. This is a "reasonable" implementation, though (and the
|
||||
* first constant multiplications gets optimized away if the delay is
|
||||
* a constant)
|
||||
*/
|
||||
|
||||
static inline void __udelay(unsigned long usecs, unsigned long lpj)
|
||||
{
|
||||
unsigned long hi, lo;
|
||||
|
||||
/*
|
||||
* The rates of 128 is rounded wrongly by the catchall case
|
||||
* for 64-bit. Excessive precission? Probably ...
|
||||
*/
|
||||
#if defined(CONFIG_64BIT) && (HZ == 128)
|
||||
usecs *= 0x0008637bd05af6c7UL; /* 2**64 / (1000000 / HZ) */
|
||||
#elif defined(CONFIG_64BIT)
|
||||
usecs *= (0x8000000000000000UL / (500000 / HZ));
|
||||
#else /* 32-bit junk follows here */
|
||||
usecs *= (unsigned long) (((0x8000000000000000ULL / (500000 / HZ)) +
|
||||
0x80000000ULL) >> 32);
|
||||
#endif
|
||||
|
||||
if (sizeof(long) == 4)
|
||||
__asm__("multu\t%2, %3"
|
||||
: "=h" (usecs), "=l" (lo)
|
||||
: "r" (usecs), "r" (lpj)
|
||||
: GCC_REG_ACCUM);
|
||||
else if (sizeof(long) == 8 && !R4000_WAR)
|
||||
__asm__("dmultu\t%2, %3"
|
||||
: "=h" (usecs), "=l" (lo)
|
||||
: "r" (usecs), "r" (lpj)
|
||||
: GCC_REG_ACCUM);
|
||||
else if (sizeof(long) == 8 && R4000_WAR)
|
||||
__asm__("dmultu\t%3, %4\n\tmfhi\t%0"
|
||||
: "=r" (usecs), "=h" (hi), "=l" (lo)
|
||||
: "r" (usecs), "r" (lpj)
|
||||
: GCC_REG_ACCUM);
|
||||
|
||||
__delay(usecs);
|
||||
}
|
||||
|
||||
#define __udelay_val cpu_data[raw_smp_processor_id()].udelay_val
|
||||
|
||||
#define udelay(usecs) __udelay((usecs), __udelay_val)
|
||||
#define ndelay(ns) __udelay(ns)
|
||||
#define udelay(us) __udelay(us)
|
||||
|
||||
/* make sure "usecs *= ..." in udelay do not overflow. */
|
||||
#if HZ >= 1000
|
||||
|
|
|
@ -42,7 +42,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
|
|||
seq_printf(m, fmt, __cpu_name[n],
|
||||
(version >> 4) & 0x0f, version & 0x0f,
|
||||
(fp_vers >> 4) & 0x0f, fp_vers & 0x0f);
|
||||
seq_printf(m, "BogoMIPS\t\t: %lu.%02lu\n",
|
||||
seq_printf(m, "BogoMIPS\t\t: %u.%02u\n",
|
||||
cpu_data[n].udelay_val / (500000/HZ),
|
||||
(cpu_data[n].udelay_val / (5000/HZ)) % 100);
|
||||
seq_printf(m, "wait instruction\t: %s\n", cpu_wait ? "yes" : "no");
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
# Makefile for MIPS-specific library files..
|
||||
#
|
||||
|
||||
lib-y += csum_partial.o memcpy.o memcpy-inatomic.o memset.o strlen_user.o \
|
||||
strncpy_user.o strnlen_user.o uncached.o
|
||||
lib-y += csum_partial.o delay.o memcpy.o memcpy-inatomic.o memset.o \
|
||||
strlen_user.o strncpy_user.o strnlen_user.o uncached.o
|
||||
|
||||
obj-y += iomap.o
|
||||
obj-$(CONFIG_PCI) += iomap-pci.o
|
||||
|
|
56
arch/mips/lib/delay.c
Normal file
56
arch/mips/lib/delay.c
Normal file
|
@ -0,0 +1,56 @@
|
|||
/*
|
||||
* This file is subject to the terms and conditions of the GNU General Public
|
||||
* License. See the file "COPYING" in the main directory of this archive
|
||||
* for more details.
|
||||
*
|
||||
* Copyright (C) 1994 by Waldorf Electronics
|
||||
* Copyright (C) 1995 - 2000, 01, 03 by Ralf Baechle
|
||||
* Copyright (C) 1999, 2000 Silicon Graphics, Inc.
|
||||
* Copyright (C) 2007 Maciej W. Rozycki
|
||||
*/
|
||||
#include <linux/module.h>
|
||||
#include <linux/param.h>
|
||||
#include <linux/smp.h>
|
||||
|
||||
#include <asm/compiler.h>
|
||||
#include <asm/war.h>
|
||||
|
||||
inline void __delay(unsigned int loops)
|
||||
{
|
||||
__asm__ __volatile__ (
|
||||
" .set noreorder \n"
|
||||
" .align 3 \n"
|
||||
"1: bnez %0, 1b \n"
|
||||
" subu %0, 1 \n"
|
||||
" .set reorder \n"
|
||||
: "=r" (loops)
|
||||
: "0" (loops));
|
||||
}
|
||||
EXPORT_SYMBOL(__delay);
|
||||
|
||||
/*
|
||||
* Division by multiplication: you don't have to worry about
|
||||
* loss of precision.
|
||||
*
|
||||
* Use only for very small delays ( < 1 msec). Should probably use a
|
||||
* lookup table, really, as the multiplications take much too long with
|
||||
* short delays. This is a "reasonable" implementation, though (and the
|
||||
* first constant multiplications gets optimized away if the delay is
|
||||
* a constant)
|
||||
*/
|
||||
|
||||
void __udelay(unsigned long us)
|
||||
{
|
||||
unsigned int lpj = current_cpu_data.udelay_val;
|
||||
|
||||
__delay((us * 0x000010c7 * HZ * lpj) >> 32);
|
||||
}
|
||||
EXPORT_SYMBOL(__udelay);
|
||||
|
||||
void __ndelay(unsigned long ns)
|
||||
{
|
||||
unsigned int lpj = current_cpu_data.udelay_val;
|
||||
|
||||
__delay((us * 0x00000005 * HZ * lpj) >> 32);
|
||||
}
|
||||
EXPORT_SYMBOL(__ndelay);
|
Loading…
Reference in a new issue