kernel_optimize_test/arch/mips/lib/delay.c
Maciej W. Rozycki 2db4bc3418 MIPS: __delay CPU_DADDI_WORKAROUNDS bug fix
With CPU_DADDI_WORKAROUNDS enabled __delay assembles with a macro in a
branch delay slot:

{standard input}: Assembler messages:
{standard input}:18: Warning: Macro instruction expanded into multiple
instructions in a branch delay slot

and broken code results:

0000000000000000 <__delay>:
   0:	1480ffff 	bnez	a0,0 <__delay>
   4:	24010001 	li	at,1
   8:	0081202f 	dsubu	a0,a0,at
   c:	03e00008 	jr	ra
  10:	00000000 	nop
  14:	00000000 	nop

Consequently the function loops indefinitely, showing up prominently as a
hang in the delay loop calibration at bootstrap.

This change corrects the problem by forcing the immediate 1 into a
register while keeping code produced identical where CPU_DADDI_WORKAROUNDS
is disabled.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/6669/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
2014-05-13 00:29:36 +02:00

67 lines
1.6 KiB
C

/*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*
* Copyright (C) 1994 by Waldorf Electronics
* Copyright (C) 1995 - 2000, 01, 03 by Ralf Baechle
* Copyright (C) 1999, 2000 Silicon Graphics, Inc.
* Copyright (C) 2007, 2014 Maciej W. Rozycki
*/
#include <linux/module.h>
#include <linux/param.h>
#include <linux/smp.h>
#include <asm/compiler.h>
#include <asm/war.h>
#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
#define GCC_DADDI_IMM_ASM() "I"
#else
#define GCC_DADDI_IMM_ASM() "r"
#endif
void __delay(unsigned long loops)
{
__asm__ __volatile__ (
" .set noreorder \n"
" .align 3 \n"
"1: bnez %0, 1b \n"
#if BITS_PER_LONG == 32
" subu %0, %1 \n"
#else
" dsubu %0, %1 \n"
#endif
" .set reorder \n"
: "=r" (loops)
: GCC_DADDI_IMM_ASM() (1), "0" (loops));
}
EXPORT_SYMBOL(__delay);
/*
* Division by multiplication: you don't have to worry about
* loss of precision.
*
* Use only for very small delays ( < 1 msec). Should probably use a
* lookup table, really, as the multiplications take much too long with
* short delays. This is a "reasonable" implementation, though (and the
* first constant multiplications gets optimized away if the delay is
* a constant)
*/
void __udelay(unsigned long us)
{
unsigned int lpj = raw_current_cpu_data.udelay_val;
__delay((us * 0x000010c7ull * HZ * lpj) >> 32);
}
EXPORT_SYMBOL(__udelay);
void __ndelay(unsigned long ns)
{
unsigned int lpj = raw_current_cpu_data.udelay_val;
__delay((ns * 0x00000005ull * HZ * lpj) >> 32);
}
EXPORT_SYMBOL(__ndelay);