kernel_optimize_test/arch/sparc64/kernel/trampoline.S

431 lines
9.3 KiB
ArmAsm
Raw Normal View History

/* $Id: trampoline.S,v 1.26 2002/02/09 19:49:30 davem Exp $
* trampoline.S: Jump start slave processors on sparc64.
*
* Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
*/
#include <asm/head.h>
#include <asm/asi.h>
#include <asm/lsu.h>
#include <asm/dcr.h>
#include <asm/dcu.h>
#include <asm/pstate.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/spitfire.h>
#include <asm/processor.h>
#include <asm/thread_info.h>
#include <asm/mmu.h>
#include <asm/hypervisor.h>
#include <asm/cpudata.h>
.data
.align 8
call_method:
.asciz "call-method"
.align 8
itlb_load:
.asciz "SUNW,itlb-load"
.align 8
dtlb_load:
.asciz "SUNW,dtlb-load"
.text
.align 8
.globl sparc64_cpu_startup, sparc64_cpu_startup_end
sparc64_cpu_startup:
flushw
BRANCH_IF_SUN4V(g1, niagara_startup)
BRANCH_IF_CHEETAH_BASE(g1, g5, cheetah_startup)
BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1, g5, cheetah_plus_startup)
ba,pt %xcc, spitfire_startup
nop
cheetah_plus_startup:
/* Preserve OBP chosen DCU and DCR register settings. */
ba,pt %xcc, cheetah_generic_startup
nop
cheetah_startup:
mov DCR_BPE | DCR_RPE | DCR_SI | DCR_IFPOE | DCR_MS, %g1
wr %g1, %asr18
sethi %uhi(DCU_ME|DCU_RE|DCU_HPE|DCU_SPE|DCU_SL|DCU_WE), %g5
or %g5, %ulo(DCU_ME|DCU_RE|DCU_HPE|DCU_SPE|DCU_SL|DCU_WE), %g5
sllx %g5, 32, %g5
or %g5, DCU_DM | DCU_IM | DCU_DC | DCU_IC, %g5
stxa %g5, [%g0] ASI_DCU_CONTROL_REG
membar #Sync
cheetah_generic_startup:
mov TSB_EXTENSION_P, %g3
stxa %g0, [%g3] ASI_DMMU
stxa %g0, [%g3] ASI_IMMU
membar #Sync
mov TSB_EXTENSION_S, %g3
stxa %g0, [%g3] ASI_DMMU
membar #Sync
mov TSB_EXTENSION_N, %g3
stxa %g0, [%g3] ASI_DMMU
stxa %g0, [%g3] ASI_IMMU
membar #Sync
/* fallthru */
niagara_startup:
/* Disable STICK_INT interrupts. */
sethi %hi(0x80000000), %g5
sllx %g5, 32, %g5
wr %g5, %asr25
ba,pt %xcc, startup_continue
nop
spitfire_startup:
mov (LSU_CONTROL_IC | LSU_CONTROL_DC | LSU_CONTROL_IM | LSU_CONTROL_DM), %g1
stxa %g1, [%g0] ASI_LSU_CONTROL
membar #Sync
startup_continue:
wrpr %g0, 15, %pil
sethi %hi(0x80000000), %g2
sllx %g2, 32, %g2
wr %g2, 0, %tick_cmpr
BRANCH_IF_SUN4V(g1, niagara_lock_tlb)
/* Call OBP by hand to lock KERNBASE into i/d tlbs.
* We lock 2 consequetive entries if we are 'bigkernel'.
*/
mov %o0, %l0
sethi %hi(prom_entry_lock), %g2
1: ldstub [%g2 + %lo(prom_entry_lock)], %g1
membar #StoreLoad | #StoreStore
brnz,pn %g1, 1b
nop
sethi %hi(p1275buf), %g2
or %g2, %lo(p1275buf), %g2
ldx [%g2 + 0x10], %l2
mov %sp, %l1
add %l2, -(192 + 128), %sp
flushw
sethi %hi(call_method), %g2
or %g2, %lo(call_method), %g2
stx %g2, [%sp + 2047 + 128 + 0x00]
mov 5, %g2
stx %g2, [%sp + 2047 + 128 + 0x08]
mov 1, %g2
stx %g2, [%sp + 2047 + 128 + 0x10]
sethi %hi(itlb_load), %g2
or %g2, %lo(itlb_load), %g2
stx %g2, [%sp + 2047 + 128 + 0x18]
sethi %hi(prom_mmu_ihandle_cache), %g2
lduw [%g2 + %lo(prom_mmu_ihandle_cache)], %g2
stx %g2, [%sp + 2047 + 128 + 0x20]
sethi %hi(KERNBASE), %g2
stx %g2, [%sp + 2047 + 128 + 0x28]
sethi %hi(kern_locked_tte_data), %g2
ldx [%g2 + %lo(kern_locked_tte_data)], %g2
stx %g2, [%sp + 2047 + 128 + 0x30]
mov 15, %g2
BRANCH_IF_ANY_CHEETAH(g1,g5,1f)
mov 63, %g2
1:
stx %g2, [%sp + 2047 + 128 + 0x38]
sethi %hi(p1275buf), %g2
or %g2, %lo(p1275buf), %g2
ldx [%g2 + 0x08], %o1
call %o1
add %sp, (2047 + 128), %o0
sethi %hi(bigkernel), %g2
lduw [%g2 + %lo(bigkernel)], %g2
brz,pt %g2, do_dtlb
nop
sethi %hi(call_method), %g2
or %g2, %lo(call_method), %g2
stx %g2, [%sp + 2047 + 128 + 0x00]
mov 5, %g2
stx %g2, [%sp + 2047 + 128 + 0x08]
mov 1, %g2
stx %g2, [%sp + 2047 + 128 + 0x10]
sethi %hi(itlb_load), %g2
or %g2, %lo(itlb_load), %g2
stx %g2, [%sp + 2047 + 128 + 0x18]
sethi %hi(prom_mmu_ihandle_cache), %g2
lduw [%g2 + %lo(prom_mmu_ihandle_cache)], %g2
stx %g2, [%sp + 2047 + 128 + 0x20]
sethi %hi(KERNBASE + 0x400000), %g2
stx %g2, [%sp + 2047 + 128 + 0x28]
sethi %hi(kern_locked_tte_data), %g2
ldx [%g2 + %lo(kern_locked_tte_data)], %g2
sethi %hi(0x400000), %g1
add %g2, %g1, %g2
stx %g2, [%sp + 2047 + 128 + 0x30]
mov 14, %g2
BRANCH_IF_ANY_CHEETAH(g1,g5,1f)
mov 62, %g2
1:
stx %g2, [%sp + 2047 + 128 + 0x38]
sethi %hi(p1275buf), %g2
or %g2, %lo(p1275buf), %g2
ldx [%g2 + 0x08], %o1
call %o1
add %sp, (2047 + 128), %o0
do_dtlb:
sethi %hi(call_method), %g2
or %g2, %lo(call_method), %g2
stx %g2, [%sp + 2047 + 128 + 0x00]
mov 5, %g2
stx %g2, [%sp + 2047 + 128 + 0x08]
mov 1, %g2
stx %g2, [%sp + 2047 + 128 + 0x10]
sethi %hi(dtlb_load), %g2
or %g2, %lo(dtlb_load), %g2
stx %g2, [%sp + 2047 + 128 + 0x18]
sethi %hi(prom_mmu_ihandle_cache), %g2
lduw [%g2 + %lo(prom_mmu_ihandle_cache)], %g2
stx %g2, [%sp + 2047 + 128 + 0x20]
sethi %hi(KERNBASE), %g2
stx %g2, [%sp + 2047 + 128 + 0x28]
sethi %hi(kern_locked_tte_data), %g2
ldx [%g2 + %lo(kern_locked_tte_data)], %g2
stx %g2, [%sp + 2047 + 128 + 0x30]
mov 15, %g2
BRANCH_IF_ANY_CHEETAH(g1,g5,1f)
mov 63, %g2
1:
stx %g2, [%sp + 2047 + 128 + 0x38]
sethi %hi(p1275buf), %g2
or %g2, %lo(p1275buf), %g2
ldx [%g2 + 0x08], %o1
call %o1
add %sp, (2047 + 128), %o0
sethi %hi(bigkernel), %g2
lduw [%g2 + %lo(bigkernel)], %g2
brz,pt %g2, do_unlock
nop
sethi %hi(call_method), %g2
or %g2, %lo(call_method), %g2
stx %g2, [%sp + 2047 + 128 + 0x00]
mov 5, %g2
stx %g2, [%sp + 2047 + 128 + 0x08]
mov 1, %g2
stx %g2, [%sp + 2047 + 128 + 0x10]
sethi %hi(dtlb_load), %g2
or %g2, %lo(dtlb_load), %g2
stx %g2, [%sp + 2047 + 128 + 0x18]
sethi %hi(prom_mmu_ihandle_cache), %g2
lduw [%g2 + %lo(prom_mmu_ihandle_cache)], %g2
stx %g2, [%sp + 2047 + 128 + 0x20]
sethi %hi(KERNBASE + 0x400000), %g2
stx %g2, [%sp + 2047 + 128 + 0x28]
sethi %hi(kern_locked_tte_data), %g2
ldx [%g2 + %lo(kern_locked_tte_data)], %g2
sethi %hi(0x400000), %g1
add %g2, %g1, %g2
stx %g2, [%sp + 2047 + 128 + 0x30]
mov 14, %g2
BRANCH_IF_ANY_CHEETAH(g1,g5,1f)
mov 62, %g2
1:
stx %g2, [%sp + 2047 + 128 + 0x38]
sethi %hi(p1275buf), %g2
or %g2, %lo(p1275buf), %g2
ldx [%g2 + 0x08], %o1
call %o1
add %sp, (2047 + 128), %o0
do_unlock:
sethi %hi(prom_entry_lock), %g2
stb %g0, [%g2 + %lo(prom_entry_lock)]
membar #StoreStore | #StoreLoad
ba,pt %xcc, after_lock_tlb
nop
niagara_lock_tlb:
mov HV_FAST_MMU_MAP_PERM_ADDR, %o5
sethi %hi(KERNBASE), %o0
clr %o1
sethi %hi(kern_locked_tte_data), %o2
ldx [%o2 + %lo(kern_locked_tte_data)], %o2
mov HV_MMU_IMMU, %o3
ta HV_FAST_TRAP
mov HV_FAST_MMU_MAP_PERM_ADDR, %o5
sethi %hi(KERNBASE), %o0
clr %o1
sethi %hi(kern_locked_tte_data), %o2
ldx [%o2 + %lo(kern_locked_tte_data)], %o2
mov HV_MMU_DMMU, %o3
ta HV_FAST_TRAP
sethi %hi(bigkernel), %g2
lduw [%g2 + %lo(bigkernel)], %g2
brz,pt %g2, after_lock_tlb
nop
mov HV_FAST_MMU_MAP_PERM_ADDR, %o5
sethi %hi(KERNBASE + 0x400000), %o0
clr %o1
sethi %hi(kern_locked_tte_data), %o2
ldx [%o2 + %lo(kern_locked_tte_data)], %o2
sethi %hi(0x400000), %o3
add %o2, %o3, %o2
mov HV_MMU_IMMU, %o3
ta HV_FAST_TRAP
mov HV_FAST_MMU_MAP_PERM_ADDR, %o5
sethi %hi(KERNBASE + 0x400000), %o0
clr %o1
sethi %hi(kern_locked_tte_data), %o2
ldx [%o2 + %lo(kern_locked_tte_data)], %o2
sethi %hi(0x400000), %o3
add %o2, %o3, %o2
mov HV_MMU_DMMU, %o3
ta HV_FAST_TRAP
after_lock_tlb:
mov %l1, %sp
flushw
mov %l0, %o0
wrpr %g0, (PSTATE_PRIV | PSTATE_PEF), %pstate
wr %g0, 0, %fprs
/* XXX Buggy PROM... */
srl %o0, 0, %o0
ldx [%o0], %g6
wr %g0, ASI_P, %asi
mov PRIMARY_CONTEXT, %g7
661: stxa %g0, [%g7] ASI_DMMU
.section .sun4v_1insn_patch, "ax"
.word 661b
stxa %g0, [%g7] ASI_MMU
.previous
membar #Sync
mov SECONDARY_CONTEXT, %g7
661: stxa %g0, [%g7] ASI_DMMU
.section .sun4v_1insn_patch, "ax"
.word 661b
stxa %g0, [%g7] ASI_MMU
.previous
membar #Sync
mov 1, %g5
sllx %g5, THREAD_SHIFT, %g5
sub %g5, (STACKFRAME_SZ + STACK_BIAS), %g5
add %g6, %g5, %sp
mov 0, %fp
wrpr %g0, 0, %wstate
wrpr %g0, 0, %tl
[SPARC64]: Elminate all usage of hard-coded trap globals. UltraSPARC has special sets of global registers which are switched to for certain trap types. There is one set for MMU related traps, one set of Interrupt Vector processing, and another set (called the Alternate globals) for all other trap types. For what seems like forever we've hard coded the values in some of these trap registers. Some examples include: 1) Interrupt Vector global %g6 holds current processors interrupt work struct where received interrupts are managed for IRQ handler dispatch. 2) MMU global %g7 holds the base of the page tables of the currently active address space. 3) Alternate global %g6 held the current_thread_info() value. Such hardcoding has resulted in some serious issues in many areas. There are some code sequences where having another register available would help clean up the implementation. Taking traps such as cross-calls from the OBP firmware requires some trick code sequences wherein we have to save away and restore all of the special sets of global registers when we enter/exit OBP. We were also using the IMMU TSB register on SMP to hold the per-cpu area base address, which doesn't work any longer now that we actually use the TSB facility of the cpu. The implementation is pretty straight forward. One tricky bit is getting the current processor ID as that is different on different cpu variants. We use a stub with a fancy calling convention which we patch at boot time. The calling convention is that the stub is branched to and the (PC - 4) to return to is in register %g1. The cpu number is left in %g6. This stub can be invoked by using the __GET_CPUID macro. We use an array of per-cpu trap state to store the current thread and physical address of the current address space's page tables. The TRAP_LOAD_THREAD_REG loads %g6 with the current thread from this table, it uses __GET_CPUID and also clobbers %g1. TRAP_LOAD_IRQ_WORK is used by the interrupt vector processing to load the current processor's IRQ software state into %g6. It also uses __GET_CPUID and clobbers %g1. Finally, TRAP_LOAD_PGD_PHYS loads the physical address base of the current address space's page tables into %g7, it clobbers %g1 and uses __GET_CPUID. Many refinements are possible, as well as some tuning, with this stuff in place. Signed-off-by: David S. Miller <davem@davemloft.net>
2006-02-27 15:24:22 +08:00
/* Load TBA, then we can resurface. */
sethi %hi(sparc64_ttable_tl0), %g5
wrpr %g5, %tba
ldx [%g6 + TI_TASK], %g4
wrpr %g0, 0, %wstate
call init_irqwork_curcpu
nop
sethi %hi(tlb_type), %g3
lduw [%g3 + %lo(tlb_type)], %g2
cmp %g2, 3
bne,pt %icc, 1f
nop
call sun4v_init_mondo_queues
mov 0, %o0
1: call init_cur_cpu_trap
[SPARC64]: Elminate all usage of hard-coded trap globals. UltraSPARC has special sets of global registers which are switched to for certain trap types. There is one set for MMU related traps, one set of Interrupt Vector processing, and another set (called the Alternate globals) for all other trap types. For what seems like forever we've hard coded the values in some of these trap registers. Some examples include: 1) Interrupt Vector global %g6 holds current processors interrupt work struct where received interrupts are managed for IRQ handler dispatch. 2) MMU global %g7 holds the base of the page tables of the currently active address space. 3) Alternate global %g6 held the current_thread_info() value. Such hardcoding has resulted in some serious issues in many areas. There are some code sequences where having another register available would help clean up the implementation. Taking traps such as cross-calls from the OBP firmware requires some trick code sequences wherein we have to save away and restore all of the special sets of global registers when we enter/exit OBP. We were also using the IMMU TSB register on SMP to hold the per-cpu area base address, which doesn't work any longer now that we actually use the TSB facility of the cpu. The implementation is pretty straight forward. One tricky bit is getting the current processor ID as that is different on different cpu variants. We use a stub with a fancy calling convention which we patch at boot time. The calling convention is that the stub is branched to and the (PC - 4) to return to is in register %g1. The cpu number is left in %g6. This stub can be invoked by using the __GET_CPUID macro. We use an array of per-cpu trap state to store the current thread and physical address of the current address space's page tables. The TRAP_LOAD_THREAD_REG loads %g6 with the current thread from this table, it uses __GET_CPUID and also clobbers %g1. TRAP_LOAD_IRQ_WORK is used by the interrupt vector processing to load the current processor's IRQ software state into %g6. It also uses __GET_CPUID and clobbers %g1. Finally, TRAP_LOAD_PGD_PHYS loads the physical address base of the current address space's page tables into %g7, it clobbers %g1 and uses __GET_CPUID. Many refinements are possible, as well as some tuning, with this stuff in place. Signed-off-by: David S. Miller <davem@davemloft.net>
2006-02-27 15:24:22 +08:00
nop
/* Start using proper page size encodings in ctx register. */
sethi %hi(sparc64_kern_pri_context), %g3
ldx [%g3 + %lo(sparc64_kern_pri_context)], %g2
mov PRIMARY_CONTEXT, %g1
661: stxa %g2, [%g1] ASI_DMMU
.section .sun4v_1insn_patch, "ax"
.word 661b
stxa %g2, [%g1] ASI_MMU
.previous
membar #Sync
rdpr %pstate, %o1
or %o1, PSTATE_IE, %o1
wrpr %o1, 0, %pstate
sethi %hi(is_sun4v), %o0
lduw [%o0 + %lo(is_sun4v)], %o0
brz,pt %o0, 1f
nop
TRAP_LOAD_TRAP_BLOCK(%g2, %g3)
add %g2, TRAP_PER_CPU_FAULT_INFO, %g2
stxa %g2, [%g0] ASI_SCRATCHPAD
/* Compute physical address:
*
* paddr = kern_base + (mmfsa_vaddr - KERNBASE)
*/
sethi %hi(KERNBASE), %g3
sub %g2, %g3, %g2
sethi %hi(kern_base), %g3
ldx [%g3 + %lo(kern_base)], %g3
add %g2, %g3, %o1
call prom_set_trap_table_sun4v
sethi %hi(sparc64_ttable_tl0), %o0
ba,pt %xcc, 2f
nop
1: call prom_set_trap_table
sethi %hi(sparc64_ttable_tl0), %o0
2: call smp_callin
nop
call cpu_idle
mov 0, %o0
call cpu_panic
nop
1: b,a,pt %xcc, 1b
.align 8
sparc64_cpu_startup_end: