Skip to content

Commit 0509666

Browse files
Al Viromattst88
authored andcommitted
alpha: lazy FPU switching
On each context switch we save the FPU registers on stack of old process and restore FPU registers from the stack of new one. That allows us to avoid doing that each time we enter/leave the kernel mode; however, that can get suboptimal in some cases. For one thing, we don't need to bother saving anything for kernel threads. For another, if between entering and leaving the kernel a thread gives CPU up more than once, it will do useless work, saving the same values every time, only to discard the saved copy as soon as it returns from switch_to(). Alternative solution: * move the array we save into from switch_stack to thread_info * have a (thread-synchronous) flag set when we save them * have another flag set when they should be restored on return to userland. * do *NOT* save/restore them in do_switch_stack()/undo_switch_stack(). * restore on the exit to user mode if the restore flag had been set. Clear both flags. * on context switch, entry to fork/clone/vfork, before entry into do_signal() and on entry into straced syscall save the registers and set the 'saved' flag unless it had been already set. * on context switch set the 'restore' flag as well. * have copy_thread() set both flags for child, so the registers would be restored once the child returns to userland. * use the saved data in setup_sigcontext(); have restore_sigcontext() set both flags and copy from sigframe to save area. * teach ptrace to look for FPU registers in thread_info instead of switch_stack. * teach isolated accesses to FPU registers (rdfpcr, wrfpcr, etc.) to check the 'saved' flag (under preempt_disable()) and work with the save area if it's been set; if 'saved' flag is found upon write access, set 'restore' flag as well. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Matt Turner <mattst88@gmail.com>
1 parent a7acb18 commit 0509666

9 files changed

Lines changed: 192 additions & 119 deletions

File tree

arch/alpha/include/asm/fpu.h

Lines changed: 37 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,27 @@ rdfpcr(void)
1515
{
1616
unsigned long tmp, ret;
1717

18+
preempt_disable();
19+
if (current_thread_info()->status & TS_SAVED_FP) {
20+
ret = current_thread_info()->fp[31];
21+
} else {
1822
#if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67)
19-
__asm__ __volatile__ (
20-
"ftoit $f0,%0\n\t"
21-
"mf_fpcr $f0\n\t"
22-
"ftoit $f0,%1\n\t"
23-
"itoft %0,$f0"
24-
: "=r"(tmp), "=r"(ret));
23+
__asm__ __volatile__ (
24+
"ftoit $f0,%0\n\t"
25+
"mf_fpcr $f0\n\t"
26+
"ftoit $f0,%1\n\t"
27+
"itoft %0,$f0"
28+
: "=r"(tmp), "=r"(ret));
2529
#else
26-
__asm__ __volatile__ (
27-
"stt $f0,%0\n\t"
28-
"mf_fpcr $f0\n\t"
29-
"stt $f0,%1\n\t"
30-
"ldt $f0,%0"
31-
: "=m"(tmp), "=m"(ret));
30+
__asm__ __volatile__ (
31+
"stt $f0,%0\n\t"
32+
"mf_fpcr $f0\n\t"
33+
"stt $f0,%1\n\t"
34+
"ldt $f0,%0"
35+
: "=m"(tmp), "=m"(ret));
3236
#endif
37+
}
38+
preempt_enable();
3339

3440
return ret;
3541
}
@@ -39,21 +45,28 @@ wrfpcr(unsigned long val)
3945
{
4046
unsigned long tmp;
4147

48+
preempt_disable();
49+
if (current_thread_info()->status & TS_SAVED_FP) {
50+
current_thread_info()->status |= TS_RESTORE_FP;
51+
current_thread_info()->fp[31] = val;
52+
} else {
4253
#if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67)
43-
__asm__ __volatile__ (
44-
"ftoit $f0,%0\n\t"
45-
"itoft %1,$f0\n\t"
46-
"mt_fpcr $f0\n\t"
47-
"itoft %0,$f0"
48-
: "=&r"(tmp) : "r"(val));
54+
__asm__ __volatile__ (
55+
"ftoit $f0,%0\n\t"
56+
"itoft %1,$f0\n\t"
57+
"mt_fpcr $f0\n\t"
58+
"itoft %0,$f0"
59+
: "=&r"(tmp) : "r"(val));
4960
#else
50-
__asm__ __volatile__ (
51-
"stt $f0,%0\n\t"
52-
"ldt $f0,%1\n\t"
53-
"mt_fpcr $f0\n\t"
54-
"ldt $f0,%0"
55-
: "=m"(tmp) : "m"(val));
61+
__asm__ __volatile__ (
62+
"stt $f0,%0\n\t"
63+
"ldt $f0,%1\n\t"
64+
"mt_fpcr $f0\n\t"
65+
"ldt $f0,%0"
66+
: "=m"(tmp) : "m"(val));
5667
#endif
68+
}
69+
preempt_enable();
5770
}
5871

5972
static inline unsigned long

arch/alpha/include/asm/thread_info.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ struct thread_info {
2626
int bpt_nsaved;
2727
unsigned long bpt_addr[2]; /* breakpoint handling */
2828
unsigned int bpt_insn[2];
29+
unsigned long fp[32];
2930
};
3031

3132
/*
@@ -83,6 +84,9 @@ register unsigned long *current_stack_pointer __asm__ ("$30");
8384
#define TS_UAC_NOFIX 0x0002 /* ! flags as they match */
8485
#define TS_UAC_SIGBUS 0x0004 /* ! userspace part of 'osf_sysinfo' */
8586

87+
#define TS_SAVED_FP 0x0008
88+
#define TS_RESTORE_FP 0x0010
89+
8690
#define SET_UNALIGN_CTL(task,value) ({ \
8791
__u32 status = task_thread_info(task)->status & ~UAC_BITMASK; \
8892
if (value & PR_UNALIGN_NOPRINT) \
@@ -106,5 +110,17 @@ register unsigned long *current_stack_pointer __asm__ ("$30");
106110
put_user(res, (int __user *)(value)); \
107111
})
108112

113+
#ifndef __ASSEMBLY__
114+
extern void __save_fpu(void);
115+
116+
static inline void save_fpu(void)
117+
{
118+
if (!(current_thread_info()->status & TS_SAVED_FP)) {
119+
current_thread_info()->status |= TS_SAVED_FP;
120+
__save_fpu();
121+
}
122+
}
123+
#endif
124+
109125
#endif /* __KERNEL__ */
110126
#endif /* _ALPHA_THREAD_INFO_H */

arch/alpha/include/uapi/asm/ptrace.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,9 @@ struct switch_stack {
6464
unsigned long r14;
6565
unsigned long r15;
6666
unsigned long r26;
67+
#ifndef __KERNEL__
6768
unsigned long fp[32]; /* fp[31] is fpcr */
69+
#endif
6870
};
6971

7072

arch/alpha/kernel/asm-offsets.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ void foo(void)
1717
DEFINE(TI_TASK, offsetof(struct thread_info, task));
1818
DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
1919
DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
20+
DEFINE(TI_FP, offsetof(struct thread_info, fp));
21+
DEFINE(TI_STATUS, offsetof(struct thread_info, status));
2022
BLANK();
2123

2224
DEFINE(TASK_BLOCKED, offsetof(struct task_struct, blocked));

arch/alpha/kernel/entry.S

Lines changed: 73 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
/* Stack offsets. */
1919
#define SP_OFF 184
20-
#define SWITCH_STACK_SIZE 320
20+
#define SWITCH_STACK_SIZE 64
2121

2222
.macro CFI_START_OSF_FRAME func
2323
.align 4
@@ -159,7 +159,6 @@
159159
.cfi_rel_offset $13, 32
160160
.cfi_rel_offset $14, 40
161161
.cfi_rel_offset $15, 48
162-
/* We don't really care about the FP registers for debugging. */
163162
.endm
164163

165164
.macro UNDO_SWITCH_STACK
@@ -498,6 +497,10 @@ ret_to_user:
498497
and $17, _TIF_WORK_MASK, $2
499498
bne $2, work_pending
500499
restore_all:
500+
ldl $2, TI_STATUS($8)
501+
and $2, TS_SAVED_FP | TS_RESTORE_FP, $3
502+
bne $3, restore_fpu
503+
restore_other:
501504
.cfi_remember_state
502505
RESTORE_ALL
503506
call_pal PAL_rti
@@ -506,7 +509,7 @@ ret_to_kernel:
506509
.cfi_restore_state
507510
lda $16, 7
508511
call_pal PAL_swpipl
509-
br restore_all
512+
br restore_other
510513

511514
.align 3
512515
$syscall_error:
@@ -570,6 +573,14 @@ $work_notifysig:
570573
.type strace, @function
571574
strace:
572575
/* set up signal stack, call syscall_trace */
576+
// NB: if anyone adds preemption, this block will need to be protected
577+
ldl $1, TI_STATUS($8)
578+
and $1, TS_SAVED_FP, $3
579+
or $1, TS_SAVED_FP, $2
580+
bne $3, 1f
581+
stl $2, TI_STATUS($8)
582+
bsr $26, __save_fpu
583+
1:
573584
DO_SWITCH_STACK
574585
jsr $26, syscall_trace_enter /* returns the syscall number */
575586
UNDO_SWITCH_STACK
@@ -649,40 +660,6 @@ do_switch_stack:
649660
stq $14, 40($sp)
650661
stq $15, 48($sp)
651662
stq $26, 56($sp)
652-
stt $f0, 64($sp)
653-
stt $f1, 72($sp)
654-
stt $f2, 80($sp)
655-
stt $f3, 88($sp)
656-
stt $f4, 96($sp)
657-
stt $f5, 104($sp)
658-
stt $f6, 112($sp)
659-
stt $f7, 120($sp)
660-
stt $f8, 128($sp)
661-
stt $f9, 136($sp)
662-
stt $f10, 144($sp)
663-
stt $f11, 152($sp)
664-
stt $f12, 160($sp)
665-
stt $f13, 168($sp)
666-
stt $f14, 176($sp)
667-
stt $f15, 184($sp)
668-
stt $f16, 192($sp)
669-
stt $f17, 200($sp)
670-
stt $f18, 208($sp)
671-
stt $f19, 216($sp)
672-
stt $f20, 224($sp)
673-
stt $f21, 232($sp)
674-
stt $f22, 240($sp)
675-
stt $f23, 248($sp)
676-
stt $f24, 256($sp)
677-
stt $f25, 264($sp)
678-
stt $f26, 272($sp)
679-
stt $f27, 280($sp)
680-
mf_fpcr $f0 # get fpcr
681-
stt $f28, 288($sp)
682-
stt $f29, 296($sp)
683-
stt $f30, 304($sp)
684-
stt $f0, 312($sp) # save fpcr in slot of $f31
685-
ldt $f0, 64($sp) # dont let "do_switch_stack" change fp state.
686663
ret $31, ($1), 1
687664
.cfi_endproc
688665
.size do_switch_stack, .-do_switch_stack
@@ -701,54 +678,71 @@ undo_switch_stack:
701678
ldq $14, 40($sp)
702679
ldq $15, 48($sp)
703680
ldq $26, 56($sp)
704-
ldt $f30, 312($sp) # get saved fpcr
705-
ldt $f0, 64($sp)
706-
ldt $f1, 72($sp)
707-
ldt $f2, 80($sp)
708-
ldt $f3, 88($sp)
709-
mt_fpcr $f30 # install saved fpcr
710-
ldt $f4, 96($sp)
711-
ldt $f5, 104($sp)
712-
ldt $f6, 112($sp)
713-
ldt $f7, 120($sp)
714-
ldt $f8, 128($sp)
715-
ldt $f9, 136($sp)
716-
ldt $f10, 144($sp)
717-
ldt $f11, 152($sp)
718-
ldt $f12, 160($sp)
719-
ldt $f13, 168($sp)
720-
ldt $f14, 176($sp)
721-
ldt $f15, 184($sp)
722-
ldt $f16, 192($sp)
723-
ldt $f17, 200($sp)
724-
ldt $f18, 208($sp)
725-
ldt $f19, 216($sp)
726-
ldt $f20, 224($sp)
727-
ldt $f21, 232($sp)
728-
ldt $f22, 240($sp)
729-
ldt $f23, 248($sp)
730-
ldt $f24, 256($sp)
731-
ldt $f25, 264($sp)
732-
ldt $f26, 272($sp)
733-
ldt $f27, 280($sp)
734-
ldt $f28, 288($sp)
735-
ldt $f29, 296($sp)
736-
ldt $f30, 304($sp)
737681
lda $sp, SWITCH_STACK_SIZE($sp)
738682
ret $31, ($1), 1
739683
.cfi_endproc
740684
.size undo_switch_stack, .-undo_switch_stack
685+
686+
#define FR(n) n * 8 + TI_FP($8)
687+
.align 4
688+
.globl __save_fpu
689+
.type __save_fpu, @function
690+
__save_fpu:
691+
#define V(n) stt $f##n, FR(n)
692+
V( 0); V( 1); V( 2); V( 3)
693+
V( 4); V( 5); V( 6); V( 7)
694+
V( 8); V( 9); V(10); V(11)
695+
V(12); V(13); V(14); V(15)
696+
V(16); V(17); V(18); V(19)
697+
V(20); V(21); V(22); V(23)
698+
V(24); V(25); V(26); V(27)
699+
mf_fpcr $f0 # get fpcr
700+
V(28); V(29); V(30)
701+
stt $f0, FR(31) # save fpcr in slot of $f31
702+
ldt $f0, FR(0) # don't let "__save_fpu" change fp state.
703+
ret
704+
#undef V
705+
.size __save_fpu, .-__save_fpu
706+
707+
.align 4
708+
restore_fpu:
709+
and $3, TS_RESTORE_FP, $3
710+
bic $2, TS_SAVED_FP | TS_RESTORE_FP, $2
711+
beq $3, 1f
712+
#define V(n) ldt $f##n, FR(n)
713+
ldt $f30, FR(31) # get saved fpcr
714+
V( 0); V( 1); V( 2); V( 3)
715+
mt_fpcr $f30 # install saved fpcr
716+
V( 4); V( 5); V( 6); V( 7)
717+
V( 8); V( 9); V(10); V(11)
718+
V(12); V(13); V(14); V(15)
719+
V(16); V(17); V(18); V(19)
720+
V(20); V(21); V(22); V(23)
721+
V(24); V(25); V(26); V(27)
722+
V(28); V(29); V(30)
723+
1: stl $2, TI_STATUS($8)
724+
br restore_other
725+
#undef V
726+
741727

742728
/*
743729
* The meat of the context switch code.
744730
*/
745-
746731
.align 4
747732
.globl alpha_switch_to
748733
.type alpha_switch_to, @function
749734
.cfi_startproc
750735
alpha_switch_to:
751736
DO_SWITCH_STACK
737+
ldl $1, TI_STATUS($8)
738+
and $1, TS_RESTORE_FP, $3
739+
bne $3, 1f
740+
or $1, TS_RESTORE_FP | TS_SAVED_FP, $2
741+
and $1, TS_SAVED_FP, $3
742+
stl $2, TI_STATUS($8)
743+
bne $3, 1f
744+
bsr $26, __save_fpu
745+
1:
752746
call_pal PAL_swpctx
753747
lda $8, 0x3fff
754748
UNDO_SWITCH_STACK
@@ -799,6 +793,14 @@ ret_from_kernel_thread:
799793
alpha_\name:
800794
.prologue 0
801795
bsr $1, do_switch_stack
796+
// NB: if anyone adds preemption, this block will need to be protected
797+
ldl $1, TI_STATUS($8)
798+
and $1, TS_SAVED_FP, $3
799+
or $1, TS_SAVED_FP, $2
800+
bne $3, 1f
801+
stl $2, TI_STATUS($8)
802+
bsr $26, __save_fpu
803+
1:
802804
jsr $26, sys_\name
803805
ldq $26, 56($sp)
804806
lda $sp, SWITCH_STACK_SIZE($sp)

arch/alpha/kernel/process.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
244244
childstack = ((struct switch_stack *) childregs) - 1;
245245
childti->pcb.ksp = (unsigned long) childstack;
246246
childti->pcb.flags = 1; /* set FEN, clear everything else */
247+
childti->status |= TS_SAVED_FP | TS_RESTORE_FP;
247248

248249
if (unlikely(args->fn)) {
249250
/* kernel thread */
@@ -253,6 +254,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
253254
childstack->r9 = (unsigned long) args->fn;
254255
childstack->r10 = (unsigned long) args->fn_arg;
255256
childregs->hae = alpha_mv.hae_cache;
257+
memset(childti->fp, '\0', sizeof(childti->fp));
256258
childti->pcb.usp = 0;
257259
return 0;
258260
}
@@ -335,8 +337,7 @@ EXPORT_SYMBOL(dump_elf_task);
335337

336338
int elf_core_copy_task_fpregs(struct task_struct *t, elf_fpregset_t *fpu)
337339
{
338-
struct switch_stack *sw = (struct switch_stack *)task_pt_regs(t) - 1;
339-
memcpy(fpu, sw->fp, 32 * 8);
340+
memcpy(fpu, task_thread_info(t)->fp, 32 * 8);
340341
return 1;
341342
}
342343

0 commit comments

Comments
 (0)