1+ #define ASSEMBLER
2+
3+ #include "common.h"
4+
5+ #define N $r4
6+ #define X $r5
7+ #define INCX $r6
8+ #define I $r12
9+ #define J $r13
10+ #define t1 $r14
11+ #define t2 $r18
12+ #define t3 $r15
13+ #define t4 $r17
14+ #define TEMP $r16
15+ #define m0 $xr8
16+ #define x1 $xr9
17+ #define x2 $xr10
18+ #define x3 $xr11
19+ #define x4 $xr12
20+ #define VX0 $xr20
21+ #define VX1 $xr21
22+ #define VM0 $xr22
23+ #define VM1 $xr23
24+ #define VM2 $xr19
25+
26+ PROLOGUE
27+
28+ bge $r0, N, .L999
29+ bge $r0, INCX, .L999
30+ li.d TEMP, 1
31+ slli.d TEMP, TEMP, BASE_SHIFT
32+ slli.d INCX, INCX, BASE_SHIFT
33+ bne INCX, TEMP, .L20
34+ xvld VM0, X, 0
35+ srai.d I, N, 3
36+ bge $r0, I, .L12
37+ .align 3
38+
39+ .L10:
40+ xvld VX0, X, 0 * SIZE
41+ xvld VX1, X, 4 * SIZE
42+ addi.d I, I, -1
43+ xvfmax.d VM1, VX1, VX0
44+ addi.d X, X, 8 * SIZE
45+ xvfmax.d VM0, VM0, VM1
46+ blt $r0, I, .L10
47+ .align 3
48+
49+ .L11:
50+ xvpickve.d x1, VM0, 0
51+ xvpickve.d x2, VM0, 1
52+ xvpickve.d x3, VM0, 2
53+ xvpickve.d x4, VM0, 3
54+ xvfmax.d VM1, x1, x2
55+ xvfmax.d VM2, x3, x4
56+ xvfmax.d VM0, VM1, VM2
57+ .align 3
58+
59+ .L12: //INCX==1 and N<8
60+ andi I, N, 7
61+ li.d J, 4
62+ bge J, I, .L13 // 4<N<8
63+ xvld VX0, X, 0
64+ slli.d J, J, 1 // 8
65+ sub .d I, J, I
66+ slli.d I, I, BASE_SHIFT
67+ xvldx VX1, X, I
68+ xvfmax.d m0, VX0, VX1 //patial repeat read
69+ xvpickve.d x1, m0, 0
70+ xvpickve.d x2, m0, 1
71+ xvpickve.d x3, m0, 2
72+ xvpickve.d x4, m0, 3
73+ xvfmax.d VM1, x1, x2
74+ xvfmax.d m0, x3, x4
75+ xvfmax.d m0, m0, VM1
76+ xvfmax.d VM0, m0, VM0
77+ fmov.d $f0, $f22
78+ jirl $r0, $r1, 0x0
79+ .align 3
80+
81+ .L13: //INCX==1 and 0<=N<=4
82+ bge $r0, I, .L15
83+ .align 3
84+
85+ .L14:
86+ xvld x1, X, 0
87+ addi.d I, I, -1
88+ xvfmax.d VM0, VM0, x1
89+ addi.d X, X, SIZE
90+ blt $r0, I, .L14
91+ .align 3
92+
93+ .L15:
94+ fmov.d $f0, $f22
95+ jirl $r0, $r1, 0x0
96+ .align 3
97+
98+ .L20: // INCX!=1
99+ move TEMP, X // initialize the max value
100+ ld.d t1, TEMP, 0 * SIZE
101+ add .d TEMP, TEMP, INCX
102+ xvinsgr2vr.d VM0, t1, 0
103+ srai.d I, N, 3
104+ bge $r0, I, .L23
105+ ld.d t2, TEMP, 0 * SIZE
106+ add .d TEMP, TEMP, INCX
107+ ld.d t3, TEMP, 0 * SIZE
108+ add .d TEMP, TEMP, INCX
109+ ld.d t4, TEMP, 0 * SIZE
110+ add .d TEMP, TEMP, INCX
111+ xvinsgr2vr.d VM0, t2, 1
112+ xvinsgr2vr.d VM0, t3, 2
113+ xvinsgr2vr.d VM0, t4, 3
114+ .align 3
115+
116+ .L21:
117+ ld.d t1, X, 0 * SIZE
118+ add .d X, X, INCX
119+ ld.d t2, X, 0 * SIZE
120+ add .d X, X, INCX
121+ ld.d t3, X, 0 * SIZE
122+ add .d X, X, INCX
123+ ld.d t4, X, 0 * SIZE
124+ add .d X, X, INCX
125+ xvinsgr2vr.d VX0, t1, 0
126+ xvinsgr2vr.d VX0, t2, 1
127+ xvinsgr2vr.d VX0, t3, 2
128+ xvinsgr2vr.d VX0, t4, 3
129+ ld.d t1, X, 0 * SIZE
130+ add .d X, X, INCX
131+ ld.d t2, X, 0 * SIZE
132+ add .d X, X, INCX
133+ ld.d t3, X, 0 * SIZE
134+ add .d X, X, INCX
135+ ld.d t4, X, 0 * SIZE
136+ add .d X, X, INCX
137+ xvinsgr2vr.d VX1, t1, 0
138+ xvinsgr2vr.d VX1, t2, 1
139+ xvinsgr2vr.d VX1, t3, 2
140+ xvinsgr2vr.d VX1, t4, 3
141+ addi.d I, I, -1
142+ xvfmax.d VM1, VX1, VX0
143+ xvfmax.d VM0, VM1, VM0
144+ blt $r0, I, .L21
145+ .align 3
146+
147+ .L22:
148+ xvpickve.d x1, VM0, 0
149+ xvpickve.d x2, VM0, 1
150+ xvpickve.d x3, VM0, 2
151+ xvpickve.d x4, VM0, 3
152+ xvfmax.d VM1, x1, x2
153+ xvfmax.d VM2, x3, x4
154+ xvfmax.d VM0, VM1, VM2
155+ .align 3
156+
157+ .L23: //INCX!=1 and N<8
158+ andi I, N, 7
159+ bge $r0, I, .L999
160+ .align 3
161+
162+ .L24:
163+ xvld x1, X, 0
164+ addi.d I, I, -1
165+ xvfmax.d VM0, VM0, x1
166+ add .d X, X, INCX
167+ blt $r0, I, .L24
168+ .align 3
169+
170+ .L999:
171+ fmov.d $f0, $f22
172+ jirl $r0, $r1, 0x0
173+ .align 3
174+
175+ EPILOGUE
0 commit comments