1+ #define ASSEMBLER
2+
3+ #include "common.h"
4+
5+ #define N $r4
6+ #define X $r5
7+ #define INCX $r6
8+ #define I $r12
9+ #define J $r13
10+ #define t1 $r14
11+ #define t2 $r18
12+ #define t3 $r15
13+ #define t4 $r17
14+ #define TEMP $r16
15+ #define m0 $xr8
16+ #define x1 $xr9
17+ #define x2 $xr10
18+ #define x3 $xr11
19+ #define x4 $xr12
20+ #define x5 $xr13
21+ #define x6 $xr14
22+ #define x7 $xr15
23+ #define x8 $xr16
24+ #define VX0 $xr20
25+ #define VX1 $xr21
26+ #define VM0 $xr22
27+ #define VM1 $xr23
28+ #define VM2 $xr18
29+ #define VM3 $xr19
30+
31+ PROLOGUE
32+
33+ bge $r0, N, .L999
34+ bge $r0, INCX, .L999
35+ li.d TEMP, 1
36+ slli.d TEMP, TEMP, BASE_SHIFT
37+ slli.d INCX, INCX, BASE_SHIFT
38+ bne INCX, TEMP, .L20
39+ xvld VM0, X, 0
40+ srai.d I, N, 3
41+ bge $r0, I, .L12
42+ .align 3
43+
44+ .L10:
45+ xvld VX0, X, 0 * SIZE
46+ xvld VX1, X, 4 * SIZE
47+ addi.d I, I, -1
48+ xvfmaxa.d VM1, VX1, VX0
49+ addi.d X, X, 8 * SIZE
50+ xvfmaxa.d VM0, VM0, VM1
51+ blt $r0, I, .L10
52+ .align 3
53+
54+ .L11:
55+ xvpickve.d x1, VM0, 0
56+ xvpickve.d x2, VM0, 1
57+ xvpickve.d x3, VM0, 2
58+ xvpickve.d x4, VM0, 3
59+ xvfmaxa.d VM1, x1, x2
60+ xvfmaxa.d VM2, x3, x4
61+ xvfmaxa.d VM0, VM1, VM2
62+ .align 3
63+
64+ .L12: //INCX==1 and N<8
65+ andi I, N, 7
66+ li.d J, 4
67+ bge J, I, .L13 // 4<N<8
68+ xvld VX0, X, 0
69+ slli.d J, J, 1 // 8
70+ sub .d I, J, I
71+ slli.d I, I, BASE_SHIFT
72+ xvldx VX1, X, I
73+ xvfmaxa.d m0, VX0, VX1 //patial repeat read
74+ xvpickve.d x1, m0, 0
75+ xvpickve.d x2, m0, 1
76+ xvpickve.d x3, m0, 2
77+ xvpickve.d x4, m0, 3
78+ xvfmaxa.d VM1, x1, x2
79+ xvfmaxa.d m0, x3, x4
80+ xvfmaxa.d m0, m0, VM1
81+ xvfmaxa.d VM0, m0, VM0
82+ fabs .d $f22, $f22
83+ fmov.d $f0, $f22
84+ jirl $r0, $r1, 0x0
85+ .align 3
86+
87+ .L13: //INCX==1 and 0<=N<=4
88+ bge $r0, I, .L15
89+ .align 3
90+
91+ .L14:
92+ xvld x1, X, 0
93+ addi.d I, I, -1
94+ xvfmaxa.d VM0, VM0, x1
95+ addi.d X, X, SIZE
96+ blt $r0, I, .L14
97+ .align 3
98+
99+ .L15:
100+ fabs .d $f22, $f22
101+ fmov.d $f0, $f22
102+ jirl $r0, $r1, 0x0
103+ .align 3
104+
105+ .L20: // INCX!=1
106+ move TEMP, X // initialize the maxa value
107+ ld.d t1, TEMP, 0 * SIZE
108+ add .d TEMP, TEMP, INCX
109+ xvinsgr2vr.d VM0, t1, 0
110+ srai.d I, N, 3
111+ bge $r0, I, .L23
112+ ld.d t2, TEMP, 0 * SIZE
113+ add .d TEMP, TEMP, INCX
114+ ld.d t3, TEMP, 0 * SIZE
115+ add .d TEMP, TEMP, INCX
116+ ld.d t4, TEMP, 0 * SIZE
117+ add .d TEMP, TEMP, INCX
118+ xvinsgr2vr.d VM0, t2, 1
119+ xvinsgr2vr.d VM0, t3, 2
120+ xvinsgr2vr.d VM0, t4, 3
121+ .align 3
122+
123+ .L21:
124+ ld.d t1, X, 0 * SIZE
125+ add .d X, X, INCX
126+ ld.d t2, X, 0 * SIZE
127+ add .d X, X, INCX
128+ ld.d t3, X, 0 * SIZE
129+ add .d X, X, INCX
130+ ld.d t4, X, 0 * SIZE
131+ add .d X, X, INCX
132+ xvinsgr2vr.d VX0, t1, 0
133+ xvinsgr2vr.d VX0, t2, 1
134+ xvinsgr2vr.d VX0, t3, 2
135+ xvinsgr2vr.d VX0, t4, 3
136+ ld.d t1, X, 0 * SIZE
137+ add .d X, X, INCX
138+ ld.d t2, X, 0 * SIZE
139+ add .d X, X, INCX
140+ ld.d t3, X, 0 * SIZE
141+ add .d X, X, INCX
142+ ld.d t4, X, 0 * SIZE
143+ add .d X, X, INCX
144+ xvinsgr2vr.d VX1, t1, 0
145+ xvinsgr2vr.d VX1, t2, 1
146+ xvinsgr2vr.d VX1, t3, 2
147+ xvinsgr2vr.d VX1, t4, 3
148+ addi.d I, I, -1
149+ xvfmaxa.d VM1, VX1, VX0
150+ xvfmaxa.d VM0, VM1, VM0
151+ blt $r0, I, .L21
152+ .align 3
153+
154+ .L22:
155+ xvpickve.d x1, VM0, 0
156+ xvpickve.d x2, VM0, 1
157+ xvpickve.d x3, VM0, 2
158+ xvpickve.d x4, VM0, 3
159+ xvfmaxa.d VM1, x1, x2
160+ xvfmaxa.d VM2, x3, x4
161+ xvfmaxa.d VM0, VM1, VM2
162+ .align 3
163+
164+ .L23: //INCX!=1 and N<8
165+ andi I, N, 7
166+ bge $r0, I, .L999
167+ .align 3
168+
169+ .L24:
170+ xvld x1, X, 0
171+ addi.d I, I, -1
172+ xvfmaxa.d VM0, VM0, x1
173+ add .d X, X, INCX
174+ blt $r0, I, .L24
175+ .align 3
176+
177+ .L999:
178+ fabs .d $f22, $f22
179+ fmov.d $f0, $f22
180+ jirl $r0, $r1, 0x0
181+ .align 3
182+
183+ EPILOGUE
0 commit comments