1+ #define ASSEMBLER
2+
3+ #include "common.h"
4+
5+ #define N $r4
6+ #define X $r5
7+ #define INCX $r6
8+ #define I $r12
9+ #define J $r13
10+ #define t1 $r14
11+ #define t2 $r18
12+ #define t3 $r15
13+ #define t4 $r17
14+ #define TEMP $r16
15+ #define m0 $xr8
16+ #define x1 $xr9
17+ #define x2 $xr10
18+ #define x3 $xr11
19+ #define x4 $xr12
20+ #define VX0 $xr20
21+ #define VX1 $xr21
22+ #define VM0 $xr22
23+ #define VM1 $xr23
24+ #define VM2 $xr19
25+
26+ PROLOGUE
27+
28+ bge $r0, N, .L999
29+ bge $r0, INCX, .L999
30+ li.d TEMP, 1
31+ slli.d TEMP, TEMP, BASE_SHIFT
32+ slli.d INCX, INCX, BASE_SHIFT
33+ bne INCX, TEMP, .L20
34+ xvld VM0, X, 0
35+ srai.d I, N, 3
36+ bge $r0, I, .L12
37+ .align 3
38+
39+ .L10:
40+ xvld VX0, X, 0 * SIZE
41+ addi.d I, I, -1
42+ xvld VX1, X, 4 * SIZE
43+ xvfmina.d VM1, VX1, VX0
44+ addi.d X, X, 8 * SIZE
45+ xvfmina.d VM0, VM0, VM1
46+ blt $r0, I, .L10
47+ .align 3
48+
49+ .L11:
50+ xvpickve.d x1, VM0, 0
51+ xvpickve.d x2, VM0, 1
52+ xvpickve.d x3, VM0, 2
53+ xvpickve.d x4, VM0, 3
54+ xvfmina.d VM1, x1, x2
55+ xvfmina.d VM2, x3, x4
56+ xvfmina.d VM0, VM1, VM2
57+ .align 3
58+
59+ .L12: //INCX==1 and N<8
60+ andi I, N, 7
61+ li.d J, 4
62+ bge J, I, .L13 // 4<N<8
63+ xvld VX0, X, 0
64+ slli.d J, J, 1 // 8
65+ sub .d I, J, I
66+ slli.d I, I, BASE_SHIFT
67+ xvldx VX1, X, I
68+ xvfmina.d m0, VX0, VX1 //patial repeat read
69+ xvpickve.d x1, m0, 0
70+ xvpickve.d x2, m0, 1
71+ xvpickve.d x3, m0, 2
72+ xvpickve.d x4, m0, 3
73+ xvfmina.d VM1, x1, x2
74+ xvfmina.d m0, x3, x4
75+ xvfmina.d m0, m0, VM1
76+ xvfmina.d VM0, m0, VM0
77+ fabs .d $f22, $f22
78+ fmov.d $f0, $f22
79+ jirl $r0, $r1, 0x0
80+ .align 3
81+
82+ .L13: //INCX==1 and 0<=N<=4
83+ bge $r0, I, .L15
84+ .align 3
85+
86+ .L14:
87+ xvld x1, X, 0
88+ addi.d I, I, -1
89+ xvfmina.d VM0, VM0, x1
90+ addi.d X, X, SIZE
91+ blt $r0, I, .L14
92+ .align 3
93+
94+ .L15:
95+ fabs .d $f22, $f22
96+ fmov.d $f0, $f22
97+ jirl $r0, $r1, 0x0
98+ .align 3
99+
100+ .L20: // INCX!=1
101+ move TEMP, X // initialize the mina value
102+ ld.d t1, TEMP, 0 * SIZE
103+ add .d TEMP, TEMP, INCX
104+ xvinsgr2vr.d VM0, t1, 0
105+ srai.d I, N, 3
106+ bge $r0, I, .L23
107+ ld.d t2, TEMP, 0 * SIZE
108+ add .d TEMP, TEMP, INCX
109+ ld.d t3, TEMP, 0 * SIZE
110+ add .d TEMP, TEMP, INCX
111+ ld.d t4, TEMP, 0 * SIZE
112+ add .d TEMP, TEMP, INCX
113+ xvinsgr2vr.d VM0, t2, 1
114+ xvinsgr2vr.d VM0, t3, 2
115+ xvinsgr2vr.d VM0, t4, 3
116+ .align 3
117+
118+ .L21:
119+ ld.d t1, X, 0 * SIZE
120+ add .d X, X, INCX
121+ ld.d t2, X, 0 * SIZE
122+ add .d X, X, INCX
123+ ld.d t3, X, 0 * SIZE
124+ add .d X, X, INCX
125+ ld.d t4, X, 0 * SIZE
126+ add .d X, X, INCX
127+ xvinsgr2vr.d VX0, t1, 0
128+ xvinsgr2vr.d VX0, t2, 1
129+ xvinsgr2vr.d VX0, t3, 2
130+ xvinsgr2vr.d VX0, t4, 3
131+ ld.d t1, X, 0 * SIZE
132+ add .d X, X, INCX
133+ ld.d t2, X, 0 * SIZE
134+ add .d X, X, INCX
135+ ld.d t3, X, 0 * SIZE
136+ add .d X, X, INCX
137+ ld.d t4, X, 0 * SIZE
138+ add .d X, X, INCX
139+ xvinsgr2vr.d VX1, t1, 0
140+ xvinsgr2vr.d VX1, t2, 1
141+ xvinsgr2vr.d VX1, t3, 2
142+ xvinsgr2vr.d VX1, t4, 3
143+ addi.d I, I, -1
144+ xvfmina.d VM1, VX1, VX0
145+ xvfmina.d VM0, VM1, VM0
146+ blt $r0, I, .L21
147+ .align 3
148+
149+ .L22:
150+ xvpickve.d x1, VM0, 0
151+ xvpickve.d x2, VM0, 1
152+ xvpickve.d x3, VM0, 2
153+ xvpickve.d x4, VM0, 3
154+ xvfmina.d VM1, x1, x2
155+ xvfmina.d VM2, x3, x4
156+ xvfmina.d VM0, VM1, VM2
157+ .align 3
158+
159+ .L23: //INCX!=1 and N<8
160+ andi I, N, 7
161+ bge $r0, I, .L999
162+ .align 3
163+
164+ .L24:
165+ xvld x1, X, 0
166+ addi.d I, I, -1
167+ xvfmina.d VM0, VM0, x1
168+ add .d X, X, INCX
169+ blt $r0, I, .L24
170+ .align 3
171+
172+ .L999:
173+ fabs .d $f22, $f22
174+ fmov.d $f0, $f22
175+ jirl $r0, $r1, 0x0
176+ .align 3
177+
178+ EPILOGUE
0 commit comments