Skip to content

Commit 63d063c

Browse files
authored
Merge pull request #3769 from XiWeiGu/mips64-test
[WIP,Testing]: Add test for mips64
2 parents 47120f2 + edea1bc commit 63d063c

2 files changed

Lines changed: 265 additions & 0 deletions

File tree

.github/workflows/mips64.yml

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
name: mips64 qemu test
2+
3+
on: [push, pull_request]
4+
5+
jobs:
6+
TEST:
7+
runs-on: ubuntu-latest
8+
strategy:
9+
fail-fast: false
10+
matrix:
11+
include:
12+
- target: MIPS64_GENERIC
13+
triple: mips64el-linux-gnuabi64
14+
opts: NO_SHARED=1 TARGET=MIPS64_GENERIC
15+
- target: SICORTEX
16+
triple: mips64el-linux-gnuabi64
17+
opts: NO_SHARED=1 TARGET=SICORTEX
18+
- target: I6400
19+
triple: mipsisa64r6el-linux-gnuabi64
20+
opts: NO_SHARED=1 TARGET=I6400
21+
- target: P6600
22+
triple: mipsisa64r6el-linux-gnuabi64
23+
opts: NO_SHARED=1 TARGET=P6600
24+
- target: I6500
25+
triple: mipsisa64r6el-linux-gnuabi64
26+
opts: NO_SHARED=1 TARGET=I6500
27+
28+
steps:
29+
- name: Checkout repository
30+
uses: actions/checkout@v3
31+
32+
- name: install build deps
33+
run: |
34+
sudo apt-get update
35+
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
36+
gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross
37+
38+
- name: checkout qemu
39+
uses: actions/checkout@v3
40+
with:
41+
repository: qemu/qemu
42+
path: qemu
43+
ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2
44+
45+
- name: build qemu
46+
run: |
47+
cd qemu
48+
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system
49+
make -j$(nproc)
50+
make install
51+
52+
- name: Compilation cache
53+
uses: actions/cache@v3
54+
with:
55+
path: ~/.ccache
56+
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
57+
restore-keys: |
58+
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
59+
ccache-${{ runner.os }}-${{ matrix.target }}
60+
61+
- name: Configure ccache
62+
run: |
63+
test -d ~/.ccache || mkdir -p ~/.ccache
64+
echo "max_size = 300M" > ~/.ccache/ccache.conf
65+
echo "compression = true" >> ~/.ccache/ccache.conf
66+
ccache -s
67+
68+
- name: build OpenBLAS
69+
run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
70+
71+
- name: test
72+
run: |
73+
export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
74+
qemu-mips64el ./utest/openblas_utest
75+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat1
76+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat1
77+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat1
78+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat1
79+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat2 < ./ctest/sin2
80+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat2 < ./ctest/din2
81+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat2 < ./ctest/cin2
82+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat2 < ./ctest/zin2
83+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat3 < ./ctest/sin3
84+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat3 < ./ctest/din3
85+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat3 < ./ctest/cin3
86+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat3 < ./ctest/zin3
87+
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat1
88+
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat1
89+
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat1
90+
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat1
91+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat1
92+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat1
93+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat1
94+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat1
95+
rm -f ./test/?BLAT2.SUMM
96+
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat
97+
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat
98+
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat
99+
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat
100+
rm -f ./test/?BLAT2.SUMM
101+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat
102+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat
103+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat
104+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat
105+
rm -f ./test/?BLAT3.SUMM
106+
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat
107+
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat
108+
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat
109+
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat
110+
rm -f ./test/?BLAT3.SUMM
111+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat
112+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat
113+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat
114+
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat

kernel/mips/sdot_msa.c

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
3939
FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
4040
v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
4141
v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
42+
#if defined(DSDOT)
43+
v2f64 dvx0, dvx1, dvx2, dvx3, dvx4, dvx5, dvx6, dvx7;
44+
v2f64 dvy0, dvy1, dvy2, dvy3, dvy4, dvy5, dvy6, dvy7;
45+
v2f64 dot0 = {0, 0};
46+
v2f64 dot1 = {0, 0};
47+
v2f64 dot2 = {0, 0};
48+
v2f64 dot3 = {0, 0};
49+
#else
4250
v4f32 dot0 = {0, 0, 0, 0};
4351
v4f32 dot1 = {0, 0, 0, 0};
4452
v4f32 dot2 = {0, 0, 0, 0};
4553
v4f32 dot3 = {0, 0, 0, 0};
54+
#endif
4655

4756
if (n < 1) return (dot);
4857

@@ -83,6 +92,61 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
8392
x_pref += 32;
8493
y_pref += 32;
8594

95+
#if defined(DSDOT)
96+
/* Extend single precision to double precision */
97+
dvy0 = __msa_fexupr_d(vy0);
98+
dvy1 = __msa_fexupr_d(vy1);
99+
dvy2 = __msa_fexupr_d(vy2);
100+
dvy3 = __msa_fexupr_d(vy3);
101+
dvy4 = __msa_fexupr_d(vy4);
102+
dvy5 = __msa_fexupr_d(vy5);
103+
dvy6 = __msa_fexupr_d(vy6);
104+
dvy7 = __msa_fexupr_d(vy7);
105+
106+
vy0 = (v4f32)__msa_fexupl_d(vy0);
107+
vy1 = (v4f32)__msa_fexupl_d(vy1);
108+
vy2 = (v4f32)__msa_fexupl_d(vy2);
109+
vy3 = (v4f32)__msa_fexupl_d(vy3);
110+
vy4 = (v4f32)__msa_fexupl_d(vy4);
111+
vy5 = (v4f32)__msa_fexupl_d(vy5);
112+
vy6 = (v4f32)__msa_fexupl_d(vy6);
113+
vy7 = (v4f32)__msa_fexupl_d(vy7);
114+
115+
dvx0 = __msa_fexupr_d(vx0);
116+
dvx1 = __msa_fexupr_d(vx1);
117+
dvx2 = __msa_fexupr_d(vx2);
118+
dvx3 = __msa_fexupr_d(vx3);
119+
dvx4 = __msa_fexupr_d(vx4);
120+
dvx5 = __msa_fexupr_d(vx5);
121+
dvx6 = __msa_fexupr_d(vx6);
122+
dvx7 = __msa_fexupr_d(vx7);
123+
124+
vx0 = (v4f32)__msa_fexupl_d(vx0);
125+
vx1 = (v4f32)__msa_fexupl_d(vx1);
126+
vx2 = (v4f32)__msa_fexupl_d(vx2);
127+
vx3 = (v4f32)__msa_fexupl_d(vx3);
128+
vx4 = (v4f32)__msa_fexupl_d(vx4);
129+
vx5 = (v4f32)__msa_fexupl_d(vx5);
130+
vx6 = (v4f32)__msa_fexupl_d(vx6);
131+
vx7 = (v4f32)__msa_fexupl_d(vx7);
132+
133+
dot0 += (dvy0 * dvx0);
134+
dot1 += (dvy1 * dvx1);
135+
dot2 += (dvy2 * dvx2);
136+
dot3 += (dvy3 * dvx3);
137+
dot0 += (dvy4 * dvx4);
138+
dot1 += (dvy5 * dvx5);
139+
dot2 += (dvy6 * dvx6);
140+
dot3 += (dvy7 * dvx7);
141+
dot0 += ((v2f64)vy0 * (v2f64)vx0);
142+
dot1 += ((v2f64)vy1 * (v2f64)vx1);
143+
dot2 += ((v2f64)vy2 * (v2f64)vx2);
144+
dot3 += ((v2f64)vy3 * (v2f64)vx3);
145+
dot0 += ((v2f64)vy4 * (v2f64)vx4);
146+
dot1 += ((v2f64)vy5 * (v2f64)vx5);
147+
dot2 += ((v2f64)vy6 * (v2f64)vx6);
148+
dot3 += ((v2f64)vy7 * (v2f64)vx7);
149+
#else
86150
dot0 += (vy0 * vx0);
87151
dot1 += (vy1 * vx1);
88152
dot2 += (vy2 * vx2);
@@ -91,6 +155,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
91155
dot1 += (vy5 * vx5);
92156
dot2 += (vy6 * vx6);
93157
dot3 += (vy7 * vx7);
158+
#endif
94159
}
95160

96161
if (n & 31)
@@ -100,53 +165,123 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
100165
LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
101166
LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
102167

168+
#if defined(DSDOT)
169+
dvy0 = __msa_fexupr_d(vy0);
170+
dvy1 = __msa_fexupr_d(vy1);
171+
dvy2 = __msa_fexupr_d(vy2);
172+
dvy3 = __msa_fexupr_d(vy3);
173+
174+
vy0 = (v4f32)__msa_fexupl_d(vy0);
175+
vy1 = (v4f32)__msa_fexupl_d(vy1);
176+
vy2 = (v4f32)__msa_fexupl_d(vy2);
177+
vy3 = (v4f32)__msa_fexupl_d(vy3);
178+
179+
dvx0 = __msa_fexupr_d(vx0);
180+
dvx1 = __msa_fexupr_d(vx1);
181+
dvx2 = __msa_fexupr_d(vx2);
182+
dvx3 = __msa_fexupr_d(vx3);
183+
184+
vx0 = (v4f32)__msa_fexupl_d(vx0);
185+
vx1 = (v4f32)__msa_fexupl_d(vx1);
186+
vx2 = (v4f32)__msa_fexupl_d(vx2);
187+
vx3 = (v4f32)__msa_fexupl_d(vx3);
188+
189+
dot0 += (dvy0 * dvx0);
190+
dot1 += (dvy1 * dvx1);
191+
dot2 += (dvy2 * dvx2);
192+
dot3 += (dvy3 * dvx3);
193+
dot0 += ((v2f64)vy0 * (v2f64)vx0);
194+
dot1 += ((v2f64)vy1 * (v2f64)vx1);
195+
dot2 += ((v2f64)vy2 * (v2f64)vx2);
196+
dot3 += ((v2f64)vy3 * (v2f64)vx3);
197+
#else
103198
dot0 += (vy0 * vx0);
104199
dot1 += (vy1 * vx1);
105200
dot2 += (vy2 * vx2);
106201
dot3 += (vy3 * vx3);
202+
#endif
107203
}
108204

109205
if (n & 8)
110206
{
111207
LD_SP2_INC(x, 4, vx0, vx1);
112208
LD_SP2_INC(y, 4, vy0, vy1);
113209

210+
#if defined(DSDOT)
211+
dvy0 = __msa_fexupr_d(vy0);
212+
dvy1 = __msa_fexupr_d(vy1);
213+
214+
vy0 = (v4f32)__msa_fexupl_d(vy0);
215+
vy1 = (v4f32)__msa_fexupl_d(vy1);
216+
217+
dvx0 = __msa_fexupr_d(vx0);
218+
dvx1 = __msa_fexupr_d(vx1);
219+
220+
vx0 = (v4f32)__msa_fexupl_d(vx0);
221+
vx1 = (v4f32)__msa_fexupl_d(vx1);
222+
223+
dot0 += (dvy0 * dvx0);
224+
dot1 += (dvy1 * dvx1);
225+
dot0 += ((v2f64)vy0 * (v2f64)vx0);
226+
dot1 += ((v2f64)vy1 * (v2f64)vx1);
227+
#else
114228
dot0 += (vy0 * vx0);
115229
dot1 += (vy1 * vx1);
230+
#endif
116231
}
117232

118233
if (n & 4)
119234
{
120235
vx0 = LD_SP(x); x += 4;
121236
vy0 = LD_SP(y); y += 4;
122237

238+
#if defined(DSDOT)
239+
dvy0 = __msa_fexupr_d(vy0);
240+
vy0 = (v4f32)__msa_fexupl_d(vy0);
241+
dvx0 = __msa_fexupr_d(vx0);
242+
vx0 = (v4f32)__msa_fexupl_d(vx0);
243+
dot0 += (dvy0 * dvx0);
244+
dot0 += ((v2f64)vy0 * (v2f64)vx0);
245+
#else
123246
dot0 += (vy0 * vx0);
247+
#endif
124248
}
125249

126250
if (n & 2)
127251
{
128252
LD_GP2_INC(x, 1, x0, x1);
129253
LD_GP2_INC(y, 1, y0, y1);
130254

255+
#if defined(DSDOT)
256+
dot += ((double)y0 * (double)x0);
257+
dot += ((double)y1 * (double)x1);
258+
#else
131259
dot += (y0 * x0);
132260
dot += (y1 * x1);
261+
#endif
133262
}
134263

135264
if (n & 1)
136265
{
137266
x0 = *x;
138267
y0 = *y;
139268

269+
#if defined(DSDOT)
270+
dot += ((double)y0 * (double)x0);
271+
#else
140272
dot += (y0 * x0);
273+
#endif
141274
}
142275
}
143276

144277
dot0 += dot1 + dot2 + dot3;
145278

146279
dot += dot0[0];
147280
dot += dot0[1];
281+
#if !defined(DSDOT)
148282
dot += dot0[2];
149283
dot += dot0[3];
284+
#endif
150285
}
151286
else
152287
{
@@ -155,27 +290,43 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
155290
LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
156291
LD_GP4_INC(y, inc_y, y0, y1, y2, y3);
157292

293+
#if defined(DSDOT)
294+
dot += ((double)y0 * (double)x0);
295+
dot += ((double)y1 * (double)x1);
296+
dot += ((double)y2 * (double)x2);
297+
dot += ((double)y3 * (double)x3);
298+
#else
158299
dot += (y0 * x0);
159300
dot += (y1 * x1);
160301
dot += (y2 * x2);
161302
dot += (y3 * x3);
303+
#endif
162304
}
163305

164306
if (n & 2)
165307
{
166308
LD_GP2_INC(x, inc_x, x0, x1);
167309
LD_GP2_INC(y, inc_y, y0, y1);
168310

311+
#if defined(DSDOT)
312+
dot += ((double)y0 * (double)x0);
313+
dot += ((double)y1 * (double)x1);
314+
#else
169315
dot += (y0 * x0);
170316
dot += (y1 * x1);
317+
#endif
171318
}
172319

173320
if (n & 1)
174321
{
175322
x0 = *x;
176323
y0 = *y;
177324

325+
#if defined(DSDOT)
326+
dot += ((double)y0 * (double)x0);
327+
#else
178328
dot += (y0 * x0);
329+
#endif
179330
}
180331
}
181332

0 commit comments

Comments
 (0)