@@ -68,25 +68,16 @@ double sqrt(double);
6868#define GETRF_FACTOR 1.00
6969
7070
71- #if defined(USE_PTHREAD_LOCK )
72- static pthread_mutex_t getrf_lock = PTHREAD_MUTEX_INITIALIZER ;
73- #elif defined(USE_PTHREAD_SPINLOCK )
74- static pthread_spinlock_t getrf_lock = 0 ;
71+ #if (__STDC_VERSION__ >= 201112L )
72+ #define atomic_load_long (p ) __atomic_load_n(p, __ATOMIC_RELAXED)
73+ #define atomic_store_long (p , v ) __atomic_store_n(p, v, __ATOMIC_RELAXED)
7574#else
76- static BLASULONG getrf_lock = 0UL ;
77- #endif
78-
79- #if defined(USE_PTHREAD_LOCK )
80- static pthread_mutex_t getrf_flag_lock = PTHREAD_MUTEX_INITIALIZER ;
81- #elif defined(USE_PTHREAD_SPINLOCK )
82- static pthread_spinlock_t getrf_flag_lock = 0 ;
83- #else
84- static BLASULONG getrf_flag_lock = 0UL ;
75+ #define atomic_load_long (p ) (BLASLONG)(*(volatile BLASLONG*)(p))
76+ #define atomic_store_long (p , v ) (*(volatile BLASLONG *)(p)) = (v)
8577#endif
8678
8779
8880
89-
9081static __inline BLASLONG FORMULA1 (BLASLONG M , BLASLONG N , BLASLONG IS , BLASLONG BK , BLASLONG T ) {
9182
9283 double m = (double )(M - IS - BK );
@@ -119,11 +110,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
119110 FLOAT * d = (FLOAT * )args -> b + (k + k * lda ) * COMPSIZE ;
120111 FLOAT * sbb = sb ;
121112
122- #if __STDC_VERSION__ >= 201112L
123- _Atomic BLASLONG * flag = (_Atomic BLASLONG * )args -> d ;
124- #else
125113 volatile BLASLONG * flag = (volatile BLASLONG * )args -> d ;
126- #endif
127114
128115 blasint * ipiv = (blasint * )args -> c ;
129116
@@ -180,7 +167,10 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
180167 }
181168 }
182169
183- if ((js + REAL_GEMM_R >= n ) && (mypos >= 0 )) flag [mypos * CACHE_LINE_SIZE ] = 0 ;
170+ if ((js + REAL_GEMM_R >= n ) && (mypos >= 0 )) {
171+ MB ;
172+ atomic_store_long (& flag [mypos * CACHE_LINE_SIZE ], 0 );
173+ }
184174
185175 for (is = 0 ; is < m ; is += GEMM_P ){
186176 min_i = m - is ;
@@ -201,14 +191,10 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
201191/* Non blocking implementation */
202192
203193typedef struct {
204- #if __STDC_VERSION__ >= 201112L
205- _Atomic
206- #else
207- volatile
208- #endif
209- BLASLONG working [MAX_CPU_NUMBER ][CACHE_LINE_SIZE * DIVIDE_RATE ];
194+ volatile BLASLONG working [MAX_CPU_NUMBER ][CACHE_LINE_SIZE * DIVIDE_RATE ];
210195} job_t ;
211196
197+
212198#define ICOPY_OPERATION (M , N , A , LDA , X , Y , BUFFER ) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
213199#define OCOPY_OPERATION (M , N , A , LDA , X , Y , BUFFER ) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
214200
@@ -246,11 +232,8 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
246232
247233 blasint * ipiv = (blasint * )args -> c ;
248234 BLASLONG jw ;
249- #if __STDC_VERSION__ >= 201112L
250- _Atomic BLASLONG * flag = (_Atomic BLASLONG * )args -> d ;
251- #else
252235 volatile BLASLONG * flag = (volatile BLASLONG * )args -> d ;
253- #endif
236+
254237 if (args -> a == NULL ) {
255238 TRSM_ILTCOPY (k , k , (FLOAT * )args -> b , lda , 0 , sb );
256239 sbb = (FLOAT * )((((BLASULONG )(sb + k * k * COMPSIZE ) + GEMM_ALIGN ) & ~GEMM_ALIGN ) + GEMM_OFFSET_B );
@@ -280,10 +263,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
280263#if 1
281264 {
282265 do {
283- LOCK_COMMAND (& getrf_lock );
284- jw = job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ];
285- UNLOCK_COMMAND (& getrf_lock );
266+ jw = atomic_load_long (& job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ]);
286267 } while (jw );
268+ MB ;
287269 }
288270#else
289271 while (job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ]) {};
@@ -326,21 +308,17 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
326308 }
327309 MB ;
328310 for (i = 0 ; i < args -> nthreads ; i ++ ) {
329- LOCK_COMMAND (& getrf_lock );
330- job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ] = (BLASLONG )buffer [bufferside ];
331- UNLOCK_COMMAND (& getrf_lock );
311+ atomic_store_long (& job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ], (BLASLONG )buffer [bufferside ]);
332312 }
333313 }
334314
335- LOCK_COMMAND (& getrf_flag_lock );
336- flag [mypos * CACHE_LINE_SIZE ] = 0 ;
337- UNLOCK_COMMAND (& getrf_flag_lock );
315+ MB ;
316+ atomic_store_long (& flag [mypos * CACHE_LINE_SIZE ], 0 );
338317
339318 if (m == 0 ) {
319+ MB ;
340320 for (xxx = 0 ; xxx < DIVIDE_RATE ; xxx ++ ) {
341- LOCK_COMMAND (& getrf_lock );
342- job [mypos ].working [mypos ][CACHE_LINE_SIZE * xxx ] = 0 ;
343- UNLOCK_COMMAND (& getrf_lock );
321+ atomic_store_long (& job [mypos ].working [mypos ][CACHE_LINE_SIZE * xxx ], 0 );
344322 }
345323 }
346324
@@ -366,10 +344,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
366344 if ((current != mypos ) && (!is )) {
367345#if 1
368346 do {
369- LOCK_COMMAND (& getrf_lock );
370- jw = job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ];
371- UNLOCK_COMMAND (& getrf_lock );
372- } while (jw == 0 );
347+ jw = atomic_load_long (& job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ]);
348+ } while (jw == 0 );
349+ MB ;
373350#else
374351 while (job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] == 0 ) {};
375352#endif
@@ -381,9 +358,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
381358
382359 MB ;
383360 if (is + min_i >= m ) {
384- LOCK_COMMAND (& getrf_lock );
385- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] = 0 ;
386- UNLOCK_COMMAND (& getrf_lock );
361+ atomic_store_long (& job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ], 0 );
387362 }
388363 }
389364
@@ -397,10 +372,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
397372 for (xxx = 0 ; xxx < DIVIDE_RATE ; xxx ++ ) {
398373#if 1
399374 do {
400- LOCK_COMMAND (& getrf_lock );
401- jw = job [mypos ].working [i ][CACHE_LINE_SIZE * xxx ];
402- UNLOCK_COMMAND (& getrf_lock );
375+ jw = atomic_load_long (& job [mypos ].working [i ][CACHE_LINE_SIZE * xxx ]);
403376 } while (jw != 0 );
377+ MB ;
404378#else
405379 while (job [mypos ].working [i ][CACHE_LINE_SIZE * xxx ] ) {};
406380#endif
@@ -443,12 +417,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
443417#ifdef _MSC_VER
444418 BLASLONG flag [MAX_CPU_NUMBER * CACHE_LINE_SIZE ];
445419#else
446- #if __STDC_VERSION__ >= 201112L
447- _Atomic
448- #else
449- volatile
450- #endif
451- BLASLONG flag [MAX_CPU_NUMBER * CACHE_LINE_SIZE ] __attribute__((aligned (128 )));
420+ volatile BLASLONG flag [MAX_CPU_NUMBER * CACHE_LINE_SIZE ] __attribute__((aligned (128 )));
452421#endif
453422
454423#ifndef COMPLEX
@@ -543,7 +512,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
543512 if (width > mn - is - bk ) width = mn - is - bk ;
544513 }
545514
546- if (num_cpu > 0 ) exec_blas_async_wait (num_cpu , & queue [0 ]);
515+
516+ if (num_cpu > 0 ) {
517+ WMB ;
518+ exec_blas_async_wait (num_cpu , & queue [0 ]);
519+ }
547520
548521 mm = m - bk - is ;
549522 nn = n - bk - is ;
@@ -608,7 +581,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
608581 queue [num_cpu ].sa = NULL ;
609582 queue [num_cpu ].sb = NULL ;
610583 queue [num_cpu ].next = & queue [num_cpu + 1 ];
611- flag [num_cpu * CACHE_LINE_SIZE ] = 1 ;
584+ atomic_store_long ( & flag [num_cpu * CACHE_LINE_SIZE ], 1 ) ;
612585
613586 num_cpu ++ ;
614587
@@ -637,6 +610,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
637610 if (num_cpu > 0 ) {
638611 queue [num_cpu - 1 ].next = NULL ;
639612
613+ WMB ;
614+
640615 exec_blas_async (0 , & queue [0 ]);
641616
642617 inner_basic_thread (& newarg , NULL , range_n_mine , sa , sbb , -1 );
@@ -647,14 +622,10 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
647622
648623 for (i = 0 ; i < num_cpu ; i ++ ) {
649624#if 1
650- LOCK_COMMAND (& getrf_flag_lock );
651- f = flag [i * CACHE_LINE_SIZE ];
652- UNLOCK_COMMAND (& getrf_flag_lock );
653- while (f != 0 ) {
654- LOCK_COMMAND (& getrf_flag_lock );
655- f = flag [i * CACHE_LINE_SIZE ];
656- UNLOCK_COMMAND (& getrf_flag_lock );
657- };
625+ do {
626+ f = atomic_load_long (& flag [i * CACHE_LINE_SIZE ]);
627+ } while (f != 0 );
628+ MB ;
658629#else
659630 while (flag [i * CACHE_LINE_SIZE ]) {};
660631#endif
@@ -719,12 +690,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
719690 BLASLONG range [MAX_CPU_NUMBER + 1 ];
720691
721692 BLASLONG width , nn , num_cpu ;
722- #if __STDC_VERSION__ >= 201112L
723- _Atomic
724- #else
725- volatile
726- #endif
727- BLASLONG flag [MAX_CPU_NUMBER * CACHE_LINE_SIZE ] __attribute__((aligned (128 )));
693+ volatile BLASLONG flag [MAX_CPU_NUMBER * CACHE_LINE_SIZE ] __attribute__((aligned (128 )));
728694
729695#ifndef COMPLEX
730696#ifdef XDOUBLE
@@ -833,6 +799,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
833799 nn = n - bk - is ;
834800 if (width > nn ) width = nn ;
835801
802+ WMB ;
803+
836804 if (num_cpu > 1 ) exec_blas_async_wait (num_cpu - 1 , & queue [1 ]);
837805
838806 range [0 ] = 0 ;
@@ -867,7 +835,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
867835 queue [num_cpu ].sa = NULL ;
868836 queue [num_cpu ].sb = NULL ;
869837 queue [num_cpu ].next = & queue [num_cpu + 1 ];
870- flag [num_cpu * CACHE_LINE_SIZE ] = 1 ;
838+ atomic_store_long ( & flag [num_cpu * CACHE_LINE_SIZE ], 1 ) ;
871839
872840 num_cpu ++ ;
873841 }
@@ -882,6 +850,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
882850 range_n_new [0 ] = offset + is ;
883851 range_n_new [1 ] = offset + is + bk ;
884852
853+ WMB ;
885854 if (num_cpu > 1 ) {
886855
887856 exec_blas_async (1 , & queue [1 ]);
@@ -917,7 +886,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
917886
918887#endif
919888
920- for (i = 1 ; i < num_cpu ; i ++ ) while (flag [i * CACHE_LINE_SIZE ]) {};
889+ for (i = 1 ; i < num_cpu ; i ++ ) while (atomic_load_long ( & flag [i * CACHE_LINE_SIZE ]) ) {};
921890
922891 TRSM_ILTCOPY (bk , bk , a + (is + is * lda ) * COMPSIZE , lda , 0 , sb );
923892
0 commit comments