@@ -140,6 +140,16 @@ typedef struct {
140140
141141} thread_status_t ;
142142
143+ #if (__STDC_VERSION__ >= 201112L )
144+ #define atomic_load_queue (p ) __atomic_load_n(p, __ATOMIC_RELAXED)
145+ #define atomic_store_queue (p , v ) __atomic_store_n(p, v, __ATOMIC_RELAXED)
146+ #else
147+ #define atomic_load_queue (p ) (blas_queue_t*)(*(volatile blas_queue_t**)(p))
148+ #define atomic_store_queue (p , v ) (*(volatile blas_queue_t* volatile*)(p) = (v))
149+ #endif
150+
151+
152+
143153static thread_status_t thread_status [MAX_CPU_NUMBER ] __attribute__((aligned (ATTRIBUTE_SIZE )));
144154
145155#ifndef THREAD_TIMEOUT
@@ -312,40 +322,38 @@ blas_queue_t *tscq;
312322
313323 last_tick = (unsigned int )rpcc ();
314324
315- pthread_mutex_lock (& thread_status [cpu ].lock );
316- tscq = thread_status [cpu ].queue ;
317- pthread_mutex_unlock (& thread_status [cpu ].lock );
325+ tscq = atomic_load_queue (& thread_status [cpu ].queue );
318326
319327 while (!tscq ) {
320328 YIELDING ;
321329
322330 if ((unsigned int )rpcc () - last_tick > thread_timeout ) {
323331
324- pthread_mutex_lock (& thread_status [cpu ].lock );
325332
326- if (!thread_status [cpu ].queue ) {
333+ if (!atomic_load_queue (& thread_status [cpu ].queue )) {
334+ pthread_mutex_lock (& thread_status [cpu ].lock );
327335 thread_status [cpu ].status = THREAD_STATUS_SLEEP ;
328- while (thread_status [cpu ].status == THREAD_STATUS_SLEEP ) {
336+ while (thread_status [cpu ].status == THREAD_STATUS_SLEEP &&
337+ !atomic_load_queue (& thread_status [cpu ].queue )) {
329338
330339#ifdef MONITOR
331340 main_status [cpu ] = MAIN_SLEEPING ;
332341#endif
333342
334343 pthread_cond_wait (& thread_status [cpu ].wakeup , & thread_status [cpu ].lock );
335344 }
345+ pthread_mutex_unlock (& thread_status [cpu ].lock );
336346 }
337347
338- pthread_mutex_unlock (& thread_status [cpu ].lock );
339-
340348 last_tick = (unsigned int )rpcc ();
341349 }
342- pthread_mutex_lock (& thread_status [cpu ].lock );
343- tscq = thread_status [cpu ].queue ;
344- pthread_mutex_unlock (& thread_status [cpu ].lock );
350+
351+ tscq = atomic_load_queue (& thread_status [cpu ].queue );
345352
346353 }
347354
348- queue = thread_status [cpu ].queue ;
355+ queue = atomic_load_queue (& thread_status [cpu ].queue );
356+ MB ;
349357
350358 if ((long )queue == -1 ) break ;
351359
@@ -360,9 +368,7 @@ blas_queue_t *tscq;
360368 if (queue ) {
361369 int (* routine )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ) = queue -> routine ;
362370
363- pthread_mutex_lock (& thread_status [cpu ].lock );
364- thread_status [cpu ].queue = (blas_queue_t * )1 ;
365- pthread_mutex_unlock (& thread_status [cpu ].lock );
371+ atomic_store_queue (& thread_status [cpu ].queue , (blas_queue_t * )1 );
366372
367373 sa = queue -> sa ;
368374 sb = queue -> sb ;
@@ -442,13 +448,9 @@ blas_queue_t *tscq;
442448
443449 // arm: make sure all results are written out _before_
444450 // thread is marked as done and other threads use them
445- WMB ;
451+ MB ;
452+ atomic_store_queue (& thread_status [cpu ].queue , (blas_queue_t * )0 );
446453
447- pthread_mutex_lock (& thread_status [cpu ].lock );
448- thread_status [cpu ].queue = (blas_queue_t * volatile ) ((long )thread_status [cpu ].queue & 0 ); /* Need a trick */
449- pthread_mutex_unlock (& thread_status [cpu ].lock );
450-
451- WMB ;
452454
453455 }
454456
@@ -566,7 +568,7 @@ int blas_thread_init(void){
566568
567569 for (i = 0 ; i < blas_num_threads - 1 ; i ++ ){
568570
569- thread_status [i ].queue = (blas_queue_t * )NULL ;
571+ atomic_store_queue ( & thread_status [i ].queue , (blas_queue_t * )0 ) ;
570572 thread_status [i ].status = THREAD_STATUS_WAKEUP ;
571573
572574 pthread_mutex_init (& thread_status [i ].lock , NULL );
@@ -655,7 +657,8 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
655657 if (queue -> mode & BLAS_NODE ) {
656658
657659 do {
658- while ((thread_status [i ].node != node || thread_status [i ].queue ) && (i < blas_num_threads - 1 )) i ++ ;
660+
661+ while ((thread_status [i ].node != node || atomic_load_queue (& thread_status [i ].queue )) && (i < blas_num_threads - 1 )) i ++ ;
659662
660663 if (i < blas_num_threads - 1 ) break ;
661664
@@ -669,36 +672,26 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
669672 } while (1 );
670673
671674 } else {
672- pthread_mutex_lock (& thread_status [i ].lock );
673- tsiq = thread_status [i ].queue ;
674- pthread_mutex_unlock (& thread_status [i ].lock );
675+ tsiq = atomic_load_queue (& thread_status [i ].queue );
675676 while (tsiq ) {
676677 i ++ ;
677678 if (i >= blas_num_threads - 1 ) i = 0 ;
678- pthread_mutex_lock (& thread_status [i ].lock );
679- tsiq = thread_status [i ].queue ;
680- pthread_mutex_unlock (& thread_status [i ].lock );
679+ tsiq = atomic_load_queue (& thread_status [i ].queue );
681680 }
682681 }
683682#else
684- pthread_mutex_lock (& thread_status [i ].lock );
685- tsiq = thread_status [i ].queue ;
686- pthread_mutex_unlock (& thread_status [i ].lock );
683+ tsiq = atomic_load_queue (& thread_status [i ].queue );
687684 while (tsiq ) {
688685 i ++ ;
689686 if (i >= blas_num_threads - 1 ) i = 0 ;
690- pthread_mutex_lock (& thread_status [i ].lock );
691- tsiq = thread_status [i ].queue ;
692- pthread_mutex_unlock (& thread_status [i ].lock );
687+ tsiq = atomic_load_queue (& thread_status [i ].queue );
693688 }
694689#endif
695690
696691 queue -> assigned = i ;
697- WMB ;
698- pthread_mutex_lock (& thread_status [i ].lock );
699- thread_status [i ].queue = queue ;
700- pthread_mutex_unlock (& thread_status [i ].lock );
701- WMB ;
692+ MB ;
693+
694+ atomic_store_queue (& thread_status [i ].queue , queue );
702695
703696 queue = queue -> next ;
704697 pos ++ ;
@@ -718,9 +711,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
718711
719712 pos = current -> assigned ;
720713
721- pthread_mutex_lock (& thread_status [pos ].lock );
722- tspq = thread_status [pos ].queue ;
723- pthread_mutex_unlock (& thread_status [pos ].lock );
714+ tspq = atomic_load_queue (& thread_status [pos ].queue );
724715
725716 if ((BLASULONG )tspq > 1 ) {
726717 pthread_mutex_lock (& thread_status [pos ].lock );
@@ -752,24 +743,20 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
752743
753744 while ((num > 0 ) && queue ) {
754745
755- pthread_mutex_lock (& thread_status [queue -> assigned ].lock );
756- tsqq = thread_status [queue -> assigned ].queue ;
757- pthread_mutex_unlock (& thread_status [queue -> assigned ].lock );
746+ tsqq = atomic_load_queue (& thread_status [queue -> assigned ].queue );
758747
759748
760749 while (tsqq ) {
761750 YIELDING ;
762- pthread_mutex_lock (& thread_status [queue -> assigned ].lock );
763- tsqq = thread_status [queue -> assigned ].queue ;
764- pthread_mutex_unlock (& thread_status [queue -> assigned ].lock );
765-
766-
751+ tsqq = atomic_load_queue (& thread_status [queue -> assigned ].queue );
767752 };
768753
769754 queue = queue -> next ;
770755 num -- ;
771756 }
772757
758+ MB ;
759+
773760#ifdef SMP_DEBUG
774761 fprintf (STDERR , "Done.\n\n" );
775762#endif
@@ -880,7 +867,7 @@ void goto_set_num_threads(int num_threads) {
880867
881868 for (i = blas_num_threads - 1 ; i < num_threads - 1 ; i ++ ){
882869
883- thread_status [i ].queue = (blas_queue_t * )NULL ;
870+ atomic_store_queue ( & thread_status [i ].queue , (blas_queue_t * )0 ) ;
884871 thread_status [i ].status = THREAD_STATUS_WAKEUP ;
885872
886873 pthread_mutex_init (& thread_status [i ].lock , NULL );
@@ -971,12 +958,11 @@ int BLASFUNC(blas_thread_shutdown)(void){
971958
972959 for (i = 0 ; i < blas_num_threads - 1 ; i ++ ) {
973960
974- pthread_mutex_lock (& thread_status [i ].lock );
975961
976- thread_status [i ].queue = ( blas_queue_t * ) -1 ;
962+ pthread_mutex_lock ( & thread_status [i ].lock ) ;
977963
964+ atomic_store_queue (& thread_status [i ].queue , (blas_queue_t * )-1 );
978965 thread_status [i ].status = THREAD_STATUS_WAKEUP ;
979-
980966 pthread_cond_signal (& thread_status [i ].wakeup );
981967
982968 pthread_mutex_unlock (& thread_status [i ].lock );
0 commit comments