Skip to content

Commit e6edb74

Browse files
authored
Merge pull request #2466 from AGSaidi/acq-rel-1
Switch blas_server to use acq/rel semantics
2 parents 59243d4 + d68e4ba commit e6edb74

1 file changed

Lines changed: 41 additions & 55 deletions

File tree

driver/others/blas_server.c

Lines changed: 41 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,16 @@ typedef struct {
140140

141141
} thread_status_t;
142142

143+
#if (__STDC_VERSION__ >= 201112L)
144+
#define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED)
145+
#define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED)
146+
#else
147+
#define atomic_load_queue(p) (blas_queue_t*)(*(volatile blas_queue_t**)(p))
148+
#define atomic_store_queue(p, v) (*(volatile blas_queue_t* volatile*)(p) = (v))
149+
#endif
150+
151+
152+
143153
static thread_status_t thread_status[MAX_CPU_NUMBER] __attribute__((aligned(ATTRIBUTE_SIZE)));
144154

145155
#ifndef THREAD_TIMEOUT
@@ -312,40 +322,38 @@ blas_queue_t *tscq;
312322

313323
last_tick = (unsigned int)rpcc();
314324

315-
pthread_mutex_lock (&thread_status[cpu].lock);
316-
tscq=thread_status[cpu].queue;
317-
pthread_mutex_unlock (&thread_status[cpu].lock);
325+
tscq = atomic_load_queue(&thread_status[cpu].queue);
318326

319327
while(!tscq) {
320328
YIELDING;
321329

322330
if ((unsigned int)rpcc() - last_tick > thread_timeout) {
323331

324-
pthread_mutex_lock (&thread_status[cpu].lock);
325332

326-
if (!thread_status[cpu].queue) {
333+
if (!atomic_load_queue(&thread_status[cpu].queue)) {
334+
pthread_mutex_lock (&thread_status[cpu].lock);
327335
thread_status[cpu].status = THREAD_STATUS_SLEEP;
328-
while (thread_status[cpu].status == THREAD_STATUS_SLEEP) {
336+
while (thread_status[cpu].status == THREAD_STATUS_SLEEP &&
337+
!atomic_load_queue(&thread_status[cpu].queue)) {
329338

330339
#ifdef MONITOR
331340
main_status[cpu] = MAIN_SLEEPING;
332341
#endif
333342

334343
pthread_cond_wait(&thread_status[cpu].wakeup, &thread_status[cpu].lock);
335344
}
345+
pthread_mutex_unlock(&thread_status[cpu].lock);
336346
}
337347

338-
pthread_mutex_unlock(&thread_status[cpu].lock);
339-
340348
last_tick = (unsigned int)rpcc();
341349
}
342-
pthread_mutex_lock (&thread_status[cpu].lock);
343-
tscq=thread_status[cpu].queue;
344-
pthread_mutex_unlock (&thread_status[cpu].lock);
350+
351+
tscq = atomic_load_queue(&thread_status[cpu].queue);
345352

346353
}
347354

348-
queue = thread_status[cpu].queue;
355+
queue = atomic_load_queue(&thread_status[cpu].queue);
356+
MB;
349357

350358
if ((long)queue == -1) break;
351359

@@ -360,9 +368,7 @@ blas_queue_t *tscq;
360368
if (queue) {
361369
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
362370

363-
pthread_mutex_lock (&thread_status[cpu].lock);
364-
thread_status[cpu].queue = (blas_queue_t *)1;
365-
pthread_mutex_unlock (&thread_status[cpu].lock);
371+
atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1);
366372

367373
sa = queue -> sa;
368374
sb = queue -> sb;
@@ -442,13 +448,9 @@ blas_queue_t *tscq;
442448

443449
// arm: make sure all results are written out _before_
444450
// thread is marked as done and other threads use them
445-
WMB;
451+
MB;
452+
atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)0);
446453

447-
pthread_mutex_lock (&thread_status[cpu].lock);
448-
thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */
449-
pthread_mutex_unlock (&thread_status[cpu].lock);
450-
451-
WMB;
452454

453455
}
454456

@@ -566,7 +568,7 @@ int blas_thread_init(void){
566568

567569
for(i = 0; i < blas_num_threads - 1; i++){
568570

569-
thread_status[i].queue = (blas_queue_t *)NULL;
571+
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0);
570572
thread_status[i].status = THREAD_STATUS_WAKEUP;
571573

572574
pthread_mutex_init(&thread_status[i].lock, NULL);
@@ -655,7 +657,8 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
655657
if (queue -> mode & BLAS_NODE) {
656658

657659
do {
658-
while((thread_status[i].node != node || thread_status[i].queue) && (i < blas_num_threads - 1)) i ++;
660+
661+
while((thread_status[i].node != node || atomic_load_queue(&thread_status[i].queue)) && (i < blas_num_threads - 1)) i ++;
659662

660663
if (i < blas_num_threads - 1) break;
661664

@@ -669,36 +672,26 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
669672
} while (1);
670673

671674
} else {
672-
pthread_mutex_lock (&thread_status[i].lock);
673-
tsiq = thread_status[i].queue;
674-
pthread_mutex_unlock (&thread_status[i].lock);
675+
tsiq = atomic_load_queue(&thread_status[i].queue);
675676
while(tsiq) {
676677
i ++;
677678
if (i >= blas_num_threads - 1) i = 0;
678-
pthread_mutex_lock (&thread_status[i].lock);
679-
tsiq = thread_status[i].queue;
680-
pthread_mutex_unlock (&thread_status[i].lock);
679+
tsiq = atomic_load_queue(&thread_status[i].queue);
681680
}
682681
}
683682
#else
684-
pthread_mutex_lock (&thread_status[i].lock);
685-
tsiq=thread_status[i].queue ;
686-
pthread_mutex_unlock (&thread_status[i].lock);
683+
tsiq = atomic_load_queue(&thread_status[i].queue);
687684
while(tsiq) {
688685
i ++;
689686
if (i >= blas_num_threads - 1) i = 0;
690-
pthread_mutex_lock (&thread_status[i].lock);
691-
tsiq=thread_status[i].queue ;
692-
pthread_mutex_unlock (&thread_status[i].lock);
687+
tsiq = atomic_load_queue(&thread_status[i].queue);
693688
}
694689
#endif
695690

696691
queue -> assigned = i;
697-
WMB;
698-
pthread_mutex_lock (&thread_status[i].lock);
699-
thread_status[i].queue = queue;
700-
pthread_mutex_unlock (&thread_status[i].lock);
701-
WMB;
692+
MB;
693+
694+
atomic_store_queue(&thread_status[i].queue, queue);
702695

703696
queue = queue -> next;
704697
pos ++;
@@ -718,9 +711,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
718711

719712
pos = current -> assigned;
720713

721-
pthread_mutex_lock (&thread_status[pos].lock);
722-
tspq=thread_status[pos].queue;
723-
pthread_mutex_unlock (&thread_status[pos].lock);
714+
tspq = atomic_load_queue(&thread_status[pos].queue);
724715

725716
if ((BLASULONG)tspq > 1) {
726717
pthread_mutex_lock (&thread_status[pos].lock);
@@ -752,24 +743,20 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
752743

753744
while ((num > 0) && queue) {
754745

755-
pthread_mutex_lock(&thread_status[queue->assigned].lock);
756-
tsqq=thread_status[queue -> assigned].queue;
757-
pthread_mutex_unlock(&thread_status[queue->assigned].lock);
746+
tsqq = atomic_load_queue(&thread_status[queue->assigned].queue);
758747

759748

760749
while(tsqq) {
761750
YIELDING;
762-
pthread_mutex_lock(&thread_status[queue->assigned].lock);
763-
tsqq=thread_status[queue -> assigned].queue;
764-
pthread_mutex_unlock(&thread_status[queue->assigned].lock);
765-
766-
751+
tsqq = atomic_load_queue(&thread_status[queue->assigned].queue);
767752
};
768753

769754
queue = queue -> next;
770755
num --;
771756
}
772757

758+
MB;
759+
773760
#ifdef SMP_DEBUG
774761
fprintf(STDERR, "Done.\n\n");
775762
#endif
@@ -880,7 +867,7 @@ void goto_set_num_threads(int num_threads) {
880867

881868
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
882869

883-
thread_status[i].queue = (blas_queue_t *)NULL;
870+
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0);
884871
thread_status[i].status = THREAD_STATUS_WAKEUP;
885872

886873
pthread_mutex_init(&thread_status[i].lock, NULL);
@@ -971,12 +958,11 @@ int BLASFUNC(blas_thread_shutdown)(void){
971958

972959
for (i = 0; i < blas_num_threads - 1; i++) {
973960

974-
pthread_mutex_lock (&thread_status[i].lock);
975961

976-
thread_status[i].queue = (blas_queue_t *)-1;
962+
pthread_mutex_lock (&thread_status[i].lock);
977963

964+
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1);
978965
thread_status[i].status = THREAD_STATUS_WAKEUP;
979-
980966
pthread_cond_signal (&thread_status[i].wakeup);
981967

982968
pthread_mutex_unlock(&thread_status[i].lock);

0 commit comments

Comments
 (0)