Skip to content

Commit dbef479

Browse files
authored
Merge pull request #2469 from AGSaidi/acq-rel-2
Use acq/rel semantics to pass flags/pointers in getrf_parallel.
2 parents ad9e531 + 208c7e7 commit dbef479

1 file changed

Lines changed: 44 additions & 75 deletions

File tree

lapack/getrf/getrf_parallel.c

Lines changed: 44 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -68,25 +68,16 @@ double sqrt(double);
6868
#define GETRF_FACTOR 1.00
6969

7070

71-
#if defined(USE_PTHREAD_LOCK)
72-
static pthread_mutex_t getrf_lock = PTHREAD_MUTEX_INITIALIZER;
73-
#elif defined(USE_PTHREAD_SPINLOCK)
74-
static pthread_spinlock_t getrf_lock = 0;
71+
#if (__STDC_VERSION__ >= 201112L)
72+
#define atomic_load_long(p) __atomic_load_n(p, __ATOMIC_RELAXED)
73+
#define atomic_store_long(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED)
7574
#else
76-
static BLASULONG getrf_lock = 0UL;
77-
#endif
78-
79-
#if defined(USE_PTHREAD_LOCK)
80-
static pthread_mutex_t getrf_flag_lock = PTHREAD_MUTEX_INITIALIZER;
81-
#elif defined(USE_PTHREAD_SPINLOCK)
82-
static pthread_spinlock_t getrf_flag_lock = 0;
83-
#else
84-
static BLASULONG getrf_flag_lock = 0UL;
75+
#define atomic_load_long(p) (BLASLONG)(*(volatile BLASLONG*)(p))
76+
#define atomic_store_long(p, v) (*(volatile BLASLONG *)(p)) = (v)
8577
#endif
8678

8779

8880

89-
9081
static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) {
9182

9283
double m = (double)(M - IS - BK);
@@ -119,11 +110,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
119110
FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE;
120111
FLOAT *sbb = sb;
121112

122-
#if __STDC_VERSION__ >= 201112L
123-
_Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d;
124-
#else
125113
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
126-
#endif
127114

128115
blasint *ipiv = (blasint *)args -> c;
129116

@@ -180,7 +167,10 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
180167
}
181168
}
182169

183-
if ((js + REAL_GEMM_R >= n) && (mypos >= 0)) flag[mypos * CACHE_LINE_SIZE] = 0;
170+
if ((js + REAL_GEMM_R >= n) && (mypos >= 0)) {
171+
MB;
172+
atomic_store_long(&flag[mypos * CACHE_LINE_SIZE], 0);
173+
}
184174

185175
for (is = 0; is < m; is += GEMM_P){
186176
min_i = m - is;
@@ -201,14 +191,10 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
201191
/* Non blocking implementation */
202192

203193
typedef struct {
204-
#if __STDC_VERSION__ >= 201112L
205-
_Atomic
206-
#else
207-
volatile
208-
#endif
209-
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
194+
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
210195
} job_t;
211196

197+
212198
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
213199
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
214200

@@ -246,11 +232,8 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
246232

247233
blasint *ipiv = (blasint *)args -> c;
248234
BLASLONG jw;
249-
#if __STDC_VERSION__ >= 201112L
250-
_Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d;
251-
#else
252235
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
253-
#endif
236+
254237
if (args -> a == NULL) {
255238
TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
256239
sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
@@ -280,10 +263,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
280263
#if 1
281264
{
282265
do {
283-
LOCK_COMMAND(&getrf_lock);
284-
jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
285-
UNLOCK_COMMAND(&getrf_lock);
266+
jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside]);
286267
} while (jw);
268+
MB;
287269
}
288270
#else
289271
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {};
@@ -326,21 +308,17 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
326308
}
327309
MB;
328310
for (i = 0; i < args -> nthreads; i++) {
329-
LOCK_COMMAND(&getrf_lock);
330-
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
331-
UNLOCK_COMMAND(&getrf_lock);
311+
atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]);
332312
}
333313
}
334314

335-
LOCK_COMMAND(&getrf_flag_lock);
336-
flag[mypos * CACHE_LINE_SIZE] = 0;
337-
UNLOCK_COMMAND(&getrf_flag_lock);
315+
MB;
316+
atomic_store_long(&flag[mypos * CACHE_LINE_SIZE], 0);
338317

339318
if (m == 0) {
319+
MB;
340320
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
341-
LOCK_COMMAND(&getrf_lock);
342-
job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0;
343-
UNLOCK_COMMAND(&getrf_lock);
321+
atomic_store_long(&job[mypos].working[mypos][CACHE_LINE_SIZE * xxx], 0);
344322
}
345323
}
346324

@@ -366,10 +344,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
366344
if ((current != mypos) && (!is)) {
367345
#if 1
368346
do {
369-
LOCK_COMMAND(&getrf_lock);
370-
jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
371-
UNLOCK_COMMAND(&getrf_lock);
372-
} while (jw == 0);
347+
jw = atomic_load_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside]);
348+
} while (jw == 0);
349+
MB;
373350
#else
374351
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {};
375352
#endif
@@ -381,9 +358,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
381358

382359
MB;
383360
if (is + min_i >= m) {
384-
LOCK_COMMAND(&getrf_lock);
385-
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
386-
UNLOCK_COMMAND(&getrf_lock);
361+
atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], 0);
387362
}
388363
}
389364

@@ -397,10 +372,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
397372
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
398373
#if 1
399374
do {
400-
LOCK_COMMAND(&getrf_lock);
401-
jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];
402-
UNLOCK_COMMAND(&getrf_lock);
375+
jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE *xxx]);
403376
} while(jw != 0);
377+
MB;
404378
#else
405379
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {};
406380
#endif
@@ -443,12 +417,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
443417
#ifdef _MSC_VER
444418
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE];
445419
#else
446-
#if __STDC_VERSION__ >= 201112L
447-
_Atomic
448-
#else
449-
volatile
450-
#endif
451-
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
420+
volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
452421
#endif
453422

454423
#ifndef COMPLEX
@@ -543,7 +512,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
543512
if (width > mn - is - bk) width = mn - is - bk;
544513
}
545514

546-
if (num_cpu > 0) exec_blas_async_wait(num_cpu, &queue[0]);
515+
516+
if (num_cpu > 0) {
517+
WMB;
518+
exec_blas_async_wait(num_cpu, &queue[0]);
519+
}
547520

548521
mm = m - bk - is;
549522
nn = n - bk - is;
@@ -608,7 +581,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
608581
queue[num_cpu].sa = NULL;
609582
queue[num_cpu].sb = NULL;
610583
queue[num_cpu].next = &queue[num_cpu + 1];
611-
flag[num_cpu * CACHE_LINE_SIZE] = 1;
584+
atomic_store_long(&flag[num_cpu * CACHE_LINE_SIZE], 1);
612585

613586
num_cpu ++;
614587

@@ -637,6 +610,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
637610
if (num_cpu > 0) {
638611
queue[num_cpu - 1].next = NULL;
639612

613+
WMB;
614+
640615
exec_blas_async(0, &queue[0]);
641616

642617
inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1);
@@ -647,14 +622,10 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
647622

648623
for (i = 0; i < num_cpu; i ++) {
649624
#if 1
650-
LOCK_COMMAND(&getrf_flag_lock);
651-
f=flag[i*CACHE_LINE_SIZE];
652-
UNLOCK_COMMAND(&getrf_flag_lock);
653-
while (f!=0) {
654-
LOCK_COMMAND(&getrf_flag_lock);
655-
f=flag[i*CACHE_LINE_SIZE];
656-
UNLOCK_COMMAND(&getrf_flag_lock);
657-
};
625+
do {
626+
f = atomic_load_long(&flag[i*CACHE_LINE_SIZE]);
627+
} while (f != 0);
628+
MB;
658629
#else
659630
while (flag[i*CACHE_LINE_SIZE]) {};
660631
#endif
@@ -719,12 +690,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
719690
BLASLONG range[MAX_CPU_NUMBER + 1];
720691

721692
BLASLONG width, nn, num_cpu;
722-
#if __STDC_VERSION__ >= 201112L
723-
_Atomic
724-
#else
725-
volatile
726-
#endif
727-
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
693+
volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
728694

729695
#ifndef COMPLEX
730696
#ifdef XDOUBLE
@@ -833,6 +799,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
833799
nn = n - bk - is;
834800
if (width > nn) width = nn;
835801

802+
WMB;
803+
836804
if (num_cpu > 1) exec_blas_async_wait(num_cpu - 1, &queue[1]);
837805

838806
range[0] = 0;
@@ -867,7 +835,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
867835
queue[num_cpu].sa = NULL;
868836
queue[num_cpu].sb = NULL;
869837
queue[num_cpu].next = &queue[num_cpu + 1];
870-
flag[num_cpu * CACHE_LINE_SIZE] = 1;
838+
atomic_store_long(&flag[num_cpu * CACHE_LINE_SIZE], 1);
871839

872840
num_cpu ++;
873841
}
@@ -882,6 +850,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
882850
range_n_new[0] = offset + is;
883851
range_n_new[1] = offset + is + bk;
884852

853+
WMB;
885854
if (num_cpu > 1) {
886855

887856
exec_blas_async(1, &queue[1]);
@@ -917,7 +886,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
917886

918887
#endif
919888

920-
for (i = 1; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};
889+
for (i = 1; i < num_cpu; i ++) while (atomic_load_long(&flag[i * CACHE_LINE_SIZE])) {};
921890

922891
TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb);
923892

0 commit comments

Comments
 (0)