Skip to content

Commit 208c7e7

Browse files
committed
Use acq/rel semantics to pass flags/pointers in getrf_parallel.
The current implementation has locks, but the locks each only have a critical section of one variable so atomic reads/writes with barriers can be used to achieve the same behavior. Like the previous patch, pthread_mutex_lock isn't fair, so in a tight loop the previous thread that has the lock can keep it starving another thread, even if that thread is about to write the data that will stop the current thread from spinning. On a 64c Arm system this improves performance by 20x on sgesv.goto.
1 parent 014fc13 commit 208c7e7

1 file changed

Lines changed: 44 additions & 75 deletions

File tree

lapack/getrf/getrf_parallel.c

Lines changed: 44 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -68,25 +68,16 @@ double sqrt(double);
6868
#define GETRF_FACTOR 1.00
6969

7070

71-
#if defined(USE_PTHREAD_LOCK)
72-
static pthread_mutex_t getrf_lock = PTHREAD_MUTEX_INITIALIZER;
73-
#elif defined(USE_PTHREAD_SPINLOCK)
74-
static pthread_spinlock_t getrf_lock = 0;
71+
#if (__STDC_VERSION__ >= 201112L)
72+
#define atomic_load_long(p) __atomic_load_n(p, __ATOMIC_RELAXED)
73+
#define atomic_store_long(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED)
7574
#else
76-
static BLASULONG getrf_lock = 0UL;
77-
#endif
78-
79-
#if defined(USE_PTHREAD_LOCK)
80-
static pthread_mutex_t getrf_flag_lock = PTHREAD_MUTEX_INITIALIZER;
81-
#elif defined(USE_PTHREAD_SPINLOCK)
82-
static pthread_spinlock_t getrf_flag_lock = 0;
83-
#else
84-
static BLASULONG getrf_flag_lock = 0UL;
75+
#define atomic_load_long(p) (BLASLONG)(*(volatile BLASLONG*)(p))
76+
#define atomic_store_long(p, v) (*(volatile BLASLONG *)(p)) = (v)
8577
#endif
8678

8779

8880

89-
9081
static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) {
9182

9283
double m = (double)(M - IS - BK);
@@ -119,11 +110,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
119110
FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE;
120111
FLOAT *sbb = sb;
121112

122-
#if __STDC_VERSION__ >= 201112L
123-
_Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d;
124-
#else
125113
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
126-
#endif
127114

128115
blasint *ipiv = (blasint *)args -> c;
129116

@@ -180,7 +167,10 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
180167
}
181168
}
182169

183-
if ((js + REAL_GEMM_R >= n) && (mypos >= 0)) flag[mypos * CACHE_LINE_SIZE] = 0;
170+
if ((js + REAL_GEMM_R >= n) && (mypos >= 0)) {
171+
MB;
172+
atomic_store_long(&flag[mypos * CACHE_LINE_SIZE], 0);
173+
}
184174

185175
for (is = 0; is < m; is += GEMM_P){
186176
min_i = m - is;
@@ -201,14 +191,10 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
201191
/* Non blocking implementation */
202192

203193
typedef struct {
204-
#if __STDC_VERSION__ >= 201112L
205-
_Atomic
206-
#else
207-
volatile
208-
#endif
209-
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
194+
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
210195
} job_t;
211196

197+
212198
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
213199
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
214200

@@ -246,11 +232,8 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
246232

247233
blasint *ipiv = (blasint *)args -> c;
248234
BLASLONG jw;
249-
#if __STDC_VERSION__ >= 201112L
250-
_Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d;
251-
#else
252235
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
253-
#endif
236+
254237
if (args -> a == NULL) {
255238
TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
256239
sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
@@ -280,10 +263,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
280263
#if 1
281264
{
282265
do {
283-
LOCK_COMMAND(&getrf_lock);
284-
jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
285-
UNLOCK_COMMAND(&getrf_lock);
266+
jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside]);
286267
} while (jw);
268+
MB;
287269
}
288270
#else
289271
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {};
@@ -326,21 +308,17 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
326308
}
327309
MB;
328310
for (i = 0; i < args -> nthreads; i++) {
329-
LOCK_COMMAND(&getrf_lock);
330-
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
331-
UNLOCK_COMMAND(&getrf_lock);
311+
atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]);
332312
}
333313
}
334314

335-
LOCK_COMMAND(&getrf_flag_lock);
336-
flag[mypos * CACHE_LINE_SIZE] = 0;
337-
UNLOCK_COMMAND(&getrf_flag_lock);
315+
MB;
316+
atomic_store_long(&flag[mypos * CACHE_LINE_SIZE], 0);
338317

339318
if (m == 0) {
319+
MB;
340320
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
341-
LOCK_COMMAND(&getrf_lock);
342-
job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0;
343-
UNLOCK_COMMAND(&getrf_lock);
321+
atomic_store_long(&job[mypos].working[mypos][CACHE_LINE_SIZE * xxx], 0);
344322
}
345323
}
346324

@@ -366,10 +344,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
366344
if ((current != mypos) && (!is)) {
367345
#if 1
368346
do {
369-
LOCK_COMMAND(&getrf_lock);
370-
jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
371-
UNLOCK_COMMAND(&getrf_lock);
372-
} while (jw == 0);
347+
jw = atomic_load_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside]);
348+
} while (jw == 0);
349+
MB;
373350
#else
374351
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {};
375352
#endif
@@ -381,9 +358,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
381358

382359
MB;
383360
if (is + min_i >= m) {
384-
LOCK_COMMAND(&getrf_lock);
385-
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
386-
UNLOCK_COMMAND(&getrf_lock);
361+
atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], 0);
387362
}
388363
}
389364

@@ -397,10 +372,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
397372
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
398373
#if 1
399374
do {
400-
LOCK_COMMAND(&getrf_lock);
401-
jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];
402-
UNLOCK_COMMAND(&getrf_lock);
375+
jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE *xxx]);
403376
} while(jw != 0);
377+
MB;
404378
#else
405379
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {};
406380
#endif
@@ -443,12 +417,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
443417
#ifdef _MSC_VER
444418
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE];
445419
#else
446-
#if __STDC_VERSION__ >= 201112L
447-
_Atomic
448-
#else
449-
volatile
450-
#endif
451-
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
420+
volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
452421
#endif
453422

454423
#ifndef COMPLEX
@@ -543,7 +512,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
543512
if (width > mn - is - bk) width = mn - is - bk;
544513
}
545514

546-
if (num_cpu > 0) exec_blas_async_wait(num_cpu, &queue[0]);
515+
516+
if (num_cpu > 0) {
517+
WMB;
518+
exec_blas_async_wait(num_cpu, &queue[0]);
519+
}
547520

548521
mm = m - bk - is;
549522
nn = n - bk - is;
@@ -608,7 +581,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
608581
queue[num_cpu].sa = NULL;
609582
queue[num_cpu].sb = NULL;
610583
queue[num_cpu].next = &queue[num_cpu + 1];
611-
flag[num_cpu * CACHE_LINE_SIZE] = 1;
584+
atomic_store_long(&flag[num_cpu * CACHE_LINE_SIZE], 1);
612585

613586
num_cpu ++;
614587

@@ -637,6 +610,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
637610
if (num_cpu > 0) {
638611
queue[num_cpu - 1].next = NULL;
639612

613+
WMB;
614+
640615
exec_blas_async(0, &queue[0]);
641616

642617
inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1);
@@ -647,14 +622,10 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
647622

648623
for (i = 0; i < num_cpu; i ++) {
649624
#if 1
650-
LOCK_COMMAND(&getrf_flag_lock);
651-
f=flag[i*CACHE_LINE_SIZE];
652-
UNLOCK_COMMAND(&getrf_flag_lock);
653-
while (f!=0) {
654-
LOCK_COMMAND(&getrf_flag_lock);
655-
f=flag[i*CACHE_LINE_SIZE];
656-
UNLOCK_COMMAND(&getrf_flag_lock);
657-
};
625+
do {
626+
f = atomic_load_long(&flag[i*CACHE_LINE_SIZE]);
627+
} while (f != 0);
628+
MB;
658629
#else
659630
while (flag[i*CACHE_LINE_SIZE]) {};
660631
#endif
@@ -719,12 +690,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
719690
BLASLONG range[MAX_CPU_NUMBER + 1];
720691

721692
BLASLONG width, nn, num_cpu;
722-
#if __STDC_VERSION__ >= 201112L
723-
_Atomic
724-
#else
725-
volatile
726-
#endif
727-
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
693+
volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
728694

729695
#ifndef COMPLEX
730696
#ifdef XDOUBLE
@@ -833,6 +799,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
833799
nn = n - bk - is;
834800
if (width > nn) width = nn;
835801

802+
WMB;
803+
836804
if (num_cpu > 1) exec_blas_async_wait(num_cpu - 1, &queue[1]);
837805

838806
range[0] = 0;
@@ -867,7 +835,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
867835
queue[num_cpu].sa = NULL;
868836
queue[num_cpu].sb = NULL;
869837
queue[num_cpu].next = &queue[num_cpu + 1];
870-
flag[num_cpu * CACHE_LINE_SIZE] = 1;
838+
atomic_store_long(&flag[num_cpu * CACHE_LINE_SIZE], 1);
871839

872840
num_cpu ++;
873841
}
@@ -882,6 +850,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
882850
range_n_new[0] = offset + is;
883851
range_n_new[1] = offset + is + bk;
884852

853+
WMB;
885854
if (num_cpu > 1) {
886855

887856
exec_blas_async(1, &queue[1]);
@@ -917,7 +886,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
917886

918887
#endif
919888

920-
for (i = 1; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};
889+
for (i = 1; i < num_cpu; i ++) while (atomic_load_long(&flag[i * CACHE_LINE_SIZE])) {};
921890

922891
TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb);
923892

0 commit comments

Comments
 (0)