@@ -105,6 +105,14 @@ typedef struct {
105105 BLASLONG working [MAX_CPU_NUMBER ][CACHE_LINE_SIZE * DIVIDE_RATE ];
106106} job_t ;
107107
108+ #ifdef HAVE_C11
109+ #define atomic_load_long (p ) __atomic_load_n(p, __ATOMIC_RELAXED)
110+ #define atomic_store_long (p , v ) __atomic_store_n(p, v, __ATOMIC_RELAXED)
111+ #else
112+ #define atomic_load_long (p ) (BLASLONG)(*(volatile BLASLONG*)(p))
113+ #define atomic_store_long (p , v ) (*(volatile BLASLONG *)(p)) = (v)
114+ #endif
115+
108116
109117#ifndef KERNEL_OPERATION
110118#ifndef COMPLEX
@@ -233,14 +241,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
233241 }
234242
235243#ifndef LOWER
244+ MB ;
236245 for (i = 0 ; i <= mypos ; i ++ )
237- job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ] = (BLASLONG )buffer [bufferside ];
246+ atomic_store_long (& job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ], (BLASLONG )buffer [bufferside ]);
247+ // job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
238248#else
249+ MB
239250 for (i = mypos ; i < args -> nthreads ; i ++ )
240- job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ] = (BLASLONG )buffer [bufferside ];
251+ atomic_store_long (& job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ], (BLASLONG )buffer [bufferside ]);
252+ // job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
241253#endif
242254
243- WMB ;
255+ // WMB;
244256 }
245257
246258 min_i = m_to - m_from ;
@@ -271,14 +283,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
271283 for (xxx = range_n [current ], bufferside = 0 ; xxx < range_n [current + 1 ]; xxx += div_n , bufferside ++ ) {
272284
273285 /* thread has to wait */
274- if (current != mypos ) while (job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] == 0 ) {YIELDING ;};
286+ if (current != mypos )
287+ do {
288+ jw = atomic_load_long (& job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ]);
289+ } while (jw == 0 );
290+ MB ;
291+
292+ //while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
275293
276294 KERNEL_OPERATION (min_i , MIN (range_n [current + 1 ] - xxx , div_n ), k , alpha ,
277295 sa , (FLOAT * )job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ],
278296 c , lda , m_from , xxx );
279297
280298 if (m_from + min_i >= m_to ) {
281- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] &= 0 ;
299+ atomic_store_long (& job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ], job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] &= 0 );
300+ // job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
282301 WMB ;
283302 }
284303 }
@@ -323,7 +342,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
323342 c , lda , is , xxx );
324343
325344 if (is + min_i >= m_to ) {
326- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] &= 0 ;
345+ atomic_store_long (& job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ], job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] &= 0 );
346+ // job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
327347 WMB ;
328348 }
329349 }
@@ -337,9 +357,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
337357
338358 for (i = 0 ; i < args -> nthreads ; i ++ ) {
339359 if (i != mypos ) {
340- for (xxx = 0 ; xxx < DIVIDE_RATE ; xxx ++ ) {
360+ for (xxx = 0 ; xxx < DIVIDE_RATE ; xxx ++ )
361+ #if 1
362+ {
363+ do {
364+ jw = atomic_load_long (& job [mypos ].working [i ][CACHE_LINE_SIZE * xxx ]);
365+ } while (jw );
366+ MB ;
367+ }
368+ #else
341369 while (job [mypos ].working [i ][CACHE_LINE_SIZE * xxx ] ) {YIELDING ;};
342- }
370+ #endif
371+ // }
343372 }
344373 }
345374
0 commit comments