@@ -351,8 +351,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
351351 /* Make sure if no one is using workspace */
352352 START_RPCC ();
353353 for (i = 0 ; i < args -> nthreads ; i ++ )
354- while (job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ]) {YIELDING ;MB ; };
354+ while (job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ]) {YIELDING ;};
355355 STOP_RPCC (waiting1 );
356+ MB ;
356357
357358#if defined(FUSED_GEMM ) && !defined(TIMING )
358359
@@ -395,10 +396,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
395396 }
396397#endif
397398
399+ WMB ;
398400 /* Set flag so other threads can access local region of B */
399401 for (i = mypos_n * nthreads_m ; i < (mypos_n + 1 ) * nthreads_m ; i ++ )
400402 job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ] = (BLASLONG )buffer [bufferside ];
401- WMB ;
402403 }
403404
404405 /* Get regions of B from other threads and apply kernel */
@@ -417,8 +418,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
417418
418419 /* Wait until other region of B is initialized */
419420 START_RPCC ();
420- while (job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] == 0 ) {YIELDING ;MB ; };
421+ while (job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] == 0 ) {YIELDING ;};
421422 STOP_RPCC (waiting2 );
423+ MB ;
422424
423425 /* Apply kernel with local region of A and part of other region of B */
424426 START_RPCC ();
@@ -434,8 +436,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
434436
435437 /* Clear synchronization flag if this thread is done with other region of B */
436438 if (m_to - m_from == min_i ) {
437- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] &= 0 ;
438439 WMB ;
440+ job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] &= 0 ;
439441 }
440442 }
441443 } while (current != mypos );
@@ -477,8 +479,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
477479
478480 /* Clear synchronization flag if this thread is done with region of B */
479481 if (is + min_i >= m_to ) {
480- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] &= 0 ;
481482 WMB ;
483+ job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] &= 0 ;
482484 }
483485 }
484486
@@ -497,10 +499,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
497499 START_RPCC ();
498500 for (i = 0 ; i < args -> nthreads ; i ++ ) {
499501 for (js = 0 ; js < DIVIDE_RATE ; js ++ ) {
500- while (job [mypos ].working [i ][CACHE_LINE_SIZE * js ] ) {YIELDING ;MB ; };
502+ while (job [mypos ].working [i ][CACHE_LINE_SIZE * js ] ) {YIELDING ;};
501503 }
502504 }
503505 STOP_RPCC (waiting3 );
506+ MB ;
504507
505508#ifdef TIMING
506509 BLASLONG waiting = waiting1 + waiting2 + waiting3 ;
@@ -705,7 +708,7 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
705708 }
706709 }
707710 }
708-
711+ WMB ;
709712 /* Execute parallel computation */
710713 exec_blas (nthreads , queue );
711714 }
0 commit comments