5252
5353/* Thread server common information */
5454typedef struct {
55- CRITICAL_SECTION lock ;
56- HANDLE filled ;
57- HANDLE killed ;
55+ HANDLE taskSemaphore ;
5856
5957 blas_queue_t * queue ; /* Parameter Pointer */
6058 int shutdown ; /* server shutdown flag */
@@ -68,6 +66,7 @@ int blas_server_avail = 0;
6866static BLASULONG server_lock = 0 ;
6967
7068static blas_pool_t pool ;
69+ static BLASULONG pool_lock = 0 ;
7170static HANDLE blas_threads [MAX_CPU_NUMBER ];
7271static DWORD blas_threads_id [MAX_CPU_NUMBER ];
7372
@@ -198,7 +197,6 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
198197
199198/* This is a main routine of threads. Each thread waits until job is */
200199/* queued. */
201-
202200static DWORD WINAPI blas_thread_server (void * arg ){
203201
204202 /* Thread identifier */
@@ -207,9 +205,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
207205#endif
208206
209207 void * buffer , * sa , * sb ;
210- blas_queue_t * queue ;
211- DWORD action ;
212- HANDLE handles [] = {pool .filled , pool .killed };
208+ volatile blas_queue_t * queue ;
213209
214210 /* Each server needs each buffer */
215211 buffer = blas_memory_alloc (2 );
@@ -226,28 +222,32 @@ static DWORD WINAPI blas_thread_server(void *arg){
226222 fprintf (STDERR , "Server[%2ld] Waiting for Queue.\n" , cpu );
227223#endif
228224
229- do {
230- action = WaitForMultipleObjects (2 , handles , FALSE, INFINITE );
231- } while ((action != WAIT_OBJECT_0 ) && (action != WAIT_OBJECT_0 + 1 ));
232-
233- if (action == WAIT_OBJECT_0 + 1 ) break ;
225+ // all worker threads wait on the semaphore
226+ WaitForSingleObject (pool .taskSemaphore , INFINITE );
234227
228+ // kill the thread if we are shutting down the server
229+ if (pool .shutdown )
230+ break ;
231+
235232#ifdef SMP_DEBUG
236233 fprintf (STDERR , "Server[%2ld] Got it.\n" , cpu );
237234#endif
238235
239- EnterCriticalSection (& pool .lock );
236+ // grab a queued task and update the list
237+ volatile blas_queue_t * queue_next ;
238+ LONG64 prev_value ;
239+ do {
240+ queue = (volatile blas_queue_t * )pool .queue ;
241+ if (!queue )
242+ break ;
240243
241- queue = pool .queue ;
242- if (queue ) pool .queue = queue -> next ;
243-
244- LeaveCriticalSection (& pool .lock );
244+ queue_next = (volatile blas_queue_t * )queue -> next ;
245+ prev_value = InterlockedCompareExchange64 ((PLONG64 )& pool .queue , (LONG64 )queue_next , (LONG64 )queue );
246+ } while (prev_value != queue );
245247
246248 if (queue ) {
247249 int (* routine )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ) = queue -> routine ;
248250
249- if (pool .queue ) SetEvent (pool .filled );
250-
251251 sa = queue -> sa ;
252252 sb = queue -> sb ;
253253
@@ -332,13 +332,8 @@ static DWORD WINAPI blas_thread_server(void *arg){
332332 fprintf (STDERR , "Server[%2ld] Finished!\n" , cpu );
333333#endif
334334
335- EnterCriticalSection (& queue -> lock );
336-
337- queue -> status = BLAS_STATUS_FINISHED ;
338-
339- LeaveCriticalSection (& queue -> lock );
340-
341- SetEvent (queue -> finish );
335+ // mark our sub-task as complete
336+ InterlockedDecrement (& queue -> status );
342337 }
343338
344339 /* Shutdown procedure */
@@ -353,7 +348,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
353348 }
354349
355350/* Initializing routine */
356- int blas_thread_init (void ){
351+ int blas_thread_init (void ){
357352 BLASLONG i ;
358353
359354 if (blas_server_avail || (blas_cpu_number <= 1 )) return 0 ;
@@ -367,9 +362,7 @@ int blas_thread_init(void){
367362
368363 if (!blas_server_avail ){
369364
370- InitializeCriticalSection (& pool .lock );
371- pool .filled = CreateEvent (NULL , FALSE, FALSE, NULL );
372- pool .killed = CreateEvent (NULL , TRUE, FALSE, NULL );
365+ pool .taskSemaphore = CreateSemaphore (NULL , 0 , blas_cpu_number - 1 , NULL );
373366
374367 pool .shutdown = 0 ;
375368 pool .queue = NULL ;
@@ -391,11 +384,10 @@ int blas_thread_init(void){
391384/*
392385 User can call one of two routines.
393386
394- exec_blas_async ... immediately returns after jobs are queued.
387+ exec_blas_async ... immediately returns after jobs are queued.
395388
396- exec_blas ... returns after jobs are finished.
389+ exec_blas ... returns after jobs are finished.
397390*/
398-
399391int exec_blas_async (BLASLONG pos , blas_queue_t * queue ){
400392
401393#if defined(SMP_SERVER )
@@ -409,8 +401,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
409401 current = queue ;
410402
411403 while (current ) {
412- InitializeCriticalSection (& current -> lock );
413- current -> finish = CreateEvent (NULL , FALSE, FALSE, NULL );
404+ current -> status = 1 ;
414405 current -> position = pos ;
415406
416407#ifdef CONSISTENT_FPCSR
@@ -422,19 +413,10 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
422413 pos ++ ;
423414 }
424415
425- EnterCriticalSection ( & pool .lock ) ;
416+ pool .queue = queue ;
426417
427- if (pool .queue ) {
428- current = pool .queue ;
429- while (current -> next ) current = current -> next ;
430- current -> next = queue ;
431- } else {
432- pool .queue = queue ;
433- }
434-
435- LeaveCriticalSection (& pool .lock );
436-
437- SetEvent (pool .filled );
418+ // start up worker threads
419+ ReleaseSemaphore (pool .taskSemaphore , pos - 1 , NULL );
438420
439421 return 0 ;
440422}
@@ -450,10 +432,9 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
450432 fprintf (STDERR , "Waiting Queue ..\n" );
451433#endif
452434
453- WaitForSingleObject (queue -> finish , INFINITE );
454-
455- CloseHandle (queue -> finish );
456- DeleteCriticalSection (& queue -> lock );
435+ // spin-wait on each sub-task to finish
436+ while (* ((volatile int * )& queue -> status ))
437+ YIELDING ;
457438
458439 queue = queue -> next ;
459440 num -- ;
@@ -501,18 +482,21 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
501482
502483/* Shutdown procedure, but user don't have to call this routine. The */
503484/* kernel automatically kill threads. */
504-
505485int BLASFUNC (blas_thread_shutdown )(void ){
506486
507487 int i ;
508488
489+ #ifdef SMP_DEBUG
490+ fprintf (STDERR , "blas_thread_shutdown..\n" );
491+ #endif
492+
509493 if (!blas_server_avail ) return 0 ;
510494
511495 LOCK_COMMAND (& server_lock );
512496
513497 if (blas_server_avail ){
514498
515- SetEvent ( pool .killed ) ;
499+ pool .shutdown = 1 ;
516500
517501 for (i = 0 ; i < blas_num_threads - 1 ; i ++ ){
518502 // Could also just use WaitForMultipleObjects
@@ -528,8 +512,7 @@ int BLASFUNC(blas_thread_shutdown)(void){
528512 CloseHandle (blas_threads [i ]);
529513 }
530514
531- CloseHandle (pool .filled );
532- CloseHandle (pool .killed );
515+ CloseHandle (pool .taskSemaphore );
533516
534517 blas_server_avail = 0 ;
535518 }
@@ -559,16 +542,14 @@ void goto_set_num_threads(int num_threads)
559542 //increased_threads = 1;
560543 if (!blas_server_avail ){
561544
562- InitializeCriticalSection (& pool .lock );
563- pool .filled = CreateEvent (NULL , FALSE, FALSE, NULL );
564- pool .killed = CreateEvent (NULL , TRUE, FALSE, NULL );
545+ pool .taskSemaphore = CreateSemaphore (NULL , 0 , blas_cpu_number - 1 , NULL );
565546
566547 pool .shutdown = 0 ;
567548 pool .queue = NULL ;
568549 blas_server_avail = 1 ;
569550 }
570551
571- for (i = blas_num_threads - 1 ; i < num_threads - 1 ; i ++ ){
552+ for (i = blas_num_threads ; i < num_threads - 1 ; i ++ ){
572553
573554 blas_threads [i ] = CreateThread (NULL , 0 ,
574555 blas_thread_server , (void * )i ,
0 commit comments