5050
5151/* This is a thread implementation for Win32 lazy implementation */
5252
53+ #if defined (__GNUC__ ) && (__GNUC__ < 6 )
54+ #define WIN_CAS (dest , exch , comp ) __sync_val_compare_and_swap(dest, comp, exch)
55+ #else
56+ #if defined(_WIN64 )
57+ #define WIN_CAS (dest , exch , comp ) InterlockedCompareExchange64(dest, exch, comp)
58+ #else
59+ #define WIN_CAS (dest , exch , comp ) InterlockedCompareExchange(dest, exch, comp)
60+ #endif
61+ #endif
62+
5363/* Thread server common information */
5464typedef struct {
55- CRITICAL_SECTION lock ;
56- HANDLE filled ;
57- HANDLE killed ;
65+ HANDLE taskSemaphore ;
5866
5967 blas_queue_t * queue ; /* Parameter Pointer */
6068 int shutdown ; /* server shutdown flag */
@@ -71,8 +79,6 @@ static blas_pool_t pool;
7179static HANDLE blas_threads [MAX_CPU_NUMBER ];
7280static DWORD blas_threads_id [MAX_CPU_NUMBER ];
7381
74-
75-
7682static void legacy_exec (void * func , int mode , blas_arg_t * args , void * sb ){
7783
7884 if (!(mode & BLAS_COMPLEX )){
@@ -198,7 +204,6 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
198204
199205/* This is a main routine of threads. Each thread waits until job is */
200206/* queued. */
201-
202207static DWORD WINAPI blas_thread_server (void * arg ){
203208
204209 /* Thread identifier */
@@ -207,9 +212,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
207212#endif
208213
209214 void * buffer , * sa , * sb ;
210- blas_queue_t * queue ;
211- DWORD action ;
212- HANDLE handles [] = {pool .filled , pool .killed };
215+ volatile blas_queue_t * queue ;
213216
214217 /* Each server needs each buffer */
215218 buffer = blas_memory_alloc (2 );
@@ -226,28 +229,32 @@ static DWORD WINAPI blas_thread_server(void *arg){
226229 fprintf (STDERR , "Server[%2ld] Waiting for Queue.\n" , cpu );
227230#endif
228231
229- do {
230- action = WaitForMultipleObjects (2 , handles , FALSE, INFINITE );
231- } while ((action != WAIT_OBJECT_0 ) && (action != WAIT_OBJECT_0 + 1 ));
232-
233- if (action == WAIT_OBJECT_0 + 1 ) break ;
232+ // all worker threads wait on the semaphore
233+ WaitForSingleObject (pool .taskSemaphore , INFINITE );
234234
235+ // kill the thread if we are shutting down the server
236+ if (pool .shutdown )
237+ break ;
238+
235239#ifdef SMP_DEBUG
236240 fprintf (STDERR , "Server[%2ld] Got it.\n" , cpu );
237241#endif
238242
239- EnterCriticalSection (& pool .lock );
240-
241- queue = pool .queue ;
242- if (queue ) pool .queue = queue -> next ;
243+ // grab a queued task and update the list
244+ volatile blas_queue_t * queue_next ;
245+ INT_PTR prev_value ;
246+ do {
247+ queue = (volatile blas_queue_t * )pool .queue ;
248+ if (!queue )
249+ break ;
243250
244- LeaveCriticalSection (& pool .lock );
251+ queue_next = (volatile blas_queue_t * )queue -> next ;
252+ prev_value = WIN_CAS ((INT_PTR * )& pool .queue , (INT_PTR )queue_next , (INT_PTR )queue );
253+ } while (prev_value != queue );
245254
246255 if (queue ) {
247256 int (* routine )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ) = queue -> routine ;
248257
249- if (pool .queue ) SetEvent (pool .filled );
250-
251258 sa = queue -> sa ;
252259 sb = queue -> sb ;
253260
@@ -332,13 +339,8 @@ static DWORD WINAPI blas_thread_server(void *arg){
332339 fprintf (STDERR , "Server[%2ld] Finished!\n" , cpu );
333340#endif
334341
335- EnterCriticalSection (& queue -> lock );
336-
337- queue -> status = BLAS_STATUS_FINISHED ;
338-
339- LeaveCriticalSection (& queue -> lock );
340-
341- SetEvent (queue -> finish );
342+ // mark our sub-task as complete
343+ InterlockedDecrement (& queue -> status );
342344 }
343345
344346 /* Shutdown procedure */
@@ -353,7 +355,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
353355 }
354356
355357/* Initializing routine */
356- int blas_thread_init (void ){
358+ int blas_thread_init (void ){
357359 BLASLONG i ;
358360
359361 if (blas_server_avail || (blas_cpu_number <= 1 )) return 0 ;
@@ -367,9 +369,7 @@ int blas_thread_init(void){
367369
368370 if (!blas_server_avail ){
369371
370- InitializeCriticalSection (& pool .lock );
371- pool .filled = CreateEvent (NULL , FALSE, FALSE, NULL );
372- pool .killed = CreateEvent (NULL , TRUE, FALSE, NULL );
372+ pool .taskSemaphore = CreateSemaphore (NULL , 0 , blas_cpu_number - 1 , NULL );
373373
374374 pool .shutdown = 0 ;
375375 pool .queue = NULL ;
@@ -391,11 +391,10 @@ int blas_thread_init(void){
391391/*
392392 User can call one of two routines.
393393
394- exec_blas_async ... immediately returns after jobs are queued.
394+ exec_blas_async ... immediately returns after jobs are queued.
395395
396- exec_blas ... returns after jobs are finished.
396+ exec_blas ... returns after jobs are finished.
397397*/
398-
399398int exec_blas_async (BLASLONG pos , blas_queue_t * queue ){
400399
401400#if defined(SMP_SERVER )
@@ -409,8 +408,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
409408 current = queue ;
410409
411410 while (current ) {
412- InitializeCriticalSection (& current -> lock );
413- current -> finish = CreateEvent (NULL , FALSE, FALSE, NULL );
411+ current -> status = 1 ;
414412 current -> position = pos ;
415413
416414#ifdef CONSISTENT_FPCSR
@@ -422,19 +420,10 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
422420 pos ++ ;
423421 }
424422
425- EnterCriticalSection (& pool .lock );
426-
427- if (pool .queue ) {
428- current = pool .queue ;
429- while (current -> next ) current = current -> next ;
430- current -> next = queue ;
431- } else {
432- pool .queue = queue ;
433- }
434-
435- LeaveCriticalSection (& pool .lock );
423+ pool .queue = queue ;
436424
437- SetEvent (pool .filled );
425+ // start up worker threads
426+ ReleaseSemaphore (pool .taskSemaphore , pos - 1 , NULL );
438427
439428 return 0 ;
440429}
@@ -450,10 +439,9 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
450439 fprintf (STDERR , "Waiting Queue ..\n" );
451440#endif
452441
453- WaitForSingleObject (queue -> finish , INFINITE );
454-
455- CloseHandle (queue -> finish );
456- DeleteCriticalSection (& queue -> lock );
442+ // spin-wait on each sub-task to finish
443+ while (* ((volatile int * )& queue -> status ))
444+ YIELDING ;
457445
458446 queue = queue -> next ;
459447 num -- ;
@@ -501,18 +489,21 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
501489
502490/* Shutdown procedure, but user don't have to call this routine. The */
503491/* kernel automatically kill threads. */
504-
505492int BLASFUNC (blas_thread_shutdown )(void ){
506493
507494 int i ;
508495
496+ #ifdef SMP_DEBUG
497+ fprintf (STDERR , "blas_thread_shutdown..\n" );
498+ #endif
499+
509500 if (!blas_server_avail ) return 0 ;
510501
511502 LOCK_COMMAND (& server_lock );
512503
513504 if (blas_server_avail ){
514505
515- SetEvent ( pool .killed ) ;
506+ pool .shutdown = 1 ;
516507
517508 for (i = 0 ; i < blas_num_threads - 1 ; i ++ ){
518509 // Could also just use WaitForMultipleObjects
@@ -528,8 +519,7 @@ int BLASFUNC(blas_thread_shutdown)(void){
528519 CloseHandle (blas_threads [i ]);
529520 }
530521
531- CloseHandle (pool .filled );
532- CloseHandle (pool .killed );
522+ CloseHandle (pool .taskSemaphore );
533523
534524 blas_server_avail = 0 ;
535525 }
@@ -559,16 +549,14 @@ void goto_set_num_threads(int num_threads)
559549 //increased_threads = 1;
560550 if (!blas_server_avail ){
561551
562- InitializeCriticalSection (& pool .lock );
563- pool .filled = CreateEvent (NULL , FALSE, FALSE, NULL );
564- pool .killed = CreateEvent (NULL , TRUE, FALSE, NULL );
552+ pool .taskSemaphore = CreateSemaphore (NULL , 0 , blas_cpu_number - 1 , NULL );
565553
566554 pool .shutdown = 0 ;
567555 pool .queue = NULL ;
568556 blas_server_avail = 1 ;
569557 }
570558
571- for (i = blas_num_threads - 1 ; i < num_threads - 1 ; i ++ ){
559+ for (i = blas_num_threads ; i < num_threads - 1 ; i ++ ){
572560
573561 blas_threads [i ] = CreateThread (NULL , 0 ,
574562 blas_thread_server , (void * )i ,
0 commit comments