5050
5151/* This is a thread implementation for Win32 lazy implementation */
5252
53- #if defined (__GNUC__ ) && (__GNUC__ < 6 )
54- #define WIN_CAS (dest , exch , comp ) __sync_val_compare_and_swap(dest, comp, exch)
55- #else
56- #if defined(_WIN64 )
57- #define WIN_CAS (dest , exch , comp ) InterlockedCompareExchange64(dest, exch, comp)
58- #else
59- #define WIN_CAS (dest , exch , comp ) InterlockedCompareExchange(dest, exch, comp)
60- #endif
61- #endif
62-
6353/* Thread server common information */
6454typedef struct {
65- HANDLE taskSemaphore ;
55+ CRITICAL_SECTION lock ;
56+ HANDLE filled ;
57+ HANDLE killed ;
6658
6759 blas_queue_t * queue ; /* Parameter Pointer */
6860 int shutdown ; /* server shutdown flag */
@@ -79,6 +71,8 @@ static blas_pool_t pool;
7971static HANDLE blas_threads [MAX_CPU_NUMBER ];
8072static DWORD blas_threads_id [MAX_CPU_NUMBER ];
8173
74+
75+
8276static void legacy_exec (void * func , int mode , blas_arg_t * args , void * sb ){
8377
8478 if (!(mode & BLAS_COMPLEX )){
@@ -204,6 +198,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
204198
205199/* This is a main routine of threads. Each thread waits until job is */
206200/* queued. */
201+
207202static DWORD WINAPI blas_thread_server (void * arg ){
208203
209204 /* Thread identifier */
@@ -212,7 +207,9 @@ static DWORD WINAPI blas_thread_server(void *arg){
212207#endif
213208
214209 void * buffer , * sa , * sb ;
215- volatile blas_queue_t * queue ;
210+ blas_queue_t * queue ;
211+ DWORD action ;
212+ HANDLE handles [] = {pool .filled , pool .killed };
216213
217214 /* Each server needs each buffer */
218215 buffer = blas_memory_alloc (2 );
@@ -229,32 +226,28 @@ static DWORD WINAPI blas_thread_server(void *arg){
229226 fprintf (STDERR , "Server[%2ld] Waiting for Queue.\n" , cpu );
230227#endif
231228
232- // all worker threads wait on the semaphore
233- WaitForSingleObject (pool .taskSemaphore , INFINITE );
229+ do {
230+ action = WaitForMultipleObjects (2 , handles , FALSE, INFINITE );
231+ } while ((action != WAIT_OBJECT_0 ) && (action != WAIT_OBJECT_0 + 1 ));
232+
233+ if (action == WAIT_OBJECT_0 + 1 ) break ;
234234
235- // kill the thread if we are shutting down the server
236- if (pool .shutdown )
237- break ;
238-
239235#ifdef SMP_DEBUG
240236 fprintf (STDERR , "Server[%2ld] Got it.\n" , cpu );
241237#endif
242238
243- // grab a queued task and update the list
244- volatile blas_queue_t * queue_next ;
245- INT_PTR prev_value ;
246- do {
247- queue = (volatile blas_queue_t * )pool .queue ;
248- if (!queue )
249- break ;
239+ EnterCriticalSection (& pool .lock );
240+
241+ queue = pool .queue ;
242+ if (queue ) pool .queue = queue -> next ;
250243
251- queue_next = (volatile blas_queue_t * )queue -> next ;
252- prev_value = WIN_CAS ((INT_PTR * )& pool .queue , (INT_PTR )queue_next , (INT_PTR )queue );
253- } while (prev_value != queue );
244+ LeaveCriticalSection (& pool .lock );
254245
255246 if (queue ) {
256247 int (* routine )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ) = queue -> routine ;
257248
249+ if (pool .queue ) SetEvent (pool .filled );
250+
258251 sa = queue -> sa ;
259252 sb = queue -> sb ;
260253
@@ -339,8 +332,13 @@ static DWORD WINAPI blas_thread_server(void *arg){
339332 fprintf (STDERR , "Server[%2ld] Finished!\n" , cpu );
340333#endif
341334
342- // mark our sub-task as complete
343- InterlockedDecrement (& queue -> status );
335+ EnterCriticalSection (& queue -> lock );
336+
337+ queue -> status = BLAS_STATUS_FINISHED ;
338+
339+ LeaveCriticalSection (& queue -> lock );
340+
341+ SetEvent (queue -> finish );
344342 }
345343
346344 /* Shutdown procedure */
@@ -355,7 +353,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
355353 }
356354
357355/* Initializing routine */
358- int blas_thread_init (void ){
356+ int blas_thread_init (void ){
359357 BLASLONG i ;
360358
361359 if (blas_server_avail || (blas_cpu_number <= 1 )) return 0 ;
@@ -369,7 +367,9 @@ static DWORD WINAPI blas_thread_server(void *arg){
369367
370368 if (!blas_server_avail ){
371369
372- pool .taskSemaphore = CreateSemaphore (NULL , 0 , blas_cpu_number - 1 , NULL );
370+ InitializeCriticalSection (& pool .lock );
371+ pool .filled = CreateEvent (NULL , FALSE, FALSE, NULL );
372+ pool .killed = CreateEvent (NULL , TRUE, FALSE, NULL );
373373
374374 pool .shutdown = 0 ;
375375 pool .queue = NULL ;
@@ -391,10 +391,11 @@ static DWORD WINAPI blas_thread_server(void *arg){
391391/*
392392 User can call one of two routines.
393393
394- exec_blas_async ... immediately returns after jobs are queued.
394+ exec_blas_async ... immediately returns after jobs are queued.
395395
396- exec_blas ... returns after jobs are finished.
396+ exec_blas ... returns after jobs are finished.
397397*/
398+
398399int exec_blas_async (BLASLONG pos , blas_queue_t * queue ){
399400
400401#if defined(SMP_SERVER )
@@ -408,7 +409,8 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
408409 current = queue ;
409410
410411 while (current ) {
411- current -> status = 1 ;
412+ InitializeCriticalSection (& current -> lock );
413+ current -> finish = CreateEvent (NULL , FALSE, FALSE, NULL );
412414 current -> position = pos ;
413415
414416#ifdef CONSISTENT_FPCSR
@@ -420,10 +422,19 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
420422 pos ++ ;
421423 }
422424
423- pool .queue = queue ;
425+ EnterCriticalSection (& pool .lock );
426+
427+ if (pool .queue ) {
428+ current = pool .queue ;
429+ while (current -> next ) current = current -> next ;
430+ current -> next = queue ;
431+ } else {
432+ pool .queue = queue ;
433+ }
434+
435+ LeaveCriticalSection (& pool .lock );
424436
425- // start up worker threads
426- ReleaseSemaphore (pool .taskSemaphore , pos - 1 , NULL );
437+ SetEvent (pool .filled );
427438
428439 return 0 ;
429440}
@@ -439,9 +450,10 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
439450 fprintf (STDERR , "Waiting Queue ..\n" );
440451#endif
441452
442- // spin-wait on each sub-task to finish
443- while (* ((volatile int * )& queue -> status ))
444- YIELDING ;
453+ WaitForSingleObject (queue -> finish , INFINITE );
454+
455+ CloseHandle (queue -> finish );
456+ DeleteCriticalSection (& queue -> lock );
445457
446458 queue = queue -> next ;
447459 num -- ;
@@ -489,21 +501,18 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
489501
490502/* Shutdown procedure, but user don't have to call this routine. The */
491503/* kernel automatically kill threads. */
504+
492505int BLASFUNC (blas_thread_shutdown )(void ){
493506
494507 int i ;
495508
496- #ifdef SMP_DEBUG
497- fprintf (STDERR , "blas_thread_shutdown..\n" );
498- #endif
499-
500509 if (!blas_server_avail ) return 0 ;
501510
502511 LOCK_COMMAND (& server_lock );
503512
504513 if (blas_server_avail ){
505514
506- pool . shutdown = 1 ;
515+ SetEvent ( pool . killed ) ;
507516
508517 for (i = 0 ; i < blas_num_threads - 1 ; i ++ ){
509518 // Could also just use WaitForMultipleObjects
@@ -519,7 +528,8 @@ int BLASFUNC(blas_thread_shutdown)(void){
519528 CloseHandle (blas_threads [i ]);
520529 }
521530
522- CloseHandle (pool .taskSemaphore );
531+ CloseHandle (pool .filled );
532+ CloseHandle (pool .killed );
523533
524534 blas_server_avail = 0 ;
525535 }
@@ -549,14 +559,16 @@ void goto_set_num_threads(int num_threads)
549559 //increased_threads = 1;
550560 if (!blas_server_avail ){
551561
552- pool .taskSemaphore = CreateSemaphore (NULL , 0 , blas_cpu_number - 1 , NULL );
562+ InitializeCriticalSection (& pool .lock );
563+ pool .filled = CreateEvent (NULL , FALSE, FALSE, NULL );
564+ pool .killed = CreateEvent (NULL , TRUE, FALSE, NULL );
553565
554566 pool .shutdown = 0 ;
555567 pool .queue = NULL ;
556568 blas_server_avail = 1 ;
557569 }
558570
559- for (i = blas_num_threads ; i < num_threads - 1 ; i ++ ){
571+ for (i = blas_num_threads - 1 ; i < num_threads - 1 ; i ++ ){
560572
561573 blas_threads [i ] = CreateThread (NULL , 0 ,
562574 blas_thread_server , (void * )i ,
0 commit comments