@@ -545,13 +545,31 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
545545 * range_n , IFLOAT * sa , IFLOAT * sb ,
546546 BLASLONG nthreads_m , BLASLONG nthreads_n ) {
547547
548- #ifndef USE_OPENMP
549- #ifndef OS_WINDOWS
550- static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER ;
548+ #ifdef USE_OPENMP
549+ static omp_lock_t level3_lock , critical_section_lock ;
550+ static volatile BLASLONG init_lock = 0 , omp_lock_initialized = 0 ,
551+ parallel_section_left = MAX_PARALLEL_NUMBER ;
552+
553+ // Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c
554+ while (omp_lock_initialized == 0 )
555+ {
556+ blas_lock (& init_lock );
557+ {
558+ if (omp_lock_initialized == 0 )
559+ {
560+ omp_init_lock (& level3_lock );
561+ omp_init_lock (& critical_section_lock );
562+ omp_lock_initialized = 1 ;
563+ WMB ;
564+ }
565+ blas_unlock (& init_lock );
566+ }
567+ }
568+ #elif defined(OS_WINDOWS )
569+ CRITICAL_SECTION level3_lock ;
570+ InitializeCriticalSection ((PCRITICAL_SECTION )& level3_lock );
551571#else
552- CRITICAL_SECTION level3_lock ;
553- InitializeCriticalSection ((PCRITICAL_SECTION )& level3_lock );
554- #endif
572+ static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER ;
555573#endif
556574
557575 blas_arg_t newarg ;
@@ -599,12 +617,28 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
599617#endif
600618#endif
601619
602- #ifndef USE_OPENMP
603- #ifndef OS_WINDOWS
604- pthread_mutex_lock (& level3_lock );
620+ #ifdef USE_OPENMP
621+ omp_set_lock (& level3_lock );
622+ omp_set_lock (& critical_section_lock );
623+
624+ parallel_section_left -- ;
625+
626+ /*
627+ How OpenMP locks works with NUM_PARALLEL
628+ 1) parallel_section_left = Number of available concurrent executions of OpenBLAS - Number of currently executing OpenBLAS executions
629+ 2) level3_lock is acting like a master lock or barrier which stops OpenBLAS calls when all the parallel_section are currently busy executing other OpenBLAS calls
630+ 3) critical_section_lock is used for updating variables shared between threads executing OpenBLAS calls concurrently and for unlocking of master lock whenever required
631+ 4) Unlock master lock only when we have not already exhausted all the parallel_sections and allow another thread with a OpenBLAS call to enter
632+ */
633+ if (parallel_section_left != 0 )
634+ omp_unset_lock (& level3_lock );
635+
636+ omp_unset_lock (& critical_section_lock );
637+
638+ #elif defined(OS_WINDOWS )
639+ EnterCriticalSection ((PCRITICAL_SECTION )& level3_lock );
605640#else
606- EnterCriticalSection ((PCRITICAL_SECTION )& level3_lock );
607- #endif
641+ pthread_mutex_lock (& level3_lock );
608642#endif
609643
610644#ifdef USE_ALLOC_HEAP
@@ -732,12 +766,24 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
732766 free (job );
733767#endif
734768
735- #ifndef USE_OPENMP
736- #ifndef OS_WINDOWS
737- pthread_mutex_unlock (& level3_lock );
738- #else
769+ #ifdef USE_OPENMP
770+ omp_set_lock (& critical_section_lock );
771+ parallel_section_left ++ ;
772+
773+ /*
774+ Unlock master lock only when all the parallel_sections are already exhausted and one of the thread has completed its OpenBLAS call
775+ otherwise just increment the parallel_section_left
776+ The master lock is only locked when we have exhausted all the parallel_sections, So only unlock it then and otherwise just increment the count
777+ */
778+ if (parallel_section_left == 1 )
779+ omp_unset_lock (& level3_lock );
780+
781+ omp_unset_lock (& critical_section_lock );
782+
783+ #elif defined(OS_WINDOWS )
739784 LeaveCriticalSection ((PCRITICAL_SECTION )& level3_lock );
740- #endif
785+ #else
786+ pthread_mutex_unlock (& level3_lock );
741787#endif
742788
743789 return 0 ;
0 commit comments