@@ -548,13 +548,31 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
548548 * range_n , IFLOAT * sa , IFLOAT * sb ,
549549 BLASLONG nthreads_m , BLASLONG nthreads_n ) {
550550
551- #ifndef USE_OPENMP
552- #ifndef OS_WINDOWS
553- static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER ;
551+ #ifdef USE_OPENMP
552+ static omp_lock_t level3_lock , critical_section_lock ;
553+ static volatile BLASLONG init_lock = 0 , omp_lock_initialized = 0 ,
554+ parallel_section_left = MAX_PARALLEL_NUMBER ;
555+
556+ // Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c
557+ while (omp_lock_initialized == 0 )
558+ {
559+ blas_lock (& init_lock );
560+ {
561+ if (omp_lock_initialized == 0 )
562+ {
563+ omp_init_lock (& level3_lock );
564+ omp_init_lock (& critical_section_lock );
565+ omp_lock_initialized = 1 ;
566+ WMB ;
567+ }
568+ blas_unlock (& init_lock );
569+ }
570+ }
571+ #elif defined(OS_WINDOWS )
572+ CRITICAL_SECTION level3_lock ;
573+ InitializeCriticalSection ((PCRITICAL_SECTION )& level3_lock );
554574#else
555- CRITICAL_SECTION level3_lock ;
556- InitializeCriticalSection ((PCRITICAL_SECTION )& level3_lock );
557- #endif
575+ static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER ;
558576#endif
559577
560578 blas_arg_t newarg ;
@@ -597,12 +615,28 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
597615#endif
598616#endif
599617
600- #ifndef USE_OPENMP
601- #ifndef OS_WINDOWS
602- pthread_mutex_lock (& level3_lock );
618+ #ifdef USE_OPENMP
619+ omp_set_lock (& level3_lock );
620+ omp_set_lock (& critical_section_lock );
621+
622+ parallel_section_left -- ;
623+
624+ /*
625+ How OpenMP locks works with NUM_PARALLEL
626+ 1) parallel_section_left = Number of available concurrent executions of OpenBLAS - Number of currently executing OpenBLAS executions
627+ 2) level3_lock is acting like a master lock or barrier which stops OpenBLAS calls when all the parallel_section are currently busy executing other OpenBLAS calls
628+ 3) critical_section_lock is used for updating variables shared between threads executing OpenBLAS calls concurrently and for unlocking of master lock whenever required
629+ 4) Unlock master lock only when we have not already exhausted all the parallel_sections and allow another thread with a OpenBLAS call to enter
630+ */
631+ if (parallel_section_left != 0 )
632+ omp_unset_lock (& level3_lock );
633+
634+ omp_unset_lock (& critical_section_lock );
635+
636+ #elif defined(OS_WINDOWS )
637+ EnterCriticalSection ((PCRITICAL_SECTION )& level3_lock );
603638#else
604- EnterCriticalSection ((PCRITICAL_SECTION )& level3_lock );
605- #endif
639+ pthread_mutex_lock (& level3_lock );
606640#endif
607641
608642#ifdef USE_ALLOC_HEAP
@@ -730,12 +764,24 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
730764 free (job );
731765#endif
732766
733- #ifndef USE_OPENMP
734- #ifndef OS_WINDOWS
735- pthread_mutex_unlock (& level3_lock );
736- #else
767+ #ifdef USE_OPENMP
768+ omp_set_lock (& critical_section_lock );
769+ parallel_section_left ++ ;
770+
771+ /*
772+ Unlock master lock only when all the parallel_sections are already exhausted and one of the thread has completed its OpenBLAS call
773+ otherwise just increment the parallel_section_left
774+ The master lock is only locked when we have exhausted all the parallel_sections, So only unlock it then and otherwise just increment the count
775+ */
776+ if (parallel_section_left == 1 )
777+ omp_unset_lock (& level3_lock );
778+
779+ omp_unset_lock (& critical_section_lock );
780+
781+ #elif defined(OS_WINDOWS )
737782 LeaveCriticalSection ((PCRITICAL_SECTION )& level3_lock );
738- #endif
783+ #else
784+ pthread_mutex_unlock (& level3_lock );
739785#endif
740786
741787 return 0 ;
0 commit comments