@@ -48,6 +48,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4848extern int blas_level1_thread_with_return_value (int mode , BLASLONG m , BLASLONG n ,
4949 BLASLONG k , void * alpha , void * a , BLASLONG lda , void * b , BLASLONG ldb ,
5050 void * c , BLASLONG ldc , int (* function )(), int nthreads );
51+
52+ #ifdef DYNAMIC_ARCH
53+ extern char * gotoblas_corename (void );
54+ #endif
55+
56+ #if defined(DYNAMIC_ARCH ) || defined(NEOVERSEV1 )
57+ static inline int get_dot_optimal_nthreads_neoversev1 (BLASLONG N , int ncpu ) {
58+ #ifdef DOUBLE
59+ return (N <= 10000L ) ? 1
60+ : (N <= 64500L ) ? 1
61+ : (N <= 100000L ) ? MIN (ncpu , 2 )
62+ : (N <= 150000L ) ? MIN (ncpu , 4 )
63+ : (N <= 260000L ) ? MIN (ncpu , 8 )
64+ : (N <= 360000L ) ? MIN (ncpu , 16 )
65+ : (N <= 520000L ) ? MIN (ncpu , 24 )
66+ : (N <= 1010000L ) ? MIN (ncpu , 56 )
67+ : ncpu ;
68+ #else
69+ return (N <= 10000L ) ? 1
70+ : (N <= 110000L ) ? 1
71+ : (N <= 200000L ) ? MIN (ncpu , 2 )
72+ : (N <= 280000L ) ? MIN (ncpu , 4 )
73+ : (N <= 520000L ) ? MIN (ncpu , 8 )
74+ : (N <= 830000L ) ? MIN (ncpu , 16 )
75+ : (N <= 1010000L ) ? MIN (ncpu , 24 )
76+ : ncpu ;
77+ #endif
78+ }
79+ #endif
80+
81+ static inline int get_dot_optimal_nthreads (BLASLONG n ) {
82+ int ncpu = num_cpu_avail (1 );
83+
84+ #if defined(NEOVERSEV1 ) && !defined(COMPLEX ) && !defined(BFLOAT16 )
85+ return get_dot_optimal_nthreads_neoversev1 (n , ncpu );
86+ #elif defined(DYNAMIC_ARCH ) && !defined(COMPLEX ) && !defined(BFLOAT16 )
87+ if (strcmp (gotoblas_corename (), "neoversev1" ) == 0 ) {
88+ return get_dot_optimal_nthreads_neoversev1 (n , ncpu );
89+ }
90+ #endif
91+
92+ // Default case
93+ if (n <= 10000L )
94+ return 1 ;
95+ else
96+ return num_cpu_avail (1 );
97+ }
5198#endif
5299
53100static RETURN_TYPE dot_compute (BLASLONG n , FLOAT * x , BLASLONG inc_x , FLOAT * y , BLASLONG inc_y )
@@ -85,10 +132,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y
85132 RETURN_TYPE dot = 0.0 ;
86133
87134#if defined(SMP )
88- if (inc_x == 0 || inc_y == 0 || n <= 10000 )
135+ if (inc_x == 0 || inc_y == 0 )
89136 nthreads = 1 ;
90137 else
91- nthreads = num_cpu_avail ( 1 );
138+ nthreads = get_dot_optimal_nthreads ( n );
92139
93140 if (nthreads == 1 ) {
94141 dot = dot_compute (n , x , inc_x , y , inc_y );
@@ -105,7 +152,7 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y
105152
106153 blas_level1_thread_with_return_value (mode , n , 0 , 0 , & dummy_alpha ,
107154 x , inc_x , y , inc_y , result , 0 ,
108- ( void * )dot_thread_function , nthreads );
155+ (void * )dot_thread_function , nthreads );
109156
110157 ptr = (RETURN_TYPE * )result ;
111158 for (i = 0 ; i < nthreads ; i ++ ) {
0 commit comments