@@ -73,6 +73,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
7373
7474#include "common.h"
7575
76+ #ifndef likely
77+ #ifdef __GNUC__
78+ #define likely (x ) __builtin_expect(!!(x), 1)
79+ #define unlikely (x ) __builtin_expect(!!(x), 0)
80+ #else
81+ #define likely (x ) (x)
82+ #define unlikely (x ) (x)
83+ #endif
84+ #endif
85+
7686#if defined(USE_TLS ) && defined(SMP )
7787#define COMPILE_TLS
7888
@@ -2060,6 +2070,7 @@ struct release_t {
20602070int hugetlb_allocated = 0 ;
20612071
20622072static struct release_t release_info [NUM_BUFFERS ];
2073+ static struct release_t * new_release_info ;
20632074static int release_pos = 0 ;
20642075
20652076#if defined(OS_LINUX ) && !defined(NO_WARMUP )
@@ -2110,8 +2121,13 @@ static void *alloc_mmap(void *address){
21102121#if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
21112122 LOCK_COMMAND (& alloc_lock );
21122123#endif
2124+ if (likely (release_pos < NUM_BUFFERS )) {
21132125 release_info [release_pos ].address = map_address ;
21142126 release_info [release_pos ].func = alloc_mmap_free ;
2127+ } else {
2128+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2129+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_mmap_free ;
2130+ }
21152131 release_pos ++ ;
21162132#if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
21172133 UNLOCK_COMMAND (& alloc_lock );
@@ -2274,8 +2290,13 @@ static void *alloc_mmap(void *address){
22742290#if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
22752291 LOCK_COMMAND (& alloc_lock );
22762292#endif
2293+ if (likely (release_pos < NUM_BUFFERS )) {
22772294 release_info [release_pos ].address = map_address ;
22782295 release_info [release_pos ].func = alloc_mmap_free ;
2296+ } else {
2297+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2298+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_mmap_free ;
2299+ }
22792300 release_pos ++ ;
22802301#if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
22812302 UNLOCK_COMMAND (& alloc_lock );
@@ -2307,8 +2328,13 @@ static void *alloc_malloc(void *address){
23072328 if (map_address == (void * )NULL ) map_address = (void * )-1 ;
23082329
23092330 if (map_address != (void * )-1 ) {
2331+ if (likely (release_pos < NUM_BUFFERS )) {
23102332 release_info [release_pos ].address = map_address ;
23112333 release_info [release_pos ].func = alloc_malloc_free ;
2334+ } else {
2335+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2336+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_malloc_free ;
2337+ }
23122338 release_pos ++ ;
23132339 }
23142340
@@ -2341,8 +2367,13 @@ static void *alloc_qalloc(void *address){
23412367 if (map_address == (void * )NULL ) map_address = (void * )-1 ;
23422368
23432369 if (map_address != (void * )-1 ) {
2370+ if (likely (release_pos < NUM_BUFFERS )) {
23442371 release_info [release_pos ].address = map_address ;
23452372 release_info [release_pos ].func = alloc_qalloc_free ;
2373+ } else {
2374+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2375+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_qalloc_free ;
2376+ }
23462377 release_pos ++ ;
23472378 }
23482379
@@ -2370,8 +2401,13 @@ static void *alloc_windows(void *address){
23702401 if (map_address == (void * )NULL ) map_address = (void * )-1 ;
23712402
23722403 if (map_address != (void * )-1 ) {
2404+ if (likely (release_pos < NUM_BUFFERS )) {
23732405 release_info [release_pos ].address = map_address ;
23742406 release_info [release_pos ].func = alloc_windows_free ;
2407+ } else {
2408+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2409+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_windows_free ;
2410+ }
23752411 release_pos ++ ;
23762412 }
23772413
@@ -2414,9 +2450,15 @@ static void *alloc_devicedirver(void *address){
24142450 fd , 0 );
24152451
24162452 if (map_address != (void * )-1 ) {
2453+ if (likely (release_pos < NUM_BUFFERS )) {
24172454 release_info [release_pos ].address = map_address ;
24182455 release_info [release_pos ].attr = fd ;
24192456 release_info [release_pos ].func = alloc_devicedirver_free ;
2457+ } else {
2458+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2459+ new_release_info [release_pos - NUM_BUFFERS ].attr = fd ;
2460+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_devicedirver_free ;
2461+ }
24202462 release_pos ++ ;
24212463 }
24222464
@@ -2450,9 +2492,15 @@ static void *alloc_shm(void *address){
24502492
24512493 shmctl (shmid , IPC_RMID , 0 );
24522494
2495+ if (likely (release_pos < NUM_BUFFERS )) {
24532496 release_info [release_pos ].address = map_address ;
24542497 release_info [release_pos ].attr = shmid ;
24552498 release_info [release_pos ].func = alloc_shm_free ;
2499+ } else {
2500+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2501+ new_release_info [release_pos - NUM_BUFFERS ].attr = shmid ;
2502+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_shm_free ;
2503+ }
24562504 release_pos ++ ;
24572505 }
24582506
@@ -2556,8 +2604,13 @@ static void *alloc_hugetlb(void *address){
25562604#endif
25572605
25582606 if (map_address != (void * )-1 ){
2607+ if (likely (release_pos < NUM_BUFFERS )) {
25592608 release_info [release_pos ].address = map_address ;
25602609 release_info [release_pos ].func = alloc_hugetlb_free ;
2610+ } else {
2611+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2612+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_hugetlb_free ;
2613+ }
25612614 release_pos ++ ;
25622615 }
25632616
@@ -2604,9 +2657,15 @@ static void *alloc_hugetlbfile(void *address){
26042657 fd , 0 );
26052658
26062659 if (map_address != (void * )-1 ) {
2660+ if (likely (release_pos < NUM_BUFFERS )) {
26072661 release_info [release_pos ].address = map_address ;
26082662 release_info [release_pos ].attr = fd ;
26092663 release_info [release_pos ].func = alloc_hugetlbfile_free ;
2664+ } else {
2665+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2666+ new_release_info [release_pos - NUM_BUFFERS ].attr = fd ;
2667+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_hugetlbfile_free ;
2668+ }
26102669 release_pos ++ ;
26112670 }
26122671
@@ -2636,8 +2695,25 @@ static volatile struct {
26362695
26372696} memory [NUM_BUFFERS ];
26382697
2639- static int memory_initialized = 0 ;
2698+ static volatile struct newmemstruct
2699+ {
2700+ BLASULONG lock ;
2701+ void * addr ;
2702+ #if defined(WHEREAMI ) && !defined(USE_OPENMP )
2703+ int pos ;
2704+ #endif
2705+ int used ;
2706+ #ifndef __64BIT__
2707+ char dummy [48 ];
2708+ #else
2709+ char dummy [40 ];
2710+ #endif
2711+
2712+ } ;
2713+ static volatile struct newmemstruct * newmemory ;
26402714
2715+ static int memory_initialized = 0 ;
2716+ static int memory_overflowed = 0 ;
26412717/* Memory allocation routine */
26422718/* procpos ... indicates where it comes from */
26432719/* 0 : Level 3 functions */
@@ -2646,6 +2722,8 @@ static int memory_initialized = 0;
26462722
26472723void * blas_memory_alloc (int procpos ){
26482724
2725+ int i ;
2726+
26492727 int position ;
26502728#if defined(WHEREAMI ) && !defined(USE_OPENMP )
26512729 int mypos = 0 ;
@@ -2779,6 +2857,29 @@ void *blas_memory_alloc(int procpos){
27792857#if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
27802858 UNLOCK_COMMAND (& alloc_lock );
27812859#endif
2860+ if (memory_overflowed ) {
2861+ #if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
2862+ LOCK_COMMAND (& alloc_lock );
2863+ #endif
2864+ do {
2865+ RMB ;
2866+ #if defined(USE_OPENMP )
2867+ if (!newmemory [position - NUM_BUFFERS ].used ) {
2868+ blas_lock (& newmemory [position - NUM_BUFFERS ].lock );
2869+ #endif
2870+ if (!newmemory [position - NUM_BUFFERS ].used ) goto allocation2 ;
2871+
2872+ #if defined(USE_OPENMP )
2873+ blas_unlock (& newmemory [position - NUM_BUFFERS ].lock );
2874+ }
2875+ #endif
2876+ position ++ ;
2877+
2878+ } while (position < 512 + NUM_BUFFERS );
2879+ #if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
2880+ UNLOCK_COMMAND (& alloc_lock );
2881+ #endif
2882+ }
27822883 goto error ;
27832884
27842885 allocation :
@@ -2883,6 +2984,91 @@ void *blas_memory_alloc(int procpos){
28832984 return (void * )memory [position ].addr ;
28842985
28852986 error :
2987+ if (memory_overflowed ) goto terminate ;
2988+ fprintf (stderr ,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n" );
2989+ memory_overflowed = 1 ;
2990+ new_release_info = (struct release_t * ) malloc (512 * sizeof (struct release_t ));
2991+ newmemory = (struct newmemstruct * ) malloc (512 * sizeof (struct newmemstruct ));
2992+ for (i = 0 ; i < 512 ; i ++ ) {
2993+ newmemory [i ].addr = (void * )0 ;
2994+ #if defined(WHEREAMI ) && !defined(USE_OPENMP )
2995+ newmemory [i ].pos = -1 ;
2996+ #endif
2997+ newmemory [i ].used = 0 ;
2998+ newmemory [i ].lock = 0 ;
2999+ }
3000+ newmemory [position - NUM_BUFFERS ].used = 1 ;
3001+
3002+ allocation2 :
3003+ newmemory [position - NUM_BUFFERS ].used = 1 ;
3004+ #if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
3005+ UNLOCK_COMMAND (& alloc_lock );
3006+ #else
3007+ blas_unlock (& newmemory [position - NUM_BUFFERS ].lock );
3008+ #endif
3009+ do {
3010+ #ifdef DEBUG
3011+ printf ("Allocation Start : %lx\n" , base_address );
3012+ #endif
3013+
3014+ map_address = (void * )-1 ;
3015+
3016+ func = & memoryalloc [0 ];
3017+
3018+ while ((func != NULL ) && (map_address == (void * ) -1 )) {
3019+
3020+ map_address = (* func )((void * )base_address );
3021+
3022+ #ifdef ALLOC_DEVICEDRIVER
3023+ if ((* func == alloc_devicedirver ) && (map_address == (void * )-1 )) {
3024+ fprintf (stderr , "OpenBLAS Warning ... Physically contiguous allocation was failed.\n" );
3025+ }
3026+ #endif
3027+
3028+ #ifdef ALLOC_HUGETLBFILE
3029+ if ((* func == alloc_hugetlbfile ) && (map_address == (void * )-1 )) {
3030+ #ifndef OS_WINDOWS
3031+ fprintf (stderr , "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n" );
3032+ #endif
3033+ }
3034+ #endif
3035+
3036+ #if (defined ALLOC_SHM ) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS )
3037+ if ((* func == alloc_hugetlb ) && (map_address != (void * )-1 )) hugetlb_allocated = 1 ;
3038+ #endif
3039+
3040+ func ++ ;
3041+ }
3042+
3043+ #ifdef DEBUG
3044+ printf (" Success -> %08lx\n" , map_address );
3045+ #endif
3046+ if (((BLASLONG ) map_address ) == -1 ) base_address = 0UL ;
3047+
3048+ if (base_address ) base_address += BUFFER_SIZE + FIXED_PAGESIZE ;
3049+
3050+ } while ((BLASLONG )map_address == -1 );
3051+
3052+ #if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
3053+ LOCK_COMMAND (& alloc_lock );
3054+ #endif
3055+ newmemory [position - NUM_BUFFERS ].addr = map_address ;
3056+ #if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
3057+ UNLOCK_COMMAND (& alloc_lock );
3058+ #endif
3059+
3060+ #ifdef DEBUG
3061+ printf (" Mapping Succeeded. %p(%d)\n" , (void * )newmemory [position - NUM_BUFFERS ].addr , position );
3062+ #endif
3063+
3064+ #if defined(WHEREAMI ) && !defined(USE_OPENMP )
3065+
3066+ if (newmemory [position - NUM_BUFFERS ].pos == -1 ) newmemory [position - NUM_BUFFERS ].pos = mypos ;
3067+
3068+ #endif
3069+ return (void * )newmemory [position - NUM_BUFFERS ].addr ;
3070+
3071+ terminate :
28863072 printf ("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n" );
28873073 printf ("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n" , NUM_BUFFERS );
28883074 printf ("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n" );
@@ -2907,13 +3093,28 @@ void blas_memory_free(void *free_area){
29073093 while ((position < NUM_BUFFERS ) && (memory [position ].addr != free_area ))
29083094 position ++ ;
29093095
2910- if (position >= NUM_BUFFERS ) goto error ;
3096+ if (position >= NUM_BUFFERS && ! memory_overflowed ) goto error ;
29113097
29123098#ifdef DEBUG
29133099 if (memory [position ].addr != free_area ) goto error ;
29143100 printf (" Position : %d\n" , position );
29153101#endif
3102+ if (unlikely (memory_overflowed && position >= NUM_BUFFERS )) {
3103+ while ((position < NUM_BUFFERS + 512 ) && (newmemory [position - NUM_BUFFERS ].addr != free_area ))
3104+ position ++ ;
3105+ // arm: ensure all writes are finished before other thread takes this memory
3106+ WMB ;
3107+
3108+ newmemory [position ].used = 0 ;
3109+ #if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
3110+ UNLOCK_COMMAND (& alloc_lock );
3111+ #endif
29163112
3113+ #ifdef DEBUG
3114+ printf ("Unmap from overflow area succeeded.\n\n" );
3115+ #endif
3116+ return ;
3117+ } else {
29173118 // arm: ensure all writes are finished before other thread takes this memory
29183119 WMB ;
29193120
@@ -2927,7 +3128,7 @@ void blas_memory_free(void *free_area){
29273128#endif
29283129
29293130 return ;
2930-
3131+ }
29313132 error :
29323133 printf ("BLAS : Bad memory unallocation! : %4d %p\n" , position , free_area );
29333134
@@ -2962,7 +3163,10 @@ void blas_shutdown(void){
29623163 LOCK_COMMAND (& alloc_lock );
29633164
29643165 for (pos = 0 ; pos < release_pos ; pos ++ ) {
3166+ if (likely (pos < NUM_BUFFERS ))
29653167 release_info [pos ].func (& release_info [pos ]);
3168+ else
3169+ new_release_info [pos - NUM_BUFFERS ].func (& new_release_info [pos - NUM_BUFFERS ]);
29663170 }
29673171
29683172#ifdef SEEK_ADDRESS
@@ -2979,6 +3183,15 @@ void blas_shutdown(void){
29793183#endif
29803184 memory [pos ].lock = 0 ;
29813185 }
3186+ if (memory_overflowed )
3187+ for (pos = 0 ; pos < 512 ; pos ++ ){
3188+ newmemory [pos ].addr = (void * )0 ;
3189+ newmemory [pos ].used = 0 ;
3190+ #if defined(WHEREAMI ) && !defined(USE_OPENMP )
3191+ newmemory [pos ].pos = -1 ;
3192+ #endif
3193+ newmemory [pos ].lock = 0 ;
3194+ }
29823195
29833196 UNLOCK_COMMAND (& alloc_lock );
29843197
0 commit comments