1- 
21/*********************************************************************/ 
32/* Copyright 2009, 2010 The University of Texas at Austin.           */ 
43/* All rights reserved.                                              */ 
@@ -68,19 +67,13 @@ int blas_server_avail = 0;
6867
6968int  blas_omp_threads_local  =  1 ;
7069
71- static  void  *  blas_thread_buffer [MAX_CPU_NUMBER ];
72- 
7370/* Local Variables */ 
7471static  BLASULONG  server_lock        =  0 ;
7572
7673static  HANDLE 	    blas_threads    [MAX_CPU_NUMBER ];
7774static  DWORD 	    blas_threads_id [MAX_CPU_NUMBER ];
7875static  volatile  int  thread_target ;	// target num of live threads, volatile for cross-thread reads 
7976
80- //Prototypes 
81- static  void  exec_threads (int  , blas_queue_t  * , int );
82- static  void  adjust_thread_buffers ();
83- 
8477// 
8578// Legacy code path 
8679// 
@@ -215,8 +208,12 @@ static DWORD WINAPI blas_thread_server(void *arg) {
215208  /* Thread identifier */ 
216209  BLASLONG   cpu  =  (BLASLONG )arg ;
217210
211+   void  * buffer , * sa , * sb ;
218212  blas_queue_t 	* queue ;
219213
214+   /* Each server needs each buffer */ 
215+   buffer    =  blas_memory_alloc (2 );
216+ 
220217  MT_TRACE ("Server[%2ld] Thread is started!\n" , cpu );
221218
222219  while  (1 ) {
@@ -244,8 +241,84 @@ static DWORD WINAPI blas_thread_server(void *arg) {
244241    LeaveCriticalSection (& queue_lock );
245242
246243    if  (queue ) {
244+       int  (* routine )(blas_arg_t  * , void  * , void  * , void  * , void  * , BLASLONG ) =  queue  ->  routine ;
245+ 
246+       sa  =  queue  ->  sa ;
247+       sb  =  queue  ->  sb ;
248+ 
249+       #ifdef  CONSISTENT_FPCSR 
250+         __asm__ __volatile__ ("ldmxcsr %0"  : : "m"  (queue  ->  sse_mode ));
251+         __asm__ __volatile__ ("fldcw %0"    : : "m"  (queue  ->  x87_mode ));
252+       #endif 
253+ 
254+       MT_TRACE ("Server[%2ld] Started.  Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n" ,
255+ 	      cpu , queue -> mode , queue ->  args  -> m , queue -> args -> n , queue -> args -> k );
256+ 
257+       // fprintf(stderr, "queue start[%ld]!!!\n", cpu); 
258+ 
259+       #ifdef  MONITOR 
260+         main_status [cpu ] =  MAIN_RUNNING1 ;
261+       #endif 
262+ 
263+       if  (sa  ==  NULL ) 
264+         sa  =  (void  * )((BLASLONG )buffer  +  GEMM_OFFSET_A );
265+ 
266+       if  (sb  ==  NULL ) {
267+         if  (!(queue  ->  mode  &  BLAS_COMPLEX )) {
268+ #ifdef  EXPRECISION 
269+ 	  if  ((queue  ->  mode  &  BLAS_PREC ) ==  BLAS_XDOUBLE ) {
270+ 	    sb  =  (void  * )(((BLASLONG )sa  +  ((XGEMM_P  *  XGEMM_Q  *  sizeof (xdouble )
271+ 					+  GEMM_ALIGN ) &  ~GEMM_ALIGN )) +  GEMM_OFFSET_B );
272+ 	  } else 
273+ #endif 
274+ 	    if  ((queue  ->  mode  &  BLAS_PREC ) ==  BLAS_DOUBLE ) {
275+ #ifdef  BUILD_DOUBLE 
276+ 	      sb  =  (void  * )(((BLASLONG )sa  +  ((DGEMM_P  *  DGEMM_Q  *  sizeof (double )
277+ 					  +  GEMM_ALIGN ) &  ~GEMM_ALIGN )) +  GEMM_OFFSET_B );
278+ #endif 
279+ 	    } else  if  ((queue  ->  mode  &  BLAS_PREC ) ==  BLAS_SINGLE ) {
280+ #ifdef  BUILD_SINGLE 
281+ 	      sb  =  (void  * )(((BLASLONG )sa  +  ((SGEMM_P  *  SGEMM_Q  *  sizeof (float )
282+ 					  +  GEMM_ALIGN ) &  ~GEMM_ALIGN )) +  GEMM_OFFSET_B );
283+ #endif 
284+ 	    } else  {
285+             /* Other types in future */ 
286+ 	    }
287+ 	} else  {
288+ #ifdef  EXPRECISION 
289+ 	  if  ((queue  ->  mode  &  BLAS_PREC ) ==  BLAS_XDOUBLE ){
290+ 	    sb  =  (void  * )(((BLASLONG )sa  +  ((XGEMM_P  *  XGEMM_Q  *  2  *  sizeof (xdouble )
291+ 					+  GEMM_ALIGN ) &  ~GEMM_ALIGN )) +  GEMM_OFFSET_B );
292+ 	  } else 
293+ #endif 
294+ 	    if  ((queue  ->  mode  &  BLAS_PREC ) ==  BLAS_DOUBLE ){
295+ #ifdef  BUILD_COMPLEX16 
296+ 	      sb  =  (void  * )(((BLASLONG )sa  +  ((ZGEMM_P  *  ZGEMM_Q  *  2  *  sizeof (double )
297+ 					  +  GEMM_ALIGN ) &  ~GEMM_ALIGN )) +  GEMM_OFFSET_B );
298+ #endif 
299+ 	    } else  if  ((queue  ->  mode  &  BLAS_PREC ) ==  BLAS_SINGLE ) {
300+ #ifdef  BUILD_COMPLEX 
301+ 	      sb  =  (void  * )(((BLASLONG )sa  +  ((CGEMM_P  *  CGEMM_Q  *  2  *  sizeof (float )
302+ 					  +  GEMM_ALIGN ) &  ~GEMM_ALIGN )) +  GEMM_OFFSET_B );
303+ #endif 
304+ 	    } else  {
305+             /* Other types in future */ 
306+ 	    }
307+ 	}
308+       	queue -> sb = sb ;
309+       }
310+ 
311+       #ifdef  MONITOR 
312+         main_status [cpu ] =  MAIN_RUNNING2 ;
313+       #endif 
247314
248-       exec_threads (cpu , queue , 0 );
315+       if  (!(queue  ->  mode  &  BLAS_LEGACY )) {
316+       	(routine )(queue  ->  args , queue  ->  range_m , queue  ->  range_n , sa , sb , queue  ->  position );
317+       } else  {
318+   	    legacy_exec (routine , queue  ->  mode , queue  ->  args , sb );
319+       }
320+     } else  {
321+   		continue ; //if queue == NULL 
249322	  }
250323
251324    MT_TRACE ("Server[%2ld] Finished!\n" , cpu );
@@ -257,6 +330,8 @@ static DWORD WINAPI blas_thread_server(void *arg) {
257330
258331  MT_TRACE ("Server[%2ld] Shutdown!\n" ,  cpu );
259332
333+   blas_memory_free (buffer );
334+ 
260335  return  0 ;
261336}
262337
@@ -270,8 +345,6 @@ int blas_thread_init(void) {
270345
271346  LOCK_COMMAND (& server_lock );
272347
273-   adjust_thread_buffers ();
274- 
275348  MT_TRACE ("Initializing Thread(Num. threads = %d)\n" , blas_cpu_number );
276349
277350  if  (!blas_server_avail ) {
@@ -336,14 +409,14 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue) {
336409  }
337410  else 
338411  {
339- 	  blas_queue_t  * next_item  =  work_queue ;
412+ 	  blas_queue_t  * queue_item  =  work_queue ;
340413
341414    // find the end of the work queue 
342-     while  (next_item )
343-         next_item  =  next_item -> next ;
415+     while  (queue_item -> next )
416+         queue_item  =  queue_item -> next ;
344417
345418    // add new work to the end 
346-     next_item  =  queue ;
419+     queue_item -> next  =  queue ;
347420  }
348421
349422  LeaveCriticalSection (& queue_lock );
@@ -400,17 +473,6 @@ int exec_blas(BLASLONG num, blas_queue_t *queue) {
400473
401474  if  ((num  <= 0 ) ||  (queue  ==  NULL )) return  0 ;
402475
403-   //Redirect to caller's callback routine 
404-   if  (openblas_threads_callback_ ) {
405-   int  buf_index  =  0 ;
406- #ifndef  USE_SIMPLE_THREADED_LEVEL3 
407-     for  (int  i  =  0 ; i  <  num ; i  ++ )
408-       queue [i ].position  =  i ;
409- #endif 
410-     openblas_threads_callback_ (1 , (openblas_dojob_callback ) exec_threads , num , sizeof (blas_queue_t ), (void * ) queue , buf_index );
411-     return  0 ;
412-   }
413- 
414476  if  ((num  >  1 ) &&  queue  ->  next ) 
415477    exec_blas_async (1 , queue  ->  next );
416478
@@ -445,14 +507,6 @@ int BLASFUNC(blas_thread_shutdown)(void) {
445507
446508  LOCK_COMMAND (& server_lock );
447509
448-   //Free buffers allocated for threads 
449-   for (i = 0 ; i < MAX_CPU_NUMBER ; i ++ ){
450-     if (blas_thread_buffer [i ]!= NULL ){
451-       blas_memory_free (blas_thread_buffer [i ]);
452-       blas_thread_buffer [i ]= NULL ;
453-     }
454-   }
455- 
456510  if  (blas_server_avail ) {
457511
458512    for  (i  =  0 ; i  <  blas_num_threads  -  1 ; i ++ ) {
@@ -555,108 +609,4 @@ void goto_set_num_threads(int num_threads)
555609void  openblas_set_num_threads (int  num )
556610{
557611	goto_set_num_threads (num );
558- }
559- 
560- static  void  adjust_thread_buffers () {
561- 
562-   int  i = 0 ;
563- 
564-   //adjust buffer for each thread 
565-   for (i = 0 ; i  <  blas_cpu_number ; i ++ ){
566-     if (blas_thread_buffer [i ] ==  NULL ){
567-       blas_thread_buffer [i ] =  blas_memory_alloc (2 );
568-     }
569-   }
570-   for (; i  <  MAX_CPU_NUMBER ; i ++ ){
571-     if (blas_thread_buffer [i ] !=  NULL ){
572-       blas_memory_free (blas_thread_buffer [i ]);
573-       blas_thread_buffer [i ] =  NULL ;
574-     }
575-   }
576- }
577- 
578- //Indivitual threads work executor, Helps in setting by synchronization environment and calling inner_threads routine 
579- static  void  exec_threads (int  cpu , blas_queue_t  * queue , int  buf_index )
580- {
581-   
582-   void  * buffer , * sa , * sb ;
583- 
584-   buffer  =  blas_thread_buffer [cpu ];
585-   sa  =  queue  ->  sa ;
586-   sb  =  queue  ->  sb ;
587- 
588-   int  (* routine )(blas_arg_t  * , void  * , void  * , void  * , void  * , BLASLONG ) =  queue  ->  routine ;
589- 
590-   #ifdef  CONSISTENT_FPCSR 
591-     __asm__ __volatile__ ("ldmxcsr %0"  : : "m"  (queue  ->  sse_mode ));
592-     __asm__ __volatile__ ("fldcw %0"    : : "m"  (queue  ->  x87_mode ));
593-   #endif 
594- 
595-   MT_TRACE ("Server[%2ld] Started.  Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n" ,
596-     cpu , queue -> mode , queue ->  args  -> m , queue -> args -> n , queue -> args -> k );
597- 
598-   // fprintf(stderr, "queue start[%ld]!!!\n", cpu); 
599- 
600-   #ifdef  MONITOR 
601-     main_status [cpu ] =  MAIN_RUNNING1 ;
602-   #endif 
603- 
604-   if  (sa  ==  NULL ) 
605-     sa  =  (void  * )((BLASLONG )buffer  +  GEMM_OFFSET_A );
606- 
607-   if  (sb  ==  NULL ) {
608-     if  (!(queue  ->  mode  &  BLAS_COMPLEX )) {
609- #ifdef  EXPRECISION 
610- if  ((queue  ->  mode  &  BLAS_PREC ) ==  BLAS_XDOUBLE ) {
611-   sb  =  (void  * )(((BLASLONG )sa  +  ((XGEMM_P  *  XGEMM_Q  *  sizeof (xdouble )
612-       +  GEMM_ALIGN ) &  ~GEMM_ALIGN )) +  GEMM_OFFSET_B );
613- } else 
614- #endif 
615-   if  ((queue  ->  mode  &  BLAS_PREC ) ==  BLAS_DOUBLE ) {
616- #ifdef  BUILD_DOUBLE 
617-     sb  =  (void  * )(((BLASLONG )sa  +  ((DGEMM_P  *  DGEMM_Q  *  sizeof (double )
618-         +  GEMM_ALIGN ) &  ~GEMM_ALIGN )) +  GEMM_OFFSET_B );
619- #endif 
620-   } else  if  ((queue  ->  mode  &  BLAS_PREC ) ==  BLAS_SINGLE ) {
621- #ifdef  BUILD_SINGLE 
622-     sb  =  (void  * )(((BLASLONG )sa  +  ((SGEMM_P  *  SGEMM_Q  *  sizeof (float )
623-         +  GEMM_ALIGN ) &  ~GEMM_ALIGN )) +  GEMM_OFFSET_B );
624- #endif 
625-   } else  {
626-         /* Other types in future */ 
627-   }
628- } else  {
629- #ifdef  EXPRECISION 
630- if  ((queue  ->  mode  &  BLAS_PREC ) ==  BLAS_XDOUBLE ){
631-   sb  =  (void  * )(((BLASLONG )sa  +  ((XGEMM_P  *  XGEMM_Q  *  2  *  sizeof (xdouble )
632-       +  GEMM_ALIGN ) &  ~GEMM_ALIGN )) +  GEMM_OFFSET_B );
633- } else 
634- #endif 
635-   if  ((queue  ->  mode  &  BLAS_PREC ) ==  BLAS_DOUBLE ){
636- #ifdef  BUILD_COMPLEX16 
637-     sb  =  (void  * )(((BLASLONG )sa  +  ((ZGEMM_P  *  ZGEMM_Q  *  2  *  sizeof (double )
638-         +  GEMM_ALIGN ) &  ~GEMM_ALIGN )) +  GEMM_OFFSET_B );
639- #endif 
640-   } else  if  ((queue  ->  mode  &  BLAS_PREC ) ==  BLAS_SINGLE ) {
641- #ifdef  BUILD_COMPLEX 
642-     sb  =  (void  * )(((BLASLONG )sa  +  ((CGEMM_P  *  CGEMM_Q  *  2  *  sizeof (float )
643-         +  GEMM_ALIGN ) &  ~GEMM_ALIGN )) +  GEMM_OFFSET_B );
644- #endif 
645-   } else  {
646-         /* Other types in future */ 
647-   }
648- }
649-     queue -> sb = sb ;
650-   }
651- 
652-   #ifdef  MONITOR 
653-     main_status [cpu ] =  MAIN_RUNNING2 ;
654-   #endif 
655- 
656-   if  (!(queue  ->  mode  &  BLAS_LEGACY )) {
657-     (routine )(queue  ->  args , queue  ->  range_m , queue  ->  range_n , sa , sb , queue  ->  position );
658-   } else  {
659-     legacy_exec (routine , queue  ->  mode , queue  ->  args , sb );
660-   }
661- 
662612}
0 commit comments