@@ -73,7 +73,6 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
7373 std::vector< cl_device_id > device_id;
7474 cl_context context;
7575 cl_command_queue queue;
76- cl_event outEvent = NULL ;
7776 clfftPlanHandle plan_handle;
7877
7978 for (unsigned u = 0 ; u < max_dimensions; ++u) {
@@ -204,7 +203,7 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
204203
205204
206205 OPENCL_V_THROW ( clEnqueueWriteBuffer ( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0 , size_of_input_buffers_in_bytes, &input[ 0 ],
207- 0 , NULL , &outEvent ),
206+ 0 , NULL , NULL ),
208207 " clEnqueueWriteBuffer failed" );
209208
210209 }
@@ -252,10 +251,10 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
252251
253252
254253 OPENCL_V_THROW ( clEnqueueWriteBuffer ( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0 , size_of_input_buffers_in_bytes, &real[ 0 ],
255- 0 , NULL , &outEvent ),
254+ 0 , NULL , NULL ),
256255 " clEnqueueWriteBuffer failed" );
257256 OPENCL_V_THROW ( clEnqueueWriteBuffer ( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0 , size_of_input_buffers_in_bytes, &imag[ 0 ],
258- 0 , NULL , &outEvent ),
257+ 0 , NULL , NULL ),
259258 " clEnqueueWriteBuffer failed" );
260259 }
261260 break ;
@@ -289,7 +288,7 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
289288
290289
291290 OPENCL_V_THROW ( clEnqueueWriteBuffer ( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0 , size_of_input_buffers_in_bytes, &input[ 0 ],
292- 0 , NULL , &outEvent ),
291+ 0 , NULL , NULL ),
293292 " clEnqueueWriteBuffer failed" );
294293 }
295294 break ;
@@ -325,10 +324,10 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
325324
326325
327326 OPENCL_V_THROW ( clEnqueueWriteBuffer ( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0 , size_of_input_buffers_in_bytes, &real[ 0 ],
328- 0 , NULL , &outEvent ),
327+ 0 , NULL , NULL ),
329328 " clEnqueueWriteBuffer failed" );
330329 OPENCL_V_THROW ( clEnqueueWriteBuffer ( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0 , size_of_input_buffers_in_bytes, &imag[ 0 ],
331- 0 , NULL , &outEvent ),
330+ 0 , NULL , NULL ),
332331 " clEnqueueWriteBuffer failed" );
333332 }
334333 break ;
@@ -373,7 +372,7 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
373372
374373
375374 OPENCL_V_THROW ( clEnqueueWriteBuffer ( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0 , size_of_input_buffers_in_bytes, &real[ 0 ],
376- 0 , NULL , &outEvent ),
375+ 0 , NULL , NULL ),
377376 " clEnqueueWriteBuffer failed" );
378377 }
379378 break ;
@@ -391,22 +390,10 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
391390 terr << _T ( " Could not find the external timing library; timings disabled" ) << std::endl;
392391 }
393392
394-
395393 // Timer module discovered and loaded successfully
396394 // Initialize function pointers to call into the shared module
397395 PFGETSTATTIMER get_timer = reinterpret_cast < PFGETSTATTIMER > ( LoadFunctionAddr ( timerLibHandle, " getStatTimer" ) );
398396
399- // Create and initialize our timer class, if the external timer shared library loaded
400- baseStatTimer* timer = NULL ;
401- size_t clFFTID = 0 ;
402- if ( get_timer )
403- {
404- timer = get_timer ( CLFFT_GPU );
405- timer->Reserve ( 1 , profile_count );
406- timer->setNormalize ( true );
407-
408- clFFTID = timer->getUniqueID ( " clFFT" , 0 );
409- }
410397
411398 OPENCL_V_THROW ( clfftSetup ( setupData.get ( ) ), " clfftSetup failed" );
412399 OPENCL_V_THROW ( clfftCreateDefaultPlan ( &plan_handle, context, dim, lengths ), " clfftCreateDefaultPlan failed" );
@@ -511,37 +498,64 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
511498 }
512499 }
513500
514- // Loop as many times as the user specifies to average out the timings
515- //
501+
516502 cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &output_cl_mem_buffers[ 0 ];
517503
518- Timer tr;
519- tr.Start ();
504+ // Execute once for basic functional test
505+ OPENCL_V_THROW ( clfftEnqueueTransform ( plan_handle, dir, 1 , &queue, 0 , NULL , NULL ,
506+ &input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ),
507+ " clfftEnqueueTransform failed" );
520508
521- for ( cl_uint i = 0 ; i < profile_count; ++i )
522- {
523- if ( timer ) timer->Start ( clFFTID );
509+ OPENCL_V_THROW ( clFinish ( queue ), " clFinish failed" );
510+
524511
525- OPENCL_V_THROW ( clfftEnqueueTransform ( plan_handle, dir, 1 , &queue, 0 , NULL , &outEvent,
526- &input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ),
527- " clfftEnqueueTransform failed" );
512+ // Create and initialize our timer class, if the external timer shared library loaded
513+ baseStatTimer* timer = NULL ;
514+ size_t clFFTID = 0 ;
515+ if ( get_timer )
516+ {
517+ timer = get_timer ( CLFFT_GPU );
518+ timer->Reserve ( 1 , profile_count );
519+ timer->setNormalize ( true );
528520
529- if ( timer ) timer-> Stop ( clFFTID );
521+ clFFTID = timer-> getUniqueID ( " clFFT " , 0 );
530522 }
531- OPENCL_V_THROW ( clFinish ( queue ), " clFinish failed" );
532- if (clMedBuffer) clReleaseMemObject (clMedBuffer);
533523
534- double wtime = tr.Sample ()/((double )profile_count);
535- size_t totalLen = 1 ;
536- for (int i=0 ; i<dim; i++) totalLen *= lengths[i];
537- double opsconst = 5.0 * (double )totalLen * log ((double )totalLen) / log (2.0 );
524+ cl_event *outEvent = new cl_event[profile_count];
525+ for ( cl_uint i = 0 ; i < profile_count; ++i ) outEvent[i] = 0 ;
538526
539527 if (profile_count > 1 )
540528 {
529+ Timer tr;
530+ tr.Start ();
531+ for ( cl_uint i = 0 ; i < profile_count; ++i )
532+ {
533+ if ( timer ) timer->Start ( clFFTID );
534+
535+ OPENCL_V_THROW ( clfftEnqueueTransform ( plan_handle, dir, 1 , &queue, 0 , NULL , &outEvent[i],
536+ &input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ),
537+ " clfftEnqueueTransform failed" );
538+
539+ if ( timer ) timer->Stop ( clFFTID );
540+ }
541+ OPENCL_V_THROW ( clWaitForEvents ( profile_count, outEvent ), " clWaitForEvents failed" );
542+
543+ double wtime = tr.Sample ()/((double )profile_count);
544+
545+ OPENCL_V_THROW ( clFinish ( queue ), " clFinish failed" );
546+
547+ size_t totalLen = 1 ;
548+ for (int i=0 ; i<dim; i++) totalLen *= lengths[i];
549+ double opsconst = 5.0 * (double )totalLen * log ((double )totalLen) / log (2.0 );
550+
551+
541552 tout << " \n Execution wall time: " << 1000.0 *wtime << " ms" << std::endl;
542553 tout << " Execution gflops: " << ((double )batch_size * opsconst)/(1000000000.0 *wtime) << std::endl;
554+
543555 }
544556
557+ if (clMedBuffer) clReleaseMemObject (clMedBuffer);
558+
545559 if ( timer && (command_queue_flags & CL_QUEUE_PROFILING_ENABLE) )
546560 {
547561 // Remove all timings that are outside of 2 stddev (keep 65% of samples); we ignore outliers to get a more consistent result
@@ -553,6 +567,14 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
553567 /* ****************/
554568 FreeSharedLibrary ( timerLibHandle );
555569
570+ for ( cl_uint i = 0 ; i < profile_count; ++i )
571+ {
572+ if (outEvent[i])
573+ clReleaseEvent (outEvent[i]);
574+ }
575+
576+ delete[] outEvent;
577+
556578 // Read and check output data
557579 // This check is not valid if the FFT is executed multiple times inplace.
558580 //
@@ -725,7 +747,7 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
725747 OPENCL_V_THROW ( clfftDestroyPlan ( &plan_handle ), " clfftDestroyPlan failed" );
726748 OPENCL_V_THROW ( clfftTeardown ( ), " clfftTeardown failed" );
727749
728- cleanupCL ( &context, &queue, countOf ( input_cl_mem_buffers ), input_cl_mem_buffers, countOf ( output_cl_mem_buffers ), output_cl_mem_buffers, &outEvent );
750+ cleanupCL ( &context, &queue, countOf ( input_cl_mem_buffers ), input_cl_mem_buffers, countOf ( output_cl_mem_buffers ), output_cl_mem_buffers, NULL );
729751 return 0 ;
730752}
731753
0 commit comments