@@ -226,14 +226,13 @@ class OpenClContext : public ComputeContext {
226226
227227 // 1b variates (shared pool of mersenne twister based normal variates)
228228
229- std::size_t variatesPoolSize_ = 0 ;
229+ std::size_t variatesPoolSize_ = 0 ; // count of single random numbers
230230 cl_mem variatesPool_;
231231 cl_mem variatesMtStateBuffer_;
232232 cl_program variatesProgram_;
233233 cl_kernel variatesKernelSeedInit_;
234234 cl_kernel variatesKernelTwist_;
235235 cl_kernel variatesKernelGenerate_;
236- std::vector<cl_event> variatesWaitEvents_;
237236
238237 // 2 curent calc
239238
@@ -519,6 +518,7 @@ void OpenClContext::updateVariatesPool() {
519518
520519 std::size_t fpSize = settings_.useDoublePrecision ? sizeof (double ) : sizeof (float );
521520
521+ cl_event initEvent;
522522 if (variatesPoolSize_ == 0 ) {
523523
524524 // build the kernels to fill the variates pool
@@ -581,24 +581,24 @@ void OpenClContext::updateVariatesPool() {
581581
582582 // from QuantLib::MersenneTwisterUniformRng
583583
584- std::string kernelSourceSeedInit = " __kernel void ore_seedInitialization(const uint s, __global uint * mt) {\n "
585- " const uint N = 624;\n "
584+ std::string kernelSourceSeedInit = " __kernel void ore_seedInitialization(const ulong s, __global ulong * mt) {\n "
585+ " const ulong N = 624;\n "
586586 " mt[0]= s & 0xffffffffU;\n "
587- " for (uint mti=1; mti<N; ++mti) {\n "
587+ " for (ulong mti=1; mti<N; ++mti) {\n "
588588 " mt[mti] = (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);\n "
589- " mt[mti] &= 0xffffffffU ;\n "
589+ " mt[mti] &= 0xffffffffUL ;\n "
590590 " }\n "
591591 " }\n\n " ;
592592
593- std::string kernelSourceTwist = " __kernel void ore_twist(__global uint * mt) {\n "
594- " const uint N = 624;\n "
595- " const uint M = 397;\n "
596- " const uint MATRIX_A = 0x9908b0dfUL;\n "
597- " const uint UPPER_MASK=0x80000000UL;\n "
598- " const uint LOWER_MASK=0x7fffffffUL;\n "
599- " const uint mag01[2]={0x0UL, MATRIX_A};\n "
600- " uint kk;\n "
601- " uint y;\n "
593+ std::string kernelSourceTwist = " __kernel void ore_twist(__global ulong * mt) {\n "
594+ " const ulong N = 624;\n "
595+ " const ulong M = 397;\n "
596+ " const ulong MATRIX_A = 0x9908b0dfUL;\n "
597+ " const ulong UPPER_MASK=0x80000000UL;\n "
598+ " const ulong LOWER_MASK=0x7fffffffUL;\n "
599+ " const ulong mag01[2]={0x0UL, MATRIX_A};\n "
600+ " ulong kk;\n "
601+ " ulong y;\n "
602602 " for (kk=0;kk<N-M;++kk) {\n "
603603 " y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);\n "
604604 " mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];\n "
@@ -612,14 +612,14 @@ void OpenClContext::updateVariatesPool() {
612612 " }\n\n " ;
613613
614614 std::string kernelSourceGenerate =
615- " __kernel void ore_generate(const uint offset, __global uint * mt, __global " + fpTypeStr + " * output) {\n "
616- " uint mti = get_global_id(0);\n "
617- " uint y = mt[mti];\n "
615+ " __kernel void ore_generate(const ulong offset, __global ulong * mt, __global " + fpTypeStr + " * output) {\n "
616+ " ulong mti = get_global_id(0);\n "
617+ " ulong y = mt[mti];\n "
618618 " y ^= (y >> 11);\n "
619619 " y ^= (y << 7) & 0x9d2c5680U;\n "
620620 " y ^= (y << 15) & 0xefc60000U;\n "
621621 " y ^= (y >> 18);\n "
622- " output[offset + mti] = ore_invCumN(y);\n "
622+ " output[offset + mti] = ore_invCumN((uint) y);\n "
623623 " }\n\n " ;
624624 // clang-format on
625625
@@ -653,79 +653,98 @@ void OpenClContext::updateVariatesPool() {
653653 QL_REQUIRE (err == CL_SUCCESS,
654654 " OpenClContext::updateVariatesPool(): error creating kernel generate: " << errorText (err));
655655
656- variatesMtStateBuffer_ = clCreateBuffer (context_, CL_MEM_READ_WRITE, fpSize * mt_N, NULL , &err);
656+ variatesMtStateBuffer_ = clCreateBuffer (context_, CL_MEM_READ_WRITE, sizeof (cl_ulong) * mt_N, NULL , &err);
657657 QL_REQUIRE (err == CL_SUCCESS,
658658 " OpenClContext::updateVariatesPool(): error creating mt state buffer: " << errorText (err));
659659
660- cl_uint tmpSeed = (cl_uint )settings_.rngSeed ;
661- err = clSetKernelArg (variatesKernelSeedInit_, 0 , sizeof (cl_uint ), &tmpSeed);
660+ cl_ulong tmpSeed = (cl_ulong )settings_.rngSeed ;
661+ err = clSetKernelArg (variatesKernelSeedInit_, 0 , sizeof (cl_ulong ), &tmpSeed);
662662 err |= clSetKernelArg (variatesKernelSeedInit_, 1 , sizeof (cl_mem), &variatesMtStateBuffer_);
663663 QL_REQUIRE (err == CL_SUCCESS,
664664 " OpenClContext::updateVariatesPool(): error setting kernel args seed init: " << errorText (err));
665665
666- QL_REQUIRE (variatesWaitEvents_.empty (),
667- " OpenClContext::updateVariatesPool(): internal error, expected empty variatesWaitEvents_ ("
668- << variatesWaitEvents_.size () << " )" );
669- variatesWaitEvents_.push_back (cl_event ());
670666 constexpr std::size_t sizeOne = 1 ;
671- err = clEnqueueNDRangeKernel (queue_, variatesKernelSeedInit_, 1 , NULL , &sizeOne, NULL , 0 , NULL ,
672- &variatesWaitEvents_.back ());
667+ err = clEnqueueNDRangeKernel (queue_, variatesKernelSeedInit_, 1 , NULL , &sizeOne, NULL , 0 , NULL , &initEvent);
673668 QL_REQUIRE (err == CL_SUCCESS,
674669 " OpenClContext::updateVariatesPool(): error running kernel seed init: " << errorText (err));
675- } else {
676- QL_REQUIRE (!variatesWaitEvents_.empty (),
677- " OpenClContext::updateVariatesPool(): internal error, expected non-empty variatesWaitEvents_ list." );
670+ }
671+
672+ // if the variates pool is big enough, we exit early
673+
674+ if (variatesPoolSize_ >= nVariates_ * size_[currentId_ - 1 ]) {
675+ if (variatesPoolSize_ == 0 )
676+ clWaitForEvents (1 , &initEvent);
677+ return ;
678678 }
679679
680680 // create new buffer to hold the variates and copy the current buffer contents to the new buffer
681681
682- Size alignedSize = 624 * (size_[currentId_ - 1 ] / 624 + (size_[currentId_ - 1 ] % 624 == 0 ? 0 : 1 ));
682+ Size alignedSize =
683+ 624 * (nVariates_ * size_[currentId_ - 1 ] / 624 + (nVariates_ * size_[currentId_ - 1 ] % 624 == 0 ? 0 : 1 ));
683684
684685 cl_int err;
685686 cl_mem oldBuffer = variatesPool_;
686- variatesPool_ = clCreateBuffer (context_, CL_MEM_READ_WRITE, fpSize * nVariates_ * alignedSize, NULL , &err);
687+ variatesPool_ = clCreateBuffer (context_, CL_MEM_READ_WRITE, fpSize * alignedSize, NULL , &err);
687688 QL_REQUIRE (err == CL_SUCCESS, " OpenClContext::updateVariatesPool(): error creating variates buffer with size "
688- << fpSize * nVariates_ * alignedSize << " bytes: " << errorText (err));
689+ << fpSize * alignedSize << " bytes: " << errorText (err));
690+ cl_event copyEvent;
689691 if (variatesPoolSize_ > 0 ) {
690- cl_event copyEvent;
691- err = clEnqueueCopyBuffer (queue_, oldBuffer, variatesPool_, 0 , 0 , fpSize * variatesPoolSize_ * alignedSize, 1 ,
692- &variatesWaitEvents_.back (), ©Event);
693- variatesWaitEvents_.push_back (copyEvent);
692+ err = clEnqueueCopyBuffer (queue_, oldBuffer, variatesPool_, 0 , 0 , fpSize * variatesPoolSize_, 0 , NULL ,
693+ ©Event);
694694 QL_REQUIRE (err == CL_SUCCESS,
695695 " OpenClContext::updateVariatesPool(): error copying existing variates buffer to new buffer: "
696696 << errorText (err));
697697 }
698698
699699 // fill in the new variates
700700
701- for (std::size_t currentPoolSize = variatesPoolSize_ * alignedSize; currentPoolSize < nVariates_ * alignedSize;
701+ std::size_t currentPoolSize;
702+ cl_event generateEvent;
703+ bool haveGenerated = false ;
704+ for (currentPoolSize = variatesPoolSize_; currentPoolSize < nVariates_ * size_[currentId_ - 1 ];
702705 currentPoolSize += mt_N) {
703706 err = clSetKernelArg (variatesKernelTwist_, 0 , sizeof (cl_mem), &variatesMtStateBuffer_);
704707 QL_REQUIRE (err == CL_SUCCESS,
705708 " OpenClContext::updateVariatesPool(): error setting args for kernel twist: " << errorText (err));
706709 cl_event twistEvent;
707- err = clEnqueueNDRangeKernel (queue_, variatesKernelTwist_, 1 , NULL , &mt_N, NULL , 1 , &variatesWaitEvents_. back (),
708- &twistEvent);
709- variatesWaitEvents_. push_back ( twistEvent);
710+ err = clEnqueueNDRangeKernel (
711+ queue_, variatesKernelTwist_, 1 , NULL , &mt_N, NULL , variatesPoolSize_ == 0 || haveGenerated ? 1 : 0 ,
712+ variatesPoolSize_ == 0 ? &initEvent : (haveGenerated ? &generateEvent : NULL ), & twistEvent);
710713 QL_REQUIRE (err == CL_SUCCESS,
711714 " OpenClContext::updateVariatesPool(): error running kernel twist: " << errorText (err));
712715
713- err = clSetKernelArg (variatesKernelGenerate_, 0 , sizeof (cl_uint ), ¤tPoolSize);
716+ err = clSetKernelArg (variatesKernelGenerate_, 0 , sizeof (cl_ulong ), ¤tPoolSize);
714717 err |= clSetKernelArg (variatesKernelGenerate_, 1 , sizeof (cl_mem), &variatesMtStateBuffer_);
715718 err |= clSetKernelArg (variatesKernelGenerate_, 2 , sizeof (cl_mem), &variatesPool_);
716719 QL_REQUIRE (err == CL_SUCCESS,
717720 " OpenClContext::updateVariatesPool(): error settings args for kernel generate: " << errorText (err));
718- cl_event generateEvent;
719- err = clEnqueueNDRangeKernel (queue_, variatesKernelGenerate_, 1 , NULL , &mt_N, NULL , 1 ,
720- &variatesWaitEvents_.back (), &generateEvent);
721- variatesWaitEvents_.push_back (generateEvent);
721+ err = clEnqueueNDRangeKernel (queue_, variatesKernelGenerate_, 1 , NULL , &mt_N, NULL , 1 , &twistEvent,
722+ &generateEvent);
722723 QL_REQUIRE (err == CL_SUCCESS,
723724 " OpenClContext::updateVariatesPool(): error running kernel generate: " << errorText (err));
725+ haveGenerated = true ;
724726 }
725727
728+ // wait for events to finish
729+
730+ std::vector<cl_event> waitList;
731+ if (variatesPoolSize_ > 0 )
732+ waitList.push_back (copyEvent);
733+ if (haveGenerated)
734+ waitList.push_back (generateEvent);
735+ if (!waitList.empty ())
736+ clWaitForEvents (waitList.size (), &waitList[0 ]);
737+
738+ // release old buffer
739+
740+ if (variatesPoolSize_ > 0 )
741+ releaseMem (oldBuffer);
742+
726743 // update current variates pool size
727744
728- variatesPoolSize_ = nVariates_;
745+ QL_REQUIRE (currentPoolSize == alignedSize, " OpenClContext::updateVariatesPool(): internal error, currentPoolSize = "
746+ << currentPoolSize << " does not match alignedSize " << alignedSize);
747+ variatesPoolSize_ = currentPoolSize;
729748}
730749
731750std::vector<std::vector<std::size_t >> OpenClContext::createInputVariates (const std::size_t dim,
@@ -745,8 +764,7 @@ std::vector<std::vector<std::size_t>> OpenClContext::createInputVariates(const s
745764 }
746765 }
747766 nVariates_ += dim * steps;
748- if (nVariates_ > variatesPoolSize_)
749- updateVariatesPool ();
767+ updateVariatesPool ();
750768 return resultIds;
751769}
752770
@@ -780,10 +798,10 @@ std::size_t OpenClContext::applyOperation(const std::size_t randomVariableOpCode
780798 std::vector<std::string> argStr (args.size ());
781799 for (std::size_t i = 0 ; i < args.size (); ++i) {
782800 if (args[i] < inputVarOffset_.size ()) {
783- argStr[i] = " input[" + std::to_string (inputVarOffset_[args[i]]) + " UL " +
801+ argStr[i] = " input[" + std::to_string (inputVarOffset_[args[i]]) + " U " +
784802 (inputVarIsScalar_[args[i]] ? " ]" : " + i]" );
785803 } else if (args[i] < inputVarOffset_.size () + nVariates_) {
786- argStr[i] = " rn[" + std::to_string ((args[i] - inputVarOffset_.size ()) * size_[currentId_ - 1 ]) + " + i]" ;
804+ argStr[i] = " rn[" + std::to_string ((args[i] - inputVarOffset_.size ()) * size_[currentId_ - 1 ]) + " U + i]" ;
787805 } else {
788806 // variable is an (intermediate) result
789807 argStr[i] = " v" + std::to_string (args[i]);
@@ -994,11 +1012,11 @@ void OpenClContext::finalizeCalculation(std::vector<double*>& output) {
9941012 " ore_kernel_" + std::to_string (currentId_) + " _" + std::to_string (version_[currentId_ - 1 ]);
9951013
9961014 std::vector<std::string> inputArgs;
997- if (inputBufferSize > 0 )
1015+ if (inputBufferSize > 0 )
9981016 inputArgs.push_back (" __global " + fpTypeStr + " * input" );
999- if (nVariates_ > 0 )
1017+ if (nVariates_ > 0 )
10001018 inputArgs.push_back (" __global " + fpTypeStr + " * rn" );
1001- if (outputBufferSize > 0 )
1019+ if (outputBufferSize > 0 )
10021020 inputArgs.push_back (" __global " + fpTypeStr + " * output" );
10031021
10041022 std::string kernelSource = includeSource + " __kernel void " + kernelName + " (" + boost::join (inputArgs, " ," ) +
@@ -1013,13 +1031,12 @@ void OpenClContext::finalizeCalculation(std::vector<double*>& output) {
10131031 std::size_t offset = i * size_[currentId_ - 1 ];
10141032 std::string output;
10151033 if (outputVariables_[i] < inputVarOffset_.size ()) {
1016- output = " input[" + std::to_string (outputVariables_[i]) + " UL " +
1034+ output = " input[" + std::to_string (outputVariables_[i]) + " U " +
10171035 (inputVarIsScalar_[outputVariables_[i]] ? " ]" : " + i] " );
1018- }
1019- else if (outputVariables_[i] < inputVarOffset_.size () + nVariates_) {
1036+ } else if (outputVariables_[i] < inputVarOffset_.size () + nVariates_) {
10201037 output = " rn[" +
10211038 std::to_string ((outputVariables_[i] - inputVarOffset_.size ()) * size_[currentId_ - 1 ]) +
1022- " + i]" ;
1039+ " U + i]" ;
10231040 } else {
10241041 output = " v" + std::to_string (outputVariables_[i]);
10251042 }
@@ -1114,8 +1131,6 @@ void OpenClContext::finalizeCalculation(std::vector<double*>& output) {
11141131 std::vector<cl_event> runWaitEvents;
11151132 if (inputBufferSize > 0 )
11161133 runWaitEvents.push_back (inputBufferEvent);
1117- if (!variatesWaitEvents_.empty ())
1118- runWaitEvents.push_back (variatesWaitEvents_.back ());
11191134
11201135 cl_event runEvent;
11211136 err = clEnqueueNDRangeKernel (queue_, kernel_[currentId_ - 1 ], 1 , NULL , &size_[currentId_ - 1 ], NULL ,
@@ -1142,7 +1157,7 @@ void OpenClContext::finalizeCalculation(std::vector<double*>& output) {
11421157 }
11431158 for (std::size_t i = 0 ; i < output.size (); ++i) {
11441159 outputBufferEvents.push_back (cl_event ());
1145- err = clEnqueueReadBuffer (queue_, outputBuffer, CL_FALSE, i * size_[currentId_ - 1 ],
1160+ err = clEnqueueReadBuffer (queue_, outputBuffer, CL_FALSE, fpSize * i * size_[currentId_ - 1 ],
11461161 fpSize * size_[currentId_ - 1 ],
11471162 settings_.useDoublePrecision ? (void *)&output[i][0 ] : (void *)&outputFloat[i][0 ],
11481163 1 , &runEvent, &outputBufferEvents.back ());
0 commit comments