Skip to content

Commit 270fc26

Browse files
pcaspersjenkins
authored andcommitted
QPR-12386 updates for mersenne-twister implementation in OpenCL
1 parent 2c9dd3f commit 270fc26

3 files changed

Lines changed: 125 additions & 68 deletions

File tree

QuantExt/qle/math/basiccpuenvironment.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -236,8 +236,8 @@ std::vector<std::vector<std::size_t>> BasicCpuContext::createInputVariates(const
236236
rng_ = std::make_unique<MersenneTwisterUniformRng>(settings_.rngSeed);
237237
}
238238

239-
if (variates_.size() < dim * steps) {
240-
for (std::size_t i = variates_.size(); i < dim * steps; ++i) {
239+
if (variates_.size() < numberOfVariates_[currentId_ - 1] + dim * steps) {
240+
for (std::size_t i = variates_.size(); i < numberOfVariates_[currentId_ - 1] + dim * steps; ++i) {
241241
variates_.push_back(RandomVariable(size_[currentId_ - 1]));
242242
for (std::size_t j = 0; j < variates_.back().size(); ++j)
243243
variates_.back().set(j, icn_(rng_->nextReal()));
@@ -247,11 +247,11 @@ std::vector<std::vector<std::size_t>> BasicCpuContext::createInputVariates(const
247247
std::vector<std::vector<std::size_t>> resultIds(dim, std::vector<std::size_t>(steps));
248248
for (std::size_t i = 0; i < dim; ++i) {
249249
for (std::size_t j = 0; j < steps; ++j) {
250-
resultIds[i][j] = numberOfInputVars_[currentId_ - 1] + i * steps + j;
250+
resultIds[i][j] = numberOfInputVars_[currentId_ - 1] + numberOfVariates_[currentId_ - 1] + i * steps + j;
251251
}
252252
}
253253

254-
numberOfVariates_[currentId_ - 1] = dim * steps;
254+
numberOfVariates_[currentId_ - 1] += dim * steps;
255255

256256
return resultIds;
257257
}
@@ -368,8 +368,9 @@ void BasicCpuContext::finalizeCalculation(std::vector<double*>& output) {
368368
RandomVariable* v;
369369
if (id < numberOfInputVars_[currentId_ - 1])
370370
v = &values_[id];
371-
else if (id < numberOfInputVars_[currentId_ - 1] + numberOfVariates_[currentId_ - 1])
371+
else if (id < numberOfInputVars_[currentId_ - 1] + numberOfVariates_[currentId_ - 1]) {
372372
v = &variates_[id - numberOfInputVars_[currentId_ - 1]];
373+
}
373374
else
374375
v = &values_[id - numberOfVariates_[currentId_ - 1]];
375376
for (Size j = 0; j < size_[currentId_ - 1]; ++j) {

QuantExt/qle/math/openclenvironment.cpp

Lines changed: 77 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -226,14 +226,13 @@ class OpenClContext : public ComputeContext {
226226

227227
// 1b variates (shared pool of mersenne twister based normal variates)
228228

229-
std::size_t variatesPoolSize_ = 0;
229+
std::size_t variatesPoolSize_ = 0; // count of single random numbers
230230
cl_mem variatesPool_;
231231
cl_mem variatesMtStateBuffer_;
232232
cl_program variatesProgram_;
233233
cl_kernel variatesKernelSeedInit_;
234234
cl_kernel variatesKernelTwist_;
235235
cl_kernel variatesKernelGenerate_;
236-
std::vector<cl_event> variatesWaitEvents_;
237236

238237
// 2 curent calc
239238

@@ -519,6 +518,7 @@ void OpenClContext::updateVariatesPool() {
519518

520519
std::size_t fpSize = settings_.useDoublePrecision ? sizeof(double) : sizeof(float);
521520

521+
cl_event initEvent;
522522
if (variatesPoolSize_ == 0) {
523523

524524
// build the kernels to fill the variates pool
@@ -581,24 +581,24 @@ void OpenClContext::updateVariatesPool() {
581581

582582
// from QuantLib::MersenneTwisterUniformRng
583583

584-
std::string kernelSourceSeedInit = "__kernel void ore_seedInitialization(const uint s, __global uint* mt) {\n"
585-
" const uint N = 624;\n"
584+
std::string kernelSourceSeedInit = "__kernel void ore_seedInitialization(const ulong s, __global ulong* mt) {\n"
585+
" const ulong N = 624;\n"
586586
" mt[0]= s & 0xffffffffU;\n"
587-
" for (uint mti=1; mti<N; ++mti) {\n"
587+
" for (ulong mti=1; mti<N; ++mti) {\n"
588588
" mt[mti] = (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);\n"
589-
" mt[mti] &= 0xffffffffU;\n"
589+
" mt[mti] &= 0xffffffffUL;\n"
590590
" }\n"
591591
"}\n\n";
592592

593-
std::string kernelSourceTwist = "__kernel void ore_twist(__global uint* mt) {\n"
594-
" const uint N = 624;\n"
595-
" const uint M = 397;\n"
596-
" const uint MATRIX_A = 0x9908b0dfUL;\n"
597-
" const uint UPPER_MASK=0x80000000UL;\n"
598-
" const uint LOWER_MASK=0x7fffffffUL;\n"
599-
" const uint mag01[2]={0x0UL, MATRIX_A};\n"
600-
" uint kk;\n"
601-
" uint y;\n"
593+
std::string kernelSourceTwist = "__kernel void ore_twist(__global ulong* mt) {\n"
594+
" const ulong N = 624;\n"
595+
" const ulong M = 397;\n"
596+
" const ulong MATRIX_A = 0x9908b0dfUL;\n"
597+
" const ulong UPPER_MASK=0x80000000UL;\n"
598+
" const ulong LOWER_MASK=0x7fffffffUL;\n"
599+
" const ulong mag01[2]={0x0UL, MATRIX_A};\n"
600+
" ulong kk;\n"
601+
" ulong y;\n"
602602
" for (kk=0;kk<N-M;++kk) {\n"
603603
" y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);\n"
604604
" mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];\n"
@@ -612,14 +612,14 @@ void OpenClContext::updateVariatesPool() {
612612
"}\n\n";
613613

614614
std::string kernelSourceGenerate =
615-
"__kernel void ore_generate(const uint offset, __global uint* mt, __global " + fpTypeStr + "* output) {\n"
616-
" uint mti = get_global_id(0);\n"
617-
" uint y = mt[mti];\n"
615+
"__kernel void ore_generate(const ulong offset, __global ulong* mt, __global " + fpTypeStr + "* output) {\n"
616+
" ulong mti = get_global_id(0);\n"
617+
" ulong y = mt[mti];\n"
618618
" y ^= (y >> 11);\n"
619619
" y ^= (y << 7) & 0x9d2c5680U;\n"
620620
" y ^= (y << 15) & 0xefc60000U;\n"
621621
" y ^= (y >> 18);\n"
622-
" output[offset + mti] = ore_invCumN(y);\n"
622+
" output[offset + mti] = ore_invCumN((uint)y);\n"
623623
"}\n\n";
624624
// clang-format on
625625

@@ -653,79 +653,98 @@ void OpenClContext::updateVariatesPool() {
653653
QL_REQUIRE(err == CL_SUCCESS,
654654
"OpenClContext::updateVariatesPool(): error creating kernel generate: " << errorText(err));
655655

656-
variatesMtStateBuffer_ = clCreateBuffer(context_, CL_MEM_READ_WRITE, fpSize * mt_N, NULL, &err);
656+
variatesMtStateBuffer_ = clCreateBuffer(context_, CL_MEM_READ_WRITE, sizeof(cl_ulong) * mt_N, NULL, &err);
657657
QL_REQUIRE(err == CL_SUCCESS,
658658
"OpenClContext::updateVariatesPool(): error creating mt state buffer: " << errorText(err));
659659

660-
cl_uint tmpSeed = (cl_uint)settings_.rngSeed;
661-
err = clSetKernelArg(variatesKernelSeedInit_, 0, sizeof(cl_uint), &tmpSeed);
660+
cl_ulong tmpSeed = (cl_ulong)settings_.rngSeed;
661+
err = clSetKernelArg(variatesKernelSeedInit_, 0, sizeof(cl_ulong), &tmpSeed);
662662
err |= clSetKernelArg(variatesKernelSeedInit_, 1, sizeof(cl_mem), &variatesMtStateBuffer_);
663663
QL_REQUIRE(err == CL_SUCCESS,
664664
"OpenClContext::updateVariatesPool(): error setting kernel args seed init: " << errorText(err));
665665

666-
QL_REQUIRE(variatesWaitEvents_.empty(),
667-
"OpenClContext::updateVariatesPool(): internal error, expected empty variatesWaitEvents_ ("
668-
<< variatesWaitEvents_.size() << ")");
669-
variatesWaitEvents_.push_back(cl_event());
670666
constexpr std::size_t sizeOne = 1;
671-
err = clEnqueueNDRangeKernel(queue_, variatesKernelSeedInit_, 1, NULL, &sizeOne, NULL, 0, NULL,
672-
&variatesWaitEvents_.back());
667+
err = clEnqueueNDRangeKernel(queue_, variatesKernelSeedInit_, 1, NULL, &sizeOne, NULL, 0, NULL, &initEvent);
673668
QL_REQUIRE(err == CL_SUCCESS,
674669
"OpenClContext::updateVariatesPool(): error running kernel seed init: " << errorText(err));
675-
} else {
676-
QL_REQUIRE(!variatesWaitEvents_.empty(),
677-
"OpenClContext::updateVariatesPool(): internal error, expected non-empty variatesWaitEvents_ list.");
670+
}
671+
672+
// if the variates pool is big enough, we exit early
673+
674+
if (variatesPoolSize_ >= nVariates_ * size_[currentId_ - 1]) {
675+
if (variatesPoolSize_ == 0)
676+
clWaitForEvents(1, &initEvent);
677+
return;
678678
}
679679

680680
// create new buffer to hold the variates and copy the current buffer contents to the new buffer
681681

682-
Size alignedSize = 624 * (size_[currentId_ - 1] / 624 + (size_[currentId_ - 1] % 624 == 0 ? 0 : 1));
682+
Size alignedSize =
683+
624 * (nVariates_ * size_[currentId_ - 1] / 624 + (nVariates_ * size_[currentId_ - 1] % 624 == 0 ? 0 : 1));
683684

684685
cl_int err;
685686
cl_mem oldBuffer = variatesPool_;
686-
variatesPool_ = clCreateBuffer(context_, CL_MEM_READ_WRITE, fpSize * nVariates_ * alignedSize, NULL, &err);
687+
variatesPool_ = clCreateBuffer(context_, CL_MEM_READ_WRITE, fpSize * alignedSize, NULL, &err);
687688
QL_REQUIRE(err == CL_SUCCESS, "OpenClContext::updateVariatesPool(): error creating variates buffer with size "
688-
<< fpSize * nVariates_ * alignedSize << " bytes: " << errorText(err));
689+
<< fpSize * alignedSize << " bytes: " << errorText(err));
690+
cl_event copyEvent;
689691
if (variatesPoolSize_ > 0) {
690-
cl_event copyEvent;
691-
err = clEnqueueCopyBuffer(queue_, oldBuffer, variatesPool_, 0, 0, fpSize * variatesPoolSize_ * alignedSize, 1,
692-
&variatesWaitEvents_.back(), &copyEvent);
693-
variatesWaitEvents_.push_back(copyEvent);
692+
err = clEnqueueCopyBuffer(queue_, oldBuffer, variatesPool_, 0, 0, fpSize * variatesPoolSize_, 0, NULL,
693+
&copyEvent);
694694
QL_REQUIRE(err == CL_SUCCESS,
695695
"OpenClContext::updateVariatesPool(): error copying existing variates buffer to new buffer: "
696696
<< errorText(err));
697697
}
698698

699699
// fill in the new variates
700700

701-
for (std::size_t currentPoolSize = variatesPoolSize_ * alignedSize; currentPoolSize < nVariates_ * alignedSize;
701+
std::size_t currentPoolSize;
702+
cl_event generateEvent;
703+
bool haveGenerated = false;
704+
for (currentPoolSize = variatesPoolSize_; currentPoolSize < nVariates_ * size_[currentId_ - 1];
702705
currentPoolSize += mt_N) {
703706
err = clSetKernelArg(variatesKernelTwist_, 0, sizeof(cl_mem), &variatesMtStateBuffer_);
704707
QL_REQUIRE(err == CL_SUCCESS,
705708
"OpenClContext::updateVariatesPool(): error setting args for kernel twist: " << errorText(err));
706709
cl_event twistEvent;
707-
err = clEnqueueNDRangeKernel(queue_, variatesKernelTwist_, 1, NULL, &mt_N, NULL, 1, &variatesWaitEvents_.back(),
708-
&twistEvent);
709-
variatesWaitEvents_.push_back(twistEvent);
710+
err = clEnqueueNDRangeKernel(
711+
queue_, variatesKernelTwist_, 1, NULL, &mt_N, NULL, variatesPoolSize_ == 0 || haveGenerated ? 1 : 0,
712+
variatesPoolSize_ == 0 ? &initEvent : (haveGenerated ? &generateEvent : NULL), &twistEvent);
710713
QL_REQUIRE(err == CL_SUCCESS,
711714
"OpenClContext::updateVariatesPool(): error running kernel twist: " << errorText(err));
712715

713-
err = clSetKernelArg(variatesKernelGenerate_, 0, sizeof(cl_uint), &currentPoolSize);
716+
err = clSetKernelArg(variatesKernelGenerate_, 0, sizeof(cl_ulong), &currentPoolSize);
714717
err |= clSetKernelArg(variatesKernelGenerate_, 1, sizeof(cl_mem), &variatesMtStateBuffer_);
715718
err |= clSetKernelArg(variatesKernelGenerate_, 2, sizeof(cl_mem), &variatesPool_);
716719
QL_REQUIRE(err == CL_SUCCESS,
717720
"OpenClContext::updateVariatesPool(): error settings args for kernel generate: " << errorText(err));
718-
cl_event generateEvent;
719-
err = clEnqueueNDRangeKernel(queue_, variatesKernelGenerate_, 1, NULL, &mt_N, NULL, 1,
720-
&variatesWaitEvents_.back(), &generateEvent);
721-
variatesWaitEvents_.push_back(generateEvent);
721+
err = clEnqueueNDRangeKernel(queue_, variatesKernelGenerate_, 1, NULL, &mt_N, NULL, 1, &twistEvent,
722+
&generateEvent);
722723
QL_REQUIRE(err == CL_SUCCESS,
723724
"OpenClContext::updateVariatesPool(): error running kernel generate: " << errorText(err));
725+
haveGenerated = true;
724726
}
725727

728+
// wait for events to finish
729+
730+
std::vector<cl_event> waitList;
731+
if (variatesPoolSize_ > 0)
732+
waitList.push_back(copyEvent);
733+
if (haveGenerated)
734+
waitList.push_back(generateEvent);
735+
if (!waitList.empty())
736+
clWaitForEvents(waitList.size(), &waitList[0]);
737+
738+
// release old buffer
739+
740+
if (variatesPoolSize_ > 0)
741+
releaseMem(oldBuffer);
742+
726743
// update current variates pool size
727744

728-
variatesPoolSize_ = nVariates_;
745+
QL_REQUIRE(currentPoolSize == alignedSize, "OpenClContext::updateVariatesPool(): internal error, currentPoolSize = "
746+
<< currentPoolSize << " does not match alignedSize " << alignedSize);
747+
variatesPoolSize_ = currentPoolSize;
729748
}
730749

731750
std::vector<std::vector<std::size_t>> OpenClContext::createInputVariates(const std::size_t dim,
@@ -745,8 +764,7 @@ std::vector<std::vector<std::size_t>> OpenClContext::createInputVariates(const s
745764
}
746765
}
747766
nVariates_ += dim * steps;
748-
if (nVariates_ > variatesPoolSize_)
749-
updateVariatesPool();
767+
updateVariatesPool();
750768
return resultIds;
751769
}
752770

@@ -780,10 +798,10 @@ std::size_t OpenClContext::applyOperation(const std::size_t randomVariableOpCode
780798
std::vector<std::string> argStr(args.size());
781799
for (std::size_t i = 0; i < args.size(); ++i) {
782800
if (args[i] < inputVarOffset_.size()) {
783-
argStr[i] = "input[" + std::to_string(inputVarOffset_[args[i]]) + "UL" +
801+
argStr[i] = "input[" + std::to_string(inputVarOffset_[args[i]]) + "U" +
784802
(inputVarIsScalar_[args[i]] ? "]" : " + i]");
785803
} else if (args[i] < inputVarOffset_.size() + nVariates_) {
786-
argStr[i] = "rn[" + std::to_string((args[i] - inputVarOffset_.size()) * size_[currentId_ - 1]) + " + i]";
804+
argStr[i] = "rn[" + std::to_string((args[i] - inputVarOffset_.size()) * size_[currentId_ - 1]) + "U + i]";
787805
} else {
788806
// variable is an (intermediate) result
789807
argStr[i] = "v" + std::to_string(args[i]);
@@ -994,11 +1012,11 @@ void OpenClContext::finalizeCalculation(std::vector<double*>& output) {
9941012
"ore_kernel_" + std::to_string(currentId_) + "_" + std::to_string(version_[currentId_ - 1]);
9951013

9961014
std::vector<std::string> inputArgs;
997-
if(inputBufferSize > 0)
1015+
if (inputBufferSize > 0)
9981016
inputArgs.push_back("__global " + fpTypeStr + "* input");
999-
if(nVariates_ > 0)
1017+
if (nVariates_ > 0)
10001018
inputArgs.push_back("__global " + fpTypeStr + "* rn");
1001-
if(outputBufferSize > 0)
1019+
if (outputBufferSize > 0)
10021020
inputArgs.push_back("__global " + fpTypeStr + "* output");
10031021

10041022
std::string kernelSource = includeSource + "__kernel void " + kernelName + "(" + boost::join(inputArgs, ",") +
@@ -1013,13 +1031,12 @@ void OpenClContext::finalizeCalculation(std::vector<double*>& output) {
10131031
std::size_t offset = i * size_[currentId_ - 1];
10141032
std::string output;
10151033
if (outputVariables_[i] < inputVarOffset_.size()) {
1016-
output = "input[" + std::to_string(outputVariables_[i]) + "UL" +
1034+
output = "input[" + std::to_string(outputVariables_[i]) + "U" +
10171035
(inputVarIsScalar_[outputVariables_[i]] ? "]" : " + i] ");
1018-
}
1019-
else if(outputVariables_[i] < inputVarOffset_.size() + nVariates_) {
1036+
} else if (outputVariables_[i] < inputVarOffset_.size() + nVariates_) {
10201037
output = "rn[" +
10211038
std::to_string((outputVariables_[i] - inputVarOffset_.size()) * size_[currentId_ - 1]) +
1022-
" + i]";
1039+
"U + i]";
10231040
} else {
10241041
output = "v" + std::to_string(outputVariables_[i]);
10251042
}
@@ -1114,8 +1131,6 @@ void OpenClContext::finalizeCalculation(std::vector<double*>& output) {
11141131
std::vector<cl_event> runWaitEvents;
11151132
if (inputBufferSize > 0)
11161133
runWaitEvents.push_back(inputBufferEvent);
1117-
if (!variatesWaitEvents_.empty())
1118-
runWaitEvents.push_back(variatesWaitEvents_.back());
11191134

11201135
cl_event runEvent;
11211136
err = clEnqueueNDRangeKernel(queue_, kernel_[currentId_ - 1], 1, NULL, &size_[currentId_ - 1], NULL,
@@ -1142,7 +1157,7 @@ void OpenClContext::finalizeCalculation(std::vector<double*>& output) {
11421157
}
11431158
for (std::size_t i = 0; i < output.size(); ++i) {
11441159
outputBufferEvents.push_back(cl_event());
1145-
err = clEnqueueReadBuffer(queue_, outputBuffer, CL_FALSE, i * size_[currentId_ - 1],
1160+
err = clEnqueueReadBuffer(queue_, outputBuffer, CL_FALSE, fpSize * i * size_[currentId_ - 1],
11461161
fpSize * size_[currentId_ - 1],
11471162
settings_.useDoublePrecision ? (void*)&output[i][0] : (void*)&outputFloat[i][0],
11481163
1, &runEvent, &outputBufferEvents.back());

QuantExt/test/computeenvironment.cpp

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ void checkRngOutput(const std::vector<std::vector<double>>& output) {
244244
BOOST_TEST_MESSAGE(" mean = " << boost::accumulators::mean(acc)
245245
<< ", variance = " << boost::accumulators::variance(acc));
246246
BOOST_CHECK_SMALL(boost::accumulators::mean(acc), 0.05);
247-
BOOST_CHECK_CLOSE(boost::accumulators::variance(acc), 1.0, 1.0);
247+
BOOST_CHECK_CLOSE(boost::accumulators::variance(acc), 1.0, 2.0);
248248
}
249249
}
250250
} // namespace
@@ -305,6 +305,47 @@ BOOST_AUTO_TEST_CASE(testReplayFlowError) {
305305
}
306306
}
307307

308+
BOOST_AUTO_TEST_CASE(testRngGenerationTmp) {
309+
ComputeEnvironmentFixture fixture;
310+
const std::size_t n = 1500;
311+
for (auto const& d : ComputeEnvironment::instance().getAvailableDevices()) {
312+
BOOST_TEST_MESSAGE("testing rng generation on device '" << d << "'.");
313+
ComputeEnvironment::instance().selectContext(d);
314+
auto& c = ComputeEnvironment::instance().context();
315+
ComputeContext::Settings settings;
316+
settings.rngSequenceType = QuantExt::SequenceType::MersenneTwister;
317+
settings.useDoublePrecision = c.supportsDoublePrecision();
318+
BOOST_TEST_MESSAGE("using double precision = " << std::boolalpha << settings.useDoublePrecision);
319+
c.initiateCalculation(n, 0, 0, settings);
320+
auto vs = c.createInputVariates(1, 1);
321+
for (auto const& d : vs) {
322+
for (auto const& r : d) {
323+
c.declareOutputVariable(r);
324+
}
325+
}
326+
auto vs2 = c.createInputVariates(1, 1);
327+
for (auto const& d : vs2) {
328+
for (auto const& r : d) {
329+
c.declareOutputVariable(r);
330+
}
331+
}
332+
std::vector<std::vector<double>> output(2, std::vector<double>(n));
333+
c.finalizeCalculation(output);
334+
335+
auto sg = GenericPseudoRandom<MersenneTwisterUniformRng, InverseCumulativeNormal>::make_sequence_generator(
336+
1, settings.rngSeed);
337+
338+
double tol = settings.useDoublePrecision ? 1E-7 : 1E-3;
339+
340+
for (Size j = 0; j < 2; ++j) {
341+
for (Size i = 0; i < n; ++i) {
342+
Real ref = sg.nextSequence().value[0];
343+
BOOST_CHECK_SMALL(output[j][i] - ref, tol);
344+
}
345+
}
346+
}
347+
}
348+
308349
BOOST_AUTO_TEST_SUITE_END()
309350

310351
BOOST_AUTO_TEST_SUITE_END()

0 commit comments

Comments
 (0)