QPR-12386 updates for mersenne-twister implementation in OpenCL

pcaspers · jenkins · commit 270fc262252d · 2024-04-22T13:45:11.000Z
diff --git a/QuantExt/qle/math/basiccpuenvironment.cpp b/QuantExt/qle/math/basiccpuenvironment.cpp
@@ -236,8 +236,8 @@ std::vector<std::vector<std::size_t>> BasicCpuContext::createInputVariates(const
         rng_ = std::make_unique<MersenneTwisterUniformRng>(settings_.rngSeed);
     }
 
-    if (variates_.size() < dim * steps) {
-        for (std::size_t i = variates_.size(); i < dim * steps; ++i) {
+    if (variates_.size() < numberOfVariates_[currentId_ - 1] + dim * steps) {
+        for (std::size_t i = variates_.size(); i < numberOfVariates_[currentId_ - 1] + dim * steps; ++i) {
             variates_.push_back(RandomVariable(size_[currentId_ - 1]));
             for (std::size_t j = 0; j < variates_.back().size(); ++j)
                 variates_.back().set(j, icn_(rng_->nextReal()));
@@ -247,11 +247,11 @@ std::vector<std::vector<std::size_t>> BasicCpuContext::createInputVariates(const
     std::vector<std::vector<std::size_t>> resultIds(dim, std::vector<std::size_t>(steps));
     for (std::size_t i = 0; i < dim; ++i) {
         for (std::size_t j = 0; j < steps; ++j) {
-            resultIds[i][j] = numberOfInputVars_[currentId_ - 1] + i * steps + j;
+            resultIds[i][j] = numberOfInputVars_[currentId_ - 1] + numberOfVariates_[currentId_ - 1] + i * steps + j;
         }
     }
 
-    numberOfVariates_[currentId_ - 1] = dim * steps;
+    numberOfVariates_[currentId_ - 1] += dim * steps;
 
     return resultIds;
 }
@@ -368,8 +368,9 @@ void BasicCpuContext::finalizeCalculation(std::vector<double*>& output) {
         RandomVariable* v;
         if (id < numberOfInputVars_[currentId_ - 1])
             v = &values_[id];
-        else if (id < numberOfInputVars_[currentId_ - 1] + numberOfVariates_[currentId_ - 1])
+        else if (id < numberOfInputVars_[currentId_ - 1] + numberOfVariates_[currentId_ - 1]) {
             v = &variates_[id - numberOfInputVars_[currentId_ - 1]];
+        }
         else
             v = &values_[id - numberOfVariates_[currentId_ - 1]];
         for (Size j = 0; j < size_[currentId_ - 1]; ++j) {
diff --git a/QuantExt/qle/math/openclenvironment.cpp b/QuantExt/qle/math/openclenvironment.cpp
@@ -226,14 +226,13 @@ class OpenClContext : public ComputeContext {
 
     // 1b variates (shared pool of mersenne twister based normal variates)
 
-    std::size_t variatesPoolSize_ = 0;
+    std::size_t variatesPoolSize_ = 0; // count of single random numbers
     cl_mem variatesPool_;
     cl_mem variatesMtStateBuffer_;
     cl_program variatesProgram_;
     cl_kernel variatesKernelSeedInit_;
     cl_kernel variatesKernelTwist_;
     cl_kernel variatesKernelGenerate_;
-    std::vector<cl_event> variatesWaitEvents_;
 
     // 2 curent calc
 
@@ -519,6 +518,7 @@ void OpenClContext::updateVariatesPool() {
 
     std::size_t fpSize = settings_.useDoublePrecision ? sizeof(double) : sizeof(float);
 
+    cl_event initEvent;
     if (variatesPoolSize_ == 0) {
 
         // build the kernels to fill the variates pool
@@ -581,24 +581,24 @@ void OpenClContext::updateVariatesPool() {
 
         // from QuantLib::MersenneTwisterUniformRng
 
-        std::string kernelSourceSeedInit = "__kernel void ore_seedInitialization(const uint s, __global uint* mt) {\n"
-            "  const uint N = 624;\n"
+        std::string kernelSourceSeedInit = "__kernel void ore_seedInitialization(const ulong s, __global ulong* mt) {\n"
+            "  const ulong N = 624;\n"
             "  mt[0]= s & 0xffffffffU;\n"
-            "  for (uint mti=1; mti<N; ++mti) {\n"
+            "  for (ulong mti=1; mti<N; ++mti) {\n"
             "    mt[mti] = (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);\n"
-            "    mt[mti] &= 0xffffffffU;\n"
+            "    mt[mti] &= 0xffffffffUL;\n"
             "  }\n"
             "}\n\n";
 
-        std::string kernelSourceTwist = "__kernel void ore_twist(__global uint* mt) {\n"
-            " const uint N = 624;\n"
-            " const uint M = 397;\n"
-            " const uint MATRIX_A = 0x9908b0dfUL;\n"
-            " const uint UPPER_MASK=0x80000000UL;\n"
-            " const uint LOWER_MASK=0x7fffffffUL;\n"
-            " const uint mag01[2]={0x0UL, MATRIX_A};\n"
-            " uint kk;\n"
-            " uint y;\n"
+        std::string kernelSourceTwist = "__kernel void ore_twist(__global ulong* mt) {\n"
+            " const ulong N = 624;\n"
+            " const ulong M = 397;\n"
+            " const ulong MATRIX_A = 0x9908b0dfUL;\n"
+            " const ulong UPPER_MASK=0x80000000UL;\n"
+            " const ulong LOWER_MASK=0x7fffffffUL;\n"
+            " const ulong mag01[2]={0x0UL, MATRIX_A};\n"
+            " ulong kk;\n"
+            " ulong y;\n"
             " for (kk=0;kk<N-M;++kk) {\n"
             "     y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);\n"
             "     mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];\n"
@@ -612,14 +612,14 @@ void OpenClContext::updateVariatesPool() {
             "}\n\n";
 
         std::string kernelSourceGenerate =
-            "__kernel void ore_generate(const uint offset, __global uint* mt, __global " + fpTypeStr + "* output) {\n"
-            "   uint mti = get_global_id(0);\n"
-            "   uint y = mt[mti];\n"
+            "__kernel void ore_generate(const ulong offset, __global ulong* mt, __global " + fpTypeStr + "* output) {\n"
+            "   ulong mti = get_global_id(0);\n"
+            "   ulong y = mt[mti];\n"
             "   y ^= (y >> 11);\n"
             "   y ^= (y << 7) & 0x9d2c5680U;\n"
             "   y ^= (y << 15) & 0xefc60000U;\n"
             "   y ^= (y >> 18);\n"
-            "   output[offset + mti] = ore_invCumN(y);\n"
+            "   output[offset + mti] = ore_invCumN((uint)y);\n"
             "}\n\n";
         // clang-format on
 
@@ -653,79 +653,98 @@ void OpenClContext::updateVariatesPool() {
         QL_REQUIRE(err == CL_SUCCESS,
                    "OpenClContext::updateVariatesPool(): error creating kernel generate: " << errorText(err));
 
-        variatesMtStateBuffer_ = clCreateBuffer(context_, CL_MEM_READ_WRITE, fpSize * mt_N, NULL, &err);
+        variatesMtStateBuffer_ = clCreateBuffer(context_, CL_MEM_READ_WRITE, sizeof(cl_ulong) * mt_N, NULL, &err);
         QL_REQUIRE(err == CL_SUCCESS,
                    "OpenClContext::updateVariatesPool(): error creating mt state buffer: " << errorText(err));
 
-        cl_uint tmpSeed = (cl_uint)settings_.rngSeed;
-        err = clSetKernelArg(variatesKernelSeedInit_, 0, sizeof(cl_uint), &tmpSeed);
+        cl_ulong tmpSeed = (cl_ulong)settings_.rngSeed;
+        err = clSetKernelArg(variatesKernelSeedInit_, 0, sizeof(cl_ulong), &tmpSeed);
         err |= clSetKernelArg(variatesKernelSeedInit_, 1, sizeof(cl_mem), &variatesMtStateBuffer_);
         QL_REQUIRE(err == CL_SUCCESS,
                    "OpenClContext::updateVariatesPool(): error setting kernel args seed init: " << errorText(err));
 
-        QL_REQUIRE(variatesWaitEvents_.empty(),
-                   "OpenClContext::updateVariatesPool(): internal error, expected empty variatesWaitEvents_ ("
-                       << variatesWaitEvents_.size() << ")");
-        variatesWaitEvents_.push_back(cl_event());
         constexpr std::size_t sizeOne = 1;
-        err = clEnqueueNDRangeKernel(queue_, variatesKernelSeedInit_, 1, NULL, &sizeOne, NULL, 0, NULL,
-                                     &variatesWaitEvents_.back());
+        err = clEnqueueNDRangeKernel(queue_, variatesKernelSeedInit_, 1, NULL, &sizeOne, NULL, 0, NULL, &initEvent);
         QL_REQUIRE(err == CL_SUCCESS,
                    "OpenClContext::updateVariatesPool(): error running kernel seed init: " << errorText(err));
-    } else {
-        QL_REQUIRE(!variatesWaitEvents_.empty(),
-                   "OpenClContext::updateVariatesPool(): internal error, expected non-empty variatesWaitEvents_ list.");
+    }
+
+    // if the variates pool is big enough, we exit early
+
+    if (variatesPoolSize_ >= nVariates_ * size_[currentId_ - 1]) {
+        if (variatesPoolSize_ == 0)
+            clWaitForEvents(1, &initEvent);
+        return;
     }
 
     // create new buffer to hold the variates and copy the current buffer contents to the new buffer
 
-    Size alignedSize = 624 * (size_[currentId_ - 1] / 624 + (size_[currentId_ - 1] % 624 == 0 ? 0 : 1));
+    Size alignedSize =
+        624 * (nVariates_ * size_[currentId_ - 1] / 624 + (nVariates_ * size_[currentId_ - 1] % 624 == 0 ? 0 : 1));
 
     cl_int err;
     cl_mem oldBuffer = variatesPool_;
-    variatesPool_ = clCreateBuffer(context_, CL_MEM_READ_WRITE, fpSize * nVariates_ * alignedSize, NULL, &err);
+    variatesPool_ = clCreateBuffer(context_, CL_MEM_READ_WRITE, fpSize * alignedSize, NULL, &err);
     QL_REQUIRE(err == CL_SUCCESS, "OpenClContext::updateVariatesPool(): error creating variates buffer with size "
-                                      << fpSize * nVariates_ * alignedSize << " bytes: " << errorText(err));
+                                      << fpSize * alignedSize << " bytes: " << errorText(err));
+    cl_event copyEvent;
     if (variatesPoolSize_ > 0) {
-        cl_event copyEvent;
-        err = clEnqueueCopyBuffer(queue_, oldBuffer, variatesPool_, 0, 0, fpSize * variatesPoolSize_ * alignedSize, 1,
-                                  &variatesWaitEvents_.back(), &copyEvent);
-        variatesWaitEvents_.push_back(copyEvent);
+        err = clEnqueueCopyBuffer(queue_, oldBuffer, variatesPool_, 0, 0, fpSize * variatesPoolSize_, 0, NULL,
+                                  &copyEvent);
         QL_REQUIRE(err == CL_SUCCESS,
                    "OpenClContext::updateVariatesPool(): error copying existing variates buffer to new buffer: "
                        << errorText(err));
     }
 
     // fill in the new variates
 
-    for (std::size_t currentPoolSize = variatesPoolSize_ * alignedSize; currentPoolSize < nVariates_ * alignedSize;
+    std::size_t currentPoolSize;
+    cl_event generateEvent;
+    bool haveGenerated = false;
+    for (currentPoolSize = variatesPoolSize_; currentPoolSize < nVariates_ * size_[currentId_ - 1];
          currentPoolSize += mt_N) {
         err = clSetKernelArg(variatesKernelTwist_, 0, sizeof(cl_mem), &variatesMtStateBuffer_);
         QL_REQUIRE(err == CL_SUCCESS,
                    "OpenClContext::updateVariatesPool(): error setting args for kernel twist: " << errorText(err));
         cl_event twistEvent;
-        err = clEnqueueNDRangeKernel(queue_, variatesKernelTwist_, 1, NULL, &mt_N, NULL, 1, &variatesWaitEvents_.back(),
-                                     &twistEvent);
-        variatesWaitEvents_.push_back(twistEvent);
+        err = clEnqueueNDRangeKernel(
+            queue_, variatesKernelTwist_, 1, NULL, &mt_N, NULL, variatesPoolSize_ == 0 || haveGenerated ? 1 : 0,
+            variatesPoolSize_ == 0 ? &initEvent : (haveGenerated ? &generateEvent : NULL), &twistEvent);
         QL_REQUIRE(err == CL_SUCCESS,
                    "OpenClContext::updateVariatesPool(): error running kernel twist: " << errorText(err));
 
-        err = clSetKernelArg(variatesKernelGenerate_, 0, sizeof(cl_uint), &currentPoolSize);
+        err = clSetKernelArg(variatesKernelGenerate_, 0, sizeof(cl_ulong), &currentPoolSize);
         err |= clSetKernelArg(variatesKernelGenerate_, 1, sizeof(cl_mem), &variatesMtStateBuffer_);
         err |= clSetKernelArg(variatesKernelGenerate_, 2, sizeof(cl_mem), &variatesPool_);
         QL_REQUIRE(err == CL_SUCCESS,
                    "OpenClContext::updateVariatesPool(): error settings args for kernel generate: " << errorText(err));
-        cl_event generateEvent;
-        err = clEnqueueNDRangeKernel(queue_, variatesKernelGenerate_, 1, NULL, &mt_N, NULL, 1,
-                                     &variatesWaitEvents_.back(), &generateEvent);
-        variatesWaitEvents_.push_back(generateEvent);
+        err = clEnqueueNDRangeKernel(queue_, variatesKernelGenerate_, 1, NULL, &mt_N, NULL, 1, &twistEvent,
+                                     &generateEvent);
         QL_REQUIRE(err == CL_SUCCESS,
                    "OpenClContext::updateVariatesPool(): error running kernel generate: " << errorText(err));
+        haveGenerated = true;
     }
 
+    // wait for events to finish
+
+    std::vector<cl_event> waitList;
+    if (variatesPoolSize_ > 0)
+        waitList.push_back(copyEvent);
+    if (haveGenerated)
+        waitList.push_back(generateEvent);
+    if (!waitList.empty())
+        clWaitForEvents(waitList.size(), &waitList[0]);
+
+    // release old buffer
+
+    if (variatesPoolSize_ > 0)
+        releaseMem(oldBuffer);
+
     // update current variates pool size
 
-    variatesPoolSize_ = nVariates_;
+    QL_REQUIRE(currentPoolSize == alignedSize, "OpenClContext::updateVariatesPool(): internal error, currentPoolSize = "
+                                                   << currentPoolSize << " does not match alignedSize " << alignedSize);
+    variatesPoolSize_ = currentPoolSize;
 }
 
 std::vector<std::vector<std::size_t>> OpenClContext::createInputVariates(const std::size_t dim,
@@ -745,8 +764,7 @@ std::vector<std::vector<std::size_t>> OpenClContext::createInputVariates(const s
         }
     }
     nVariates_ += dim * steps;
-    if (nVariates_ > variatesPoolSize_)
-        updateVariatesPool();
+    updateVariatesPool();
     return resultIds;
 }
 
@@ -780,10 +798,10 @@ std::size_t OpenClContext::applyOperation(const std::size_t randomVariableOpCode
     std::vector<std::string> argStr(args.size());
     for (std::size_t i = 0; i < args.size(); ++i) {
         if (args[i] < inputVarOffset_.size()) {
-            argStr[i] = "input[" + std::to_string(inputVarOffset_[args[i]]) + "UL" +
+            argStr[i] = "input[" + std::to_string(inputVarOffset_[args[i]]) + "U" +
                         (inputVarIsScalar_[args[i]] ? "]" : " + i]");
         } else if (args[i] < inputVarOffset_.size() + nVariates_) {
-            argStr[i] = "rn[" + std::to_string((args[i] - inputVarOffset_.size()) * size_[currentId_ - 1]) + " + i]";
+            argStr[i] = "rn[" + std::to_string((args[i] - inputVarOffset_.size()) * size_[currentId_ - 1]) + "U + i]";
         } else {
             // variable is an (intermediate) result
             argStr[i] = "v" + std::to_string(args[i]);
@@ -994,11 +1012,11 @@ void OpenClContext::finalizeCalculation(std::vector<double*>& output) {
             "ore_kernel_" + std::to_string(currentId_) + "_" + std::to_string(version_[currentId_ - 1]);
 
         std::vector<std::string> inputArgs;
-        if(inputBufferSize > 0)
+        if (inputBufferSize > 0)
             inputArgs.push_back("__global " + fpTypeStr + "* input");
-        if(nVariates_ > 0)
+        if (nVariates_ > 0)
             inputArgs.push_back("__global " + fpTypeStr + "* rn");
-        if(outputBufferSize > 0)
+        if (outputBufferSize > 0)
             inputArgs.push_back("__global " + fpTypeStr + "* output");
 
         std::string kernelSource = includeSource + "__kernel void " + kernelName + "(" + boost::join(inputArgs, ",") +
@@ -1013,13 +1031,12 @@ void OpenClContext::finalizeCalculation(std::vector<double*>& output) {
             std::size_t offset = i * size_[currentId_ - 1];
             std::string output;
             if (outputVariables_[i] < inputVarOffset_.size()) {
-                output = "input[" + std::to_string(outputVariables_[i]) + "UL" +
+                output = "input[" + std::to_string(outputVariables_[i]) + "U" +
                          (inputVarIsScalar_[outputVariables_[i]] ? "]" : " + i] ");
-            }
-            else if(outputVariables_[i] < inputVarOffset_.size() + nVariates_) {
+            } else if (outputVariables_[i] < inputVarOffset_.size() + nVariates_) {
                 output = "rn[" +
                          std::to_string((outputVariables_[i] - inputVarOffset_.size()) * size_[currentId_ - 1]) +
-                         " + i]";
+                         "U + i]";
             } else {
                 output = "v" + std::to_string(outputVariables_[i]);
             }
@@ -1114,8 +1131,6 @@ void OpenClContext::finalizeCalculation(std::vector<double*>& output) {
     std::vector<cl_event> runWaitEvents;
     if (inputBufferSize > 0)
         runWaitEvents.push_back(inputBufferEvent);
-    if (!variatesWaitEvents_.empty())
-        runWaitEvents.push_back(variatesWaitEvents_.back());
 
     cl_event runEvent;
     err = clEnqueueNDRangeKernel(queue_, kernel_[currentId_ - 1], 1, NULL, &size_[currentId_ - 1], NULL,
@@ -1142,7 +1157,7 @@ void OpenClContext::finalizeCalculation(std::vector<double*>& output) {
         }
         for (std::size_t i = 0; i < output.size(); ++i) {
             outputBufferEvents.push_back(cl_event());
-            err = clEnqueueReadBuffer(queue_, outputBuffer, CL_FALSE, i * size_[currentId_ - 1],
+            err = clEnqueueReadBuffer(queue_, outputBuffer, CL_FALSE, fpSize * i * size_[currentId_ - 1],
                                       fpSize * size_[currentId_ - 1],
                                       settings_.useDoublePrecision ? (void*)&output[i][0] : (void*)&outputFloat[i][0],
                                       1, &runEvent, &outputBufferEvents.back());
diff --git a/QuantExt/test/computeenvironment.cpp b/QuantExt/test/computeenvironment.cpp
@@ -244,7 +244,7 @@ void checkRngOutput(const std::vector<std::vector<double>>& output) {
         BOOST_TEST_MESSAGE("  mean = " << boost::accumulators::mean(acc)
                                        << ", variance = " << boost::accumulators::variance(acc));
         BOOST_CHECK_SMALL(boost::accumulators::mean(acc), 0.05);
-        BOOST_CHECK_CLOSE(boost::accumulators::variance(acc), 1.0, 1.0);
+        BOOST_CHECK_CLOSE(boost::accumulators::variance(acc), 1.0, 2.0);
     }
 }
 } // namespace
@@ -305,6 +305,47 @@ BOOST_AUTO_TEST_CASE(testReplayFlowError) {
     }
 }
 
+BOOST_AUTO_TEST_CASE(testRngGenerationTmp) {
+    ComputeEnvironmentFixture fixture;
+    const std::size_t n = 1500;
+    for (auto const& d : ComputeEnvironment::instance().getAvailableDevices()) {
+        BOOST_TEST_MESSAGE("testing rng generation on device '" << d << "'.");
+        ComputeEnvironment::instance().selectContext(d);
+        auto& c = ComputeEnvironment::instance().context();
+        ComputeContext::Settings settings;
+        settings.rngSequenceType = QuantExt::SequenceType::MersenneTwister;
+        settings.useDoublePrecision = c.supportsDoublePrecision();
+        BOOST_TEST_MESSAGE("using double precision = " << std::boolalpha << settings.useDoublePrecision);
+        c.initiateCalculation(n, 0, 0, settings);
+        auto vs = c.createInputVariates(1, 1);
+        for (auto const& d : vs) {
+            for (auto const& r : d) {
+                c.declareOutputVariable(r);
+            }
+        }
+        auto vs2 = c.createInputVariates(1, 1);
+        for (auto const& d : vs2) {
+            for (auto const& r : d) {
+                c.declareOutputVariable(r);
+            }
+        }
+        std::vector<std::vector<double>> output(2, std::vector<double>(n));
+        c.finalizeCalculation(output);
+
+        auto sg = GenericPseudoRandom<MersenneTwisterUniformRng, InverseCumulativeNormal>::make_sequence_generator(
+            1, settings.rngSeed);
+
+        double tol = settings.useDoublePrecision ? 1E-7 : 1E-3;
+
+        for (Size j = 0; j < 2; ++j) {
+            for (Size i = 0; i < n; ++i) {
+                Real ref = sg.nextSequence().value[0];
+                BOOST_CHECK_SMALL(output[j][i] - ref, tol);
+            }
+        }
+    }
+}
+
 BOOST_AUTO_TEST_SUITE_END()
 
 BOOST_AUTO_TEST_SUITE_END()