diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 8f88ab18dacbd..9eccb9638d882 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -140,6 +140,9 @@ void OrtModel::initEnvironment() void OrtModel::initSessionFromBuffer(const char* buffer, size_t bufferSize) { + if (mAllocateDeviceMemory) { + memoryOnDevice(mDeviceId); + } mPImplOrt->sessionOptions.AddConfigEntry("session.load_model_format", "ONNX"); mPImplOrt->sessionOptions.AddConfigEntry("session.use_ort_model_bytes_directly", "1"); diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 040a4b84a0f64..8628abb1c0374 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -631,34 +631,47 @@ void GPUReconstructionCUDA::loadKernelModules(bool perKernel) } \ } -void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId) +void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& sessionOptions, int32_t stream, int32_t* deviceId) { GPUChkErr(cudaGetDevice(deviceId)); + #if !defined(__HIPCC__) && defined(ORT_CUDA_BUILD) const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION); - OrtCUDAProviderOptionsV2* cuda_options = nullptr; - ORTCHK(api->CreateCUDAProviderOptions(&cuda_options)); +#ifdef ORT_TENSORRT_BUILD + OrtTensorRTProviderOptionsV2* trtOptions = nullptr; + ORTCHK(api->CreateTensorRTProviderOptions(&trtOptions)); + + const std::string device = std::to_string(*deviceId); + const char* keys[] = {"device_id", "trt_int8_enable"}; + const char* values[] = {device.c_str(), "1"}; + + ORTCHK(api->UpdateTensorRTProviderOptions(trtOptions, keys, values, sizeof(keys) / sizeof(keys[0]))); + ORTCHK(api->UpdateTensorRTProviderOptionsWithValue(trtOptions, "user_compute_stream", mInternals->Streams[stream])); + ORTCHK(api->SessionOptionsAppendExecutionProvider_TensorRT_V2(sessionOptions, trtOptions)); // Register TensorRT first: it consequently has higher priority. + api->ReleaseTensorRTProviderOptions(trtOptions); +#endif + + // CUDA is the fallback for nodes unsupported by TensorRT. + OrtCUDAProviderOptionsV2* cudaOptions = nullptr; + ORTCHK(api->CreateCUDAProviderOptions(&cudaOptions)); // std::vector keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"}; // std::vector values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"}; // UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size()); + ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cudaOptions, "user_compute_stream", mInternals->Streams[stream])); + ORTCHK(api->SessionOptionsAppendExecutionProvider_CUDA_V2(sessionOptions, cudaOptions)); + api->ReleaseCUDAProviderOptions(cudaOptions); - // this implicitly sets "has_user_compute_stream" - ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", mInternals->Streams[stream])); - ORTCHK(api->SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options)); - - // Finally, don't forget to release the provider options - api->ReleaseCUDAProviderOptions(cuda_options); #elif defined(ORT_ROCM_BUILD) // const auto& api = Ort::GetApi(); // api.GetCurrentGpuDeviceId(deviceId); - OrtROCMProviderOptions rocm_options; - rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream - rocm_options.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code + OrtROCMProviderOptions rocmOptions; + rocmOptions.has_user_compute_stream = 1; // Indicate that we are passing a user stream + rocmOptions.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code // rocm_options.gpu_mem_limit = 1073741824; // 0 means no limit - rocm_options.user_compute_stream = mInternals->Streams[stream]; - session_options.AppendExecutionProvider_ROCM(rocm_options); -#endif // ORT_ROCM_BUILD + rocmOptions.user_compute_stream = mInternals->Streams[stream]; + sessionOptions.AppendExecutionProvider_ROCM(rocmOptions); +#endif } #ifndef __HIPCC__ // CUDA diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index e3327f2f8b661..40f7a34e699c9 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -300,6 +300,7 @@ AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishe AddOption(nnCCDBRegressionLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") AddOption(nnCCDBBeamType, std::string, "pp", "", 0, "Distinguishes between networks trained for different beam types. Options: pp, pPb, PbPb") AddOption(nnCCDBInteractionRate, std::string, "500", "", 0, "Distinguishes between networks for different interaction rates [kHz].") +AddOption(nnCCDBExtraMetadata, std::string, "", "", 0, "Extra metadata to distinguish between networks, e.g. for different internal datatypes, etc.") AddHelp("help", 'h') EndConfig() diff --git a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv index 97761a86b966b..5a8c7cf4ddaec 100644 --- a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv +++ b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv @@ -76,7 +76,7 @@ GPUMemClean16,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,,,,,,, GPUitoa,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,,,,,,, GPUTPCCFNoiseSuppression_noiseSuppression,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,,,,,,,448 GPUTPCCFNoiseSuppression_updatePeaks,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,,,,,,,448 -GPUTPCNNClusterizerKernels_runCfClusterizer,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_runCfClusterizer,"""GPUCA_LB_GPUTPCCFClusterizer""",,,,,,,,,,,,,,, GPUTPCNNClusterizerKernels_fillInputNNCPU,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,,,, GPUTPCNNClusterizerKernels_fillInputNNGPU,1024,,,,,,,,,,,,,,, GPUTPCNNClusterizerKernels_determineClass1Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,,,, diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 76dcd54a89289..462b7798ce337 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -1269,15 +1269,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Start(); } if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32); + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32); + } else { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16); } } else if (clustererNNShadow.mNnInferenceInputDType == 1) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32); + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32); + } else { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16); } } if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Stop(); } // doGPU || lane<4 -> only for GPU or first 4 CPU lanes (to limit number of concurrent timers). At least gives some statistics for CPU time... @@ -1289,15 +1289,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Start(); } if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_32); + (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32); + } else { + (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_16); } - } else if (clustererNNShadow.mNnInferenceInputDType == 1) { + } else { if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32); + (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_32); + } else { + (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16); } } if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Stop(); } @@ -1305,15 +1305,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Start(); } if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_32); + (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32); + } else { + (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_16); } } else if (clustererNNShadow.mNnInferenceInputDType == 1) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32); + (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_32); + } else { + (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16); } } if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Stop(); } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 6fac0e417ac26..0e77393be1ce3 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -30,34 +30,34 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) void* startMem = mem; if (mNnClusterizerBatchedMode > 0) { if (mNnInferenceInputDType == 0 && mNnClusterizerElementSize > 0) { - computePointerWithAlignment(mem, mInputData_16, mNnClusterizerBatchedMode * mNnClusterizerElementSize); - } else if (mNnInferenceInputDType == 1 && mNnClusterizerElementSize > 0) { computePointerWithAlignment(mem, mInputData_32, mNnClusterizerBatchedMode * mNnClusterizerElementSize); + } else if (mNnInferenceInputDType == 1 && mNnClusterizerElementSize > 0) { + computePointerWithAlignment(mem, mInputData_16, mNnClusterizerBatchedMode * mNnClusterizerElementSize); } computePointerWithAlignment(mem, mClusterFlags, 2 * mNnClusterizerBatchedMode); if (mNnInferenceOutputDType == 0 && mNnClusterizerElementSize > 0) { if (mNnClusterizerModelClassNumOutputNodes > 0) { - computePointerWithAlignment(mem, mModelProbabilities_16, mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes); + computePointerWithAlignment(mem, mModelProbabilities_32, mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes); } if (!mNnClusterizerUseCfRegression) { if (mNnClusterizerModelReg1NumOutputNodes > 0) { - computePointerWithAlignment(mem, mOutputDataReg1_16, mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes); + computePointerWithAlignment(mem, mOutputDataReg1_32, mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes); } if (mNnClusterizerModelReg2NumOutputNodes > 0) { - computePointerWithAlignment(mem, mOutputDataReg2_16, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes); + computePointerWithAlignment(mem, mOutputDataReg2_32, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes); } } } else if (mNnInferenceOutputDType == 1 && mNnClusterizerElementSize > 0) { if (mNnClusterizerModelClassNumOutputNodes > 0) { - computePointerWithAlignment(mem, mModelProbabilities_32, mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes); + computePointerWithAlignment(mem, mModelProbabilities_16, mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes); } if (!mNnClusterizerUseCfRegression) { if (mNnClusterizerModelReg1NumOutputNodes > 0) { - computePointerWithAlignment(mem, mOutputDataReg1_32, mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes); + computePointerWithAlignment(mem, mOutputDataReg1_16, mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes); } if (mNnClusterizerModelReg2NumOutputNodes > 0) { - computePointerWithAlignment(mem, mOutputDataReg2_32, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes); + computePointerWithAlignment(mem, mOutputDataReg2_16, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes); } } } @@ -78,26 +78,26 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) // Element counts (number of array entries, not bytes) size_t elemsClusterFlags = (mClusterFlags && mNnClusterizerBatchedMode > 0) ? (size_t)2 * mNnClusterizerBatchedMode : 0; - size_t elemsInput16 = (mInputData_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0; - size_t elemsInput32 = (mInputData_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0; - size_t elemsProb16 = (mModelProbabilities_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0; - size_t elemsProb32 = (mModelProbabilities_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0; - size_t elemsReg1_16 = (mOutputDataReg1_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0; - size_t elemsReg2_16 = (mOutputDataReg2_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0; + size_t elemsInput_32 = (mInputData_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0; + size_t elemsInput_16 = (mInputData_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0; + size_t elemsProb_32 = (mModelProbabilities_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0; + size_t elemsProb_16 = (mModelProbabilities_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0; size_t elemsReg1_32 = (mOutputDataReg1_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0; + size_t elemsReg1_16 = (mOutputDataReg1_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0; size_t elemsReg2_32 = (mOutputDataReg2_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0; + size_t elemsReg2_16 = (mOutputDataReg2_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0; size_t elemsOutputDataClass = (mOutputDataClass && mNnClusterizerTotalClusters > 0) ? (size_t)mNnClusterizerTotalClusters : 0; // Byte sizes size_t szClusterFlags = elemsClusterFlags * sizeof(int8_t); - size_t szInput16 = elemsInput16 * sizeof(OrtDataType::Float16_t); - size_t szInput32 = elemsInput32 * sizeof(float); - size_t szProb16 = elemsProb16 * sizeof(OrtDataType::Float16_t); - size_t szProb32 = elemsProb32 * sizeof(float); - size_t szReg1_16 = elemsReg1_16 * sizeof(OrtDataType::Float16_t); - size_t szReg2_16 = elemsReg2_16 * sizeof(OrtDataType::Float16_t); + size_t szInput_32 = elemsInput_32 * sizeof(float); + size_t szInput_16 = elemsInput_16 * sizeof(OrtDataType::Float16_t); + size_t szProb_32 = elemsProb_32 * sizeof(float); + size_t szProb_16 = elemsProb_16 * sizeof(OrtDataType::Float16_t); size_t szReg1_32 = elemsReg1_32 * sizeof(float); + size_t szReg1_16 = elemsReg1_16 * sizeof(OrtDataType::Float16_t); size_t szReg2_32 = elemsReg2_32 * sizeof(float); + size_t szReg2_16 = elemsReg2_16 * sizeof(OrtDataType::Float16_t); size_t szOutputDataClass = elemsOutputDataClass * sizeof(int32_t); LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") Pointers set for clusterizer with memoryID " << mMemoryId << " deviceID " << mDeviceId << " and sector " << mISector; @@ -108,11 +108,11 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) << " | elements=" << elemsClusterFlags << " (= 2 * mNnClusterizerBatchedMode)" << " | " << fmt(szClusterFlags); LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_16 pointer: " << mInputData_16 - << " | elements=" << elemsInput16 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)" - << " | " << fmt(szInput16); + << " | elements=" << elemsInput_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)" + << " | " << fmt(szInput_16); LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_16 pointer: " << mModelProbabilities_16 - << " | elements=" << elemsProb16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)" - << " | " << fmt(szProb16); + << " | elements=" << elemsProb_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)" + << " | " << fmt(szProb_16); LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_16 pointer: " << mOutputDataReg1_16 << " | elements=" << elemsReg1_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)" << " | " << fmt(szReg1_16); @@ -120,11 +120,11 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) << " | elements=" << elemsReg2_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes)" << " | " << fmt(szReg2_16); LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_32 pointer: " << mInputData_32 - << " | elements=" << elemsInput32 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)" - << " | " << fmt(szInput32); + << " | elements=" << elemsInput_32 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)" + << " | " << fmt(szInput_32); LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_32 pointer: " << mModelProbabilities_32 - << " | elements=" << elemsProb32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)" - << " | " << fmt(szProb32); + << " | elements=" << elemsProb_32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)" + << " | " << fmt(szProb_32); LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_32 pointer: " << mOutputDataReg1_32 << " | elements=" << elemsReg1_32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)" << " | " << fmt(szReg1_32); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index b7bc1575d349a..7aa23489eb2f4 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -89,7 +89,7 @@ class GPUTPCNNClusterizer : public GPUProcessor int8_t* mClusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr int32_t* mOutputDataClass = nullptr; - // FP32 + // FP32, also used for int8 models float* mInputData_32 = nullptr; float* mModelProbabilities_32 = nullptr; float* mOutputDataReg1_32 = nullptr; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 77d5ee13f85fb..96b8a1d7ed2fd 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -123,8 +123,17 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust } else { clustererNN.mNnClusterizerVerbosity = settings.nnClusterizerVerbosity; } - clustererNN.mNnInferenceInputDType = settings.nnInferenceInputDType.find("32") != std::string::npos; - clustererNN.mNnInferenceOutputDType = settings.nnInferenceOutputDType.find("32") != std::string::npos; + // Define the datatype for input and output + if (settings.nnInferenceInputDType.find("32") != std::string::npos) { + clustererNN.mNnInferenceInputDType = 0; + } else { + clustererNN.mNnInferenceInputDType = 1; // Default to float16 + } + if (settings.nnInferenceOutputDType.find("32") != std::string::npos) { + clustererNN.mNnInferenceOutputDType = 0; + } else { + clustererNN.mNnInferenceOutputDType = 1; // Default to float16 + } clustererNN.mNnClusterizerModelClassNumOutputNodes = mModelClass.getNumOutputNodes()[0][1]; if (!settings.nnClusterizerUseCfRegression) { if (mModelClass.getNumOutputNodes()[0][1] == 1 || !mModelReg2.isInitialized()) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index ee0fa217b8095..693ee4dd78e8d 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -17,7 +17,6 @@ #include "GPUTPCNNClusterizerKernels.h" #include "GPUConstantMem.h" #include "GPUTPCClusterFinder.h" -#include "GPUTPCCFClusterizer.h" #include "GPUTPCGeometry.h" using namespace o2::gpu; @@ -37,6 +36,8 @@ using namespace o2::gpu::tpccf; #include "GPUTPCCFClusterizer.inc" #endif +static_assert(GPUTPCNNClusterizerKernels::SCRATCH_PAD_WORK_GROUP_SIZE == GPUTPCCFClusterizer::SCRATCH_PAD_WORK_GROUP_SIZE, "Work group sizes do not match"); + // Defining individual thread functions for data filling, determining the class label and running the CF clusterizer template <> GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) @@ -48,7 +49,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Threadcounters.nClusters - 1)] > 0) : 1); - GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, reinterpret_cast(smem), chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, isAccepted); + GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, isAccepted); } template <> @@ -92,17 +93,17 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(clustererNN.mInputData_16_Test[write_idx]) - static_cast(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) { @@ -116,13 +117,13 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(sector) / o2::tpc::constants::MAXSECTOR); - clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); - clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads_row); - } else { clustererNN.mInputData_32[write_idx] = static_cast(sector) / o2::tpc::constants::MAXSECTOR; clustererNN.mInputData_32[write_idx + 1] = static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW; clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / npads_row; + } else { + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(sector) / o2::tpc::constants::MAXSECTOR); + clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); + clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads_row); } } @@ -142,6 +143,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { + // Statically quantized S8S8 ONNX models with graph-contained scaling expose + // FP32 graph boundaries and must use dtype == 0 here. Their QuantizeLinear + // nodes execute on the ONNX GPU stream. dtype == 2 is reserved for models + // with true external INT8 I/O and requires matching external scale metadata. const uint32_t glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; @@ -173,13 +178,13 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(sector) / o2::tpc::constants::MAXSECTOR); - clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); - clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads); - } else { clustererNN.mInputData_32[write_idx] = static_cast(sector) / o2::tpc::constants::MAXSECTOR; clustererNN.mInputData_32[write_idx + 1] = static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW; clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / npads; + } else { + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(sector) / o2::tpc::constants::MAXSECTOR); + clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); + clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads); } } @@ -197,9 +202,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread 62) || (target_row < 0) || (row > 62 && target_row < 63) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW)) { for (uint32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) { if (dtype == 0) { - clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; - } else { clustererNN.mInputData_32[write_idx] = output_value; + } else { + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; } write_idx += clustererNN.mNnClusterizerFullTimeSize; } @@ -224,9 +229,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread clustererNN.mNnClassThreshold); - } else if (dtype == 1) { clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)(clustererNN.mModelProbabilities_32[glo_idx] > clustererNN.mNnClassThreshold); + } else { + clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold); } } else { clustererNN.mOutputDataClass[glo_idx + batchStart] = 1; @@ -271,15 +276,15 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(clustererNN.mModelProbabilities_16[pIdx]); - } else if (dtype == 1) { current_max_prob = clustererNN.mModelProbabilities_32[pIdx]; + } else { + current_max_prob = static_cast(clustererNN.mModelProbabilities_16[pIdx]); } } else { if (dtype == 0) { - current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_16[pIdx].ToFloat()); - } else if (dtype == 1) { current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_32[pIdx]); + } else { + current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_16[pIdx].ToFloat()); } } } @@ -368,25 +373,25 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat(); - publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat(); + publishPadPosition = static_cast(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index]; + publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1]; isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); - pc.setFull(central_charge * clustererNN.mOutputDataReg1_16[model_output_index + 4].ToFloat(), + pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4], publishPadPosition, - notSinglePad ? clustererNN.mOutputDataReg1_16[model_output_index + 2].ToFloat() : 0.f, + notSinglePad ? clustererNN.mOutputDataReg1_32[model_output_index + 2] : 0.f, (clusterer.mPmemory->fragment).start + publishTimePosition, - notSingleTime ? clustererNN.mOutputDataReg1_16[model_output_index + 3].ToFloat() : 0.f, + notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f, clustererNN.mClusterFlags[2 * glo_idx], clustererNN.mClusterFlags[2 * glo_idx + 1]); } else { - publishPadPosition = static_cast(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index]; - publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1]; + publishPadPosition = static_cast(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat(); + publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat(); isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); - pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4], + pc.setFull(central_charge * clustererNN.mOutputDataReg1_16[model_output_index + 4].ToFloat(), publishPadPosition, - notSinglePad ? clustererNN.mOutputDataReg1_32[model_output_index + 2] : 0.f, + notSinglePad ? clustererNN.mOutputDataReg1_16[model_output_index + 2].ToFloat() : 0.f, (clusterer.mPmemory->fragment).start + publishTimePosition, - notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f, + notSingleTime ? clustererNN.mOutputDataReg1_16[model_output_index + 3].ToFloat() : 0.f, clustererNN.mClusterFlags[2 * glo_idx], clustererNN.mClusterFlags[2 * glo_idx + 1]); } @@ -554,17 +559,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat(); - publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(); - isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); - pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 8].ToFloat(), - publishPadPosition, - clustererNN.mOutputDataReg2_16[model_output_index + 4].ToFloat(), - (clusterer.mPmemory->fragment).start + publishTimePosition, - clustererNN.mOutputDataReg2_16[model_output_index + 6].ToFloat(), - clustererNN.mClusterFlags[2 * glo_idx], - clustererNN.mClusterFlags[2 * glo_idx + 1]); - } else if (dtype == 1) { publishPadPosition = static_cast(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index]; publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 1]; isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); @@ -575,6 +569,17 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat(); + publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(); + isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); + pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 8].ToFloat(), + publishPadPosition, + clustererNN.mOutputDataReg2_16[model_output_index + 4].ToFloat(), + (clusterer.mPmemory->fragment).start + publishTimePosition, + clustererNN.mOutputDataReg2_16[model_output_index + 6].ToFloat(), + clustererNN.mClusterFlags[2 * glo_idx], + clustererNN.mClusterFlags[2 * glo_idx + 1]); } tpc::ClusterNative myCluster; @@ -608,17 +613,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(); - publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat(); - isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); - pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 9].ToFloat(), - publishPadPosition, - clustererNN.mOutputDataReg2_16[model_output_index + 5].ToFloat(), - (clusterer.mPmemory->fragment).start + publishTimePosition, - clustererNN.mOutputDataReg2_16[model_output_index + 7].ToFloat(), - clustererNN.mClusterFlags[2 * glo_idx], - clustererNN.mClusterFlags[2 * glo_idx + 1]); - } else if (dtype == 1) { publishPadPosition = static_cast(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1]; publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3]; isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); @@ -629,6 +623,17 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(); + publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat(); + isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); + pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 9].ToFloat(), + publishPadPosition, + clustererNN.mOutputDataReg2_16[model_output_index + 5].ToFloat(), + (clusterer.mPmemory->fragment).start + publishTimePosition, + clustererNN.mOutputDataReg2_16[model_output_index + 7].ToFloat(), + clustererNN.mClusterFlags[2 * glo_idx], + clustererNN.mClusterFlags[2 * glo_idx + 1]); } rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h index c77a99bec3a70..d7194d163f341 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -18,6 +18,7 @@ #include "CfArray2D.h" #include "GPUGeneralKernels.h" #include "GPUTPCNNClusterizer.h" +#include "GPUTPCCFClusterizer.h" namespace o2::tpc { @@ -36,13 +37,8 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate { public: // Must all have same number of threads, since they use a common SCRATCH_PAD_WORK_GROUP_SIZE below - static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer); - struct GPUSharedMemory { - // Regular cluster finder - CfChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE]; - PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N]; - uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE]; - }; + static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFClusterizer); + using GPUSharedMemory = GPUTPCCFClusterizer::GPUSharedMemory; GPUhdi() constexpr static gpudatatypes::RecoStep GetRecoStep() { diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx index e54d1dadb6ce2..d928cfdd502c9 100644 --- a/GPU/Workflow/src/GPUWorkflowSpec.cxx +++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx @@ -1263,6 +1263,7 @@ Inputs GPURecoWorkflowSpec::inputs() metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBClassificationLayerType; // FC, CNN metadata["nnCCDBInteractionRate"] = nnClusterizerSettings.nnCCDBInteractionRate; // in kHz metadata["nnCCDBBeamType"] = nnClusterizerSettings.nnCCDBBeamType; // pp, pPb, PbPb + metadata["nnCCDBExtraMetadata"] = nnClusterizerSettings.nnCCDBExtraMetadata; // Extra metadata for CCDB auto convert_map_to_metadata = [](const std::map& inputMap, std::vector& outputMetadata) { for (const auto& [key, value] : inputMap) {