From e7e21b4d3ab49f6f7bbf2863ec81c61378e0143e Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 30 Jun 2026 09:28:13 +0200 Subject: [PATCH 1/8] Adding support for int8 inputs --- Common/ML/src/OrtInterface.cxx | 10 ++ GPU/GPUTracking/Definitions/GPUSettingsList.h | 1 + .../Global/GPUChainTrackingClusterizer.cxx | 60 ++++++-- .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 86 +++++++---- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 6 + .../GPUTPCNNClusterizerHost.cxx | 21 ++- .../GPUTPCNNClusterizerKernels.cxx | 135 +++++++++++++----- GPU/Workflow/src/GPUWorkflowSpec.cxx | 1 + 8 files changed, 242 insertions(+), 78 deletions(-) diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 8f88ab18dacbd..7e3404a288447 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -354,6 +354,11 @@ template void OrtModel::inference(OrtDataType::Float16_t*, int64_t, float*); template void OrtModel::inference(float*, int64_t, OrtDataType::Float16_t*); template void OrtModel::inference(float*, int64_t, float*); +template void OrtModel::inference(int8_t*, int64_t, int8_t*); +template void OrtModel::inference(int8_t*, int64_t, float*); +template void OrtModel::inference(float*, int64_t, int8_t*); +template void OrtModel::inference(int8_t*, int64_t, OrtDataType::Float16_t*); +template void OrtModel::inference(OrtDataType::Float16_t*, int64_t, int8_t*); template void OrtModel::inference(I** input, int64_t input_size, O* output) @@ -414,6 +419,11 @@ template void OrtModel::inference(OrtDataType::Float16_t**, int64_t, float*); template void OrtModel::inference(float**, int64_t, OrtDataType::Float16_t*); template void OrtModel::inference(float**, int64_t, float*); +template void OrtModel::inference(int8_t**, int64_t, int8_t*); +template void OrtModel::inference(int8_t**, int64_t, float*); +template void OrtModel::inference(float**, int64_t, int8_t*); +template void OrtModel::inference(int8_t**, int64_t, OrtDataType::Float16_t*); +template void OrtModel::inference(OrtDataType::Float16_t**, int64_t, int8_t*); template std::vector OrtModel::inference(std::vector>& inputs) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index e3327f2f8b661..40f7a34e699c9 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -300,6 +300,7 @@ AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishe AddOption(nnCCDBRegressionLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") AddOption(nnCCDBBeamType, std::string, "pp", "", 0, "Distinguishes between networks trained for different beam types. Options: pp, pPb, PbPb") AddOption(nnCCDBInteractionRate, std::string, "500", "", 0, "Distinguishes between networks for different interaction rates [kHz].") +AddOption(nnCCDBExtraMetadata, std::string, "", "", 0, "Extra metadata to distinguish between networks, e.g. for different internal datatypes, etc.") AddHelp("help", 'h') EndConfig() diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 76dcd54a89289..76f4c6479287e 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -1269,15 +1269,27 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Start(); } if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16); + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32); } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32); + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_8); } } else if (clustererNNShadow.mNnInferenceInputDType == 1) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16); + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32); } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32); + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_8); + } + } else if (clustererNNShadow.mNnInferenceInputDType == 2) { + if (clustererNNShadow.mNnInferenceOutputDType == 0) { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mModelProbabilities_32); + } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mModelProbabilities_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mModelProbabilities_8); } } if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Stop(); } // doGPU || lane<4 -> only for GPU or first 4 CPU lanes (to limit number of concurrent timers). At least gives some statistics for CPU time... @@ -1289,15 +1301,27 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Start(); } if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16); + (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32); } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_32); + (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { + (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_8); } } else if (clustererNNShadow.mNnInferenceInputDType == 1) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_16); + (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_32); } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32); + (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { + (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_8); + } + } else if (clustererNNShadow.mNnInferenceInputDType == 2) { + if (clustererNNShadow.mNnInferenceOutputDType == 0) { + (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg1_32); + } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg1_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { + (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg1_8); } } if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Stop(); } @@ -1305,15 +1329,27 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Start(); } if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16); + (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32); } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_32); + (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { + (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_8); } } else if (clustererNNShadow.mNnInferenceInputDType == 1) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_16); + (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_32); } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32); + (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { + (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_8); + } + } else if (clustererNNShadow.mNnInferenceInputDType == 2) { + if (clustererNNShadow.mNnInferenceOutputDType == 0) { + (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg2_32); + } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg2_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { + (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg2_8); } } if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Stop(); } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 6fac0e417ac26..3decd231aec62 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -30,13 +30,27 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) void* startMem = mem; if (mNnClusterizerBatchedMode > 0) { if (mNnInferenceInputDType == 0 && mNnClusterizerElementSize > 0) { - computePointerWithAlignment(mem, mInputData_16, mNnClusterizerBatchedMode * mNnClusterizerElementSize); - } else if (mNnInferenceInputDType == 1 && mNnClusterizerElementSize > 0) { computePointerWithAlignment(mem, mInputData_32, mNnClusterizerBatchedMode * mNnClusterizerElementSize); + } else if (mNnInferenceInputDType == 1 && mNnClusterizerElementSize > 0) { + computePointerWithAlignment(mem, mInputData_16, mNnClusterizerBatchedMode * mNnClusterizerElementSize); + } else if (mNnInferenceInputDType == 2 && mNnClusterizerElementSize > 0) { + computePointerWithAlignment(mem, mInputData_8, mNnClusterizerBatchedMode * mNnClusterizerElementSize); } computePointerWithAlignment(mem, mClusterFlags, 2 * mNnClusterizerBatchedMode); if (mNnInferenceOutputDType == 0 && mNnClusterizerElementSize > 0) { + if (mNnClusterizerModelClassNumOutputNodes > 0) { + computePointerWithAlignment(mem, mModelProbabilities_32, mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes); + } + if (!mNnClusterizerUseCfRegression) { + if (mNnClusterizerModelReg1NumOutputNodes > 0) { + computePointerWithAlignment(mem, mOutputDataReg1_32, mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes); + } + if (mNnClusterizerModelReg2NumOutputNodes > 0) { + computePointerWithAlignment(mem, mOutputDataReg2_32, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes); + } + } + } else if (mNnInferenceOutputDType == 1 && mNnClusterizerElementSize > 0) { if (mNnClusterizerModelClassNumOutputNodes > 0) { computePointerWithAlignment(mem, mModelProbabilities_16, mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes); } @@ -48,16 +62,16 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) computePointerWithAlignment(mem, mOutputDataReg2_16, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes); } } - } else if (mNnInferenceOutputDType == 1 && mNnClusterizerElementSize > 0) { + } else if (mNnInferenceOutputDType == 2 && mNnClusterizerElementSize > 0) { if (mNnClusterizerModelClassNumOutputNodes > 0) { - computePointerWithAlignment(mem, mModelProbabilities_32, mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes); + computePointerWithAlignment(mem, mModelProbabilities_8, mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes); } if (!mNnClusterizerUseCfRegression) { if (mNnClusterizerModelReg1NumOutputNodes > 0) { - computePointerWithAlignment(mem, mOutputDataReg1_32, mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes); + computePointerWithAlignment(mem, mOutputDataReg1_8, mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes); } if (mNnClusterizerModelReg2NumOutputNodes > 0) { - computePointerWithAlignment(mem, mOutputDataReg2_32, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes); + computePointerWithAlignment(mem, mOutputDataReg2_8, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes); } } } @@ -78,26 +92,34 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) // Element counts (number of array entries, not bytes) size_t elemsClusterFlags = (mClusterFlags && mNnClusterizerBatchedMode > 0) ? (size_t)2 * mNnClusterizerBatchedMode : 0; - size_t elemsInput16 = (mInputData_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0; - size_t elemsInput32 = (mInputData_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0; - size_t elemsProb16 = (mModelProbabilities_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0; - size_t elemsProb32 = (mModelProbabilities_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0; - size_t elemsReg1_16 = (mOutputDataReg1_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0; - size_t elemsReg2_16 = (mOutputDataReg2_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0; + size_t elemsInput_32 = (mInputData_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0; + size_t elemsInput_16 = (mInputData_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0; + size_t elemsInput_8 = (mInputData_8 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0; + size_t elemsProb_32 = (mModelProbabilities_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0; + size_t elemsProb_16 = (mModelProbabilities_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0; + size_t elemsProb_8 = (mModelProbabilities_8 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0; size_t elemsReg1_32 = (mOutputDataReg1_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0; + size_t elemsReg1_16 = (mOutputDataReg1_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0; + size_t elemsReg1_8 = (mOutputDataReg1_8 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0; size_t elemsReg2_32 = (mOutputDataReg2_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0; + size_t elemsReg2_16 = (mOutputDataReg2_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0; + size_t elemsReg2_8 = (mOutputDataReg2_8 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0; size_t elemsOutputDataClass = (mOutputDataClass && mNnClusterizerTotalClusters > 0) ? (size_t)mNnClusterizerTotalClusters : 0; // Byte sizes size_t szClusterFlags = elemsClusterFlags * sizeof(int8_t); - size_t szInput16 = elemsInput16 * sizeof(OrtDataType::Float16_t); - size_t szInput32 = elemsInput32 * sizeof(float); - size_t szProb16 = elemsProb16 * sizeof(OrtDataType::Float16_t); - size_t szProb32 = elemsProb32 * sizeof(float); - size_t szReg1_16 = elemsReg1_16 * sizeof(OrtDataType::Float16_t); - size_t szReg2_16 = elemsReg2_16 * sizeof(OrtDataType::Float16_t); + size_t szInput_32 = elemsInput_32 * sizeof(float); + size_t szInput_16 = elemsInput_16 * sizeof(OrtDataType::Float16_t); + size_t szInput_8 = elemsInput_8 * sizeof(int8_t); + size_t szProb_32 = elemsProb_32 * sizeof(float); + size_t szProb_16 = elemsProb_16 * sizeof(OrtDataType::Float16_t); + size_t szProb_8 = elemsProb_8 * sizeof(int8_t); size_t szReg1_32 = elemsReg1_32 * sizeof(float); + size_t szReg1_16 = elemsReg1_16 * sizeof(OrtDataType::Float16_t); + size_t szReg1_8 = elemsReg1_8 * sizeof(int8_t); size_t szReg2_32 = elemsReg2_32 * sizeof(float); + size_t szReg2_16 = elemsReg2_16 * sizeof(OrtDataType::Float16_t); + size_t szReg2_8 = elemsReg2_8 * sizeof(int8_t); size_t szOutputDataClass = elemsOutputDataClass * sizeof(int32_t); LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") Pointers set for clusterizer with memoryID " << mMemoryId << " deviceID " << mDeviceId << " and sector " << mISector; @@ -107,12 +129,24 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mClusterFlags pointer: " << static_cast(mClusterFlags) << " | elements=" << elemsClusterFlags << " (= 2 * mNnClusterizerBatchedMode)" << " | " << fmt(szClusterFlags); + LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_8 pointer: " << mInputData_8 + << " | elements=" << elemsInput_8 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)" + << " | " << fmt(szInput_8); + LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_8 pointer: " << mModelProbabilities_8 + << " | elements=" << elemsProb_8 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)" + << " | " << fmt(szProb_8); + LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_8 pointer: " << mOutputDataReg1_8 + << " | elements=" << elemsReg1_8 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)" + << " | " << fmt(szReg1_8); + LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_8 pointer: " << mOutputDataReg2_8 + << " | elements=" << elemsReg2_8 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes)" + << " | " << fmt(szReg2_8); LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_16 pointer: " << mInputData_16 - << " | elements=" << elemsInput16 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)" - << " | " << fmt(szInput16); + << " | elements=" << elemsInput_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)" + << " | " << fmt(szInput_16); LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_16 pointer: " << mModelProbabilities_16 - << " | elements=" << elemsProb16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)" - << " | " << fmt(szProb16); + << " | elements=" << elemsProb_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)" + << " | " << fmt(szProb_16); LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_16 pointer: " << mOutputDataReg1_16 << " | elements=" << elemsReg1_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)" << " | " << fmt(szReg1_16); @@ -120,11 +154,11 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) << " | elements=" << elemsReg2_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes)" << " | " << fmt(szReg2_16); LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_32 pointer: " << mInputData_32 - << " | elements=" << elemsInput32 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)" - << " | " << fmt(szInput32); + << " | elements=" << elemsInput_32 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)" + << " | " << fmt(szInput_32); LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_32 pointer: " << mModelProbabilities_32 - << " | elements=" << elemsProb32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)" - << " | " << fmt(szProb32); + << " | elements=" << elemsProb_32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)" + << " | " << fmt(szProb_32); LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_32 pointer: " << mOutputDataReg1_32 << " | elements=" << elemsReg1_32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)" << " | " << fmt(szReg1_32); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index b7bc1575d349a..9ef1a658a9c3a 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -101,6 +101,12 @@ class GPUTPCNNClusterizer : public GPUProcessor OrtDataType::Float16_t* mOutputDataReg1_16 = nullptr; OrtDataType::Float16_t* mOutputDataReg2_16 = nullptr; + // INT8 + int8_t* mInputData_8 = nullptr; + int8_t* mModelProbabilities_8 = nullptr; + int8_t* mOutputDataReg1_8 = nullptr; + int8_t* mOutputDataReg2_8 = nullptr; + int16_t mMemoryId = -1; }; // class GPUTPCNNClusterizer diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 77d5ee13f85fb..f4f52d7ec3298 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -123,8 +123,25 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust } else { clustererNN.mNnClusterizerVerbosity = settings.nnClusterizerVerbosity; } - clustererNN.mNnInferenceInputDType = settings.nnInferenceInputDType.find("32") != std::string::npos; - clustererNN.mNnInferenceOutputDType = settings.nnInferenceOutputDType.find("32") != std::string::npos; + // Define the datatype for input and output + if (settings.nnInferenceInputDType.find("32") != std::string::npos) { + clustererNN.mNnInferenceInputDType = 0; + } else if (settings.nnInferenceInputDType.find("16") != std::string::npos) { + clustererNN.mNnInferenceInputDType = 1; + } else if (settings.nnInferenceInputDType.find("8") != std::string::npos) { + clustererNN.mNnInferenceInputDType = 2; + } else { + clustererNN.mNnInferenceInputDType = 1; // Default to float32 + } + if (settings.nnInferenceOutputDType.find("32") != std::string::npos) { + clustererNN.mNnInferenceOutputDType = 0; + } else if (settings.nnInferenceOutputDType.find("16") != std::string::npos) { + clustererNN.mNnInferenceOutputDType = 1; + } else if (settings.nnInferenceOutputDType.find("8") != std::string::npos) { + clustererNN.mNnInferenceOutputDType = 2; + } else { + clustererNN.mNnInferenceOutputDType = 1; // Default to float32 + } clustererNN.mNnClusterizerModelClassNumOutputNodes = mModelClass.getNumOutputNodes()[0][1]; if (!settings.nnClusterizerUseCfRegression) { if (mModelClass.getNumOutputNodes()[0][1] == 1 || !mModelReg2.isInitialized()) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index ee0fa217b8095..a33a9eaed58ef 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -92,17 +92,21 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(clustererNN.mInputData_16_Test[write_idx]) - static_cast(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) { @@ -116,13 +120,17 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(sector) / o2::tpc::constants::MAXSECTOR); - clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); - clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads_row); - } else { clustererNN.mInputData_32[write_idx] = static_cast(sector) / o2::tpc::constants::MAXSECTOR; clustererNN.mInputData_32[write_idx + 1] = static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW; clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / npads_row; + } else if (dtype == 1) { + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(sector) / o2::tpc::constants::MAXSECTOR); + clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); + clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads_row); + } else if (dtype == 2) { + clustererNN.mInputData_8[write_idx] = (int8_t)(static_cast(sector) / o2::tpc::constants::MAXSECTOR); + clustererNN.mInputData_8[write_idx + 1] = (int8_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); + clustererNN.mInputData_8[write_idx + 2] = (int8_t)(static_cast(pad) / npads_row); } } @@ -142,6 +150,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { + // Statically quantized S8S8 ONNX models with graph-contained scaling expose + // FP32 graph boundaries and must use dtype == 0 here. Their QuantizeLinear + // nodes execute on the ONNX GPU stream. dtype == 2 is reserved for models + // with true external INT8 I/O and requires matching external scale metadata. const uint32_t glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; @@ -173,13 +185,17 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(sector) / o2::tpc::constants::MAXSECTOR); - clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); - clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads); - } else { clustererNN.mInputData_32[write_idx] = static_cast(sector) / o2::tpc::constants::MAXSECTOR; clustererNN.mInputData_32[write_idx + 1] = static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW; clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / npads; + } else if (dtype == 1) { + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(sector) / o2::tpc::constants::MAXSECTOR); + clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); + clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads); + } else if (dtype == 2) { + clustererNN.mInputData_8[write_idx] = (int8_t)(static_cast(sector) / o2::tpc::constants::MAXSECTOR); + clustererNN.mInputData_8[write_idx + 1] = (int8_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); + clustererNN.mInputData_8[write_idx + 2] = (int8_t)(static_cast(pad) / npads); } } @@ -197,9 +213,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread 62) || (target_row < 0) || (row > 62 && target_row < 63) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW)) { for (uint32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) { if (dtype == 0) { + clustererNN.mInputData_32[write_idx] = output_value; + } else if (dtype == 1) { clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; } else { - clustererNN.mInputData_32[write_idx] = output_value; + clustererNN.mInputData_8[write_idx] = (int8_t)output_value; } write_idx += clustererNN.mNnClusterizerFullTimeSize; } @@ -224,9 +242,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread clustererNN.mNnClassThreshold); - } else if (dtype == 1) { clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)(clustererNN.mModelProbabilities_32[glo_idx] > clustererNN.mNnClassThreshold); + } else if (dtype == 1) { + clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold); + } else if (dtype == 2) { + clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)(clustererNN.mModelProbabilities_8[glo_idx] > clustererNN.mNnClassThreshold); } } else { clustererNN.mOutputDataClass[glo_idx + batchStart] = 1; @@ -271,15 +293,19 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(clustererNN.mModelProbabilities_16[pIdx]); - } else if (dtype == 1) { current_max_prob = clustererNN.mModelProbabilities_32[pIdx]; + } else if (dtype == 1) { + current_max_prob = static_cast(clustererNN.mModelProbabilities_16[pIdx]); + } else if (dtype == 2) { + current_max_prob = static_cast(clustererNN.mModelProbabilities_8[pIdx]); } } else { if (dtype == 0) { - current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_16[pIdx].ToFloat()); - } else if (dtype == 1) { current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_32[pIdx]); + } else if (dtype == 1) { + current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_16[pIdx].ToFloat()); + } else if (dtype == 2) { + current_max_prob = CAMath::Max(current_max_prob, static_cast(clustererNN.mModelProbabilities_8[pIdx])); } } } @@ -368,6 +394,17 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index]; + publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1]; + isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); + pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4], + publishPadPosition, + notSinglePad ? clustererNN.mOutputDataReg1_32[model_output_index + 2] : 0.f, + (clusterer.mPmemory->fragment).start + publishTimePosition, + notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f, + clustererNN.mClusterFlags[2 * glo_idx], + clustererNN.mClusterFlags[2 * glo_idx + 1]); + } else if (dtype == 1) { publishPadPosition = static_cast(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat(); publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat(); isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); @@ -378,15 +415,15 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index]; - publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1]; + } else if (dtype == 2) { + publishPadPosition = static_cast(peak.pad()) + clustererNN.mOutputDataReg1_8[model_output_index]; + publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg1_8[model_output_index + 1]; isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); - pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4], + pc.setFull(central_charge * clustererNN.mOutputDataReg1_8[model_output_index + 4], publishPadPosition, - notSinglePad ? clustererNN.mOutputDataReg1_32[model_output_index + 2] : 0.f, + notSinglePad ? clustererNN.mOutputDataReg1_8[model_output_index + 2] : 0.f, (clusterer.mPmemory->fragment).start + publishTimePosition, - notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f, + notSingleTime ? clustererNN.mOutputDataReg1_8[model_output_index + 3] : 0.f, clustererNN.mClusterFlags[2 * glo_idx], clustererNN.mClusterFlags[2 * glo_idx + 1]); } @@ -554,6 +591,17 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index]; + publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 1]; + isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); + pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 8], + publishPadPosition, + clustererNN.mOutputDataReg2_32[model_output_index + 4], + (clusterer.mPmemory->fragment).start + publishTimePosition, + clustererNN.mOutputDataReg2_32[model_output_index + 6], + clustererNN.mClusterFlags[2 * glo_idx], + clustererNN.mClusterFlags[2 * glo_idx + 1]); + } else if (dtype == 1) { publishPadPosition = static_cast(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat(); publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(); isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); @@ -564,15 +612,15 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index]; - publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 1]; + } else if (dtype == 2) { + publishPadPosition = static_cast(peak.pad()) + clustererNN.mOutputDataReg2_8[model_output_index]; + publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_8[model_output_index + 1]; isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); - pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 8], + pc.setFull(central_charge * clustererNN.mOutputDataReg2_8[model_output_index + 8], publishPadPosition, - clustererNN.mOutputDataReg2_32[model_output_index + 4], + clustererNN.mOutputDataReg2_8[model_output_index + 4], (clusterer.mPmemory->fragment).start + publishTimePosition, - clustererNN.mOutputDataReg2_32[model_output_index + 6], + clustererNN.mOutputDataReg2_8[model_output_index + 6], clustererNN.mClusterFlags[2 * glo_idx], clustererNN.mClusterFlags[2 * glo_idx + 1]); } @@ -608,6 +656,17 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1]; + publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3]; + isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); + pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 9], + publishPadPosition, + clustererNN.mOutputDataReg2_32[model_output_index + 5], + (clusterer.mPmemory->fragment).start + publishTimePosition, + clustererNN.mOutputDataReg2_32[model_output_index + 7], + clustererNN.mClusterFlags[2 * glo_idx], + clustererNN.mClusterFlags[2 * glo_idx + 1]); + } else if (dtype == 1) { publishPadPosition = static_cast(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(); publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat(); isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); @@ -618,15 +677,15 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1]; - publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3]; + } else if (dtype == 2) { + publishPadPosition = static_cast(peak.pad()) + clustererNN.mOutputDataReg2_8[model_output_index + 1]; + publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_8[model_output_index + 3]; isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); - pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 9], + pc.setFull(central_charge * clustererNN.mOutputDataReg2_8[model_output_index + 9], publishPadPosition, - clustererNN.mOutputDataReg2_32[model_output_index + 5], + clustererNN.mOutputDataReg2_8[model_output_index + 5], (clusterer.mPmemory->fragment).start + publishTimePosition, - clustererNN.mOutputDataReg2_32[model_output_index + 7], + clustererNN.mOutputDataReg2_8[model_output_index + 7], clustererNN.mClusterFlags[2 * glo_idx], clustererNN.mClusterFlags[2 * glo_idx + 1]); } diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx index e54d1dadb6ce2..d928cfdd502c9 100644 --- a/GPU/Workflow/src/GPUWorkflowSpec.cxx +++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx @@ -1263,6 +1263,7 @@ Inputs GPURecoWorkflowSpec::inputs() metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBClassificationLayerType; // FC, CNN metadata["nnCCDBInteractionRate"] = nnClusterizerSettings.nnCCDBInteractionRate; // in kHz metadata["nnCCDBBeamType"] = nnClusterizerSettings.nnCCDBBeamType; // pp, pPb, PbPb + metadata["nnCCDBExtraMetadata"] = nnClusterizerSettings.nnCCDBExtraMetadata; // Extra metadata for CCDB auto convert_map_to_metadata = [](const std::map& inputMap, std::vector& outputMetadata) { for (const auto& [key, value] : inputMap) { From f09b710dcffc1b94d09dac64b2894b4214059ba6 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 30 Jun 2026 13:10:33 +0200 Subject: [PATCH 2/8] Reverting int8 changes as int8 computations are taken care of in the ONNX graph due to requirement of additional scaling --- Common/ML/src/OrtInterface.cxx | 13 +-- .../Base/cuda/GPUReconstructionCUDA.cu | 48 ++++++----- .../Global/GPUChainTrackingClusterizer.cxx | 50 ++---------- .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 34 -------- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 8 +- .../GPUTPCNNClusterizerKernels.cxx | 79 +++---------------- 6 files changed, 50 insertions(+), 182 deletions(-) diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 7e3404a288447..9eccb9638d882 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -140,6 +140,9 @@ void OrtModel::initEnvironment() void OrtModel::initSessionFromBuffer(const char* buffer, size_t bufferSize) { + if (mAllocateDeviceMemory) { + memoryOnDevice(mDeviceId); + } mPImplOrt->sessionOptions.AddConfigEntry("session.load_model_format", "ONNX"); mPImplOrt->sessionOptions.AddConfigEntry("session.use_ort_model_bytes_directly", "1"); @@ -354,11 +357,6 @@ template void OrtModel::inference(OrtDataType::Float16_t*, int64_t, float*); template void OrtModel::inference(float*, int64_t, OrtDataType::Float16_t*); template void OrtModel::inference(float*, int64_t, float*); -template void OrtModel::inference(int8_t*, int64_t, int8_t*); -template void OrtModel::inference(int8_t*, int64_t, float*); -template void OrtModel::inference(float*, int64_t, int8_t*); -template void OrtModel::inference(int8_t*, int64_t, OrtDataType::Float16_t*); -template void OrtModel::inference(OrtDataType::Float16_t*, int64_t, int8_t*); template void OrtModel::inference(I** input, int64_t input_size, O* output) @@ -419,11 +417,6 @@ template void OrtModel::inference(OrtDataType::Float16_t**, int64_t, float*); template void OrtModel::inference(float**, int64_t, OrtDataType::Float16_t*); template void OrtModel::inference(float**, int64_t, float*); -template void OrtModel::inference(int8_t**, int64_t, int8_t*); -template void OrtModel::inference(int8_t**, int64_t, float*); -template void OrtModel::inference(float**, int64_t, int8_t*); -template void OrtModel::inference(int8_t**, int64_t, OrtDataType::Float16_t*); -template void OrtModel::inference(OrtDataType::Float16_t**, int64_t, int8_t*); template std::vector OrtModel::inference(std::vector>& inputs) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 040a4b84a0f64..23847fad18c9f 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -631,34 +631,40 @@ void GPUReconstructionCUDA::loadKernelModules(bool perKernel) } \ } -void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId) -{ +void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& sessionOptions, int32_t stream, int32_t* deviceId) { GPUChkErr(cudaGetDevice(deviceId)); + #if !defined(__HIPCC__) && defined(ORT_CUDA_BUILD) const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION); - OrtCUDAProviderOptionsV2* cuda_options = nullptr; - ORTCHK(api->CreateCUDAProviderOptions(&cuda_options)); - // std::vector keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"}; - // std::vector values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"}; - // UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size()); +#ifdef ORT_TENSORRT_BUILD + OrtTensorRTProviderOptionsV2* trtOptions = nullptr; + ORTCHK(api->CreateTensorRTProviderOptions(&trtOptions)); + + const std::string device = std::to_string(*deviceId); + const char* keys[] = {"device_id", "trt_int8_enable"}; + const char* values[] = {device.c_str(), "1"}; - // this implicitly sets "has_user_compute_stream" - ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", mInternals->Streams[stream])); - ORTCHK(api->SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options)); + ORTCHK(api->UpdateTensorRTProviderOptions(trtOptions,keys,values,sizeof(keys) / sizeof(keys[0]))); + ORTCHK(api->UpdateTensorRTProviderOptionsWithValue(trtOptions,"user_compute_stream",mInternals->Streams[stream])); + ORTCHK(api->SessionOptionsAppendExecutionProvider_TensorRT_V2(sessionOptions,trtOptions)); // Register TensorRT first: it consequently has higher priority. + api->ReleaseTensorRTProviderOptions(trtOptions); +#endif + + // CUDA is the fallback for nodes unsupported by TensorRT. + OrtCUDAProviderOptionsV2* cudaOptions = nullptr; + ORTCHK(api->CreateCUDAProviderOptions(&cudaOptions)); + ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cudaOptions,"user_compute_stream",mInternals->Streams[stream])); + ORTCHK(api->SessionOptionsAppendExecutionProvider_CUDA_V2(sessionOptions,cudaOptions)); + api->ReleaseCUDAProviderOptions(cudaOptions); - // Finally, don't forget to release the provider options - api->ReleaseCUDAProviderOptions(cuda_options); #elif defined(ORT_ROCM_BUILD) - // const auto& api = Ort::GetApi(); - // api.GetCurrentGpuDeviceId(deviceId); - OrtROCMProviderOptions rocm_options; - rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream - rocm_options.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code - // rocm_options.gpu_mem_limit = 1073741824; // 0 means no limit - rocm_options.user_compute_stream = mInternals->Streams[stream]; - session_options.AppendExecutionProvider_ROCM(rocm_options); -#endif // ORT_ROCM_BUILD + OrtROCMProviderOptions rocmOptions; + rocmOptions.has_user_compute_stream = 1; + rocmOptions.arena_extend_strategy = 0; + rocmOptions.user_compute_stream = mInternals->Streams[stream]; + sessionOptions.AppendExecutionProvider_ROCM(rocmOptions); +#endif } #ifndef __HIPCC__ // CUDA diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 76f4c6479287e..462b7798ce337 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -1270,26 +1270,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + } else { (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_8); } } else if (clustererNNShadow.mNnInferenceInputDType == 1) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + } else { (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_8); - } - } else if (clustererNNShadow.mNnInferenceInputDType == 2) { - if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mModelProbabilities_32); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mModelProbabilities_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mModelProbabilities_8); } } if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Stop(); } // doGPU || lane<4 -> only for GPU or first 4 CPU lanes (to limit number of concurrent timers). At least gives some statistics for CPU time... @@ -1302,26 +1290,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + } else { (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { - (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_8); } - } else if (clustererNNShadow.mNnInferenceInputDType == 1) { + } else { if (clustererNNShadow.mNnInferenceOutputDType == 0) { (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_32); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + } else { (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { - (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_8); - } - } else if (clustererNNShadow.mNnInferenceInputDType == 2) { - if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg1_32); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg1_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { - (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg1_8); } } if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Stop(); } @@ -1330,26 +1306,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + } else { (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { - (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_8); } } else if (clustererNNShadow.mNnInferenceInputDType == 1) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_32); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + } else { (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { - (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_8); - } - } else if (clustererNNShadow.mNnInferenceInputDType == 2) { - if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg2_32); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg2_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 2) { - (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg2_8); } } if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Stop(); } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 3decd231aec62..0e77393be1ce3 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -33,8 +33,6 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) computePointerWithAlignment(mem, mInputData_32, mNnClusterizerBatchedMode * mNnClusterizerElementSize); } else if (mNnInferenceInputDType == 1 && mNnClusterizerElementSize > 0) { computePointerWithAlignment(mem, mInputData_16, mNnClusterizerBatchedMode * mNnClusterizerElementSize); - } else if (mNnInferenceInputDType == 2 && mNnClusterizerElementSize > 0) { - computePointerWithAlignment(mem, mInputData_8, mNnClusterizerBatchedMode * mNnClusterizerElementSize); } computePointerWithAlignment(mem, mClusterFlags, 2 * mNnClusterizerBatchedMode); @@ -62,18 +60,6 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) computePointerWithAlignment(mem, mOutputDataReg2_16, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes); } } - } else if (mNnInferenceOutputDType == 2 && mNnClusterizerElementSize > 0) { - if (mNnClusterizerModelClassNumOutputNodes > 0) { - computePointerWithAlignment(mem, mModelProbabilities_8, mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes); - } - if (!mNnClusterizerUseCfRegression) { - if (mNnClusterizerModelReg1NumOutputNodes > 0) { - computePointerWithAlignment(mem, mOutputDataReg1_8, mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes); - } - if (mNnClusterizerModelReg2NumOutputNodes > 0) { - computePointerWithAlignment(mem, mOutputDataReg2_8, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes); - } - } } } if (mNnClusterizerTotalClusters > 0) { @@ -94,32 +80,24 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) size_t elemsClusterFlags = (mClusterFlags && mNnClusterizerBatchedMode > 0) ? (size_t)2 * mNnClusterizerBatchedMode : 0; size_t elemsInput_32 = (mInputData_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0; size_t elemsInput_16 = (mInputData_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0; - size_t elemsInput_8 = (mInputData_8 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0; size_t elemsProb_32 = (mModelProbabilities_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0; size_t elemsProb_16 = (mModelProbabilities_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0; - size_t elemsProb_8 = (mModelProbabilities_8 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0; size_t elemsReg1_32 = (mOutputDataReg1_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0; size_t elemsReg1_16 = (mOutputDataReg1_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0; - size_t elemsReg1_8 = (mOutputDataReg1_8 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0; size_t elemsReg2_32 = (mOutputDataReg2_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0; size_t elemsReg2_16 = (mOutputDataReg2_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0; - size_t elemsReg2_8 = (mOutputDataReg2_8 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0; size_t elemsOutputDataClass = (mOutputDataClass && mNnClusterizerTotalClusters > 0) ? (size_t)mNnClusterizerTotalClusters : 0; // Byte sizes size_t szClusterFlags = elemsClusterFlags * sizeof(int8_t); size_t szInput_32 = elemsInput_32 * sizeof(float); size_t szInput_16 = elemsInput_16 * sizeof(OrtDataType::Float16_t); - size_t szInput_8 = elemsInput_8 * sizeof(int8_t); size_t szProb_32 = elemsProb_32 * sizeof(float); size_t szProb_16 = elemsProb_16 * sizeof(OrtDataType::Float16_t); - size_t szProb_8 = elemsProb_8 * sizeof(int8_t); size_t szReg1_32 = elemsReg1_32 * sizeof(float); size_t szReg1_16 = elemsReg1_16 * sizeof(OrtDataType::Float16_t); - size_t szReg1_8 = elemsReg1_8 * sizeof(int8_t); size_t szReg2_32 = elemsReg2_32 * sizeof(float); size_t szReg2_16 = elemsReg2_16 * sizeof(OrtDataType::Float16_t); - size_t szReg2_8 = elemsReg2_8 * sizeof(int8_t); size_t szOutputDataClass = elemsOutputDataClass * sizeof(int32_t); LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") Pointers set for clusterizer with memoryID " << mMemoryId << " deviceID " << mDeviceId << " and sector " << mISector; @@ -129,18 +107,6 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mClusterFlags pointer: " << static_cast(mClusterFlags) << " | elements=" << elemsClusterFlags << " (= 2 * mNnClusterizerBatchedMode)" << " | " << fmt(szClusterFlags); - LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_8 pointer: " << mInputData_8 - << " | elements=" << elemsInput_8 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)" - << " | " << fmt(szInput_8); - LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_8 pointer: " << mModelProbabilities_8 - << " | elements=" << elemsProb_8 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)" - << " | " << fmt(szProb_8); - LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_8 pointer: " << mOutputDataReg1_8 - << " | elements=" << elemsReg1_8 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)" - << " | " << fmt(szReg1_8); - LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_8 pointer: " << mOutputDataReg2_8 - << " | elements=" << elemsReg2_8 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes)" - << " | " << fmt(szReg2_8); LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_16 pointer: " << mInputData_16 << " | elements=" << elemsInput_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)" << " | " << fmt(szInput_16); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 9ef1a658a9c3a..7aa23489eb2f4 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -89,7 +89,7 @@ class GPUTPCNNClusterizer : public GPUProcessor int8_t* mClusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr int32_t* mOutputDataClass = nullptr; - // FP32 + // FP32, also used for int8 models float* mInputData_32 = nullptr; float* mModelProbabilities_32 = nullptr; float* mOutputDataReg1_32 = nullptr; @@ -101,12 +101,6 @@ class GPUTPCNNClusterizer : public GPUProcessor OrtDataType::Float16_t* mOutputDataReg1_16 = nullptr; OrtDataType::Float16_t* mOutputDataReg2_16 = nullptr; - // INT8 - int8_t* mInputData_8 = nullptr; - int8_t* mModelProbabilities_8 = nullptr; - int8_t* mOutputDataReg1_8 = nullptr; - int8_t* mOutputDataReg2_8 = nullptr; - int16_t mMemoryId = -1; }; // class GPUTPCNNClusterizer diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index a33a9eaed58ef..882a1e684d746 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -93,20 +93,16 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(clustererNN.mInputData_16_Test[write_idx]) - static_cast(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) { @@ -123,14 +119,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(sector) / o2::tpc::constants::MAXSECTOR; clustererNN.mInputData_32[write_idx + 1] = static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW; clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / npads_row; - } else if (dtype == 1) { + } else { clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(sector) / o2::tpc::constants::MAXSECTOR); clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads_row); - } else if (dtype == 2) { - clustererNN.mInputData_8[write_idx] = (int8_t)(static_cast(sector) / o2::tpc::constants::MAXSECTOR); - clustererNN.mInputData_8[write_idx + 1] = (int8_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); - clustererNN.mInputData_8[write_idx + 2] = (int8_t)(static_cast(pad) / npads_row); } } @@ -188,14 +180,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(sector) / o2::tpc::constants::MAXSECTOR; clustererNN.mInputData_32[write_idx + 1] = static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW; clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / npads; - } else if (dtype == 1) { + } else { clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(sector) / o2::tpc::constants::MAXSECTOR); clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads); - } else if (dtype == 2) { - clustererNN.mInputData_8[write_idx] = (int8_t)(static_cast(sector) / o2::tpc::constants::MAXSECTOR); - clustererNN.mInputData_8[write_idx + 1] = (int8_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); - clustererNN.mInputData_8[write_idx + 2] = (int8_t)(static_cast(pad) / npads); } } @@ -214,10 +202,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread clustererNN.mNnClassThreshold); - } else if (dtype == 1) { + } else { clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold); - } else if (dtype == 2) { - clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)(clustererNN.mModelProbabilities_8[glo_idx] > clustererNN.mNnClassThreshold); } } else { clustererNN.mOutputDataClass[glo_idx + batchStart] = 1; @@ -294,18 +276,14 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(clustererNN.mModelProbabilities_16[pIdx]); - } else if (dtype == 2) { - current_max_prob = static_cast(clustererNN.mModelProbabilities_8[pIdx]); } } else { if (dtype == 0) { current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_32[pIdx]); - } else if (dtype == 1) { + } else { current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_16[pIdx].ToFloat()); - } else if (dtype == 2) { - current_max_prob = CAMath::Max(current_max_prob, static_cast(clustererNN.mModelProbabilities_8[pIdx])); } } } @@ -404,7 +382,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat(); publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat(); isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); @@ -415,17 +393,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg1_8[model_output_index]; - publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg1_8[model_output_index + 1]; - isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); - pc.setFull(central_charge * clustererNN.mOutputDataReg1_8[model_output_index + 4], - publishPadPosition, - notSinglePad ? clustererNN.mOutputDataReg1_8[model_output_index + 2] : 0.f, - (clusterer.mPmemory->fragment).start + publishTimePosition, - notSingleTime ? clustererNN.mOutputDataReg1_8[model_output_index + 3] : 0.f, - clustererNN.mClusterFlags[2 * glo_idx], - clustererNN.mClusterFlags[2 * glo_idx + 1]); } // if (boundaryFlag != 0) { // Prints the entire NN input for the given index @@ -601,7 +568,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat(); publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(); isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); @@ -612,17 +579,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg2_8[model_output_index]; - publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_8[model_output_index + 1]; - isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); - pc.setFull(central_charge * clustererNN.mOutputDataReg2_8[model_output_index + 8], - publishPadPosition, - clustererNN.mOutputDataReg2_8[model_output_index + 4], - (clusterer.mPmemory->fragment).start + publishTimePosition, - clustererNN.mOutputDataReg2_8[model_output_index + 6], - clustererNN.mClusterFlags[2 * glo_idx], - clustererNN.mClusterFlags[2 * glo_idx + 1]); } tpc::ClusterNative myCluster; @@ -666,7 +622,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(); publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat(); isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); @@ -677,17 +633,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg2_8[model_output_index + 1]; - publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_8[model_output_index + 3]; - isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); - pc.setFull(central_charge * clustererNN.mOutputDataReg2_8[model_output_index + 9], - publishPadPosition, - clustererNN.mOutputDataReg2_8[model_output_index + 5], - (clusterer.mPmemory->fragment).start + publishTimePosition, - clustererNN.mOutputDataReg2_8[model_output_index + 7], - clustererNN.mClusterFlags[2 * glo_idx], - clustererNN.mClusterFlags[2 * glo_idx + 1]); } rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap); From 039f813e4bd5c287da5fab8fa328e1b189547e08 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 30 Jun 2026 13:13:01 +0200 Subject: [PATCH 3/8] Bug-fix --- .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index f4f52d7ec3298..96b8a1d7ed2fd 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -126,21 +126,13 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust // Define the datatype for input and output if (settings.nnInferenceInputDType.find("32") != std::string::npos) { clustererNN.mNnInferenceInputDType = 0; - } else if (settings.nnInferenceInputDType.find("16") != std::string::npos) { - clustererNN.mNnInferenceInputDType = 1; - } else if (settings.nnInferenceInputDType.find("8") != std::string::npos) { - clustererNN.mNnInferenceInputDType = 2; } else { - clustererNN.mNnInferenceInputDType = 1; // Default to float32 + clustererNN.mNnInferenceInputDType = 1; // Default to float16 } if (settings.nnInferenceOutputDType.find("32") != std::string::npos) { clustererNN.mNnInferenceOutputDType = 0; - } else if (settings.nnInferenceOutputDType.find("16") != std::string::npos) { - clustererNN.mNnInferenceOutputDType = 1; - } else if (settings.nnInferenceOutputDType.find("8") != std::string::npos) { - clustererNN.mNnInferenceOutputDType = 2; } else { - clustererNN.mNnInferenceOutputDType = 1; // Default to float32 + clustererNN.mNnInferenceOutputDType = 1; // Default to float16 } clustererNN.mNnClusterizerModelClassNumOutputNodes = mModelClass.getNumOutputNodes()[0][1]; if (!settings.nnClusterizerUseCfRegression) { From 1023771d2baa3c71ffa8ff768caa8010697b0204 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Tue, 30 Jun 2026 13:01:18 +0000 Subject: [PATCH 4/8] Please consider the following formatting changes --- GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 23847fad18c9f..7ad12ef90f253 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -631,7 +631,8 @@ void GPUReconstructionCUDA::loadKernelModules(bool perKernel) } \ } -void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& sessionOptions, int32_t stream, int32_t* deviceId) { +void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& sessionOptions, int32_t stream, int32_t* deviceId) +{ GPUChkErr(cudaGetDevice(deviceId)); #if !defined(__HIPCC__) && defined(ORT_CUDA_BUILD) @@ -645,17 +646,17 @@ void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& sessionOptions const char* keys[] = {"device_id", "trt_int8_enable"}; const char* values[] = {device.c_str(), "1"}; - ORTCHK(api->UpdateTensorRTProviderOptions(trtOptions,keys,values,sizeof(keys) / sizeof(keys[0]))); - ORTCHK(api->UpdateTensorRTProviderOptionsWithValue(trtOptions,"user_compute_stream",mInternals->Streams[stream])); - ORTCHK(api->SessionOptionsAppendExecutionProvider_TensorRT_V2(sessionOptions,trtOptions)); // Register TensorRT first: it consequently has higher priority. + ORTCHK(api->UpdateTensorRTProviderOptions(trtOptions, keys, values, sizeof(keys) / sizeof(keys[0]))); + ORTCHK(api->UpdateTensorRTProviderOptionsWithValue(trtOptions, "user_compute_stream", mInternals->Streams[stream])); + ORTCHK(api->SessionOptionsAppendExecutionProvider_TensorRT_V2(sessionOptions, trtOptions)); // Register TensorRT first: it consequently has higher priority. api->ReleaseTensorRTProviderOptions(trtOptions); #endif // CUDA is the fallback for nodes unsupported by TensorRT. OrtCUDAProviderOptionsV2* cudaOptions = nullptr; ORTCHK(api->CreateCUDAProviderOptions(&cudaOptions)); - ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cudaOptions,"user_compute_stream",mInternals->Streams[stream])); - ORTCHK(api->SessionOptionsAppendExecutionProvider_CUDA_V2(sessionOptions,cudaOptions)); + ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cudaOptions, "user_compute_stream", mInternals->Streams[stream])); + ORTCHK(api->SessionOptionsAppendExecutionProvider_CUDA_V2(sessionOptions, cudaOptions)); api->ReleaseCUDAProviderOptions(cudaOptions); #elif defined(ORT_ROCM_BUILD) From 599cd6718c7629131c874201fed306833bfe293c Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 2 Jul 2026 10:26:56 +0200 Subject: [PATCH 5/8] Changing memory layout usage --- .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 3 +-- .../TPCClusterFinder/GPUTPCNNClusterizerKernels.h | 8 ++------ 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 882a1e684d746..9d114b40997e6 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -17,7 +17,6 @@ #include "GPUTPCNNClusterizerKernels.h" #include "GPUConstantMem.h" #include "GPUTPCClusterFinder.h" -#include "GPUTPCCFClusterizer.h" #include "GPUTPCGeometry.h" using namespace o2::gpu; @@ -48,7 +47,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Threadcounters.nClusters - 1)] > 0) : 1); - GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, reinterpret_cast(smem), chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, isAccepted); + GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, isAccepted); } template <> diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h index c77a99bec3a70..15258dbacb115 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -18,6 +18,7 @@ #include "CfArray2D.h" #include "GPUGeneralKernels.h" #include "GPUTPCNNClusterizer.h" +#include "GPUTPCCFClusterizer.h" namespace o2::tpc { @@ -37,12 +38,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate public: // Must all have same number of threads, since they use a common SCRATCH_PAD_WORK_GROUP_SIZE below static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer); - struct GPUSharedMemory { - // Regular cluster finder - CfChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE]; - PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N]; - uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE]; - }; + using GPUSharedMemory = GPUTPCCFClusterizer::GPUSharedMemory; GPUhdi() constexpr static gpudatatypes::RecoStep GetRecoStep() { From 37cc87f4aa9850e02331cefa91c254dcf476cb48 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 2 Jul 2026 13:02:19 +0200 Subject: [PATCH 6/8] Adjusting launch bounds --- GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv | 2 +- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 2 ++ GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv index 97761a86b966b..5a8c7cf4ddaec 100644 --- a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv +++ b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv @@ -76,7 +76,7 @@ GPUMemClean16,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,,,,,,, GPUitoa,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,,,,,,, GPUTPCCFNoiseSuppression_noiseSuppression,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,,,,,,,448 GPUTPCCFNoiseSuppression_updatePeaks,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,,,,,,,448 -GPUTPCNNClusterizerKernels_runCfClusterizer,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_runCfClusterizer,"""GPUCA_LB_GPUTPCCFClusterizer""",,,,,,,,,,,,,,, GPUTPCNNClusterizerKernels_fillInputNNCPU,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,,,, GPUTPCNNClusterizerKernels_fillInputNNGPU,1024,,,,,,,,,,,,,,, GPUTPCNNClusterizerKernels_determineClass1Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,,,, diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 9d114b40997e6..693ee4dd78e8d 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -36,6 +36,8 @@ using namespace o2::gpu::tpccf; #include "GPUTPCCFClusterizer.inc" #endif +static_assert(GPUTPCNNClusterizerKernels::SCRATCH_PAD_WORK_GROUP_SIZE == GPUTPCCFClusterizer::SCRATCH_PAD_WORK_GROUP_SIZE, "Work group sizes do not match"); + // Defining individual thread functions for data filling, determining the class label and running the CF clusterizer template <> GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h index 15258dbacb115..d7194d163f341 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -37,7 +37,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate { public: // Must all have same number of threads, since they use a common SCRATCH_PAD_WORK_GROUP_SIZE below - static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer); + static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFClusterizer); using GPUSharedMemory = GPUTPCCFClusterizer::GPUSharedMemory; GPUhdi() constexpr static gpudatatypes::RecoStep GetRecoStep() From a9b7adcd94c909040bfa3a3c672c28714fb43832 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 2 Jul 2026 13:07:34 +0200 Subject: [PATCH 7/8] Adding back some helpful comments --- GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 7ad12ef90f253..5c719a50638e6 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -655,14 +655,20 @@ void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& sessionOptions // CUDA is the fallback for nodes unsupported by TensorRT. OrtCUDAProviderOptionsV2* cudaOptions = nullptr; ORTCHK(api->CreateCUDAProviderOptions(&cudaOptions)); + // std::vector keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"}; + // std::vector values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"}; + // UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size()); ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cudaOptions, "user_compute_stream", mInternals->Streams[stream])); ORTCHK(api->SessionOptionsAppendExecutionProvider_CUDA_V2(sessionOptions, cudaOptions)); api->ReleaseCUDAProviderOptions(cudaOptions); #elif defined(ORT_ROCM_BUILD) + // const auto& api = Ort::GetApi(); + // api.GetCurrentGpuDeviceId(deviceId); OrtROCMProviderOptions rocmOptions; - rocmOptions.has_user_compute_stream = 1; - rocmOptions.arena_extend_strategy = 0; + rocmOptions.has_user_compute_stream = 1; // Indicate that we are passing a user stream + rocmOptions.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code + // rocm_options.gpu_mem_limit = 1073741824; // 0 means no limit rocmOptions.user_compute_stream = mInternals->Streams[stream]; sessionOptions.AppendExecutionProvider_ROCM(rocmOptions); #endif From 9de8fdc40310ba7cae085399a97d7275c2f6d7df Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Thu, 2 Jul 2026 11:11:26 +0000 Subject: [PATCH 8/8] Please consider the following formatting changes --- GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 5c719a50638e6..8628abb1c0374 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -667,7 +667,7 @@ void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& sessionOptions // api.GetCurrentGpuDeviceId(deviceId); OrtROCMProviderOptions rocmOptions; rocmOptions.has_user_compute_stream = 1; // Indicate that we are passing a user stream - rocmOptions.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code + rocmOptions.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code // rocm_options.gpu_mem_limit = 1073741824; // 0 means no limit rocmOptions.user_compute_stream = mInternals->Streams[stream]; sessionOptions.AppendExecutionProvider_ROCM(rocmOptions);