Skip to content

Commit 49a4983

Browse files
kpoosarodrigovivi
authored andcommitted
drm/xe/hwmon: Expose individual VRAM channel temperature
Expose individual VRAM temperature attributes. Update Xe hwmon documentation for this entry. v2: - Avoid using default switch case for VRAM individual temperatures. - Append labels with VRAM channel number. - Update kernel version in Xe hwmon documentation. v3: - Add missing brackets in Xe hwmon documentation from VRAM channel sysfs. - Reorder BMG_VRAM_TEMPERATURE_N macro in xe_pcode_regs.h. - Add api to check if VRAM is available on the channel. v4: - Improve VRAM label handling to eliminate temp variable by introducing a dedicated array vram_label in xe_hwmon_thermal_info. - Remove a magic number. - Change the label from vram_X to vram_ch_X. v5: - Address review comments from Raag. - Change vram to VRAM in commit title and subject. - Refactor BMG_VRAM_TEMPERATURE_N macro. - Refactor is_vram_ch_available(). - Rephrase a comment. - Check individual VRAM temperature limits in addition to VRAM availability in xe_hwmon_temp_is_visible. (Raag) - Move VRAM label change out of this patch. v6: - Use in_range() for VRAM_N index check instead of if check. (Raag) - Minor aesthetic changes. Signed-off-by: Karthik Poosa <karthik.poosa@intel.com> Reviewed-by: Raag Jadav <raag.jadav@intel.com> Link: https://patch.msgid.link/20260112203521.1014388-5-karthik.poosa@intel.com Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
1 parent 8d25116 commit 49a4983

3 files changed

Lines changed: 91 additions & 0 deletions

File tree

Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,28 @@ KernelVersion: 7.0
211211
Contact: intel-xe@lists.freedesktop.org
212212
Description: RO. GPU PCIe temperature in millidegree Celsius.
213213

214+
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp[6-21]_crit
215+
Date: January 2026
216+
KernelVersion: 7.0
217+
Contact: intel-xe@lists.freedesktop.org
218+
Description: RO. VRAM channel critical temperature in millidegree Celsius.
219+
220+
Only supported for particular Intel Xe graphics platforms.
221+
222+
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp[6-21]_emergency
223+
Date: January 2026
224+
KernelVersion: 7.0
225+
Contact: intel-xe@lists.freedesktop.org
226+
Description: RO. VRAM channel shutdown temperature in millidegree Celsius.
227+
228+
Only supported for particular Intel Xe graphics platforms.
229+
230+
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp[6-21]_input
231+
Date: January 2026
232+
KernelVersion: 7.0
233+
Contact: intel-xe@lists.freedesktop.org
234+
Description: RO. VRAM channel temperature in millidegree Celsius.
235+
214236
Only supported for particular Intel Xe graphics platforms.
215237

216238
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/fan1_input

drivers/gpu/drm/xe/regs/xe_pcode_regs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@
2121
#define BMG_FAN_1_SPEED XE_REG(0x138140)
2222
#define BMG_FAN_2_SPEED XE_REG(0x138170)
2323
#define BMG_FAN_3_SPEED XE_REG(0x1381a0)
24+
#define BMG_VRAM_TEMPERATURE_N(n) XE_REG(0x138260 + ((n) * (sizeof(u32))))
2425
#define BMG_VRAM_TEMPERATURE XE_REG(0x1382c0)
26+
#define TEMP_MASK_VRAM_N REG_GENMASK(30, 8)
27+
#define TEMP_SIGN_MASK REG_BIT(31)
2528
#define BMG_PACKAGE_TEMPERATURE XE_REG(0x138434)
2629

2730
#endif /* _XE_PCODE_REGS_H_ */

drivers/gpu/drm/xe/xe_hwmon.c

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,16 @@ enum xe_hwmon_reg_operation {
3939
REG_READ64,
4040
};
4141

42+
#define MAX_VRAM_CHANNELS (16)
43+
4244
enum xe_hwmon_channel {
4345
CHANNEL_CARD,
4446
CHANNEL_PKG,
4547
CHANNEL_VRAM,
4648
CHANNEL_MCTRL,
4749
CHANNEL_PCIE,
50+
CHANNEL_VRAM_N,
51+
CHANNEL_VRAM_N_MAX = CHANNEL_VRAM_N + MAX_VRAM_CHANNELS,
4852
CHANNEL_MAX,
4953
};
5054

@@ -105,6 +109,9 @@ enum sensor_attr_power {
105109
/* Index of memory controller in READ_THERMAL_DATA output */
106110
#define TEMP_INDEX_MCTRL 2
107111

112+
/* Maximum characters in hwmon label name */
113+
#define MAX_LABEL_SIZE 16
114+
108115
/**
109116
* struct xe_hwmon_energy_info - to accumulate energy
110117
*/
@@ -139,6 +146,8 @@ struct xe_hwmon_thermal_info {
139146
u8 count;
140147
/** @value: signed value from each sensor */
141148
s8 value[U8_MAX];
149+
/** @vram_label: vram label names */
150+
char vram_label[MAX_VRAM_CHANNELS][MAX_LABEL_SIZE];
142151
};
143152

144153
/**
@@ -255,6 +264,8 @@ static struct xe_reg xe_hwmon_get_reg(struct xe_hwmon *hwmon, enum xe_hwmon_reg
255264
return BMG_PACKAGE_TEMPERATURE;
256265
else if (channel == CHANNEL_VRAM)
257266
return BMG_VRAM_TEMPERATURE;
267+
else if (in_range(channel, CHANNEL_VRAM_N, CHANNEL_VRAM_N_MAX))
268+
return BMG_VRAM_TEMPERATURE_N(channel - CHANNEL_VRAM_N);
258269
} else if (xe->info.platform == XE_DG2) {
259270
if (channel == CHANNEL_PKG)
260271
return PCU_CR_PACKAGE_TEMPERATURE;
@@ -714,6 +725,22 @@ static const struct hwmon_channel_info * const hwmon_info[] = {
714725
HWMON_T_MAX,
715726
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
716727
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
728+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
729+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
730+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
731+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
732+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
733+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
734+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
735+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
736+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
737+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
738+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
739+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
740+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
741+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
742+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
743+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
717744
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL),
718745
HWMON_CHANNEL_INFO(power, HWMON_P_MAX | HWMON_P_RATED_MAX | HWMON_P_LABEL | HWMON_P_CRIT |
719746
HWMON_P_CAP,
@@ -888,6 +915,21 @@ static void xe_hwmon_get_voltage(struct xe_hwmon *hwmon, int channel, long *valu
888915
*value = DIV_ROUND_CLOSEST(REG_FIELD_GET(VOLTAGE_MASK, reg_val) * 2500, SF_VOLTAGE);
889916
}
890917

918+
static inline bool is_vram_ch_available(struct xe_hwmon *hwmon, int channel)
919+
{
920+
struct xe_mmio *mmio = xe_root_tile_mmio(hwmon->xe);
921+
int vram_id = channel - CHANNEL_VRAM_N;
922+
struct xe_reg vram_reg;
923+
924+
vram_reg = xe_hwmon_get_reg(hwmon, REG_TEMP, channel);
925+
if (!xe_reg_is_valid(vram_reg) || !xe_mmio_read32(mmio, vram_reg))
926+
return false;
927+
928+
/* Create label only for available vram channel */
929+
sprintf(hwmon->temp.vram_label[vram_id], "vram_ch_%d", vram_id);
930+
return true;
931+
}
932+
891933
static umode_t
892934
xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
893935
{
@@ -901,6 +943,9 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
901943
case CHANNEL_MCTRL:
902944
case CHANNEL_PCIE:
903945
return hwmon->temp.count ? 0444 : 0;
946+
case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
947+
return (is_vram_ch_available(hwmon, channel) &&
948+
hwmon->temp.limit[TEMP_LIMIT_MEM_SHUTDOWN]) ? 0444 : 0;
904949
default:
905950
return 0;
906951
}
@@ -913,6 +958,9 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
913958
case CHANNEL_MCTRL:
914959
case CHANNEL_PCIE:
915960
return hwmon->temp.count ? 0444 : 0;
961+
case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
962+
return (is_vram_ch_available(hwmon, channel) &&
963+
hwmon->temp.limit[TEMP_LIMIT_MEM_CRIT]) ? 0444 : 0;
916964
default:
917965
return 0;
918966
}
@@ -933,6 +981,8 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
933981
case CHANNEL_MCTRL:
934982
case CHANNEL_PCIE:
935983
return hwmon->temp.count ? 0444 : 0;
984+
case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
985+
return is_vram_ch_available(hwmon, channel) ? 0444 : 0;
936986
default:
937987
return 0;
938988
}
@@ -961,6 +1011,16 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
9611011
return get_mc_temp(hwmon, val);
9621012
case CHANNEL_PCIE:
9631013
return get_pcie_temp(hwmon, val);
1014+
case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
1015+
reg_val = xe_mmio_read32(mmio, xe_hwmon_get_reg(hwmon, REG_TEMP, channel));
1016+
/*
1017+
* This temperature format is 24 bit [31:8] signed integer and 8 bit
1018+
* [7:0] fraction.
1019+
*/
1020+
*val = (s32)(REG_FIELD_GET(TEMP_MASK_VRAM_N, reg_val)) *
1021+
(REG_FIELD_GET(TEMP_SIGN_MASK, reg_val) ? -1 : 1) *
1022+
MILLIDEGREE_PER_DEGREE;
1023+
return 0;
9641024
default:
9651025
return -EOPNOTSUPP;
9661026
}
@@ -972,6 +1032,7 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
9721032
*val = hwmon->temp.limit[TEMP_LIMIT_PKG_SHUTDOWN] * MILLIDEGREE_PER_DEGREE;
9731033
return 0;
9741034
case CHANNEL_VRAM:
1035+
case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
9751036
*val = hwmon->temp.limit[TEMP_LIMIT_MEM_SHUTDOWN] * MILLIDEGREE_PER_DEGREE;
9761037
return 0;
9771038
default:
@@ -985,6 +1046,7 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
9851046
*val = hwmon->temp.limit[TEMP_LIMIT_PKG_CRIT] * MILLIDEGREE_PER_DEGREE;
9861047
return 0;
9871048
case CHANNEL_VRAM:
1049+
case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
9881050
*val = hwmon->temp.limit[TEMP_LIMIT_MEM_CRIT] * MILLIDEGREE_PER_DEGREE;
9891051
return 0;
9901052
default:
@@ -1353,6 +1415,8 @@ static int xe_hwmon_read_label(struct device *dev,
13531415
enum hwmon_sensor_types type,
13541416
u32 attr, int channel, const char **str)
13551417
{
1418+
struct xe_hwmon *hwmon = dev_get_drvdata(dev);
1419+
13561420
switch (type) {
13571421
case hwmon_temp:
13581422
if (channel == CHANNEL_PKG)
@@ -1363,6 +1427,8 @@ static int xe_hwmon_read_label(struct device *dev,
13631427
*str = "mctrl";
13641428
else if (channel == CHANNEL_PCIE)
13651429
*str = "pcie";
1430+
else if (in_range(channel, CHANNEL_VRAM_N, CHANNEL_VRAM_N_MAX))
1431+
*str = hwmon->temp.vram_label[channel - CHANNEL_VRAM_N];
13661432
return 0;
13671433
case hwmon_power:
13681434
case hwmon_energy:

0 commit comments

Comments
 (0)