Skip to content

Commit c881c40

Browse files
committed
drm/asahi: Handle channel errors
Signed-off-by: Asahi Lina <lina@asahilina.net>
1 parent 20398d9 commit c881c40

7 files changed

Lines changed: 201 additions & 14 deletions

File tree

drivers/gpu/drm/asahi/channel.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,31 @@ impl EventChannel::ver {
383383
dev_crit!(self.dev, "EventChannel: No GPU manager available!\n")
384384
}
385385
},
386+
EventMsg::ChannelError {
387+
error_type,
388+
pipe_type,
389+
event_slot,
390+
event_value,
391+
} => match self.gpu.as_ref() {
392+
Some(gpu) => {
393+
let error_type = match error_type {
394+
0 => ChannelErrorType::MemoryError,
395+
1 => ChannelErrorType::DMKill,
396+
2 => ChannelErrorType::Aborted,
397+
3 => ChannelErrorType::Unk3,
398+
a => ChannelErrorType::Unknown(a),
399+
};
400+
gpu.handle_channel_error(
401+
error_type,
402+
pipe_type,
403+
event_slot,
404+
event_value,
405+
);
406+
}
407+
None => {
408+
dev_crit!(self.dev, "EventChannel: No GPU manager available!\n")
409+
}
410+
},
386411
msg => {
387412
dev_crit!(self.dev, "Unknown event message: {:?}\n", msg);
388413
}

drivers/gpu/drm/asahi/debug.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ pub(crate) enum DebugFlags {
6666
Debug6 = 54,
6767
Debug7 = 55,
6868

69+
VerboseFaults = 61,
6970
AllowUnknownOverrides = 62,
7071
OopsOnGpuCrash = 63,
7172
}

drivers/gpu/drm/asahi/event.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,16 @@ impl EventManager {
216216
}
217217
}
218218

219+
/// Returns a reference to the workqueue owning an event.
220+
pub(crate) fn get_owner(
221+
&self,
222+
slot: u32,
223+
) -> Option<Arc<dyn workqueue::WorkQueue + Send + Sync>> {
224+
self.alloc
225+
.with_inner(|inner| inner.owners[slot as usize].as_ref().cloned())
226+
.map(|a| a.clone())
227+
}
228+
219229
/// Fail all commands, used when the GPU crashes.
220230
pub(crate) fn fail_all(&self, error: workqueue::WorkError) {
221231
let mut owners: Vec<Arc<dyn workqueue::WorkQueue + Send + Sync>> = Vec::new();

drivers/gpu/drm/asahi/fw/channels.rs

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -184,8 +184,16 @@ pub(crate) enum DeviceControlMsg {
184184
halt_count: U64,
185185
__pad: Pad<{ DEVICECONTROL_SZ::ver - 0x1c }>,
186186
},
187-
Unk0e(Array<DEVICECONTROL_SZ::ver, u8>),
188-
Unk0f(Array<DEVICECONTROL_SZ::ver, u8>),
187+
RecoverChannel {
188+
pipe_type: u32,
189+
work_queue: GpuWeakPointer<super::workqueue::QueueInfo::ver>,
190+
event_value: u32,
191+
__pad: Pad<{ DEVICECONTROL_SZ::ver - 0x10 }>,
192+
},
193+
IdlePowerOff {
194+
val: u32,
195+
__pad: Pad<{ DEVICECONTROL_SZ::ver - 0x4 }>,
196+
},
189197
Unk10(Array<DEVICECONTROL_SZ::ver, u8>),
190198
Unk11(Array<DEVICECONTROL_SZ::ver, u8>),
191199
Unk12(Array<DEVICECONTROL_SZ::ver, u8>),
@@ -236,6 +244,17 @@ pub(crate) struct FwCtlMsg {
236244

237245
pub(crate) const EVENT_SZ: usize = 0x34;
238246

247+
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
248+
#[repr(C, u32)]
249+
#[allow(dead_code)]
250+
pub(crate) enum ChannelErrorType {
251+
MemoryError,
252+
DMKill,
253+
Aborted,
254+
Unk3,
255+
Unknown(u32),
256+
}
257+
239258
#[derive(Debug, Copy, Clone)]
240259
#[repr(C, u32)]
241260
#[allow(dead_code)]
@@ -258,12 +277,19 @@ pub(crate) enum EventMsg {
258277
vm_slot: u32,
259278
buffer_slot: u32,
260279
counter: u32,
261-
}, // Max discriminant: 0x7
280+
},
281+
ChannelError {
282+
error_type: u32,
283+
pipe_type: u32,
284+
event_slot: u32,
285+
event_value: u32,
286+
},
287+
// Max discriminant: 0x8
262288
}
263289

264290
static_assert!(core::mem::size_of::<EventMsg>() == 4 + EVENT_SZ);
265291

266-
pub(crate) const EVENT_MAX: u32 = 0x7;
292+
pub(crate) const EVENT_MAX: u32 = 0x8;
267293

268294
#[derive(Copy, Clone)]
269295
#[repr(C)]

drivers/gpu/drm/asahi/fw/workqueue.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ pub(crate) mod raw {
135135
#[ver(V >= V13_2 && G < G14X)]
136136
pub(crate) unk_84_0: u32,
137137
pub(crate) unk_84_state: AtomicU32,
138-
pub(crate) unk_88: u32,
138+
pub(crate) error_count: AtomicU32,
139139
pub(crate) unk_8c: u32,
140140
pub(crate) unk_90: u32,
141141
pub(crate) unk_94: u32,

drivers/gpu/drm/asahi/gpu.rs

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ use kernel::{
3535
use crate::alloc::Allocator;
3636
use crate::debug::*;
3737
use crate::driver::{AsahiDevRef, AsahiDevice};
38-
use crate::fw::channels::PipeType;
38+
use crate::fw::channels::{ChannelErrorType, PipeType};
3939
use crate::fw::types::{U32, U64};
4040
use crate::{
4141
alloc, buffer, channel, event, fw, gem, hw, initdata, mem, mmu, queue, regs, workqueue,
@@ -256,6 +256,14 @@ pub(crate) trait GpuManager: Send + Sync {
256256
fn handle_timeout(&self, counter: u32, event_slot: i32, unk: u32);
257257
/// Handle a GPU fault event.
258258
fn handle_fault(&self);
259+
/// Handle a channel error event.
260+
fn handle_channel_error(
261+
&self,
262+
error_type: ChannelErrorType,
263+
pipe_type: u32,
264+
event_slot: u32,
265+
event_value: u32,
266+
);
259267
/// Acknowledge a Buffer grow op.
260268
fn ack_grow(&self, buffer_slot: u32, vm_slot: u32, counter: u32);
261269
/// Wait for the GPU to become idle and power off.
@@ -1331,6 +1339,82 @@ impl GpuManager for GpuManager::ver {
13311339
self.recover();
13321340
}
13331341

1342+
fn handle_channel_error(
1343+
&self,
1344+
error_type: ChannelErrorType,
1345+
pipe_type: u32,
1346+
event_slot: u32,
1347+
event_value: u32,
1348+
) {
1349+
dev_err!(self.dev, " (\\________/) \n");
1350+
dev_err!(self.dev, " | | \n");
1351+
dev_err!(self.dev, "'.| \\ , / |.'\n");
1352+
dev_err!(self.dev, "--| / (( \\ |--\n");
1353+
dev_err!(self.dev, ".'| _-_- |'.\n");
1354+
dev_err!(self.dev, " |________| \n");
1355+
dev_err!(self.dev, "GPU channel error nya~!!!!!\n");
1356+
dev_err!(self.dev, " Error type: {:?}\n", error_type);
1357+
dev_err!(self.dev, " Pipe type: {}\n", pipe_type);
1358+
dev_err!(self.dev, " Event slot: {}\n", event_slot);
1359+
dev_err!(self.dev, " Event value: {:#x?}\n", event_value);
1360+
1361+
self.event_manager.mark_error(
1362+
event_slot,
1363+
event_value,
1364+
workqueue::WorkError::ChannelError(error_type),
1365+
);
1366+
1367+
let wq = match self.event_manager.get_owner(event_slot) {
1368+
Some(wq) => wq,
1369+
None => {
1370+
dev_err!(self.dev, "Workqueue not found for this event slot!\n");
1371+
return;
1372+
}
1373+
};
1374+
1375+
let wq = match wq.as_any().downcast_ref::<workqueue::WorkQueue::ver>() {
1376+
Some(wq) => wq,
1377+
None => {
1378+
dev_crit!(self.dev, "GpuManager mismatched with WorkQueue!\n");
1379+
return;
1380+
}
1381+
};
1382+
1383+
if debug_enabled(DebugFlags::VerboseFaults) {
1384+
wq.dump_info();
1385+
}
1386+
1387+
let dc = fw::channels::DeviceControlMsg::ver::RecoverChannel {
1388+
pipe_type,
1389+
work_queue: wq.info_pointer(),
1390+
event_value,
1391+
__pad: Default::default(),
1392+
};
1393+
1394+
mod_dev_dbg!(self.dev, "Recover Channel command: {:?}\n", &dc);
1395+
let mut txch = self.tx_channels.lock();
1396+
1397+
let token = txch.device_control.send(&dc);
1398+
{
1399+
let mut guard = self.rtkit.lock();
1400+
let rtk = guard.as_mut().unwrap();
1401+
if rtk
1402+
.send_message(EP_DOORBELL, MSG_TX_DOORBELL | DOORBELL_DEVCTRL)
1403+
.is_err()
1404+
{
1405+
dev_err!(self.dev, "Failed to send Recover Channel command\n");
1406+
}
1407+
}
1408+
1409+
if txch.device_control.wait_for(token).is_err() {
1410+
dev_err!(self.dev, "Timed out waiting for Recover Channel command\n");
1411+
}
1412+
1413+
if debug_enabled(DebugFlags::VerboseFaults) {
1414+
wq.dump_info();
1415+
}
1416+
}
1417+
13341418
fn ack_grow(&self, buffer_slot: u32, vm_slot: u32, counter: u32) {
13351419
let halt_count = self
13361420
.initdata

drivers/gpu/drm/asahi/workqueue.rs

Lines changed: 49 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,14 @@
1414
//! up its associated event.
1515
1616
use crate::debug::*;
17-
use crate::fw::channels::PipeType;
17+
use crate::fw::channels::{ChannelErrorType, PipeType};
1818
use crate::fw::types::*;
1919
use crate::fw::workqueue::*;
2020
use crate::no_debug;
2121
use crate::object::OpaqueGpuObject;
2222
use crate::regs::FaultReason;
2323
use crate::{channel, driver, event, fw, gpu, object, regs};
24+
use core::any::Any;
2425
use core::num::NonZeroU64;
2526
use core::sync::atomic::Ordering;
2627
use kernel::{
@@ -48,6 +49,8 @@ pub(crate) enum WorkError {
4849
Fault(regs::FaultInfo),
4950
/// Work failed due to an error caused by other concurrent GPU work.
5051
Killed,
52+
/// Channel error
53+
ChannelError(ChannelErrorType),
5154
/// The GPU crashed.
5255
NoDevice,
5356
/// Unknown reason.
@@ -79,6 +82,9 @@ impl From<WorkError> for uapi::drm_asahi_result_info {
7982
status: match a {
8083
WorkError::Timeout => uapi::drm_asahi_status_DRM_ASAHI_STATUS_TIMEOUT,
8184
WorkError::Killed => uapi::drm_asahi_status_DRM_ASAHI_STATUS_KILLED,
85+
WorkError::ChannelError(_) => {
86+
uapi::drm_asahi_status_DRM_ASAHI_STATUS_CHANNEL_ERROR
87+
}
8288
WorkError::NoDevice => uapi::drm_asahi_status_DRM_ASAHI_STATUS_NO_DEVICE,
8389
_ => uapi::drm_asahi_status_DRM_ASAHI_STATUS_UNKNOWN_ERROR,
8490
},
@@ -97,6 +103,7 @@ impl From<WorkError> for kernel::error::Error {
97103
WorkError::Unknown => ENODATA,
98104
WorkError::Killed => ECANCELED,
99105
WorkError::NoDevice => ENODEV,
106+
WorkError::ChannelError(_) => EIO,
100107
}
101108
}
102109
}
@@ -601,20 +608,26 @@ impl WorkQueue::ver {
601608
size: u32,
602609
) -> Result<Arc<WorkQueue::ver>> {
603610
let gpu_buf = alloc.private.array_empty_tagged(0x2c18, b"GPBF")?;
604-
let shared = &mut alloc.shared;
611+
let mut state = alloc.shared.new_default::<RingState>()?;
612+
let ring = alloc.shared.array_empty(size as usize)?;
605613
let inner = WorkQueueInner::ver {
606614
dev: dev.into(),
607615
event_manager,
608-
info: alloc.private.new_init(
616+
// Use shared (coherent) state with verbose faults so we can dump state correctly
617+
info: if debug_enabled(DebugFlags::VerboseFaults) {
618+
&mut alloc.shared
619+
} else {
620+
&mut alloc.private
621+
}
622+
.new_init(
609623
try_init!(QueueInfo::ver {
610624
state: {
611-
let mut s = shared.new_default::<RingState>()?;
612-
s.with_mut(|raw, _inner| {
625+
state.with_mut(|raw, _inner| {
613626
raw.rb_size = size;
614627
});
615-
s
628+
state
616629
},
617-
ring: shared.array_empty(size as usize)?,
630+
ring,
618631
gpu_buf,
619632
notifier_list: notifier_list,
620633
gpu_context: gpu_context,
@@ -639,7 +652,7 @@ impl WorkQueue::ver {
639652
#[ver(V >= V13_2 && G < G14X)]
640653
unk_84_0: 0,
641654
unk_84_state: Default::default(),
642-
unk_88: 0,
655+
error_count: Default::default(),
643656
unk_8c: 0,
644657
unk_90: 0,
645658
unk_94: 0,
@@ -744,18 +757,46 @@ impl WorkQueue::ver {
744757
pub(crate) fn pipe_type(&self) -> PipeType {
745758
self.inner.lock().pipe_type
746759
}
760+
761+
pub(crate) fn dump_info(&self) {
762+
pr_info!("WorkQueue @ {:?}:", self.info_pointer);
763+
self.inner.lock().info.with(|raw, _inner| {
764+
pr_info!(" GPU rptr1: {:#x}", raw.gpu_rptr1.load(Ordering::Relaxed));
765+
pr_info!(" GPU rptr1: {:#x}", raw.gpu_rptr2.load(Ordering::Relaxed));
766+
pr_info!(" GPU rptr1: {:#x}", raw.gpu_rptr3.load(Ordering::Relaxed));
767+
pr_info!(" Event ID: {:#x}", raw.event_id.load(Ordering::Relaxed));
768+
pr_info!(" Busy: {:#x}", raw.busy.load(Ordering::Relaxed));
769+
pr_info!(" Unk 84: {:#x}", raw.unk_84_state.load(Ordering::Relaxed));
770+
pr_info!(
771+
" Error count: {:#x}",
772+
raw.error_count.load(Ordering::Relaxed)
773+
);
774+
pr_info!(" Pending: {:#x}", raw.pending.load(Ordering::Relaxed));
775+
});
776+
}
777+
778+
pub(crate) fn info_pointer(&self) -> GpuWeakPointer<QueueInfo::ver> {
779+
self.info_pointer
780+
}
747781
}
748782

749783
/// Trait used to erase the version-specific type of WorkQueues, to avoid leaking
750784
/// version-specificity into the event module.
751785
pub(crate) trait WorkQueue {
786+
/// Cast as an Any type.
787+
fn as_any(&self) -> &dyn Any;
788+
752789
fn signal(&self) -> bool;
753790
fn mark_error(&self, value: event::EventValue, error: WorkError);
754791
fn fail_all(&self, error: WorkError);
755792
}
756793

757794
#[versions(AGX)]
758795
impl WorkQueue for WorkQueue::ver {
796+
fn as_any(&self) -> &dyn Any {
797+
self
798+
}
799+
759800
/// Signal a workqueue that some work was completed.
760801
///
761802
/// This will check the event stamp value to find out exactly how many commands were processed.

0 commit comments

Comments
 (0)