Skip to content

Commit ffd123f

Browse files
Tomer Tayargregkh
authored andcommitted
habanalabs: Disable file operations after device is removed
A device can be removed from the PCI subsystem while a process holds the file descriptor opened. In such a case, the driver attempts to kill the process, but as it is still possible that the process will be alive after this step, the device removal will complete, and we will end up with a process object that points to a device object which was already released. To prevent the usage of this released device object, disable the following file operations for this process object, and avoid the cleanup steps when the file descriptor is eventually closed. The latter is just a best effort, as memory leak will occur. Signed-off-by: Tomer Tayar <ttayar@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
1 parent 27ac5aa commit ffd123f

2 files changed

Lines changed: 46 additions & 6 deletions

File tree

drivers/misc/habanalabs/common/device.c

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -93,12 +93,19 @@ void hl_hpriv_put(struct hl_fpriv *hpriv)
9393
static int hl_device_release(struct inode *inode, struct file *filp)
9494
{
9595
struct hl_fpriv *hpriv = filp->private_data;
96+
struct hl_device *hdev = hpriv->hdev;
97+
98+
filp->private_data = NULL;
99+
100+
if (!hdev) {
101+
pr_crit("Closing FD after device was removed. Memory leak will occur and it is advised to reboot.\n");
102+
put_pid(hpriv->taskpid);
103+
return 0;
104+
}
96105

97106
hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
98107
hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
99108

100-
filp->private_data = NULL;
101-
102109
hl_hpriv_put(hpriv);
103110

104111
return 0;
@@ -107,16 +114,19 @@ static int hl_device_release(struct inode *inode, struct file *filp)
107114
static int hl_device_release_ctrl(struct inode *inode, struct file *filp)
108115
{
109116
struct hl_fpriv *hpriv = filp->private_data;
110-
struct hl_device *hdev;
117+
struct hl_device *hdev = hpriv->hdev;
111118

112119
filp->private_data = NULL;
113120

114-
hdev = hpriv->hdev;
121+
if (!hdev) {
122+
pr_err("Closing FD after device was removed\n");
123+
goto out;
124+
}
115125

116126
mutex_lock(&hdev->fpriv_list_lock);
117127
list_del(&hpriv->dev_node);
118128
mutex_unlock(&hdev->fpriv_list_lock);
119-
129+
out:
120130
put_pid(hpriv->taskpid);
121131

122132
kfree(hpriv);
@@ -136,8 +146,14 @@ static int hl_device_release_ctrl(struct inode *inode, struct file *filp)
136146
static int hl_mmap(struct file *filp, struct vm_area_struct *vma)
137147
{
138148
struct hl_fpriv *hpriv = filp->private_data;
149+
struct hl_device *hdev = hpriv->hdev;
139150
unsigned long vm_pgoff;
140151

152+
if (!hdev) {
153+
pr_err_ratelimited("Trying to mmap after device was removed! Please close FD\n");
154+
return -ENODEV;
155+
}
156+
141157
vm_pgoff = vma->vm_pgoff;
142158
vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff);
143159

@@ -885,6 +901,16 @@ static int device_kill_open_processes(struct hl_device *hdev, u32 timeout)
885901
return -EBUSY;
886902
}
887903

904+
static void device_disable_open_processes(struct hl_device *hdev)
905+
{
906+
struct hl_fpriv *hpriv;
907+
908+
mutex_lock(&hdev->fpriv_list_lock);
909+
list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node)
910+
hpriv->hdev = NULL;
911+
mutex_unlock(&hdev->fpriv_list_lock);
912+
}
913+
888914
/*
889915
* hl_device_reset - reset the device
890916
*
@@ -1558,8 +1584,10 @@ void hl_device_fini(struct hl_device *hdev)
15581584
HL_PENDING_RESET_LONG_SEC);
15591585

15601586
rc = device_kill_open_processes(hdev, HL_PENDING_RESET_LONG_SEC);
1561-
if (rc)
1587+
if (rc) {
15621588
dev_crit(hdev->dev, "Failed to kill all open processes\n");
1589+
device_disable_open_processes(hdev);
1590+
}
15631591

15641592
hl_cb_pool_fini(hdev);
15651593

drivers/misc/habanalabs/common/habanalabs_ioctl.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
* All Rights Reserved.
66
*/
77

8+
#define pr_fmt(fmt) "habanalabs: " fmt
9+
810
#include <uapi/misc/habanalabs.h>
911
#include "habanalabs.h"
1012

@@ -682,6 +684,11 @@ long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
682684
const struct hl_ioctl_desc *ioctl = NULL;
683685
unsigned int nr = _IOC_NR(cmd);
684686

687+
if (!hdev) {
688+
pr_err_ratelimited("Sending ioctl after device was removed! Please close FD\n");
689+
return -ENODEV;
690+
}
691+
685692
if ((nr >= HL_COMMAND_START) && (nr < HL_COMMAND_END)) {
686693
ioctl = &hl_ioctls[nr];
687694
} else {
@@ -700,6 +707,11 @@ long hl_ioctl_control(struct file *filep, unsigned int cmd, unsigned long arg)
700707
const struct hl_ioctl_desc *ioctl = NULL;
701708
unsigned int nr = _IOC_NR(cmd);
702709

710+
if (!hdev) {
711+
pr_err_ratelimited("Sending ioctl after device was removed! Please close FD\n");
712+
return -ENODEV;
713+
}
714+
703715
if (nr == _IOC_NR(HL_IOCTL_INFO)) {
704716
ioctl = &hl_ioctls_control[nr];
705717
} else {

0 commit comments

Comments
 (0)