Skip to content

Commit e76e0e3

Browse files
author
Darrick J. Wong
committed
xfs: convey externally discovered fsdax media errors to the health monitor
Connect the fsdax media failure notification code to the health monitor so that xfs can send events about that to the xfs_healer daemon. Later on we'll add the ability for the xfs_scrub media scan (phase 6) to report the errors that it finds to the kernel so that those are also logged by xfs_healer. Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de>
1 parent 74c4795 commit e76e0e3

6 files changed

Lines changed: 148 additions & 5 deletions

File tree

fs/xfs/libxfs/xfs_fs.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,6 +1014,11 @@ struct xfs_rtgroup_geometry {
10141014
#define XFS_HEALTH_MONITOR_DOMAIN_INODE (3)
10151015
#define XFS_HEALTH_MONITOR_DOMAIN_RTGROUP (4)
10161016

1017+
/* disk events */
1018+
#define XFS_HEALTH_MONITOR_DOMAIN_DATADEV (5)
1019+
#define XFS_HEALTH_MONITOR_DOMAIN_RTDEV (6)
1020+
#define XFS_HEALTH_MONITOR_DOMAIN_LOGDEV (7)
1021+
10171022
/* Health monitor event types */
10181023

10191024
/* status of the monitor itself */
@@ -1031,6 +1036,9 @@ struct xfs_rtgroup_geometry {
10311036
/* filesystem shutdown */
10321037
#define XFS_HEALTH_MONITOR_TYPE_SHUTDOWN (6)
10331038

1039+
/* media errors */
1040+
#define XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR (7)
1041+
10341042
/* lost events */
10351043
struct xfs_health_monitor_lost {
10361044
__u64 count;
@@ -1071,6 +1079,12 @@ struct xfs_health_monitor_shutdown {
10711079
__u32 reasons;
10721080
};
10731081

1082+
/* disk media errors */
1083+
struct xfs_health_monitor_media {
1084+
__u64 daddr;
1085+
__u64 bbcount;
1086+
};
1087+
10741088
struct xfs_health_monitor_event {
10751089
/* XFS_HEALTH_MONITOR_DOMAIN_* */
10761090
__u32 domain;
@@ -1092,6 +1106,7 @@ struct xfs_health_monitor_event {
10921106
struct xfs_health_monitor_group group;
10931107
struct xfs_health_monitor_inode inode;
10941108
struct xfs_health_monitor_shutdown shutdown;
1109+
struct xfs_health_monitor_media media;
10951110
} e;
10961111

10971112
/* zeroes */

fs/xfs/xfs_healthmon.c

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "xfs_health.h"
2222
#include "xfs_healthmon.h"
2323
#include "xfs_fsops.h"
24+
#include "xfs_notify_failure.h"
2425

2526
#include <linux/anon_inodes.h>
2627
#include <linux/eventpoll.h>
@@ -208,6 +209,19 @@ xfs_healthmon_merge_events(
208209
/* yes, we can race to shutdown */
209210
existing->flags |= new->flags;
210211
return true;
212+
213+
case XFS_HEALTHMON_MEDIA_ERROR:
214+
/* physically adjacent errors can merge */
215+
if (existing->daddr + existing->bbcount == new->daddr) {
216+
existing->bbcount += new->bbcount;
217+
return true;
218+
}
219+
if (new->daddr + new->bbcount == existing->daddr) {
220+
existing->daddr = new->daddr;
221+
existing->bbcount += new->bbcount;
222+
return true;
223+
}
224+
return false;
211225
}
212226

213227
return false;
@@ -522,6 +536,48 @@ xfs_healthmon_report_shutdown(
522536
xfs_healthmon_put(hm);
523537
}
524538

539+
static inline enum xfs_healthmon_domain
540+
media_error_domain(
541+
enum xfs_device fdev)
542+
{
543+
switch (fdev) {
544+
case XFS_DEV_DATA:
545+
return XFS_HEALTHMON_DATADEV;
546+
case XFS_DEV_LOG:
547+
return XFS_HEALTHMON_LOGDEV;
548+
case XFS_DEV_RT:
549+
return XFS_HEALTHMON_RTDEV;
550+
}
551+
552+
ASSERT(0);
553+
return 0;
554+
}
555+
556+
/* Add a media error event to the reporting queue. */
557+
void
558+
xfs_healthmon_report_media(
559+
struct xfs_mount *mp,
560+
enum xfs_device fdev,
561+
xfs_daddr_t daddr,
562+
uint64_t bbcount)
563+
{
564+
struct xfs_healthmon_event event = {
565+
.type = XFS_HEALTHMON_MEDIA_ERROR,
566+
.domain = media_error_domain(fdev),
567+
.daddr = daddr,
568+
.bbcount = bbcount,
569+
};
570+
struct xfs_healthmon *hm = xfs_healthmon_get(mp);
571+
572+
if (!hm)
573+
return;
574+
575+
trace_xfs_healthmon_report_media(hm, fdev, &event);
576+
577+
xfs_healthmon_push(hm, &event);
578+
xfs_healthmon_put(hm);
579+
}
580+
525581
static inline void
526582
xfs_healthmon_reset_outbuf(
527583
struct xfs_healthmon *hm)
@@ -574,6 +630,9 @@ static const unsigned int domain_map[] = {
574630
[XFS_HEALTHMON_AG] = XFS_HEALTH_MONITOR_DOMAIN_AG,
575631
[XFS_HEALTHMON_INODE] = XFS_HEALTH_MONITOR_DOMAIN_INODE,
576632
[XFS_HEALTHMON_RTGROUP] = XFS_HEALTH_MONITOR_DOMAIN_RTGROUP,
633+
[XFS_HEALTHMON_DATADEV] = XFS_HEALTH_MONITOR_DOMAIN_DATADEV,
634+
[XFS_HEALTHMON_RTDEV] = XFS_HEALTH_MONITOR_DOMAIN_RTDEV,
635+
[XFS_HEALTHMON_LOGDEV] = XFS_HEALTH_MONITOR_DOMAIN_LOGDEV,
577636
};
578637

579638
static const unsigned int type_map[] = {
@@ -584,6 +643,7 @@ static const unsigned int type_map[] = {
584643
[XFS_HEALTHMON_HEALTHY] = XFS_HEALTH_MONITOR_TYPE_HEALTHY,
585644
[XFS_HEALTHMON_UNMOUNT] = XFS_HEALTH_MONITOR_TYPE_UNMOUNT,
586645
[XFS_HEALTHMON_SHUTDOWN] = XFS_HEALTH_MONITOR_TYPE_SHUTDOWN,
646+
[XFS_HEALTHMON_MEDIA_ERROR] = XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR,
587647
};
588648

589649
/* Render event as a V0 structure */
@@ -635,6 +695,12 @@ xfs_healthmon_format_v0(
635695
hme.e.inode.ino = event->ino;
636696
hme.e.inode.gen = event->gen;
637697
break;
698+
case XFS_HEALTHMON_DATADEV:
699+
case XFS_HEALTHMON_LOGDEV:
700+
case XFS_HEALTHMON_RTDEV:
701+
hme.e.media.daddr = event->daddr;
702+
hme.e.media.bbcount = event->bbcount;
703+
break;
638704
default:
639705
break;
640706
}

fs/xfs/xfs_healthmon.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ enum xfs_healthmon_type {
7979
XFS_HEALTHMON_SICK, /* runtime corruption observed */
8080
XFS_HEALTHMON_CORRUPT, /* fsck reported corruption */
8181
XFS_HEALTHMON_HEALTHY, /* fsck reported healthy structure */
82+
83+
/* media errors */
84+
XFS_HEALTHMON_MEDIA_ERROR,
8285
};
8386

8487
enum xfs_healthmon_domain {
@@ -89,6 +92,11 @@ enum xfs_healthmon_domain {
8992
XFS_HEALTHMON_AG, /* allocation group metadata */
9093
XFS_HEALTHMON_INODE, /* inode metadata */
9194
XFS_HEALTHMON_RTGROUP, /* realtime group metadata */
95+
96+
/* media errors */
97+
XFS_HEALTHMON_DATADEV,
98+
XFS_HEALTHMON_RTDEV,
99+
XFS_HEALTHMON_LOGDEV,
92100
};
93101

94102
struct xfs_healthmon_event {
@@ -126,6 +134,11 @@ struct xfs_healthmon_event {
126134
struct {
127135
unsigned int flags;
128136
};
137+
/* media errors */
138+
struct {
139+
xfs_daddr_t daddr;
140+
uint64_t bbcount;
141+
};
129142
};
130143
};
131144

@@ -141,6 +154,9 @@ void xfs_healthmon_report_inode(struct xfs_inode *ip,
141154

142155
void xfs_healthmon_report_shutdown(struct xfs_mount *mp, uint32_t flags);
143156

157+
void xfs_healthmon_report_media(struct xfs_mount *mp, enum xfs_device fdev,
158+
xfs_daddr_t daddr, uint64_t bbcount);
159+
144160
long xfs_ioc_health_monitor(struct file *file,
145161
struct xfs_health_monitor __user *arg);
146162

fs/xfs/xfs_notify_failure.c

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "xfs_notify_failure.h"
2323
#include "xfs_rtgroup.h"
2424
#include "xfs_rtrmap_btree.h"
25+
#include "xfs_healthmon.h"
2526

2627
#include <linux/mm.h>
2728
#include <linux/dax.h>
@@ -219,6 +220,8 @@ xfs_dax_notify_logdev_failure(
219220
if (error)
220221
return error;
221222

223+
xfs_healthmon_report_media(mp, XFS_DEV_LOG, daddr, bblen);
224+
222225
/*
223226
* In the pre-remove case the failure notification is attempting to
224227
* trigger a force unmount. The expectation is that the device is
@@ -252,16 +255,20 @@ xfs_dax_notify_dev_failure(
252255
uint64_t bblen;
253256
struct xfs_group *xg = NULL;
254257

255-
if (!xfs_has_rmapbt(mp)) {
256-
xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
257-
return -EOPNOTSUPP;
258-
}
259-
260258
error = xfs_dax_translate_range(xfs_group_type_buftarg(mp, type),
261259
offset, len, &daddr, &bblen);
262260
if (error)
263261
return error;
264262

263+
xfs_healthmon_report_media(mp,
264+
type == XG_TYPE_RTG ? XFS_DEV_RT : XFS_DEV_DATA,
265+
daddr, bblen);
266+
267+
if (!xfs_has_rmapbt(mp)) {
268+
xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
269+
return -EOPNOTSUPP;
270+
}
271+
265272
if (type == XG_TYPE_RTG) {
266273
start_bno = xfs_daddr_to_rtb(mp, daddr);
267274
end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);

fs/xfs/xfs_trace.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
#include "xfs_zone_priv.h"
5454
#include "xfs_health.h"
5555
#include "xfs_healthmon.h"
56+
#include "xfs_notify_failure.h"
5657

5758
/*
5859
* We include this last to have the helpers above available for the trace

fs/xfs/xfs_trace.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6086,6 +6086,12 @@ DECLARE_EVENT_CLASS(xfs_healthmon_event_class,
60866086
__entry->ino = event->ino;
60876087
__entry->gen = event->gen;
60886088
break;
6089+
case XFS_HEALTHMON_DATADEV:
6090+
case XFS_HEALTHMON_LOGDEV:
6091+
case XFS_HEALTHMON_RTDEV:
6092+
__entry->offset = event->daddr;
6093+
__entry->length = event->bbcount;
6094+
break;
60896095
}
60906096
),
60916097
TP_printk("dev %d:%d type %s domain %s mask 0x%x ino 0x%llx gen 0x%x offset 0x%llx len 0x%llx group 0x%x lost %llu",
@@ -6228,6 +6234,38 @@ TRACE_EVENT(xfs_healthmon_report_shutdown,
62286234
__print_flags(__entry->shutdown_flags, "|", XFS_SHUTDOWN_STRINGS))
62296235
);
62306236

6237+
#define XFS_DEVICE_STRINGS \
6238+
{ XFS_DEV_DATA, "datadev" }, \
6239+
{ XFS_DEV_RT, "rtdev" }, \
6240+
{ XFS_DEV_LOG, "logdev" }
6241+
6242+
TRACE_DEFINE_ENUM(XFS_DEV_DATA);
6243+
TRACE_DEFINE_ENUM(XFS_DEV_RT);
6244+
TRACE_DEFINE_ENUM(XFS_DEV_LOG);
6245+
6246+
TRACE_EVENT(xfs_healthmon_report_media,
6247+
TP_PROTO(const struct xfs_healthmon *hm, enum xfs_device fdev,
6248+
const struct xfs_healthmon_event *event),
6249+
TP_ARGS(hm, fdev, event),
6250+
TP_STRUCT__entry(
6251+
__field(dev_t, dev)
6252+
__field(unsigned int, error_dev)
6253+
__field(uint64_t, daddr)
6254+
__field(uint64_t, bbcount)
6255+
),
6256+
TP_fast_assign(
6257+
__entry->dev = hm->dev;
6258+
__entry->error_dev = fdev;
6259+
__entry->daddr = event->daddr;
6260+
__entry->bbcount = event->bbcount;
6261+
),
6262+
TP_printk("dev %d:%d %s daddr 0x%llx bbcount 0x%llx",
6263+
MAJOR(__entry->dev), MINOR(__entry->dev),
6264+
__print_symbolic(__entry->error_dev, XFS_DEVICE_STRINGS),
6265+
__entry->daddr,
6266+
__entry->bbcount)
6267+
);
6268+
62316269
#endif /* _TRACE_XFS_H */
62326270

62336271
#undef TRACE_INCLUDE_PATH

0 commit comments

Comments
 (0)