Skip to content

Commit 16069e1

Browse files
committed
afs: Parse the VolSync record in the reply of a number of RPC ops
A number of fileserver RPC operations return a VolSync record as part of their reply that gives some information about the state of the volume being accessed, including: (1) A volume Creation timestamp. For an RW volume, this is the time at which the volume was created; if it changes, the RW volume was presumably restored from a backup and all cached data should be scrubbed as Data Version numbers could regress on the files in the volume. For an RO volume, this is the time it was last snapshotted from the RW volume. It is expected to advance each time this happens; if it regresses, cached data should be scrubbed. (2) A volume Update timestamp (Auristor only). For an RW volume, this is updated any time any change is made to a volume or its contents. If it regresses, all cached data must be scrubbed. For an RO volume, this is a copy of the RW volume's Update timestamp at the point of snapshotting. It can be used as a version number when checking to see if a callback on a RO volume was due to a snapshot. If it regresses, all cached data must be scrubbed. but this is currently not made use of by the in-kernel afs filesystem. Make the afs filesystem use this by: (1) Add an update time field to the afs_volsync struct and use a value of TIME64_MIN in both that and the creation time to indicate that they are unset. (2) Add creation and update time fields to the afs_volume struct and use this to track the two timestamps. (3) Add a volsync_lock mutex to the afs_volume struct to control modification access for when we detect a change in these values. (3) Add a 'pre-op volsync' struct to the afs_operation struct to record the state of the volume tracking before the op. (4) Add a new counter, cb_scrub, to the afs_volume struct to count events that require all data to be scrubbed. A copy is placed in the afs_vnode struct (inode) and if they no longer match, a scrub takes place. (5) When the result of an operation is being parsed, parse the VolSync data too, if it is provided. Note that the two timestamps are handled separately, since they don't work in quite the same way. - If the afs_volume tracking is unset, just set it and do nothing else. - If the result timestamps are the same as the ones in afs_volume, do nothing. - If the timestamps regress, increment cb_scrub if not already done so. - If the creation timestamp on a RW volume changes, increment cb_scrub if not already done so. - If the creation timestamp on a RO volume advances, update the server list and see if the current server has been excluded, if so reissue the op. Once over half of the replication sites have been updated, increment cb_ro_snapshot to indicate updates may be required and switch over to excluding unupdated replication sites. - If the creation timestamp on a Backup volume advances, just increment cb_ro_snapshot to trigger updates. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
1 parent d3acd81 commit 16069e1

11 files changed

Lines changed: 268 additions & 20 deletions

File tree

fs/afs/afs.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,8 @@ struct afs_status_cb {
165165
* AFS volume synchronisation information
166166
*/
167167
struct afs_volsync {
168-
time64_t creation; /* volume creation time */
168+
time64_t creation; /* Volume creation time (or TIME64_MIN) */
169+
time64_t update; /* Volume update time (or TIME64_MIN) */
169170
};
170171

171172
/*

fs/afs/callback.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas
8181
clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
8282
if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
8383
vnode->cb_break++;
84-
vnode->cb_v_break = vnode->volume->cb_v_break;
84+
vnode->cb_v_break = atomic_read(&vnode->volume->cb_v_break);
8585
afs_clear_permits(vnode);
8686

8787
if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB)
@@ -159,12 +159,13 @@ static void afs_break_one_callback(struct afs_volume *volume,
159159
struct super_block *sb;
160160
struct afs_vnode *vnode;
161161
struct inode *inode;
162+
unsigned int cb_v_break;
162163

163164
if (fid->vnode == 0 && fid->unique == 0) {
164165
/* The callback break applies to an entire volume. */
165166
write_lock(&volume->cb_v_break_lock);
166-
volume->cb_v_break++;
167-
trace_afs_cb_break(fid, volume->cb_v_break,
167+
cb_v_break = atomic_inc_return(&volume->cb_v_break);
168+
trace_afs_cb_break(fid, cb_v_break,
168169
afs_cb_break_for_volume_callback, false);
169170
write_unlock(&volume->cb_v_break_lock);
170171
return;

fs/afs/fs_operation.c

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,13 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo
3535
key_get(key);
3636
}
3737

38-
op->key = key;
39-
op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op);
40-
op->net = volume->cell->net;
41-
op->cb_v_break = volume->cb_v_break;
42-
op->debug_id = atomic_inc_return(&afs_operation_debug_counter);
38+
op->key = key;
39+
op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op);
40+
op->net = volume->cell->net;
41+
op->cb_v_break = atomic_read(&volume->cb_v_break);
42+
op->pre_volsync.creation = volume->creation_time;
43+
op->pre_volsync.update = volume->update_time;
44+
op->debug_id = atomic_inc_return(&afs_operation_debug_counter);
4345
op->nr_iterations = -1;
4446
afs_op_set_error(op, -EDESTADDRREQ);
4547

@@ -147,7 +149,7 @@ bool afs_begin_vnode_operation(struct afs_operation *op)
147149

148150
afs_prepare_vnode(op, &op->file[0], 0);
149151
afs_prepare_vnode(op, &op->file[1], 1);
150-
op->cb_v_break = op->volume->cb_v_break;
152+
op->cb_v_break = atomic_read(&op->volume->cb_v_break);
151153
_leave(" = true");
152154
return true;
153155
}

fs/afs/fsclient.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1870,7 +1870,10 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
18701870
return ret;
18711871

18721872
bp = call->buffer;
1873-
xdr_decode_AFSVolSync(&bp, &op->volsync);
1873+
/* Unfortunately, prior to OpenAFS-1.6, volsync here is filled
1874+
* with rubbish.
1875+
*/
1876+
xdr_decode_AFSVolSync(&bp, NULL);
18741877

18751878
call->unmarshall++;
18761879
fallthrough;

fs/afs/inode.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -542,7 +542,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key)
542542
BUG_ON(!(inode->i_state & I_NEW));
543543

544544
vnode = AFS_FS_I(inode);
545-
vnode->cb_v_break = as->volume->cb_v_break,
545+
vnode->cb_v_break = atomic_read(&as->volume->cb_v_break),
546546
afs_set_netfs_context(vnode);
547547

548548
op = afs_alloc_operation(key, as->volume);

fs/afs/internal.h

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -662,7 +662,15 @@ struct afs_volume {
662662
rwlock_t servers_lock; /* Lock for ->servers */
663663
unsigned int servers_seq; /* Incremented each time ->servers changes */
664664

665-
unsigned cb_v_break; /* Break-everything counter. */
665+
/* RO release tracking */
666+
struct mutex volsync_lock; /* Time/state evaluation lock */
667+
time64_t creation_time; /* Volume creation time (or TIME64_MIN) */
668+
time64_t update_time; /* Volume update time (or TIME64_MIN) */
669+
670+
/* Callback management */
671+
atomic_t cb_ro_snapshot; /* RO volume update-from-snapshot counter */
672+
atomic_t cb_v_break; /* Volume-break event counter. */
673+
atomic_t cb_scrub; /* Scrub-all-data event counter. */
666674
rwlock_t cb_v_break_lock;
667675

668676
afs_voltype_t type; /* type of volume */
@@ -856,7 +864,8 @@ struct afs_operation {
856864
struct afs_volume *volume; /* Volume being accessed */
857865
struct afs_vnode_param file[2];
858866
struct afs_vnode_param *more_files;
859-
struct afs_volsync volsync;
867+
struct afs_volsync pre_volsync; /* Volsync before op */
868+
struct afs_volsync volsync; /* Volsync returned by op */
860869
struct dentry *dentry; /* Dentry to be altered */
861870
struct dentry *dentry_2; /* Second dentry to be altered */
862871
struct timespec64 mtime; /* Modification time to record */
@@ -1063,7 +1072,7 @@ static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode)
10631072
static inline bool afs_cb_is_broken(unsigned int cb_break,
10641073
const struct afs_vnode *vnode)
10651074
{
1066-
return cb_break != (vnode->cb_break + vnode->volume->cb_v_break);
1075+
return cb_break != (vnode->cb_break + atomic_read(&vnode->volume->cb_v_break));
10671076
}
10681077

10691078
/*
@@ -1555,6 +1564,7 @@ extern void afs_fs_exit(void);
15551564
/*
15561565
* validation.c
15571566
*/
1567+
int afs_update_volume_state(struct afs_operation *op);
15581568
bool afs_check_validity(struct afs_vnode *vnode);
15591569
bool afs_pagecache_valid(struct afs_vnode *vnode);
15601570
int afs_validate(struct afs_vnode *vnode, struct key *key);

fs/afs/rotate.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,7 @@ bool afs_select_fileserver(struct afs_operation *op)
486486
vnode->cb_server = server;
487487
vnode->cb_s_break = server->cb_s_break;
488488
vnode->cb_fs_s_break = atomic_read(&server->cell->fs_s_break);
489-
vnode->cb_v_break = vnode->volume->cb_v_break;
489+
vnode->cb_v_break = atomic_read(&vnode->volume->cb_v_break);
490490
clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
491491
}
492492

@@ -519,6 +519,8 @@ bool afs_select_fileserver(struct afs_operation *op)
519519
op->addr_index = addr_index;
520520
set_bit(addr_index, &op->addr_tried);
521521

522+
op->volsync.creation = TIME64_MIN;
523+
op->volsync.update = TIME64_MIN;
522524
op->call_responded = false;
523525
_debug("address [%u] %u/%u %pISp",
524526
op->server_index, addr_index, alist->nr_addrs,

fs/afs/validation.c

Lines changed: 197 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,201 @@
1010
#include <linux/sched.h>
1111
#include "internal.h"
1212

13+
/*
14+
* See if the server we've just talked to is currently excluded.
15+
*/
16+
static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
17+
{
18+
const struct afs_server_entry *se;
19+
const struct afs_server_list *slist;
20+
bool is_excluded = true;
21+
int i;
22+
23+
rcu_read_lock();
24+
25+
slist = rcu_dereference(volume->servers);
26+
for (i = 0; i < slist->nr_servers; i++) {
27+
se = &slist->servers[i];
28+
if (op->server == se->server) {
29+
is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
30+
break;
31+
}
32+
}
33+
34+
rcu_read_unlock();
35+
return is_excluded;
36+
}
37+
38+
/*
39+
* Update the volume's server list when the creation time changes and see if
40+
* the server we've just talked to is currently excluded.
41+
*/
42+
static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
43+
{
44+
int ret;
45+
46+
if (__afs_is_server_excluded(op, volume))
47+
return 1;
48+
49+
set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
50+
ret = afs_check_volume_status(op->volume, op);
51+
if (ret < 0)
52+
return ret;
53+
54+
return __afs_is_server_excluded(op, volume);
55+
}
56+
57+
/*
58+
* Handle a change to the volume creation time in the VolSync record.
59+
*/
60+
static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
61+
{
62+
unsigned int snap;
63+
time64_t cur = volume->creation_time;
64+
time64_t old = op->pre_volsync.creation;
65+
time64_t new = op->volsync.creation;
66+
int ret;
67+
68+
_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
69+
70+
if (cur == TIME64_MIN) {
71+
volume->creation_time = new;
72+
return 0;
73+
}
74+
75+
if (new == cur)
76+
return 0;
77+
78+
/* Try to advance the creation timestamp from what we had before the
79+
* operation to what we got back from the server. This should
80+
* hopefully ensure that in a race between multiple operations only one
81+
* of them will do this.
82+
*/
83+
if (cur != old)
84+
return 0;
85+
86+
/* If the creation time changes in an unexpected way, we need to scrub
87+
* our caches. For a RW vol, this will only change if the volume is
88+
* restored from a backup; for a RO/Backup vol, this will advance when
89+
* the volume is updated to a new snapshot (eg. "vos release").
90+
*/
91+
if (volume->type == AFSVL_RWVOL)
92+
goto regressed;
93+
if (volume->type == AFSVL_BACKVOL) {
94+
if (new < old)
95+
goto regressed;
96+
goto advance;
97+
}
98+
99+
/* We have an RO volume, we need to query the VL server and look at the
100+
* server flags to see if RW->RO replication is in progress.
101+
*/
102+
ret = afs_is_server_excluded(op, volume);
103+
if (ret < 0)
104+
return ret;
105+
if (ret > 0) {
106+
snap = atomic_read(&volume->cb_ro_snapshot);
107+
trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
108+
return ret;
109+
}
110+
111+
advance:
112+
snap = atomic_inc_return(&volume->cb_ro_snapshot);
113+
trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
114+
volume->creation_time = new;
115+
return 0;
116+
117+
regressed:
118+
atomic_inc(&volume->cb_scrub);
119+
trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
120+
volume->creation_time = new;
121+
return 0;
122+
}
123+
124+
/*
125+
* Handle a change to the volume update time in the VolSync record.
126+
*/
127+
static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
128+
{
129+
enum afs_cb_break_reason reason = afs_cb_break_no_break;
130+
time64_t cur = volume->update_time;
131+
time64_t old = op->pre_volsync.update;
132+
time64_t new = op->volsync.update;
133+
134+
_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
135+
136+
if (cur == TIME64_MIN) {
137+
volume->update_time = new;
138+
return;
139+
}
140+
141+
if (new == cur)
142+
return;
143+
144+
/* If the volume update time changes in an unexpected way, we need to
145+
* scrub our caches. For a RW vol, this will advance on every
146+
* modification op; for a RO/Backup vol, this will advance when the
147+
* volume is updated to a new snapshot (eg. "vos release").
148+
*/
149+
if (new < old)
150+
reason = afs_cb_break_for_update_regress;
151+
152+
/* Try to advance the update timestamp from what we had before the
153+
* operation to what we got back from the server. This should
154+
* hopefully ensure that in a race between multiple operations only one
155+
* of them will do this.
156+
*/
157+
if (cur == old) {
158+
if (reason == afs_cb_break_for_update_regress) {
159+
atomic_inc(&volume->cb_scrub);
160+
trace_afs_cb_v_break(volume->vid, 0, reason);
161+
}
162+
volume->update_time = new;
163+
}
164+
}
165+
166+
static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
167+
{
168+
int ret = 0;
169+
170+
if (likely(op->volsync.creation == volume->creation_time &&
171+
op->volsync.update == volume->update_time))
172+
return 0;
173+
174+
mutex_lock(&volume->volsync_lock);
175+
if (op->volsync.creation != volume->creation_time) {
176+
ret = afs_update_volume_creation_time(op, volume);
177+
if (ret < 0)
178+
goto out;
179+
}
180+
if (op->volsync.update != volume->update_time)
181+
afs_update_volume_update_time(op, volume);
182+
out:
183+
mutex_unlock(&volume->volsync_lock);
184+
return ret;
185+
}
186+
187+
/*
188+
* Update the state of a volume. Returns 1 to redo the operation from the start.
189+
*/
190+
int afs_update_volume_state(struct afs_operation *op)
191+
{
192+
struct afs_volume *volume = op->volume;
193+
int ret;
194+
195+
_enter("%llx", op->volume->vid);
196+
197+
if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
198+
ret = afs_update_volume_times(op, volume);
199+
if (ret != 0) {
200+
_leave(" = %d", ret);
201+
return ret;
202+
}
203+
}
204+
205+
return 0;
206+
}
207+
13208
/*
14209
* mark the data attached to an inode as obsolete due to a write on the server
15210
* - might also want to ditch all the outstanding writes and dirty pages
@@ -74,7 +269,7 @@ bool afs_check_validity(struct afs_vnode *vnode)
74269
cb_break = vnode->cb_break;
75270

76271
if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
77-
if (vnode->cb_v_break != vnode->volume->cb_v_break)
272+
if (vnode->cb_v_break != atomic_read(&vnode->volume->cb_v_break))
78273
need_clear = afs_cb_break_for_v_break;
79274
else if (!afs_check_server_good(vnode))
80275
need_clear = afs_cb_break_for_s_reinit;
@@ -95,7 +290,7 @@ bool afs_check_validity(struct afs_vnode *vnode)
95290

96291
write_seqlock(&vnode->cb_lock);
97292
if (need_clear == afs_cb_break_no_promise)
98-
vnode->cb_v_break = vnode->volume->cb_v_break;
293+
vnode->cb_v_break = atomic_read(&vnode->volume->cb_v_break);
99294
else if (cb_break == vnode->cb_break)
100295
__afs_break_callback(vnode, need_clear);
101296
else

fs/afs/volume.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,11 +90,14 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
9090
volume->type = params->type;
9191
volume->type_force = params->force;
9292
volume->name_len = vldb->name_len;
93+
volume->creation_time = TIME64_MIN;
94+
volume->update_time = TIME64_MIN;
9395

9496
refcount_set(&volume->ref, 1);
9597
INIT_HLIST_NODE(&volume->proc_link);
9698
INIT_WORK(&volume->destructor, afs_destroy_volume);
9799
rwlock_init(&volume->servers_lock);
100+
mutex_init(&volume->volsync_lock);
98101
rwlock_init(&volume->cb_v_break_lock);
99102
memcpy(volume->name, vldb->name, vldb->name_len + 1);
100103

fs/afs/yfsclient.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,12 +245,15 @@ static void xdr_decode_YFSVolSync(const __be32 **_bp,
245245
struct afs_volsync *volsync)
246246
{
247247
struct yfs_xdr_YFSVolSync *x = (void *)*_bp;
248-
u64 creation;
248+
u64 creation, update;
249249

250250
if (volsync) {
251251
creation = xdr_to_u64(x->vol_creation_date);
252252
do_div(creation, 10 * 1000 * 1000);
253253
volsync->creation = creation;
254+
update = xdr_to_u64(x->vol_update_date);
255+
do_div(update, 10 * 1000 * 1000);
256+
volsync->update = update;
254257
}
255258

256259
*_bp += xdr_size(x);

0 commit comments

Comments
 (0)