@@ -1274,6 +1274,8 @@ static int bch2_fill_extent(struct bch_fs *c,
12741274 struct bkey_s_c k = bkey_i_to_s_c (fe -> kbuf .k );
12751275 unsigned flags = fe -> flags ;
12761276
1277+ BUG_ON (!k .k -> size );
1278+
12771279 if (bkey_extent_is_direct_data (k .k )) {
12781280 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c (k );
12791281 const union bch_extent_entry * entry ;
@@ -1326,36 +1328,6 @@ static int bch2_fill_extent(struct bch_fs *c,
13261328 }
13271329}
13281330
1329- static int bch2_fiemap_extent (struct btree_trans * trans ,
1330- struct btree_iter * iter , struct bkey_s_c k ,
1331- struct bch_fiemap_extent * cur )
1332- {
1333- s64 offset_into_extent = iter -> pos .offset - bkey_start_offset (k .k );
1334- unsigned sectors = k .k -> size - offset_into_extent ;
1335-
1336- bch2_bkey_buf_reassemble (& cur -> kbuf , trans -> c , k );
1337-
1338- enum btree_id data_btree = BTREE_ID_extents ;
1339- int ret = bch2_read_indirect_extent (trans , & data_btree , & offset_into_extent ,
1340- & cur -> kbuf );
1341- if (ret )
1342- return ret ;
1343-
1344- k = bkey_i_to_s_c (cur -> kbuf .k );
1345- sectors = min_t (unsigned , sectors , k .k -> size - offset_into_extent );
1346-
1347- bch2_cut_front (POS (k .k -> p .inode ,
1348- bkey_start_offset (k .k ) + offset_into_extent ),
1349- cur -> kbuf .k );
1350- bch2_key_resize (& cur -> kbuf .k -> k , sectors );
1351- cur -> kbuf .k -> k .p = iter -> pos ;
1352- cur -> kbuf .k -> k .p .offset += cur -> kbuf .k -> k .size ;
1353-
1354- cur -> flags = 0 ;
1355-
1356- return 0 ;
1357- }
1358-
13591331/*
13601332 * Scan a range of an inode for data in pagecache.
13611333 *
@@ -1371,13 +1343,19 @@ bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end,
13711343 dstart = bch2_seek_pagecache_data (vinode , * start , * end , 0 , nonblock );
13721344 if (dstart < 0 )
13731345 return dstart ;
1374- if (dstart >= * end )
1375- return - ENOENT ;
1346+
1347+ if (dstart == * end ) {
1348+ * start = dstart ;
1349+ return 0 ;
1350+ }
13761351
13771352 dend = bch2_seek_pagecache_hole (vinode , dstart , * end , 0 , nonblock );
13781353 if (dend < 0 )
13791354 return dend ;
13801355
1356+ /* race */
1357+ BUG_ON (dstart == dend );
1358+
13811359 * start = dstart ;
13821360 * end = dend ;
13831361 return 0 ;
@@ -1387,18 +1365,15 @@ bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end,
13871365 * Scan a range of pagecache that corresponds to a file mapping hole in the
13881366 * extent btree. If data is found, fake up an extent key so it looks like a
13891367 * delalloc extent to the rest of the fiemap processing code.
1390- *
1391- * Returns 0 if cached data was found, -ENOENT if not.
13921368 */
13931369static int
1394- bch2_fiemap_hole (struct btree_trans * trans , struct inode * vinode , u64 start ,
1395- u64 end , struct bch_fiemap_extent * cur )
1370+ bch2_next_fiemap_pagecache_extent (struct btree_trans * trans , struct bch_inode_info * inode ,
1371+ u64 start , u64 end , struct bch_fiemap_extent * cur )
13961372{
1397- struct bch_fs * c = vinode -> i_sb -> s_fs_info ;
1398- struct bch_inode_info * ei = to_bch_ei (vinode );
1373+ struct bch_fs * c = trans -> c ;
13991374 struct bkey_i_extent * delextent ;
14001375 struct bch_extent_ptr ptr = {};
1401- loff_t dstart = start , dend = end ;
1376+ loff_t dstart = start << 9 , dend = end << 9 ;
14021377 int ret ;
14031378
14041379 /*
@@ -1411,13 +1386,10 @@ bch2_fiemap_hole(struct btree_trans *trans, struct inode *vinode, u64 start,
14111386 * fundamentally racy with writeback anyways. Therefore, just report the
14121387 * range as delalloc regardless of whether we have to cycle trans locks.
14131388 */
1414- ret = bch2_fiemap_hole_pagecache (vinode , & dstart , & dend , true);
1415- if (ret == - EAGAIN ) {
1416- /* open coded drop_locks_do() to relock even on error */
1417- bch2_trans_unlock (trans );
1418- ret = bch2_fiemap_hole_pagecache (vinode , & dstart , & dend , false);
1419- bch2_trans_relock (trans );
1420- }
1389+ ret = bch2_fiemap_hole_pagecache (& inode -> v , & dstart , & dend , true);
1390+ if (ret == - EAGAIN )
1391+ ret = drop_locks_do (trans ,
1392+ bch2_fiemap_hole_pagecache (& inode -> v , & dstart , & dend , false));
14211393 if (ret < 0 )
14221394 return ret ;
14231395
@@ -1428,124 +1400,151 @@ bch2_fiemap_hole(struct btree_trans *trans, struct inode *vinode, u64 start,
14281400 */
14291401 bch2_bkey_buf_realloc (& cur -> kbuf , c , sizeof (* delextent ) / sizeof (u64 ));
14301402 delextent = bkey_extent_init (cur -> kbuf .k );
1431- delextent -> k .p = POS (ei -> v . i_ino , dstart >> 9 );
1432- bch2_key_resize ( & delextent -> k , (dend - dstart ) >> 9 ) ;
1403+ delextent -> k .p = POS (inode -> ei_inum . inum , dend >> 9 );
1404+ delextent -> k . size = (dend - dstart ) >> 9 ;
14331405 bch2_bkey_append_ptr (& delextent -> k_i , ptr );
14341406
14351407 cur -> flags = FIEMAP_EXTENT_DELALLOC ;
14361408
14371409 return 0 ;
14381410}
14391411
1412+ static int bch2_next_fiemap_extent (struct btree_trans * trans ,
1413+ struct bch_inode_info * inode ,
1414+ u64 start , u64 end ,
1415+ struct bch_fiemap_extent * cur )
1416+ {
1417+ u32 snapshot ;
1418+ int ret = bch2_subvolume_get_snapshot (trans , inode -> ei_inum .subvol , & snapshot );
1419+ if (ret )
1420+ return ret ;
1421+
1422+ struct btree_iter iter ;
1423+ bch2_trans_iter_init (trans , & iter , BTREE_ID_extents ,
1424+ SPOS (inode -> ei_inum .inum , start , snapshot ), 0 );
1425+
1426+ struct bkey_s_c k =
1427+ bch2_btree_iter_peek_max (trans , & iter , POS (inode -> ei_inum .inum , end ));
1428+ ret = bkey_err (k );
1429+ if (ret )
1430+ goto err ;
1431+
1432+ ret = bch2_next_fiemap_pagecache_extent (trans , inode , start , end , cur );
1433+ if (ret )
1434+ goto err ;
1435+
1436+ struct bpos pagecache_start = bkey_start_pos (& cur -> kbuf .k -> k );
1437+
1438+ /*
1439+ * Does the pagecache or the btree take precedence?
1440+ *
1441+ * It _should_ be the pagecache, so that we correctly report delalloc
1442+ * extents when dirty in the pagecache (we're COW, after all).
1443+ *
1444+ * But we'd have to add per-sector writeback tracking to
1445+ * bch_folio_state, otherwise we report delalloc extents for clean
1446+ * cached data in the pagecache.
1447+ *
1448+ * We should do this, but even then fiemap won't report stable mappings:
1449+ * on bcachefs data moves around in the background (copygc, rebalance)
1450+ * and we don't provide a way for userspace to lock that out.
1451+ */
1452+ if (k .k &&
1453+ bkey_le (bpos_max (iter .pos , bkey_start_pos (k .k )),
1454+ pagecache_start )) {
1455+ bch2_bkey_buf_reassemble (& cur -> kbuf , trans -> c , k );
1456+ bch2_cut_front (iter .pos , cur -> kbuf .k );
1457+ bch2_cut_back (POS (inode -> ei_inum .inum , end ), cur -> kbuf .k );
1458+ cur -> flags = 0 ;
1459+ } else if (k .k ) {
1460+ bch2_cut_back (bkey_start_pos (k .k ), cur -> kbuf .k );
1461+ }
1462+
1463+ if (cur -> kbuf .k -> k .type == KEY_TYPE_reflink_p ) {
1464+ unsigned sectors = cur -> kbuf .k -> k .size ;
1465+ s64 offset_into_extent = 0 ;
1466+ enum btree_id data_btree = BTREE_ID_extents ;
1467+ int ret = bch2_read_indirect_extent (trans , & data_btree , & offset_into_extent ,
1468+ & cur -> kbuf );
1469+ if (ret )
1470+ goto err ;
1471+
1472+ struct bkey_i * k = cur -> kbuf .k ;
1473+ sectors = min_t (unsigned , sectors , k -> k .size - offset_into_extent );
1474+
1475+ bch2_cut_front (POS (k -> k .p .inode ,
1476+ bkey_start_offset (& k -> k ) + offset_into_extent ),
1477+ k );
1478+ bch2_key_resize (& k -> k , sectors );
1479+ k -> k .p = iter .pos ;
1480+ k -> k .p .offset += k -> k .size ;
1481+ }
1482+ err :
1483+ bch2_trans_iter_exit (trans , & iter );
1484+ return ret ;
1485+ }
1486+
14401487static int bch2_fiemap (struct inode * vinode , struct fiemap_extent_info * info ,
14411488 u64 start , u64 len )
14421489{
14431490 struct bch_fs * c = vinode -> i_sb -> s_fs_info ;
14441491 struct bch_inode_info * ei = to_bch_ei (vinode );
14451492 struct btree_trans * trans ;
1446- struct btree_iter iter ;
1447- struct bkey_s_c k ;
14481493 struct bch_fiemap_extent cur , prev ;
1449- bool have_extent = false;
14501494 int ret = 0 ;
14511495
14521496 ret = fiemap_prep (& ei -> v , info , start , & len , 0 );
14531497 if (ret )
14541498 return ret ;
14551499
1456- struct bpos end = POS (ei -> v .i_ino , (start + len ) >> 9 );
14571500 if (start + len < start )
14581501 return - EINVAL ;
14591502
14601503 start >>= 9 ;
1504+ u64 end = (start + len ) >> 9 ;
14611505
14621506 bch2_bkey_buf_init (& cur .kbuf );
14631507 bch2_bkey_buf_init (& prev .kbuf );
1464- trans = bch2_trans_get (c );
1465-
1466- bch2_trans_iter_init (trans , & iter , BTREE_ID_extents ,
1467- POS (ei -> v .i_ino , start ), 0 );
1508+ bkey_init (& prev .kbuf .k -> k );
14681509
1469- while (!ret || bch2_err_matches (ret , BCH_ERR_transaction_restart )) {
1470- bool have_delalloc = false;
1471-
1472- bch2_trans_begin (trans );
1510+ trans = bch2_trans_get (c );
14731511
1474- u32 snapshot ;
1475- ret = bch2_subvolume_get_snapshot (trans , ei -> ei_inum .subvol , & snapshot );
1512+ while (start < end ) {
1513+ ret = lockrestart_do (trans ,
1514+ bch2_next_fiemap_extent (trans , ei , start , end , & cur ));
14761515 if (ret )
1477- continue ;
1478-
1479- bch2_btree_iter_set_snapshot (trans , & iter , snapshot );
1516+ goto err ;
14801517
1481- k = bch2_btree_iter_peek_max (trans , & iter , end );
1482- ret = bkey_err (k );
1483- if (ret )
1484- continue ;
1518+ BUG_ON (bkey_start_offset (& cur .kbuf .k -> k ) < start );
1519+ BUG_ON (cur .kbuf .k -> k .p .offset > end );
14851520
1486- if (! k . k )
1521+ if (bkey_start_offset ( & cur . kbuf . k -> k ) == end )
14871522 break ;
14881523
1489- /*
1490- * If a hole exists before the start of the extent key, scan the
1491- * range for pagecache data that might be pending writeback and
1492- * thus not yet exist in the extent tree.
1493- */
1494- if (iter .pos .offset > start ) {
1495- ret = bch2_fiemap_hole (trans , vinode , start << 9 ,
1496- iter .pos .offset << 9 , & cur );
1497- if (!ret )
1498- have_delalloc = true;
1499- else if (ret != - ENOENT )
1500- break ;
1501- }
1502-
1503- /* process the current key if there's no delalloc to report */
1504- if (!have_delalloc ) {
1505- if (!bkey_extent_is_data (k .k ) &&
1506- k .k -> type != KEY_TYPE_reservation ) {
1507- start = bkey_start_offset (k .k ) + k .k -> size ;
1508- bch2_btree_iter_advance (trans , & iter );
1509- continue ;
1510- }
1511-
1512- ret = bch2_fiemap_extent (trans , & iter , k , & cur );
1513- if (ret )
1514- break ;
1515- }
1516-
1517- /*
1518- * Store the current extent in prev so we can flag the last
1519- * extent on the way out.
1520- */
1521- bch2_bkey_buf_realloc (& prev .kbuf , c , cur .kbuf .k -> k .u64s );
15221524 start = cur .kbuf .k -> k .p .offset ;
15231525
1524- if (have_extent ) {
1526+ if (! bkey_deleted ( & prev . kbuf . k -> k ) ) {
15251527 bch2_trans_unlock (trans );
15261528 ret = bch2_fill_extent (c , info , & prev );
15271529 if (ret )
1528- break ;
1530+ goto err ;
15291531 }
15301532
1531- bkey_copy ( prev .kbuf . k , cur .kbuf .k );
1533+ bch2_bkey_buf_copy ( & prev .kbuf , c , cur .kbuf .k );
15321534 prev .flags = cur .flags ;
1533- have_extent = true;
1534-
1535- bch2_btree_iter_set_pos (trans , & iter , POS (iter .pos .inode , start ));
15361535 }
1537- bch2_trans_iter_exit (trans , & iter );
15381536
1539- if (!ret && have_extent ) {
1537+ if (!bkey_deleted ( & prev . kbuf . k -> k ) ) {
15401538 bch2_trans_unlock (trans );
15411539 prev .flags |= FIEMAP_EXTENT_LAST ;
15421540 ret = bch2_fill_extent (c , info , & prev );
15431541 }
1544-
1542+ err :
15451543 bch2_trans_put (trans );
15461544 bch2_bkey_buf_exit (& cur .kbuf , c );
15471545 bch2_bkey_buf_exit (& prev .kbuf , c );
1548- return ret < 0 ? ret : 0 ;
1546+
1547+ return bch2_err_class (ret < 0 ? ret : 0 );
15491548}
15501549
15511550static const struct vm_operations_struct bch_vm_ops = {
0 commit comments