Skip to content

Commit 85c7000

Browse files
committed
Merge tag 'ceph-for-5.18-rc1' of https://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov: "The highlights are: - several changes to how snap context and snap realms are tracked (Xiubo Li). In particular, this should resolve a long-standing issue of high kworker CPU usage and various stalls caused by needless iteration over all inodes in the snap realm. - async create fixes to address hangs in some edge cases (Jeff Layton) - support for getvxattr MDS op for querying server-side xattrs, such as file/directory layouts and ephemeral pins (Milind Changire) - average latency is now maintained for all metrics (Venky Shankar) - some tweaks around handling inline data to make it fit better with netfs helper library (David Howells) Also a couple of memory leaks got plugged along with a few assorted fixups. Last but not least, Xiubo has stepped up to serve as a CephFS co-maintainer" * tag 'ceph-for-5.18-rc1' of https://github.com/ceph/ceph-client: (27 commits) ceph: fix memory leak in ceph_readdir when note_last_dentry returns error ceph: uninitialized variable in debug output ceph: use tracked average r/w/m latencies to display metrics in debugfs ceph: include average/stdev r/w/m latency in mds metrics ceph: track average r/w/m latency ceph: use ktime_to_timespec64() rather than jiffies_to_timespec64() ceph: assign the ci only when the inode isn't NULL ceph: fix inode reference leakage in ceph_get_snapdir() ceph: misc fix for code style and logs ceph: allocate capsnap memory outside of ceph_queue_cap_snap() ceph: do not release the global snaprealm until unmounting ceph: remove incorrect and unused CEPH_INO_DOTDOT macro MAINTAINERS: add Xiubo Li as cephfs co-maintainer ceph: eliminate the recursion when rebuilding the snap context ceph: do not update snapshot context when there is no new snapshot ceph: zero the dir_entries memory when allocating it ceph: move to a dedicated slabcache for ceph_cap_snap ceph: add getvxattr op libceph: drop else branches in prepare_read_data{,_cont} ceph: fix comments mentioning i_mutex ...
2 parents b1b07ba + f639d98 commit 85c7000

20 files changed

Lines changed: 577 additions & 376 deletions

File tree

MAINTAINERS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4456,6 +4456,7 @@ F: drivers/power/supply/cw2015_battery.c
44564456
CEPH COMMON CODE (LIBCEPH)
44574457
M: Ilya Dryomov <idryomov@gmail.com>
44584458
M: Jeff Layton <jlayton@kernel.org>
4459+
M: Xiubo Li <xiubli@redhat.com>
44594460
L: ceph-devel@vger.kernel.org
44604461
S: Supported
44614462
W: http://ceph.com/
@@ -4466,6 +4467,7 @@ F: net/ceph/
44664467

44674468
CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH)
44684469
M: Jeff Layton <jlayton@kernel.org>
4470+
M: Xiubo Li <xiubli@redhat.com>
44694471
M: Ilya Dryomov <idryomov@gmail.com>
44704472
L: ceph-devel@vger.kernel.org
44714473
S: Supported

fs/ceph/addr.c

Lines changed: 112 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ static int ceph_releasepage(struct page *page, gfp_t gfp)
184184

185185
static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
186186
{
187-
struct inode *inode = rreq->mapping->host;
187+
struct inode *inode = rreq->inode;
188188
struct ceph_inode_info *ci = ceph_inode(inode);
189189
struct ceph_file_layout *lo = &ci->i_layout;
190190
u32 blockoff;
@@ -201,7 +201,7 @@ static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
201201

202202
static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq)
203203
{
204-
struct inode *inode = subreq->rreq->mapping->host;
204+
struct inode *inode = subreq->rreq->inode;
205205
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
206206
struct ceph_inode_info *ci = ceph_inode(inode);
207207
u64 objno, objoff;
@@ -244,10 +244,63 @@ static void finish_netfs_read(struct ceph_osd_request *req)
244244
iput(req->r_inode);
245245
}
246246

247+
static bool ceph_netfs_issue_op_inline(struct netfs_read_subrequest *subreq)
248+
{
249+
struct netfs_read_request *rreq = subreq->rreq;
250+
struct inode *inode = rreq->inode;
251+
struct ceph_mds_reply_info_parsed *rinfo;
252+
struct ceph_mds_reply_info_in *iinfo;
253+
struct ceph_mds_request *req;
254+
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
255+
struct ceph_inode_info *ci = ceph_inode(inode);
256+
struct iov_iter iter;
257+
ssize_t err = 0;
258+
size_t len;
259+
260+
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
261+
__clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);
262+
263+
if (subreq->start >= inode->i_size)
264+
goto out;
265+
266+
/* We need to fetch the inline data. */
267+
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
268+
if (IS_ERR(req)) {
269+
err = PTR_ERR(req);
270+
goto out;
271+
}
272+
req->r_ino1 = ci->i_vino;
273+
req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
274+
req->r_num_caps = 2;
275+
276+
err = ceph_mdsc_do_request(mdsc, NULL, req);
277+
if (err < 0)
278+
goto out;
279+
280+
rinfo = &req->r_reply_info;
281+
iinfo = &rinfo->targeti;
282+
if (iinfo->inline_version == CEPH_INLINE_NONE) {
283+
/* The data got uninlined */
284+
ceph_mdsc_put_request(req);
285+
return false;
286+
}
287+
288+
len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
289+
iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
290+
err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
291+
if (err == 0)
292+
err = -EFAULT;
293+
294+
ceph_mdsc_put_request(req);
295+
out:
296+
netfs_subreq_terminated(subreq, err, false);
297+
return true;
298+
}
299+
247300
static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
248301
{
249302
struct netfs_read_request *rreq = subreq->rreq;
250-
struct inode *inode = rreq->mapping->host;
303+
struct inode *inode = rreq->inode;
251304
struct ceph_inode_info *ci = ceph_inode(inode);
252305
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
253306
struct ceph_osd_request *req;
@@ -258,6 +311,10 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
258311
int err = 0;
259312
u64 len = subreq->len;
260313

314+
if (ci->i_inline_version != CEPH_INLINE_NONE &&
315+
ceph_netfs_issue_op_inline(subreq))
316+
return;
317+
261318
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
262319
0, 1, CEPH_OSD_OP_READ,
263320
CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
@@ -326,23 +383,9 @@ static int ceph_readpage(struct file *file, struct page *subpage)
326383
size_t len = folio_size(folio);
327384
u64 off = folio_file_pos(folio);
328385

329-
if (ci->i_inline_version != CEPH_INLINE_NONE) {
330-
/*
331-
* Uptodate inline data should have been added
332-
* into page cache while getting Fcr caps.
333-
*/
334-
if (off == 0) {
335-
folio_unlock(folio);
336-
return -EINVAL;
337-
}
338-
zero_user_segment(&folio->page, 0, folio_size(folio));
339-
folio_mark_uptodate(folio);
340-
folio_unlock(folio);
341-
return 0;
342-
}
343-
344-
dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n",
345-
vino.ino, vino.snap, file, off, len, folio, folio_index(folio));
386+
dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n inline %d",
387+
vino.ino, vino.snap, file, off, len, folio, folio_index(folio),
388+
ci->i_inline_version != CEPH_INLINE_NONE);
346389

347390
return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL);
348391
}
@@ -1281,45 +1324,11 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
12811324
struct page **pagep, void **fsdata)
12821325
{
12831326
struct inode *inode = file_inode(file);
1284-
struct ceph_inode_info *ci = ceph_inode(inode);
12851327
struct folio *folio = NULL;
1286-
pgoff_t index = pos >> PAGE_SHIFT;
12871328
int r;
12881329

1289-
/*
1290-
* Uninlining should have already been done and everything updated, EXCEPT
1291-
* for inline_version sent to the MDS.
1292-
*/
1293-
if (ci->i_inline_version != CEPH_INLINE_NONE) {
1294-
unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
1295-
if (aop_flags & AOP_FLAG_NOFS)
1296-
fgp_flags |= FGP_NOFS;
1297-
folio = __filemap_get_folio(mapping, index, fgp_flags,
1298-
mapping_gfp_mask(mapping));
1299-
if (!folio)
1300-
return -ENOMEM;
1301-
1302-
/*
1303-
* The inline_version on a new inode is set to 1. If that's the
1304-
* case, then the folio is brand new and isn't yet Uptodate.
1305-
*/
1306-
r = 0;
1307-
if (index == 0 && ci->i_inline_version != 1) {
1308-
if (!folio_test_uptodate(folio)) {
1309-
WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n",
1310-
ci->i_inline_version);
1311-
r = -EINVAL;
1312-
}
1313-
goto out;
1314-
}
1315-
zero_user_segment(&folio->page, 0, folio_size(folio));
1316-
folio_mark_uptodate(folio);
1317-
goto out;
1318-
}
1319-
13201330
r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL,
13211331
&ceph_netfs_read_ops, NULL);
1322-
out:
13231332
if (r == 0)
13241333
folio_wait_fscache(folio);
13251334
if (r < 0) {
@@ -1515,19 +1524,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
15151524
sb_start_pagefault(inode->i_sb);
15161525
ceph_block_sigs(&oldset);
15171526

1518-
if (ci->i_inline_version != CEPH_INLINE_NONE) {
1519-
struct page *locked_page = NULL;
1520-
if (off == 0) {
1521-
lock_page(page);
1522-
locked_page = page;
1523-
}
1524-
err = ceph_uninline_data(vma->vm_file, locked_page);
1525-
if (locked_page)
1526-
unlock_page(locked_page);
1527-
if (err < 0)
1528-
goto out_free;
1529-
}
1530-
15311527
if (off + thp_size(page) <= size)
15321528
len = thp_size(page);
15331529
else
@@ -1584,11 +1580,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
15841580
ceph_put_snap_context(snapc);
15851581
} while (err == 0);
15861582

1587-
if (ret == VM_FAULT_LOCKED ||
1588-
ci->i_inline_version != CEPH_INLINE_NONE) {
1583+
if (ret == VM_FAULT_LOCKED) {
15891584
int dirty;
15901585
spin_lock(&ci->i_ceph_lock);
1591-
ci->i_inline_version = CEPH_INLINE_NONE;
15921586
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
15931587
&prealloc_cf);
15941588
spin_unlock(&ci->i_ceph_lock);
@@ -1652,16 +1646,30 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
16521646
}
16531647
}
16541648

1655-
int ceph_uninline_data(struct file *filp, struct page *locked_page)
1649+
int ceph_uninline_data(struct file *file)
16561650
{
1657-
struct inode *inode = file_inode(filp);
1651+
struct inode *inode = file_inode(file);
16581652
struct ceph_inode_info *ci = ceph_inode(inode);
16591653
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
16601654
struct ceph_osd_request *req;
1661-
struct page *page = NULL;
1662-
u64 len, inline_version;
1655+
struct ceph_cap_flush *prealloc_cf;
1656+
struct folio *folio = NULL;
1657+
u64 inline_version = CEPH_INLINE_NONE;
1658+
struct page *pages[1];
16631659
int err = 0;
1664-
bool from_pagecache = false;
1660+
u64 len;
1661+
1662+
prealloc_cf = ceph_alloc_cap_flush();
1663+
if (!prealloc_cf)
1664+
return -ENOMEM;
1665+
1666+
folio = read_mapping_folio(inode->i_mapping, 0, file);
1667+
if (IS_ERR(folio)) {
1668+
err = PTR_ERR(folio);
1669+
goto out;
1670+
}
1671+
1672+
folio_lock(folio);
16651673

16661674
spin_lock(&ci->i_ceph_lock);
16671675
inline_version = ci->i_inline_version;
@@ -1672,53 +1680,19 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
16721680

16731681
if (inline_version == 1 || /* initial version, no data */
16741682
inline_version == CEPH_INLINE_NONE)
1675-
goto out;
1676-
1677-
if (locked_page) {
1678-
page = locked_page;
1679-
WARN_ON(!PageUptodate(page));
1680-
} else if (ceph_caps_issued(ci) &
1681-
(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) {
1682-
page = find_get_page(inode->i_mapping, 0);
1683-
if (page) {
1684-
if (PageUptodate(page)) {
1685-
from_pagecache = true;
1686-
lock_page(page);
1687-
} else {
1688-
put_page(page);
1689-
page = NULL;
1690-
}
1691-
}
1692-
}
1683+
goto out_unlock;
16931684

1694-
if (page) {
1695-
len = i_size_read(inode);
1696-
if (len > PAGE_SIZE)
1697-
len = PAGE_SIZE;
1698-
} else {
1699-
page = __page_cache_alloc(GFP_NOFS);
1700-
if (!page) {
1701-
err = -ENOMEM;
1702-
goto out;
1703-
}
1704-
err = __ceph_do_getattr(inode, page,
1705-
CEPH_STAT_CAP_INLINE_DATA, true);
1706-
if (err < 0) {
1707-
/* no inline data */
1708-
if (err == -ENODATA)
1709-
err = 0;
1710-
goto out;
1711-
}
1712-
len = err;
1713-
}
1685+
len = i_size_read(inode);
1686+
if (len > folio_size(folio))
1687+
len = folio_size(folio);
17141688

17151689
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
17161690
ceph_vino(inode), 0, &len, 0, 1,
17171691
CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE,
17181692
NULL, 0, 0, false);
17191693
if (IS_ERR(req)) {
17201694
err = PTR_ERR(req);
1721-
goto out;
1695+
goto out_unlock;
17221696
}
17231697

17241698
req->r_mtime = inode->i_mtime;
@@ -1727,7 +1701,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
17271701
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
17281702
ceph_osdc_put_request(req);
17291703
if (err < 0)
1730-
goto out;
1704+
goto out_unlock;
17311705

17321706
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
17331707
ceph_vino(inode), 0, &len, 1, 3,
@@ -1736,10 +1710,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
17361710
ci->i_truncate_size, false);
17371711
if (IS_ERR(req)) {
17381712
err = PTR_ERR(req);
1739-
goto out;
1713+
goto out_unlock;
17401714
}
17411715

1742-
osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);
1716+
pages[0] = folio_page(folio, 0);
1717+
osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false);
17431718

17441719
{
17451720
__le64 xattr_buf = cpu_to_le64(inline_version);
@@ -1749,7 +1724,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
17491724
CEPH_OSD_CMPXATTR_OP_GT,
17501725
CEPH_OSD_CMPXATTR_MODE_U64);
17511726
if (err)
1752-
goto out_put;
1727+
goto out_put_req;
17531728
}
17541729

17551730
{
@@ -1760,7 +1735,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
17601735
"inline_version",
17611736
xattr_buf, xattr_len, 0, 0);
17621737
if (err)
1763-
goto out_put;
1738+
goto out_put_req;
17641739
}
17651740

17661741
req->r_mtime = inode->i_mtime;
@@ -1771,19 +1746,28 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
17711746
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
17721747
req->r_end_latency, len, err);
17731748

1774-
out_put:
1749+
if (!err) {
1750+
int dirty;
1751+
1752+
/* Set to CAP_INLINE_NONE and dirty the caps */
1753+
down_read(&fsc->mdsc->snap_rwsem);
1754+
spin_lock(&ci->i_ceph_lock);
1755+
ci->i_inline_version = CEPH_INLINE_NONE;
1756+
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf);
1757+
spin_unlock(&ci->i_ceph_lock);
1758+
up_read(&fsc->mdsc->snap_rwsem);
1759+
if (dirty)
1760+
__mark_inode_dirty(inode, dirty);
1761+
}
1762+
out_put_req:
17751763
ceph_osdc_put_request(req);
17761764
if (err == -ECANCELED)
17771765
err = 0;
1766+
out_unlock:
1767+
folio_unlock(folio);
1768+
folio_put(folio);
17781769
out:
1779-
if (page && page != locked_page) {
1780-
if (from_pagecache) {
1781-
unlock_page(page);
1782-
put_page(page);
1783-
} else
1784-
__free_pages(page, 0);
1785-
}
1786-
1770+
ceph_free_cap_flush(prealloc_cf);
17871771
dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
17881772
inode, ceph_vinop(inode), inline_version, err);
17891773
return err;

0 commit comments

Comments
 (0)