From ba1b22b8339b1c1d2e510ff7cfa93c725b42bf89 Mon Sep 17 00:00:00 2001 From: Mattijs De Paepe Date: Tue, 9 Jun 2026 18:50:49 +0100 Subject: [PATCH 1/2] Use HeadObject instead of GetObject for S3 metadata S3Client._get_metadata read object metadata via ObjectSummary.get() (a GetObject), which issues a full object read and leaves an unread response stream instead of returning the connection to the pool. Switch to head_object, which reads only the metadata and keeps Content-Length on S3-compatible gateways that drop it from GetObject responses, while bringing S3 in line with the other backends. Since head_object returns no body, a missing key surfaces as a generic 404 ClientError rather than NoSuchKey, so S3Path.stat now translates a 404 to NoStatError and lets other errors (e.g. 403) propagate. Updates the S3 test mock and adds regression tests. Closes #564 Co-Authored-By: Claude Opus 4.8 --- HISTORY.md | 1 + cloudpathlib/s3/s3client.py | 8 +++++--- cloudpathlib/s3/s3path.py | 13 +++++++++---- tests/mock_clients/mock_s3.py | 30 +++++++++++++++++++++++------- tests/test_cloudpath_file_io.py | 10 ++++++++++ tests/test_s3_specific.py | 17 +++++++++++++++++ 6 files changed, 65 insertions(+), 14 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 8d5166e0..a9bd511d 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -3,6 +3,7 @@ ## UNRELEASED - Fixed mypy 2.x type errors in `Client` and `CloudPath` that caused CI lint failures (Issue [#563](https://github.com/drivendataorg/cloudpathlib/issues/563), PR [#566](https://github.com/drivendataorg/cloudpathlib/pull/566)) +- Changed `S3Client._get_metadata` to read object metadata with `HeadObject` instead of `GetObject`, so `stat`, `etag`, and `size` no longer open the object body. Also fixes a `KeyError` on `ContentLength` against S3-compatible gateways that drop `Content-Length` from `GetObject` responses. (Issue [#564](https://github.com/drivendataorg/cloudpathlib/issues/564), PR [#TODO]) ## v0.24.0 (2026-04-29) - Added support for S3 Multi-Region Access Point (MRAP) URLs in `S3Path` (Issue [#556](https://github.com/drivendataorg/cloudpathlib/issues/556), PR [#557](https://github.com/drivendataorg/cloudpathlib/pull/557)) diff --git a/cloudpathlib/s3/s3client.py b/cloudpathlib/s3/s3client.py index 87e45a17..6d052273 100644 --- a/cloudpathlib/s3/s3client.py +++ b/cloudpathlib/s3/s3client.py @@ -135,9 +135,11 @@ def __init__( ) def _get_metadata(self, cloud_path: S3Path) -> Dict[str, Any]: - # get accepts all download extra args - data = self.s3.ObjectSummary(cloud_path.bucket, cloud_path.key).get( - **self.boto3_dl_extra_args + # head_object accepts all download extra args and reads metadata without the body + data = self.client.head_object( + Bucket=cloud_path.bucket, + Key=cloud_path.key, + **self.boto3_dl_extra_args, ) return { diff --git a/cloudpathlib/s3/s3path.py b/cloudpathlib/s3/s3path.py index 118d08ae..dd849915 100644 --- a/cloudpathlib/s3/s3path.py +++ b/cloudpathlib/s3/s3path.py @@ -60,10 +60,15 @@ def touch(self, exist_ok: bool = True, mode: Optional[Any] = None): def stat(self, follow_symlinks=True): try: meta = self.client._get_metadata(self) - except self.client.client.exceptions.NoSuchKey: - raise NoStatError( - f"No stats available for {self}; it may be a directory or not exist." - ) + except self.client.client.exceptions.ClientError as error: + # head_object returns a 404 (not NoSuchKey) for a missing key; let other errors raise + error_info = error.response.get("Error", {}) + status_code = error.response.get("ResponseMetadata", {}).get("HTTPStatusCode") + if error_info.get("Code") in ("404", "NoSuchKey") or status_code == 404: + raise NoStatError( + f"No stats available for {self}; it may be a directory or not exist." + ) + raise return os.stat_result( ( diff --git a/tests/mock_clients/mock_s3.py b/tests/mock_clients/mock_s3.py index 7f2a7aaa..18153c8d 100644 --- a/tests/mock_clients/mock_s3.py +++ b/tests/mock_clients/mock_s3.py @@ -220,11 +220,27 @@ def list_buckets(self): return {"Buckets": [{"Name": DEFAULT_S3_BUCKET_NAME}]} def head_object(self, Bucket, Key, **kwargs): - if not (self.root / Key).exists() or (self.root / Key).is_dir(): - raise ClientError({}, {}) - if Bucket != DEFAULT_S3_BUCKET_NAME and ".mrap" not in Bucket: - raise ClientError({}, {}) - return {"key": Key} + path = self.root / Key + if ( + not path.exists() + or path.is_dir() + or (Bucket != DEFAULT_S3_BUCKET_NAME and ".mrap" not in Bucket) + ): + # missing key -> 404 ClientError (head_object has no body, so not NoSuchKey) + raise ClientError( + { + "Error": {"Code": "404", "Message": "Not Found"}, + "ResponseMetadata": {"HTTPStatusCode": 404}, + }, + "HeadObject", + ) + return { + "LastModified": datetime.fromtimestamp(path.stat().st_mtime), + "ContentLength": path.stat().st_size, + "ETag": hash(str(path)), + "ContentType": self.session.metadata_cache.get(path, None), + "Metadata": {}, + } def generate_presigned_url(self, op: str, Params: dict, ExpiresIn: int): mock_presigned_url = f"https://{Params['Bucket']}.s3.amazonaws.com/{Params['Key']}?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=TEST%2FTEST%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240131T194721Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=TEST" @@ -232,8 +248,8 @@ def generate_presigned_url(self, op: str, Params: dict, ExpiresIn: int): @property def exceptions(self): - Ex = collections.namedtuple("Ex", "NoSuchKey") - return Ex(NoSuchKey=NoSuchKey) + Ex = collections.namedtuple("Ex", "NoSuchKey ClientError") + return Ex(NoSuchKey=NoSuchKey, ClientError=ClientError) class MockBoto3Paginator: diff --git a/tests/test_cloudpath_file_io.py b/tests/test_cloudpath_file_io.py index a7f6f0e9..7a177c80 100644 --- a/tests/test_cloudpath_file_io.py +++ b/tests/test_cloudpath_file_io.py @@ -13,6 +13,7 @@ CloudPathIsADirectoryError, CloudPathNotImplementedError, DirectoryNotEmptyError, + NoStatError, ) from cloudpathlib.http.httpclient import HttpClient, HttpsClient from cloudpathlib.http.httppath import HttpPath, HttpsPath @@ -362,6 +363,15 @@ def test_is_dir_is_file(rig, tmp_path): assert not non_existent.is_dir() +def test_stat_nonexistent(rig): + # stat on a path that does not exist raises NoStatError on every backend + non_existent = rig.create_cloud_path("dir_0/not_a_real_file.txt") + assert not non_existent.exists() + + with pytest.raises(NoStatError): + non_existent.stat() + + def test_file_read_writes(rig, tmp_path): p = rig.create_cloud_path("dir_0/file0_0.txt") p2 = rig.create_cloud_path("dir_0/not_a_file.txt") diff --git a/tests/test_s3_specific.py b/tests/test_s3_specific.py index b32ff538..7e9bd324 100644 --- a/tests/test_s3_specific.py +++ b/tests/test_s3_specific.py @@ -3,6 +3,7 @@ import os from time import sleep import time +from unittest.mock import patch from urllib.parse import urlparse, parse_qs import pytest @@ -56,6 +57,22 @@ def test_transfer_config(s3_rig, tmp_path): p2.unlink() +def test_get_metadata_uses_head_object(s3_rig): + # metadata lookups should use HeadObject, not GetObject (issue #564) + client = s3_rig.client_class() + client.set_as_default_client() + p = s3_rig.create_cloud_path("dir_0/file0_0.txt") + + head_spy = patch.object(client.client, "head_object", wraps=client.client.head_object) + get_spy = patch.object(client.s3, "ObjectSummary", wraps=client.s3.ObjectSummary) + with head_spy as head_object, get_spy as object_summary: + meta = client._get_metadata(p) + + head_object.assert_called_once() + object_summary.assert_not_called() + assert set(meta) == {"last_modified", "size", "etag", "content_type", "extra"} + + def _download_with_threads(s3_rig, tmp_path, use_threads): """Job used by tests to ensure Transfer config changes are actually passed through to boto3 and respected. From 8f0a5367b141da42b05ab0230ce95da2a60477a8 Mon Sep 17 00:00:00 2001 From: Mattijs De Paepe Date: Tue, 9 Jun 2026 19:21:48 +0100 Subject: [PATCH 2/2] add PR number --- HISTORY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index a9bd511d..70eedb1e 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -3,7 +3,7 @@ ## UNRELEASED - Fixed mypy 2.x type errors in `Client` and `CloudPath` that caused CI lint failures (Issue [#563](https://github.com/drivendataorg/cloudpathlib/issues/563), PR [#566](https://github.com/drivendataorg/cloudpathlib/pull/566)) -- Changed `S3Client._get_metadata` to read object metadata with `HeadObject` instead of `GetObject`, so `stat`, `etag`, and `size` no longer open the object body. Also fixes a `KeyError` on `ContentLength` against S3-compatible gateways that drop `Content-Length` from `GetObject` responses. (Issue [#564](https://github.com/drivendataorg/cloudpathlib/issues/564), PR [#TODO]) +- Changed `S3Client._get_metadata` to read object metadata with `HeadObject` instead of `GetObject`, so `stat`, `etag`, and `size` no longer open the object body. Also fixes a `KeyError` on `ContentLength` against S3-compatible gateways that drop `Content-Length` from `GetObject` responses. (Issue [#564](https://github.com/drivendataorg/cloudpathlib/issues/564), PR [#565](https://github.com/drivendataorg/cloudpathlib/pull/565)) ## v0.24.0 (2026-04-29) - Added support for S3 Multi-Region Access Point (MRAP) URLs in `S3Path` (Issue [#556](https://github.com/drivendataorg/cloudpathlib/issues/556), PR [#557](https://github.com/drivendataorg/cloudpathlib/pull/557))