Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## UNRELEASED

- Fixed mypy 2.x type errors in `Client` and `CloudPath` that caused CI lint failures (Issue [#563](https://github.com/drivendataorg/cloudpathlib/issues/563), PR [#566](https://github.com/drivendataorg/cloudpathlib/pull/566))
- Changed `S3Client._get_metadata` to read object metadata with `HeadObject` instead of `GetObject`, so `stat`, `etag`, and `size` no longer open the object body. Also fixes a `KeyError` on `ContentLength` against S3-compatible gateways that drop `Content-Length` from `GetObject` responses. (Issue [#564](https://github.com/drivendataorg/cloudpathlib/issues/564), PR [#565](https://github.com/drivendataorg/cloudpathlib/pull/565))

## v0.24.0 (2026-04-29)
- Added support for S3 Multi-Region Access Point (MRAP) URLs in `S3Path` (Issue [#556](https://github.com/drivendataorg/cloudpathlib/issues/556), PR [#557](https://github.com/drivendataorg/cloudpathlib/pull/557))
Expand Down
8 changes: 5 additions & 3 deletions cloudpathlib/s3/s3client.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,11 @@ def __init__(
)

def _get_metadata(self, cloud_path: S3Path) -> Dict[str, Any]:
# get accepts all download extra args
data = self.s3.ObjectSummary(cloud_path.bucket, cloud_path.key).get(
**self.boto3_dl_extra_args
# head_object accepts all download extra args and reads metadata without the body
data = self.client.head_object(
Bucket=cloud_path.bucket,
Key=cloud_path.key,
**self.boto3_dl_extra_args,

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does head_object respect what we collect in these kwargs?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

)

return {
Expand Down
13 changes: 9 additions & 4 deletions cloudpathlib/s3/s3path.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,15 @@ def touch(self, exist_ok: bool = True, mode: Optional[Any] = None):
def stat(self, follow_symlinks=True):
try:
meta = self.client._get_metadata(self)
except self.client.client.exceptions.NoSuchKey:
raise NoStatError(
f"No stats available for {self}; it may be a directory or not exist."
)
except self.client.client.exceptions.ClientError as error:
# head_object returns a 404 (not NoSuchKey) for a missing key; let other errors raise
error_info = error.response.get("Error", {})
status_code = error.response.get("ResponseMetadata", {}).get("HTTPStatusCode")
if error_info.get("Code") in ("404", "NoSuchKey") or status_code == 404:
raise NoStatError(
f"No stats available for {self}; it may be a directory or not exist."
)
raise

return os.stat_result(
(
Expand Down
30 changes: 23 additions & 7 deletions tests/mock_clients/mock_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,20 +220,36 @@ def list_buckets(self):
return {"Buckets": [{"Name": DEFAULT_S3_BUCKET_NAME}]}

def head_object(self, Bucket, Key, **kwargs):
if not (self.root / Key).exists() or (self.root / Key).is_dir():
raise ClientError({}, {})
if Bucket != DEFAULT_S3_BUCKET_NAME and ".mrap" not in Bucket:
raise ClientError({}, {})
return {"key": Key}
path = self.root / Key
if (
not path.exists()
or path.is_dir()
or (Bucket != DEFAULT_S3_BUCKET_NAME and ".mrap" not in Bucket)
):
# missing key -> 404 ClientError (head_object has no body, so not NoSuchKey)
raise ClientError(
{
"Error": {"Code": "404", "Message": "Not Found"},
"ResponseMetadata": {"HTTPStatusCode": 404},
},
"HeadObject",
)
return {
"LastModified": datetime.fromtimestamp(path.stat().st_mtime),
"ContentLength": path.stat().st_size,
"ETag": hash(str(path)),
"ContentType": self.session.metadata_cache.get(path, None),
"Metadata": {},
}

def generate_presigned_url(self, op: str, Params: dict, ExpiresIn: int):
mock_presigned_url = f"https://{Params['Bucket']}.s3.amazonaws.com/{Params['Key']}?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=TEST%2FTEST%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240131T194721Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=TEST"
return mock_presigned_url

@property
def exceptions(self):
Ex = collections.namedtuple("Ex", "NoSuchKey")
return Ex(NoSuchKey=NoSuchKey)
Ex = collections.namedtuple("Ex", "NoSuchKey ClientError")
return Ex(NoSuchKey=NoSuchKey, ClientError=ClientError)


class MockBoto3Paginator:
Expand Down
10 changes: 10 additions & 0 deletions tests/test_cloudpath_file_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
CloudPathIsADirectoryError,
CloudPathNotImplementedError,
DirectoryNotEmptyError,
NoStatError,
)
from cloudpathlib.http.httpclient import HttpClient, HttpsClient
from cloudpathlib.http.httppath import HttpPath, HttpsPath
Expand Down Expand Up @@ -362,6 +363,15 @@ def test_is_dir_is_file(rig, tmp_path):
assert not non_existent.is_dir()


def test_stat_nonexistent(rig):
# stat on a path that does not exist raises NoStatError on every backend
non_existent = rig.create_cloud_path("dir_0/not_a_real_file.txt")
assert not non_existent.exists()

with pytest.raises(NoStatError):
non_existent.stat()


def test_file_read_writes(rig, tmp_path):
p = rig.create_cloud_path("dir_0/file0_0.txt")
p2 = rig.create_cloud_path("dir_0/not_a_file.txt")
Expand Down
17 changes: 17 additions & 0 deletions tests/test_s3_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
from time import sleep
import time
from unittest.mock import patch

from urllib.parse import urlparse, parse_qs
import pytest
Expand Down Expand Up @@ -56,6 +57,22 @@ def test_transfer_config(s3_rig, tmp_path):
p2.unlink()


def test_get_metadata_uses_head_object(s3_rig):
# metadata lookups should use HeadObject, not GetObject (issue #564)
client = s3_rig.client_class()
client.set_as_default_client()
p = s3_rig.create_cloud_path("dir_0/file0_0.txt")

head_spy = patch.object(client.client, "head_object", wraps=client.client.head_object)
get_spy = patch.object(client.s3, "ObjectSummary", wraps=client.s3.ObjectSummary)
with head_spy as head_object, get_spy as object_summary:
meta = client._get_metadata(p)

head_object.assert_called_once()
object_summary.assert_not_called()
assert set(meta) == {"last_modified", "size", "etag", "content_type", "extra"}


def _download_with_threads(s3_rig, tmp_path, use_threads):
"""Job used by tests to ensure Transfer config changes are
actually passed through to boto3 and respected.
Expand Down
Loading