From 6490021b4f19b745f70b4a26578ad0b0549d0889 Mon Sep 17 00:00:00 2001 From: tewbo Date: Sat, 21 Mar 2026 06:27:27 +0300 Subject: [PATCH 01/36] + otel sync tracing support --- examples/opentelemetry/compose-e2e.yaml | 61 +++ examples/opentelemetry/example.py | 42 +++ .../grafana/dashboards/README.md | 5 + .../provisioning/dashboards/dashboards.yaml | 13 + .../provisioning/datasources/datasources.yaml | 22 ++ .../opentelemetry/otel-collector-config.yaml | 44 +++ examples/opentelemetry/prometheus.yaml | 7 + examples/opentelemetry/tempo.yaml | 15 + examples/opentelemetry/ydb_config/README.md | 28 ++ .../ydb_config/otel-tracing-snippet.yaml | 26 ++ .../ydb_config/ydb-config-with-tracing.yaml | 349 ++++++++++++++++++ setup.py | 1 + ydb/connection.py | 4 + ydb/opentelemetry/__init__.py | 8 + ydb/opentelemetry/_plugin.py | 96 +++++ ydb/opentelemetry/tracing.py | 85 +++++ ydb/pool.py | 10 +- ydb/query/session.py | 54 +-- ydb/query/transaction.py | 93 +++-- 19 files changed, 894 insertions(+), 69 deletions(-) create mode 100644 examples/opentelemetry/compose-e2e.yaml create mode 100644 examples/opentelemetry/example.py create mode 100644 examples/opentelemetry/grafana/dashboards/README.md create mode 100644 examples/opentelemetry/grafana/provisioning/dashboards/dashboards.yaml create mode 100644 examples/opentelemetry/grafana/provisioning/datasources/datasources.yaml create mode 100644 examples/opentelemetry/otel-collector-config.yaml create mode 100644 examples/opentelemetry/prometheus.yaml create mode 100644 examples/opentelemetry/tempo.yaml create mode 100644 examples/opentelemetry/ydb_config/README.md create mode 100644 examples/opentelemetry/ydb_config/otel-tracing-snippet.yaml create mode 100644 examples/opentelemetry/ydb_config/ydb-config-with-tracing.yaml create mode 100644 ydb/opentelemetry/__init__.py create mode 100644 ydb/opentelemetry/_plugin.py create mode 100644 ydb/opentelemetry/tracing.py diff --git a/examples/opentelemetry/compose-e2e.yaml b/examples/opentelemetry/compose-e2e.yaml new file mode 100644 index 00000000..933d9a38 --- /dev/null +++ b/examples/opentelemetry/compose-e2e.yaml @@ -0,0 +1,61 @@ +version: "3.3" +services: + ydb: + image: ydbplatform/local-ydb:trunk + restart: always + hostname: localhost + platform: linux/amd64 + environment: + YDB_DEFAULT_LOG_LEVEL: NOTICE + GRPC_TLS_PORT: "2135" + GRPC_PORT: "2136" + MON_PORT: "8765" + YDB_USE_IN_MEMORY_PDISKS: "true" + command: [ "--config-path", "/ydb_config/ydb-config-with-tracing.yaml" ] + ports: + - "2135:2135" + - "2136:2136" + - "8765:8765" + volumes: + - ./ydb_config:/ydb_config:ro + + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + command: [ "--config=/etc/otelcol/config.yaml" ] + volumes: + - ./otel-collector-config.yaml:/etc/otelcol/config.yaml:ro + ports: + - "4317:4317" + - "4318:4318" + - "9464:9464" + - "13133:13133" + - "13317:55679" + + prometheus: + image: prom/prometheus:latest + volumes: + - ./prometheus.yaml:/etc/prometheus/prometheus.yml:ro + ports: + - "9090:9090" + depends_on: [ otel-collector ] + + tempo: + image: grafana/tempo:2.4.1 + command: [ "-config.file=/etc/tempo.yaml" ] + volumes: + - ./tempo.yaml:/etc/tempo.yaml:ro + ports: + - "3200:3200" + depends_on: [ otel-collector ] + + grafana: + image: grafana/grafana:10.4.2 + environment: + GF_AUTH_ANONYMOUS_ENABLED: "true" + GF_AUTH_ANONYMOUS_ORG_ROLE: "Admin" + volumes: + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + ports: + - "3000:3000" + depends_on: [ prometheus, tempo ] diff --git a/examples/opentelemetry/example.py b/examples/opentelemetry/example.py new file mode 100644 index 00000000..797caff3 --- /dev/null +++ b/examples/opentelemetry/example.py @@ -0,0 +1,42 @@ +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.resources import Resource + +from random import randint + +resource = Resource(attributes={"service.name": "ydb-python-test"}) + +provider = TracerProvider(resource=resource) + +provider.add_span_processor( + BatchSpanProcessor(OTLPSpanExporter(endpoint="http://localhost:4317/v1/traces")) # или 4317 grpc +) +trace.set_tracer_provider(provider) + +tracer = trace.get_tracer(__name__) + +import ydb +from ydb.opentelemetry import enable_tracing + +enable_tracing() + +endpoint = "grpc://localhost:2136" +database = "/local" + +with ydb.Driver(endpoint=endpoint, database=database, credentials=ydb.default_credentials()) as driver: + driver.wait(timeout=5) + + with tracer.start_as_current_span("ydb-load-test"): + with ydb.QuerySessionPool(driver) as pool: + pool.execute_with_retries("CREATE TABLE IF NOT EXISTS example(key UInt64, value String, PRIMARY KEY (key))") + rand_value = randint(10000, 99999) + for i in range(rand_value, rand_value + 3): + val = f"value{i}" + pool.execute_with_retries(f"INSERT INTO example (key, value) VALUES ({i}, '{val}')") + + res = pool.execute_with_retries("SELECT * FROM example") + print(res.pop().rows) + +provider.shutdown() \ No newline at end of file diff --git a/examples/opentelemetry/grafana/dashboards/README.md b/examples/opentelemetry/grafana/dashboards/README.md new file mode 100644 index 00000000..eb47493a --- /dev/null +++ b/examples/opentelemetry/grafana/dashboards/README.md @@ -0,0 +1,5 @@ +This folder is intentionally left empty. + +Grafana is provisioned with Tempo + Prometheus datasources; use **Explore** to search traces. + + diff --git a/examples/opentelemetry/grafana/provisioning/dashboards/dashboards.yaml b/examples/opentelemetry/grafana/provisioning/dashboards/dashboards.yaml new file mode 100644 index 00000000..5ccefdc1 --- /dev/null +++ b/examples/opentelemetry/grafana/provisioning/dashboards/dashboards.yaml @@ -0,0 +1,13 @@ +apiVersion: 1 + +providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: true + editable: false + options: + path: /var/lib/grafana/dashboards + + diff --git a/examples/opentelemetry/grafana/provisioning/datasources/datasources.yaml b/examples/opentelemetry/grafana/provisioning/datasources/datasources.yaml new file mode 100644 index 00000000..05ba5bd9 --- /dev/null +++ b/examples/opentelemetry/grafana/provisioning/datasources/datasources.yaml @@ -0,0 +1,22 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false + + - name: Tempo + type: tempo + access: proxy + url: http://tempo:3200 + editable: false + jsonData: + tracesToMetrics: + datasourceUid: Prometheus + serviceMap: + datasourceUid: Prometheus + + diff --git a/examples/opentelemetry/otel-collector-config.yaml b/examples/opentelemetry/otel-collector-config.yaml new file mode 100644 index 00000000..7f784445 --- /dev/null +++ b/examples/opentelemetry/otel-collector-config.yaml @@ -0,0 +1,44 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: { } + +exporters: + prometheus: + endpoint: 0.0.0.0:9464 + resource_to_telemetry_conversion: + enabled: true + + otlp/tempo: + endpoint: tempo:4317 + tls: + insecure: true + + debug: + verbosity: detailed + +extensions: + health_check: + endpoint: 0.0.0.0:13133 + + zpages: + endpoint: 0.0.0.0:55679 + +service: + extensions: [ health_check, zpages ] + pipelines: + metrics: + receivers: [ otlp ] + processors: [ batch ] + exporters: [ prometheus ] + + traces: + receivers: [ otlp ] + processors: [ batch ] + exporters: [ otlp/tempo, debug ] diff --git a/examples/opentelemetry/prometheus.yaml b/examples/opentelemetry/prometheus.yaml new file mode 100644 index 00000000..64b31821 --- /dev/null +++ b/examples/opentelemetry/prometheus.yaml @@ -0,0 +1,7 @@ +global: + scrape_interval: 5s + +scrape_configs: + - job_name: otel-collector + static_configs: + - targets: ["otel-collector:9464"] diff --git a/examples/opentelemetry/tempo.yaml b/examples/opentelemetry/tempo.yaml new file mode 100644 index 00000000..43dbb19c --- /dev/null +++ b/examples/opentelemetry/tempo.yaml @@ -0,0 +1,15 @@ +server: + http_listen_port: 3200 + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + +storage: + trace: + backend: local + local: + path: /tmp/tempo diff --git a/examples/opentelemetry/ydb_config/README.md b/examples/opentelemetry/ydb_config/README.md new file mode 100644 index 00000000..cbffaaba --- /dev/null +++ b/examples/opentelemetry/ydb_config/README.md @@ -0,0 +1,28 @@ +# YDB server-side tracing (OpenTelemetry) + +This folder is used to keep a **custom YDB config** that enables server-side OpenTelemetry tracing. + +## 1) Export the default config from a running container + +If YDB is running as `ydb-local`: + +```bash +docker cp ydb-local:/ydb_data/cluster/kikimr_configs/config.yaml ./ydb_config/ydb-config.yaml +``` + +## 2) Enable OpenTelemetry exporter in the config + +Edit `ydb-config.yaml` and add the contents of `otel-tracing-snippet.yaml` (usually as a top-level section). + +Default OTLP endpoint (inside docker-compose network): `grpc://otel-collector:4317` +Default service name (so you can find it in Tempo/Grafana): `ydb` + +## 3) Run with the overridden config + +Restart YDB (the main `compose-e2e.yaml` will automatically use `--config-path` if `ydb-config.yaml` exists): + +```bash +docker-compose -f compose-e2e.yaml up -d --force-recreate ydb +``` + +Now you should see additional server-side traces in Tempo/Grafana (service name defaults to `ydb-local` in the snippet). diff --git a/examples/opentelemetry/ydb_config/otel-tracing-snippet.yaml b/examples/opentelemetry/ydb_config/otel-tracing-snippet.yaml new file mode 100644 index 00000000..bd5978d2 --- /dev/null +++ b/examples/opentelemetry/ydb_config/otel-tracing-snippet.yaml @@ -0,0 +1,26 @@ +tracing_config: + backend: + opentelemetry: + collector_url: grpc://otel-collector:4317 + service_name: ydb + external_throttling: + - scope: + database: /local + max_traces_per_minute: 60 + max_traces_burst: 3 + # Highest tracing detail for *sampled* traces (YDB-generated trace-id). + # Note: requests with an external `traceparent` are traced at level 13 (Detailed) per YDB docs. + sampling: + - scope: + database: /local + fraction: 1 + level: 15 + max_traces_per_minute: 1000 + max_traces_burst: 100 + uploader: + max_exported_spans_per_second: 30 + max_spans_in_batch: 100 + max_bytes_in_batch: 10485760 # 10 MiB + max_export_requests_inflight: 3 + max_batch_accumulation_milliseconds: 5000 + span_export_timeout_seconds: 120 diff --git a/examples/opentelemetry/ydb_config/ydb-config-with-tracing.yaml b/examples/opentelemetry/ydb_config/ydb-config-with-tracing.yaml new file mode 100644 index 00000000..ef93d0e6 --- /dev/null +++ b/examples/opentelemetry/ydb_config/ydb-config-with-tracing.yaml @@ -0,0 +1,349 @@ +actor_system_config: + batch_executor: 2 + executor: + - name: System + spin_threshold: 0 + threads: 2 + type: BASIC + - name: User + spin_threshold: 0 + threads: 3 + type: BASIC + - name: Batch + spin_threshold: 0 + threads: 2 + type: BASIC + - name: IO + threads: 1 + time_per_mailbox_micro_secs: 100 + type: IO + - name: IC + spin_threshold: 10 + threads: 1 + time_per_mailbox_micro_secs: 100 + type: BASIC + io_executor: 3 + scheduler: + progress_threshold: 10000 + resolution: 1024 + spin_threshold: 0 + service_executor: + - executor_id: 4 + service_name: Interconnect + sys_executor: 0 + user_executor: 1 +blob_storage_config: + service_set: + availability_domains: 1 + groups: + - erasure_species: 0 + group_generation: 1 + group_id: 0 + rings: + - fail_domains: + - vdisk_locations: + - node_id: 1 + pdisk_guid: 1 + pdisk_id: 1 + vdisk_slot_id: 0 + pdisks: + - node_id: 1 + path: SectorMap:1:64 + pdisk_category: 0 + pdisk_guid: 1 + pdisk_id: 1 + vdisks: + - vdisk_id: + domain: 0 + group_generation: 1 + group_id: 0 + ring: 0 + vdisk: 0 + vdisk_location: + node_id: 1 + pdisk_guid: 1 + pdisk_id: 1 + vdisk_slot_id: 0 +channel_profile_config: + profile: + - channel: + - erasure_species: none + pdisk_category: 0 + storage_pool_kind: hdd + - erasure_species: none + pdisk_category: 0 + storage_pool_kind: hdd + - erasure_species: none + pdisk_category: 0 + storage_pool_kind: hdd + profile_id: 0 + - channel: + - erasure_species: none + pdisk_category: 0 + storage_pool_kind: hdd + - erasure_species: none + pdisk_category: 0 + storage_pool_kind: hdd + - erasure_species: none + pdisk_category: 0 + storage_pool_kind: hdd + - erasure_species: none + pdisk_category: 0 + storage_pool_kind: hdd + - erasure_species: none + pdisk_category: 0 + storage_pool_kind: hdd + - erasure_species: none + pdisk_category: 0 + storage_pool_kind: hdd + - erasure_species: none + pdisk_category: 0 + storage_pool_kind: hdd + profile_id: 1 +domains_config: + domain: + - domain_id: 1 + name: local + storage_pool_types: + - kind: hdd + pool_config: + box_id: 1 + erasure_species: none + kind: hdd + pdisk_filter: + - property: + - type: ROT + vdisk_kind: Default + - kind: hdd1 + pool_config: + box_id: 1 + erasure_species: none + kind: hdd + pdisk_filter: + - property: + - type: ROT + vdisk_kind: Default + - kind: hdd2 + pool_config: + box_id: 1 + erasure_species: none + kind: hdd + pdisk_filter: + - property: + - type: ROT + vdisk_kind: Default + - kind: hdde + pool_config: + box_id: 1 + encryption_mode: 1 + erasure_species: none + kind: hdd + pdisk_filter: + - property: + - type: ROT + vdisk_kind: Default + security_config: + default_users: + - name: root + password: '1234' + state_storage: + - ring: + nto_select: 1 + ring: + - node: + - 1 + use_ring_specific_node_selection: true + ssid: 1 +feature_flags: + enable_drain_on_shutdown: false + enable_mvcc_snapshot_reads: true + enable_persistent_query_stats: true + enable_public_api_external_blobs: false + enable_scheme_transactions_at_scheme_shard: true +federated_query_config: + audit: + enabled: false + uaconfig: + uri: '' + checkpoint_coordinator: + checkpointing_period_millis: 1000 + enabled: true + max_inflight: 1 + storage: + endpoint: '' + common: + ids_prefix: pt + use_bearer_for_ydb: true + control_plane_proxy: + enabled: true + request_timeout: 30s + control_plane_storage: + available_binding: + - DATA_STREAMS + - OBJECT_STORAGE + available_connection: + - YDB_DATABASE + - CLICKHOUSE_CLUSTER + - DATA_STREAMS + - OBJECT_STORAGE + - MONITORING + enabled: true + storage: + endpoint: '' + db_pool: + enabled: true + storage: + endpoint: '' + enabled: false + gateways: + dq: + default_settings: [] + enabled: true + pq: + cluster_mapping: [] + solomon: + cluster_mapping: [] + nodes_manager: + enabled: true + pending_fetcher: + enabled: true + pinger: + ping_period: 30s + private_api: + enabled: true + private_proxy: + enabled: true + resource_manager: + enabled: true + token_accessor: + enabled: true +grpc_config: + ca: /ydb_certs/ca.pem + cert: /ydb_certs/cert.pem + host: '[::]' + key: /ydb_certs/key.pem + services: + - nbs + - legacy + - tablet_service + - yql + - discovery + - cms + - locking + - kesus + - pq + - pqcd + - pqv1 + - topic + - datastreams + - scripting + - clickhouse_internal + - rate_limiter + - analytics + - export + - import + - yq + - keyvalue + - monitoring + - auth + - query_service + - view +interconnect_config: + start_tcp: true +kafka_proxy_config: + enable_kafka_proxy: true + listening_port: 9092 +kqpconfig: + settings: + - name: _ResultRowsLimit + value: '1000' +log_config: + default_level: 5 + entry: [] + sys_log: false +nameservice_config: + node: + - address: ::1 + host: localhost + node_id: 1 + port: 19001 + walle_location: + body: 1 + data_center: '1' + rack: '1' +net_classifier_config: + cms_config_timeout_seconds: 30 + net_data_file_path: /ydb_data/netData.tsv + updater_config: + net_data_update_interval_seconds: 60 + retry_interval_seconds: 30 +pqcluster_discovery_config: + enabled: false +pqconfig: + check_acl: false + cluster_table_path: '' + clusters_update_timeout_sec: 1 + enable_proto_source_id_info: true + enabled: true + max_storage_node_port: 65535 + meta_cache_timeout_sec: 1 + quoting_config: + enable_quoting: false + require_credentials_in_new_protocol: false + root: '' + topics_are_first_class_citizen: true + version_table_path: '' +sqs_config: + enable_dead_letter_queues: true + enable_sqs: false + force_queue_creation_v2: true + force_queue_deletion_v2: true + scheme_cache_hard_refresh_time_seconds: 0 + scheme_cache_soft_refresh_time_seconds: 0 +static_erasure: none +system_tablets: + default_node: + - 1 + flat_schemeshard: + - info: + tablet_id: 72057594046678944 + flat_tx_coordinator: + - node: + - 1 + tx_allocator: + - node: + - 1 + tx_mediator: + - node: + - 1 +table_service_config: + resource_manager: + channel_buffer_size: 262144 + mkql_heavy_program_memory_limit: 1048576 + mkql_light_program_memory_limit: 65536 + verbose_memory_limit_exception: true + sql_version: 1 +tracing_config: + backend: + opentelemetry: + collector_url: grpc://otel-collector:4317 + service_name: ydb + external_throttling: + - scope: + database: /local + max_traces_per_minute: 1000 + max_traces_burst: 100 + sampling: + - scope: + database: /local + fraction: 1.0 + level: 15 + max_traces_per_minute: 1000 +# max_traces_burst: 100 + uploader: + max_exported_spans_per_second: 30 + max_spans_in_batch: 100 + max_bytes_in_batch: 10485760 # 10 MiB + max_export_requests_inflight: 3 + max_batch_accumulation_milliseconds: 5000 + span_export_timeout_seconds: 120 diff --git a/setup.py b/setup.py index da56aebf..454fc467 100644 --- a/setup.py +++ b/setup.py @@ -37,5 +37,6 @@ options={"bdist_wheel": {"universal": True}}, extras_require={ "yc": ["yandexcloud", ], + "tracing": ["opentelemetry-api>=1.0.0", "opentelemetry-sdk>=1.0.0"], } ) diff --git a/ydb/connection.py b/ydb/connection.py index 85187f65..50f4571d 100644 --- a/ydb/connection.py +++ b/ydb/connection.py @@ -24,6 +24,7 @@ import grpc from . import issues, _apis, _utilities from . import default_pem +from .opentelemetry.tracing import get_trace_metadata _stubs_list = ( _apis.TableService.Stub, @@ -176,6 +177,9 @@ def _construct_metadata(driver_config, settings): metadata.extend(getattr(settings, "headers", [])) metadata.append(_utilities.x_ydb_sdk_build_info_header(getattr(driver_config, "_additional_sdk_headers", ()))) + + metadata.extend(get_trace_metadata()) + return metadata diff --git a/ydb/opentelemetry/__init__.py b/ydb/opentelemetry/__init__.py new file mode 100644 index 00000000..c732dbf4 --- /dev/null +++ b/ydb/opentelemetry/__init__.py @@ -0,0 +1,8 @@ +def enable_tracing(): + """Enable OpenTelemetry trace context propagation and span creation for all YDB gRPC calls.""" + from ydb.opentelemetry._plugin import _enable_tracing + + _enable_tracing() + + +__all__ = ["enable_tracing"] diff --git a/ydb/opentelemetry/_plugin.py b/ydb/opentelemetry/_plugin.py new file mode 100644 index 00000000..828e7000 --- /dev/null +++ b/ydb/opentelemetry/_plugin.py @@ -0,0 +1,96 @@ +from contextlib import contextmanager + +_MIN_OTEL_VERSION = "1.0.0" + +_tracer = None +_enabled = False + + +def _check_dependencies(): + try: + from opentelemetry.version import __version__ as otel_version + except ImportError: + raise ImportError( + "OpenTelemetry packages are required for tracing support. " + "Install them with: pip install ydb[tracing]" + ) from None + + from packaging.version import Version + + if Version(otel_version) < Version(_MIN_OTEL_VERSION): + raise ImportError( + f"OpenTelemetry >= {_MIN_OTEL_VERSION} is required, " + f"but {otel_version} is installed. " + "Upgrade with: pip install ydb[tracing]" + ) + + +def _otel_metadata_hook(): + """Injects W3C Trace Context (traceparent/tracestate) into gRPC metadata.""" + from opentelemetry.propagate import inject + + headers = {} + inject(headers) + return list(headers.items()) + + +@contextmanager +def _otel_span(name, attributes=None, kind=None): + from opentelemetry import trace + + kind_map = { + "client": trace.SpanKind.CLIENT, + "internal": trace.SpanKind.INTERNAL, + } + otel_kind = kind_map.get(kind, trace.SpanKind.CLIENT) + with _tracer.start_as_current_span( + name, + kind=otel_kind, + attributes=attributes or {}, + ) as span: + try: + yield span + except Exception as e: + _otel_set_error(span, e) + raise + + +def _otel_set_error(span, exception): + """Records an exception on the span and sets ERROR status.""" + if span is None: + return + + from opentelemetry.trace import StatusCode + from ydb import issues + + attrs = {} + if isinstance(exception, issues.Error): + status_code = getattr(exception, "status", None) + if status_code is not None: + attrs["db.response.status_code"] = str(status_code) + attrs["error.type"] = status_code.name + else: + attrs["error.type"] = type(exception).__qualname__ + else: + attrs["error.type"] = type(exception).__qualname__ + + span.set_attributes(attrs) + span.set_status(StatusCode.ERROR, str(exception)) + span.record_exception(exception) + + +def _enable_tracing(): + global _enabled, _tracer + + if _enabled: + return + + _check_dependencies() + + from opentelemetry import trace + from ydb.opentelemetry.tracing import _registry + + _tracer = trace.get_tracer("ydb.sdk") + _enabled = True + _registry.set_metadata_hook(_otel_metadata_hook) + _registry.set_span_factory(_otel_span) diff --git a/ydb/opentelemetry/tracing.py b/ydb/opentelemetry/tracing.py new file mode 100644 index 00000000..265eff94 --- /dev/null +++ b/ydb/opentelemetry/tracing.py @@ -0,0 +1,85 @@ +from contextlib import contextmanager + + +@contextmanager +def _noop_span(name, attributes=None, kind=None): + yield None + + +class OtelTracingRegistry: + """Singleton registry for OpenTelemetry tracing. + + Holds the span factory and metadata hook. + By default everything is no-op until :func:`enable_tracing` is called + from :mod:`ydb.opentelemetry`. + """ + + def __init__(self): + self._metadata_hook = None + self._span_factory = _noop_span + + def create_span(self, name, attributes=None, kind=None): + """Create a tracing span (context manager).""" + return self._span_factory(name, attributes, kind=kind) + + def get_trace_metadata(self): + """Return tracing metadata (e.g. W3C traceparent) for gRPC calls.""" + if self._metadata_hook is not None: + return self._metadata_hook() + return [] + + def set_metadata_hook(self, hook): + """Set a hook that returns tracing metadata for gRPC calls. + + *hook* must be a callable returning a list of ``(key, value)`` tuples. + """ + self._metadata_hook = hook + + def set_span_factory(self, factory): + """Set a span factory for tracing SDK operations. + + *factory* must be a context-manager factory: + ``factory(name, attributes, kind) -> context manager yielding span``. + """ + self._span_factory = factory + + +_registry = OtelTracingRegistry() + + + +def create_span(name, attributes=None, kind=None): + """Create a tracing span via the global registry.""" + return _registry.create_span(name, attributes, kind) + + +def get_trace_metadata(): + """Return tracing metadata for gRPC calls.""" + return _registry.get_trace_metadata() + + +def create_ydb_span(name, driver_config, session_id=None, node_id=None, tx_id=None, kind=None): + """Create a span pre-filled with standard YDB attributes. + + :param name: Span name (e.g. ``"ydb.ExecuteQuery"``). + :param driver_config: :class:`ydb.DriverConfig` instance. + :param session_id: Optional session ID. + :param node_id: Optional node ID. + :param tx_id: Optional transaction ID. + :param kind: Optional span kind (``"client"`` or ``"internal"``). + """ + endpoint = getattr(driver_config, "endpoint", None) or "" + host, _, port = endpoint.rpartition(":") + attrs = { + "db.system.name": "ydb", + "db.namespace": getattr(driver_config, "database", None) or "", + "server.address": host, + "server.port": int(port) if port.isdigit() else 0, + } + if session_id is not None: + attrs["ydb.session.id"] = session_id or "" + if node_id is not None: + attrs["ydb.node.id"] = node_id or 0 + if tx_id is not None: + attrs["ydb.tx.id"] = tx_id or "" + return _registry.create_span(name, attributes=attrs, kind=kind) diff --git a/ydb/pool.py b/ydb/pool.py index 1d1374e6..bf03a612 100644 --- a/ydb/pool.py +++ b/ydb/pool.py @@ -10,6 +10,7 @@ from typing import Any, Callable, ContextManager, List, Optional, Set, Tuple, TYPE_CHECKING from . import connection as connection_impl, issues, resolver, _utilities, tracing +from .opentelemetry.tracing import create_ydb_span from abc import abstractmethod from .connection import Connection, EndpointKey @@ -412,10 +413,11 @@ def wait(self, timeout: Optional[float] = None, fail_fast: bool = False) -> None :param timeout: A timeout to wait in seconds :return: None """ - if fail_fast: - self._store.add_fast_fail().result(timeout) - else: - self._store.subscribe().result(timeout) + with create_ydb_span("ydb.Driver.Initialize", self._driver_config, kind="internal"): + if fail_fast: + self._store.add_fast_fail().result(timeout) + else: + self._store.subscribe().result(timeout) def _on_disconnected(self, connection: Connection) -> None: """ diff --git a/ydb/query/session.py b/ydb/query/session.py index b21c6ba4..d9f379f1 100644 --- a/ydb/query/session.py +++ b/ydb/query/session.py @@ -18,6 +18,7 @@ from .base import QueryExplainResultFormat from .. import _apis, issues, _utilities +from ..opentelemetry.tracing import create_ydb_span from ..settings import BaseRequestSettings from ..connection import _RpcState as RpcState, EndpointKey from .._grpc.grpcwrapper import common_utils @@ -368,8 +369,9 @@ def create(self, settings: Optional[BaseRequestSettings] = None) -> "QuerySessio if self._closed: raise RuntimeError("Session is already closed.") - self._create_call(settings=settings) - self._attach() + with create_ydb_span("ydb.CreateSession", self._driver._driver_config): + self._create_call(settings=settings) + self._attach() return self @@ -435,30 +437,32 @@ def execute( """ self._check_session_ready_to_use() - stream_it = self._execute_call( - query=query, - parameters=parameters, - commit_tx=True, - syntax=syntax, - exec_mode=exec_mode, - stats_mode=stats_mode, - schema_inclusion_mode=schema_inclusion_mode, - result_set_format=result_set_format, - arrow_format_settings=arrow_format_settings, - concurrent_result_sets=concurrent_result_sets, - settings=settings, - ) + with create_ydb_span("ydb.ExecuteQuery", self._driver._driver_config, + session_id=self._session_id, node_id=self._node_id): + stream_it = self._execute_call( + query=query, + parameters=parameters, + commit_tx=True, + syntax=syntax, + exec_mode=exec_mode, + stats_mode=stats_mode, + schema_inclusion_mode=schema_inclusion_mode, + result_set_format=result_set_format, + arrow_format_settings=arrow_format_settings, + concurrent_result_sets=concurrent_result_sets, + settings=settings, + ) - return base.SyncResponseContextIterator( - stream_it, - lambda resp: base.wrap_execute_query_response( - rpc_state=None, - response_pb=resp, - session=self, - settings=self._settings, - ), - on_error=self._on_execute_stream_error, - ) + return base.SyncResponseContextIterator( + stream_it, + lambda resp: base.wrap_execute_query_response( + rpc_state=None, + response_pb=resp, + session=self, + settings=self._settings, + ), + on_error=self._on_execute_stream_error, + ) def explain( self, diff --git a/ydb/query/transaction.py b/ydb/query/transaction.py index 687a5eaf..8631ba52 100644 --- a/ydb/query/transaction.py +++ b/ydb/query/transaction.py @@ -17,6 +17,7 @@ _apis, issues, ) +from ..opentelemetry.tracing import create_ydb_span from .._grpc.grpcwrapper import ydb_topic as _ydb_topic from .._grpc.grpcwrapper import ydb_query as _ydb_query from ..connection import _RpcState as RpcState @@ -553,13 +554,17 @@ def commit(self, settings: Optional[BaseRequestSettings] = None) -> None: self._ensure_prev_stream_finished() - try: - self._execute_callbacks_sync(base.TxEvent.BEFORE_COMMIT) - self._commit_call(settings) - self._execute_callbacks_sync(base.TxEvent.AFTER_COMMIT, exc=None) - except BaseException as e: # TODO: probably should be less wide - self._execute_callbacks_sync(base.TxEvent.AFTER_COMMIT, exc=e) - raise e + with create_ydb_span("ydb.Commit", self._driver._driver_config, + session_id=self.session.session_id, + node_id=self.session.node_id, + tx_id=self._tx_state.tx_id): + try: + self._execute_callbacks_sync(base.TxEvent.BEFORE_COMMIT) + self._commit_call(settings) + self._execute_callbacks_sync(base.TxEvent.AFTER_COMMIT, exc=None) + except BaseException as e: # TODO: probably should be less wide + self._execute_callbacks_sync(base.TxEvent.AFTER_COMMIT, exc=e) + raise e def rollback(self, settings: Optional[BaseRequestSettings] = None) -> None: """Calls rollback on a transaction if it is open otherwise is no-op. If transaction execution @@ -579,13 +584,17 @@ def rollback(self, settings: Optional[BaseRequestSettings] = None) -> None: self._ensure_prev_stream_finished() - try: - self._execute_callbacks_sync(base.TxEvent.BEFORE_ROLLBACK) - self._rollback_call(settings) - self._execute_callbacks_sync(base.TxEvent.AFTER_ROLLBACK, exc=None) - except BaseException as e: # TODO: probably should be less wide - self._execute_callbacks_sync(base.TxEvent.AFTER_ROLLBACK, exc=e) - raise e + with create_ydb_span("ydb.Rollback", self._driver._driver_config, + session_id=self.session.session_id, + node_id=self.session.node_id, + tx_id=self._tx_state.tx_id): + try: + self._execute_callbacks_sync(base.TxEvent.BEFORE_ROLLBACK) + self._rollback_call(settings) + self._execute_callbacks_sync(base.TxEvent.AFTER_ROLLBACK, exc=None) + except BaseException as e: # TODO: probably should be less wide + self._execute_callbacks_sync(base.TxEvent.AFTER_ROLLBACK, exc=e) + raise e def execute( self, @@ -634,30 +643,34 @@ def execute( """ self._ensure_prev_stream_finished() - stream_it = self._execute_call( - query=query, - commit_tx=commit_tx, - syntax=syntax, - exec_mode=exec_mode, - stats_mode=stats_mode, - schema_inclusion_mode=schema_inclusion_mode, - result_set_format=result_set_format, - arrow_format_settings=arrow_format_settings, - parameters=parameters, - concurrent_result_sets=concurrent_result_sets, - settings=settings, - ) - - self._prev_stream = base.SyncResponseContextIterator( - stream_it, - lambda resp: base.wrap_execute_query_response( - rpc_state=None, - response_pb=resp, - session=self.session, - tx=self, + with create_ydb_span("ydb.ExecuteQuery", self._driver._driver_config, + session_id=self.session.session_id, + node_id=self.session.node_id, + tx_id=self._tx_state.tx_id): + stream_it = self._execute_call( + query=query, commit_tx=commit_tx, - settings=self.session._settings, - ), - on_error=self.session._on_execute_stream_error, - ) - return self._prev_stream + syntax=syntax, + exec_mode=exec_mode, + stats_mode=stats_mode, + schema_inclusion_mode=schema_inclusion_mode, + result_set_format=result_set_format, + arrow_format_settings=arrow_format_settings, + parameters=parameters, + concurrent_result_sets=concurrent_result_sets, + settings=settings, + ) + + self._prev_stream = base.SyncResponseContextIterator( + stream_it, + lambda resp: base.wrap_execute_query_response( + rpc_state=None, + response_pb=resp, + session=self.session, + tx=self, + commit_tx=commit_tx, + settings=self.session._settings, + ), + on_error=self.session._on_execute_stream_error, + ) + return self._prev_stream From 5998749c1647611aeaf4eaa07948782369f33d0d Mon Sep 17 00:00:00 2001 From: tewbo Date: Sat, 21 Mar 2026 17:53:37 +0300 Subject: [PATCH 02/36] + add async spans --- examples/opentelemetry/example_async.py | 48 ++++++++++ .../{example.py => example_sync.py} | 9 +- examples/opentelemetry/example_tx.py | 60 ++++++++++++ ydb/aio/connection.py | 4 + ydb/aio/pool.py | 4 +- ydb/aio/query/session.py | 54 ++++++----- ydb/aio/query/transaction.py | 93 +++++++++++-------- 7 files changed, 201 insertions(+), 71 deletions(-) create mode 100644 examples/opentelemetry/example_async.py rename examples/opentelemetry/{example.py => example_sync.py} (81%) create mode 100644 examples/opentelemetry/example_tx.py diff --git a/examples/opentelemetry/example_async.py b/examples/opentelemetry/example_async.py new file mode 100644 index 00000000..4e7aa87a --- /dev/null +++ b/examples/opentelemetry/example_async.py @@ -0,0 +1,48 @@ +import asyncio + +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.resources import Resource + +from random import randint + +resource = Resource(attributes={"service.name": "ydb-python-test-async"}) + +provider = TracerProvider(resource=resource) + +provider.add_span_processor( + BatchSpanProcessor(OTLPSpanExporter(endpoint="http://localhost:4317/v1/traces")) +) +trace.set_tracer_provider(provider) + +tracer = trace.get_tracer(__name__) + +import ydb +from ydb.opentelemetry import enable_tracing + +enable_tracing() + +endpoint = "grpc://localhost:2136" +database = "/local" + + +async def main(): + async with ydb.aio.Driver(endpoint=endpoint, database=database, credentials=ydb.default_credentials()) as driver: + await driver.wait(timeout=5) + + with tracer.start_as_current_span("ydb-load-test-async"): + async with ydb.aio.QuerySessionPool(driver) as pool: + await pool.execute_with_retries("CREATE TABLE IF NOT EXISTS example(key UInt64, value String, PRIMARY KEY (key))") + rand_value = randint(10000, 100000) + val = f"value{rand_value}" + await pool.execute_with_retries(f"INSERT INTO example (key, value) VALUES ({rand_value}, '{val}')") + + res = await pool.execute_with_retries("SELECT * FROM example") + print(res.pop().rows) + + +asyncio.run(main()) + +provider.shutdown() diff --git a/examples/opentelemetry/example.py b/examples/opentelemetry/example_sync.py similarity index 81% rename from examples/opentelemetry/example.py rename to examples/opentelemetry/example_sync.py index 797caff3..dc9f45e8 100644 --- a/examples/opentelemetry/example.py +++ b/examples/opentelemetry/example_sync.py @@ -11,7 +11,7 @@ provider = TracerProvider(resource=resource) provider.add_span_processor( - BatchSpanProcessor(OTLPSpanExporter(endpoint="http://localhost:4317/v1/traces")) # или 4317 grpc + BatchSpanProcessor(OTLPSpanExporter(endpoint="http://localhost:4317/v1/traces")) ) trace.set_tracer_provider(provider) @@ -31,10 +31,9 @@ with tracer.start_as_current_span("ydb-load-test"): with ydb.QuerySessionPool(driver) as pool: pool.execute_with_retries("CREATE TABLE IF NOT EXISTS example(key UInt64, value String, PRIMARY KEY (key))") - rand_value = randint(10000, 99999) - for i in range(rand_value, rand_value + 3): - val = f"value{i}" - pool.execute_with_retries(f"INSERT INTO example (key, value) VALUES ({i}, '{val}')") + rand_value = randint(10000, 100000) + val = f"value{rand_value}" + pool.execute_with_retries(f"INSERT INTO example (key, value) VALUES ({rand_value}, '{val}')") res = pool.execute_with_retries("SELECT * FROM example") print(res.pop().rows) diff --git a/examples/opentelemetry/example_tx.py b/examples/opentelemetry/example_tx.py new file mode 100644 index 00000000..38f74de0 --- /dev/null +++ b/examples/opentelemetry/example_tx.py @@ -0,0 +1,60 @@ +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.resources import Resource + +from random import randint + +resource = Resource(attributes={"service.name": "ydb-python-test-tx"}) + +provider = TracerProvider(resource=resource) + +provider.add_span_processor( + BatchSpanProcessor(OTLPSpanExporter(endpoint="http://localhost:4317/v1/traces")) +) +trace.set_tracer_provider(provider) + +tracer = trace.get_tracer(__name__) + +import ydb +from ydb.opentelemetry import enable_tracing + +enable_tracing() + +endpoint = "grpc://localhost:2136" +database = "/local" + +with ydb.Driver(endpoint=endpoint, database=database, credentials=ydb.default_credentials()) as driver: + driver.wait(timeout=5) + + with tracer.start_as_current_span("ydb-tx-test"): + pool = ydb.QuerySessionPool(driver) + + pool.execute_with_retries("CREATE TABLE IF NOT EXISTS example(key UInt64, value String, PRIMARY KEY (key))") + + # Commit example: insert a row inside an explicit transaction + with tracer.start_as_current_span("commit-flow"): + with pool.checkout() as session: + with session.transaction() as tx: + rand_value = randint(10000, 100000) + with tx.execute(f"INSERT INTO example (key, value) VALUES ({rand_value}, 'committed')") as _: + pass + tx.commit() + + # Rollback example: insert a row and then rollback + with tracer.start_as_current_span("rollback-flow"): + with pool.checkout() as session: + with session.transaction() as tx: + rand_value = randint(10000, 100000) + with tx.execute(f"INSERT INTO example (key, value) VALUES ({rand_value}, 'rolled_back')") as _: + pass + tx.rollback() + + # Verify: only the committed row should be present + res = pool.execute_with_retries("SELECT * FROM example ORDER BY key") + print(res.pop().rows) + + pool.stop() + +provider.shutdown() diff --git a/ydb/aio/connection.py b/ydb/aio/connection.py index 9e03450d..a0278c41 100644 --- a/ydb/aio/connection.py +++ b/ydb/aio/connection.py @@ -26,6 +26,7 @@ from ydb.driver import DriverConfig from ydb.settings import BaseRequestSettings from ydb import issues +from ydb.opentelemetry.tracing import get_trace_metadata # Workaround for good IDE and universal for runtime if TYPE_CHECKING: @@ -71,6 +72,9 @@ async def _construct_metadata( metadata.append((YDB_REQUEST_TYPE_HEADER, settings.request_type)) metadata.append(_utilities.x_ydb_sdk_build_info_header(getattr(driver_config, "_additional_sdk_headers", ()))) + + metadata.extend(get_trace_metadata()) + return metadata diff --git a/ydb/aio/pool.py b/ydb/aio/pool.py index 0e96602c..e3aafbe0 100644 --- a/ydb/aio/pool.py +++ b/ydb/aio/pool.py @@ -6,6 +6,7 @@ from typing import Any, Callable, Optional, Tuple, TYPE_CHECKING from ydb import issues +from ydb.opentelemetry.tracing import create_ydb_span from ydb.pool import ConnectionsCache as _ConnectionsCache, IConnectionPool from .connection import Connection, EndpointKey @@ -244,7 +245,8 @@ async def __wrapper__() -> None: return __wrapper__ async def wait(self, timeout: Optional[float] = 7.0, fail_fast: bool = False) -> None: # type: ignore[override] # async override of sync method - await self._store.get(fast_fail=fail_fast, wait_timeout=timeout if timeout is not None else 7.0) + with create_ydb_span("ydb.Driver.Initialize", self._driver_config, kind="internal"): + await self._store.get(fast_fail=fail_fast, wait_timeout=timeout if timeout is not None else 7.0) def discovery_debug_details(self) -> str: if self._discovery: diff --git a/ydb/aio/query/session.py b/ydb/aio/query/session.py index 67e62ff6..80a236df 100644 --- a/ydb/aio/query/session.py +++ b/ydb/aio/query/session.py @@ -19,6 +19,7 @@ from ...query import base from ...query.session import BaseQuerySession +from ...opentelemetry.tracing import create_ydb_span from ..._constants import DEFAULT_INITIAL_RESPONSE_TIMEOUT @@ -105,8 +106,9 @@ async def create(self, settings: Optional[BaseRequestSettings] = None) -> "Query if self._closed: raise RuntimeError("Session is already closed") - await self._create_call(settings=settings) - await self._attach() + with create_ydb_span("ydb.CreateSession", self._driver._driver_config): + await self._create_call(settings=settings) + await self._attach() return self @@ -159,30 +161,32 @@ async def execute( """ self._check_session_ready_to_use() - stream_it = await self._execute_call( - query=query, - parameters=parameters, - commit_tx=True, - syntax=syntax, - exec_mode=exec_mode, - stats_mode=stats_mode, - schema_inclusion_mode=schema_inclusion_mode, - result_set_format=result_set_format, - arrow_format_settings=arrow_format_settings, - concurrent_result_sets=concurrent_result_sets, - settings=settings, - ) + with create_ydb_span("ydb.ExecuteQuery", self._driver._driver_config, + session_id=self._session_id, node_id=self._node_id): + stream_it = await self._execute_call( + query=query, + parameters=parameters, + commit_tx=True, + syntax=syntax, + exec_mode=exec_mode, + stats_mode=stats_mode, + schema_inclusion_mode=schema_inclusion_mode, + result_set_format=result_set_format, + arrow_format_settings=arrow_format_settings, + concurrent_result_sets=concurrent_result_sets, + settings=settings, + ) - return AsyncResponseContextIterator( - it=stream_it, - wrapper=lambda resp: base.wrap_execute_query_response( - rpc_state=None, - response_pb=resp, - session=self, - settings=self._settings, - ), - on_error=self._on_execute_stream_error, - ) + return AsyncResponseContextIterator( + it=stream_it, + wrapper=lambda resp: base.wrap_execute_query_response( + rpc_state=None, + response_pb=resp, + session=self, + settings=self._settings, + ), + on_error=self._on_execute_stream_error, + ) async def explain( self, diff --git a/ydb/aio/query/transaction.py b/ydb/aio/query/transaction.py index 69c77478..746831a4 100644 --- a/ydb/aio/query/transaction.py +++ b/ydb/aio/query/transaction.py @@ -12,6 +12,7 @@ BaseQueryTxContext, QueryTxStateEnum, ) +from ...opentelemetry.tracing import create_ydb_span if TYPE_CHECKING: from .session import QuerySession @@ -106,13 +107,17 @@ async def commit(self, settings: Optional[BaseRequestSettings] = None) -> None: await self._ensure_prev_stream_finished() - try: - await self._execute_callbacks_async(base.TxEvent.BEFORE_COMMIT) - await self._commit_call(settings) - await self._execute_callbacks_async(base.TxEvent.AFTER_COMMIT, exc=None) - except BaseException as e: - await self._execute_callbacks_async(base.TxEvent.AFTER_COMMIT, exc=e) - raise e + with create_ydb_span("ydb.Commit", self._driver._driver_config, + session_id=self.session.session_id, + node_id=self.session.node_id, + tx_id=self._tx_state.tx_id): + try: + await self._execute_callbacks_async(base.TxEvent.BEFORE_COMMIT) + await self._commit_call(settings) + await self._execute_callbacks_async(base.TxEvent.AFTER_COMMIT, exc=None) + except BaseException as e: + await self._execute_callbacks_async(base.TxEvent.AFTER_COMMIT, exc=e) + raise e async def rollback(self, settings: Optional[BaseRequestSettings] = None) -> None: """Calls rollback on a transaction if it is open otherwise is no-op. If transaction execution @@ -133,13 +138,17 @@ async def rollback(self, settings: Optional[BaseRequestSettings] = None) -> None await self._ensure_prev_stream_finished() - try: - await self._execute_callbacks_async(base.TxEvent.BEFORE_ROLLBACK) - await self._rollback_call(settings) - await self._execute_callbacks_async(base.TxEvent.AFTER_ROLLBACK, exc=None) - except BaseException as e: - await self._execute_callbacks_async(base.TxEvent.AFTER_ROLLBACK, exc=e) - raise e + with create_ydb_span("ydb.Rollback", self._driver._driver_config, + session_id=self.session.session_id, + node_id=self.session.node_id, + tx_id=self._tx_state.tx_id): + try: + await self._execute_callbacks_async(base.TxEvent.BEFORE_ROLLBACK) + await self._rollback_call(settings) + await self._execute_callbacks_async(base.TxEvent.AFTER_ROLLBACK, exc=None) + except BaseException as e: + await self._execute_callbacks_async(base.TxEvent.AFTER_ROLLBACK, exc=e) + raise e async def execute( self, @@ -187,30 +196,34 @@ async def execute( """ await self._ensure_prev_stream_finished() - stream_it = await self._execute_call( - query=query, - parameters=parameters, - commit_tx=commit_tx, - syntax=syntax, - exec_mode=exec_mode, - stats_mode=stats_mode, - schema_inclusion_mode=schema_inclusion_mode, - result_set_format=result_set_format, - arrow_format_settings=arrow_format_settings, - concurrent_result_sets=concurrent_result_sets, - settings=settings, - ) - - self._prev_stream = AsyncResponseContextIterator( - it=stream_it, - wrapper=lambda resp: base.wrap_execute_query_response( - rpc_state=None, - response_pb=resp, - session=self.session, - tx=self, + with create_ydb_span("ydb.ExecuteQuery", self._driver._driver_config, + session_id=self.session.session_id, + node_id=self.session.node_id, + tx_id=self._tx_state.tx_id): + stream_it = await self._execute_call( + query=query, + parameters=parameters, commit_tx=commit_tx, - settings=self.session._settings, - ), - on_error=self.session._on_execute_stream_error, - ) - return self._prev_stream + syntax=syntax, + exec_mode=exec_mode, + stats_mode=stats_mode, + schema_inclusion_mode=schema_inclusion_mode, + result_set_format=result_set_format, + arrow_format_settings=arrow_format_settings, + concurrent_result_sets=concurrent_result_sets, + settings=settings, + ) + + self._prev_stream = AsyncResponseContextIterator( + it=stream_it, + wrapper=lambda resp: base.wrap_execute_query_response( + rpc_state=None, + response_pb=resp, + session=self.session, + tx=self, + commit_tx=commit_tx, + settings=self.session._settings, + ), + on_error=self.session._on_execute_stream_error, + ) + return self._prev_stream From db012124823276b05946908d8dd1e650d6857520 Mon Sep 17 00:00:00 2001 From: tewbo Date: Tue, 24 Mar 2026 07:47:18 +0300 Subject: [PATCH 03/36] + test and refactor --- examples/opentelemetry/example.py | 65 ++++++ examples/opentelemetry/example_async.py | 48 ---- examples/opentelemetry/example_sync.py | 41 ---- examples/opentelemetry/example_tx.py | 60 ----- tests/tracing/__init__.py | 0 tests/tracing/conftest.py | 55 +++++ tests/tracing/test_tracing_async.py | 243 ++++++++++++++++++++ tests/tracing/test_tracing_sync.py | 281 ++++++++++++++++++++++++ ydb/aio/query/base.py | 18 +- ydb/aio/query/session.py | 13 +- ydb/aio/query/transaction.py | 41 +++- ydb/opentelemetry/__init__.py | 8 +- ydb/opentelemetry/_plugin.py | 122 +++++----- ydb/opentelemetry/tracing.py | 69 +++--- ydb/query/base.py | 18 +- ydb/query/session.py | 13 +- ydb/query/transaction.py | 41 +++- 17 files changed, 853 insertions(+), 283 deletions(-) create mode 100644 examples/opentelemetry/example.py delete mode 100644 examples/opentelemetry/example_async.py delete mode 100644 examples/opentelemetry/example_sync.py delete mode 100644 examples/opentelemetry/example_tx.py create mode 100644 tests/tracing/__init__.py create mode 100644 tests/tracing/conftest.py create mode 100644 tests/tracing/test_tracing_async.py create mode 100644 tests/tracing/test_tracing_sync.py diff --git a/examples/opentelemetry/example.py b/examples/opentelemetry/example.py new file mode 100644 index 00000000..fad3111a --- /dev/null +++ b/examples/opentelemetry/example.py @@ -0,0 +1,65 @@ +"""Minimal example: OpenTelemetry tracing for YDB Python SDK.""" + +import asyncio + +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.resources import Resource + +import ydb +from ydb.opentelemetry import enable_tracing + +resource = Resource(attributes={"service.name": "ydb-example"}) +provider = TracerProvider(resource=resource) +provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter(endpoint="http://localhost:4317"))) +trace.set_tracer_provider(provider) + +enable_tracing() + +tracer = trace.get_tracer(__name__) + +ENDPOINT = "grpc://localhost:2136" +DATABASE = "/local" + + +def sync_example(): + """Sync: session execute and transaction execute + commit.""" + with ydb.Driver(endpoint=ENDPOINT, database=DATABASE) as driver: + driver.wait(timeout=5) + + with ydb.QuerySessionPool(driver) as pool: + with tracer.start_as_current_span("sync-example"): + pool.execute_with_retries("SELECT 1") + + def tx_callee(session): + with session.transaction() as tx: + list(tx.execute("SELECT 1")) + tx.commit() + + pool.retry_operation_sync(tx_callee) + + +async def async_example(): + """Async: session execute and transaction execute + commit.""" + async with ydb.aio.Driver(endpoint=ENDPOINT, database=DATABASE) as driver: + await driver.wait(timeout=5) + + async with ydb.aio.QuerySessionPool(driver) as pool: + with tracer.start_as_current_span("async-example"): + await pool.execute_with_retries("SELECT 1") + + async def tx_callee(session): + async with session.transaction() as tx: + result = await tx.execute("SELECT 1") + async for _ in result: + pass + await tx.commit() + + await pool.retry_operation_async(tx_callee) + +sync_example() +asyncio.run(async_example()) + +provider.shutdown() diff --git a/examples/opentelemetry/example_async.py b/examples/opentelemetry/example_async.py deleted file mode 100644 index 4e7aa87a..00000000 --- a/examples/opentelemetry/example_async.py +++ /dev/null @@ -1,48 +0,0 @@ -import asyncio - -from opentelemetry import trace -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor -from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter -from opentelemetry.sdk.resources import Resource - -from random import randint - -resource = Resource(attributes={"service.name": "ydb-python-test-async"}) - -provider = TracerProvider(resource=resource) - -provider.add_span_processor( - BatchSpanProcessor(OTLPSpanExporter(endpoint="http://localhost:4317/v1/traces")) -) -trace.set_tracer_provider(provider) - -tracer = trace.get_tracer(__name__) - -import ydb -from ydb.opentelemetry import enable_tracing - -enable_tracing() - -endpoint = "grpc://localhost:2136" -database = "/local" - - -async def main(): - async with ydb.aio.Driver(endpoint=endpoint, database=database, credentials=ydb.default_credentials()) as driver: - await driver.wait(timeout=5) - - with tracer.start_as_current_span("ydb-load-test-async"): - async with ydb.aio.QuerySessionPool(driver) as pool: - await pool.execute_with_retries("CREATE TABLE IF NOT EXISTS example(key UInt64, value String, PRIMARY KEY (key))") - rand_value = randint(10000, 100000) - val = f"value{rand_value}" - await pool.execute_with_retries(f"INSERT INTO example (key, value) VALUES ({rand_value}, '{val}')") - - res = await pool.execute_with_retries("SELECT * FROM example") - print(res.pop().rows) - - -asyncio.run(main()) - -provider.shutdown() diff --git a/examples/opentelemetry/example_sync.py b/examples/opentelemetry/example_sync.py deleted file mode 100644 index dc9f45e8..00000000 --- a/examples/opentelemetry/example_sync.py +++ /dev/null @@ -1,41 +0,0 @@ -from opentelemetry import trace -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor -from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter -from opentelemetry.sdk.resources import Resource - -from random import randint - -resource = Resource(attributes={"service.name": "ydb-python-test"}) - -provider = TracerProvider(resource=resource) - -provider.add_span_processor( - BatchSpanProcessor(OTLPSpanExporter(endpoint="http://localhost:4317/v1/traces")) -) -trace.set_tracer_provider(provider) - -tracer = trace.get_tracer(__name__) - -import ydb -from ydb.opentelemetry import enable_tracing - -enable_tracing() - -endpoint = "grpc://localhost:2136" -database = "/local" - -with ydb.Driver(endpoint=endpoint, database=database, credentials=ydb.default_credentials()) as driver: - driver.wait(timeout=5) - - with tracer.start_as_current_span("ydb-load-test"): - with ydb.QuerySessionPool(driver) as pool: - pool.execute_with_retries("CREATE TABLE IF NOT EXISTS example(key UInt64, value String, PRIMARY KEY (key))") - rand_value = randint(10000, 100000) - val = f"value{rand_value}" - pool.execute_with_retries(f"INSERT INTO example (key, value) VALUES ({rand_value}, '{val}')") - - res = pool.execute_with_retries("SELECT * FROM example") - print(res.pop().rows) - -provider.shutdown() \ No newline at end of file diff --git a/examples/opentelemetry/example_tx.py b/examples/opentelemetry/example_tx.py deleted file mode 100644 index 38f74de0..00000000 --- a/examples/opentelemetry/example_tx.py +++ /dev/null @@ -1,60 +0,0 @@ -from opentelemetry import trace -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor -from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter -from opentelemetry.sdk.resources import Resource - -from random import randint - -resource = Resource(attributes={"service.name": "ydb-python-test-tx"}) - -provider = TracerProvider(resource=resource) - -provider.add_span_processor( - BatchSpanProcessor(OTLPSpanExporter(endpoint="http://localhost:4317/v1/traces")) -) -trace.set_tracer_provider(provider) - -tracer = trace.get_tracer(__name__) - -import ydb -from ydb.opentelemetry import enable_tracing - -enable_tracing() - -endpoint = "grpc://localhost:2136" -database = "/local" - -with ydb.Driver(endpoint=endpoint, database=database, credentials=ydb.default_credentials()) as driver: - driver.wait(timeout=5) - - with tracer.start_as_current_span("ydb-tx-test"): - pool = ydb.QuerySessionPool(driver) - - pool.execute_with_retries("CREATE TABLE IF NOT EXISTS example(key UInt64, value String, PRIMARY KEY (key))") - - # Commit example: insert a row inside an explicit transaction - with tracer.start_as_current_span("commit-flow"): - with pool.checkout() as session: - with session.transaction() as tx: - rand_value = randint(10000, 100000) - with tx.execute(f"INSERT INTO example (key, value) VALUES ({rand_value}, 'committed')") as _: - pass - tx.commit() - - # Rollback example: insert a row and then rollback - with tracer.start_as_current_span("rollback-flow"): - with pool.checkout() as session: - with session.transaction() as tx: - rand_value = randint(10000, 100000) - with tx.execute(f"INSERT INTO example (key, value) VALUES ({rand_value}, 'rolled_back')") as _: - pass - tx.rollback() - - # Verify: only the committed row should be present - res = pool.execute_with_retries("SELECT * FROM example ORDER BY key") - print(res.pop().rows) - - pool.stop() - -provider.shutdown() diff --git a/tests/tracing/__init__.py b/tests/tracing/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/tracing/conftest.py b/tests/tracing/conftest.py new file mode 100644 index 00000000..b54ffd05 --- /dev/null +++ b/tests/tracing/conftest.py @@ -0,0 +1,55 @@ +"""Shared fixtures for OpenTelemetry tracing tests. + +Sets up an in-memory TracerProvider so that spans created by the SDK +can be collected and inspected without any external backend. +""" + +import pytest + +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter + +from ydb.opentelemetry.tracing import _registry + + +_provider = TracerProvider() +_exporter = InMemorySpanExporter() +_provider.add_span_processor(SimpleSpanProcessor(_exporter)) +trace.set_tracer_provider(_provider) + + +@pytest.fixture() +def otel_setup(): + """Enable SDK tracing, yield the exporter, then restore noop defaults. + + Each test gets a clean exporter (cleared before and after). + """ + import ydb.opentelemetry._plugin as _plugin + + _exporter.clear() + + _plugin._enabled = False + _plugin._tracer = None + + from ydb.opentelemetry import enable_tracing + + enable_tracing() + + yield _exporter + + # Restore noop state + _registry.set_create_span(None) + _registry.set_metadata_hook(None) + _plugin._enabled = False + _plugin._tracer = None + _exporter.clear() + + +class FakeDriverConfig: + def __init__(self, endpoint="test_endpoint:1337", database="/test_database"): + self.endpoint = endpoint + self.database = database + self.query_client_settings = None + self.tracer = None diff --git a/tests/tracing/test_tracing_async.py b/tests/tracing/test_tracing_async.py new file mode 100644 index 00000000..08bcddba --- /dev/null +++ b/tests/tracing/test_tracing_async.py @@ -0,0 +1,243 @@ +"""Unit tests for OpenTelemetry tracing — asynchronous SDK operations. + +Mirrors the sync tests but exercises the async code paths in ydb.aio.query. +""" + +from opentelemetry.trace import StatusCode, SpanKind +from ydb.query.transaction import QueryTxStateEnum +from .conftest import FakeDriverConfig +from unittest.mock import AsyncMock, MagicMock, patch + +import asyncio +import pytest + +async def _empty_async_iter(): + return + yield # noqa: makes this an async generator + + +def _get_spans(exporter, name=None): + spans = exporter.get_finished_spans() + if name is not None: + spans = [s for s in spans if s.name == name] + return spans + + +def _get_single_span(exporter, name): + spans = _get_spans(exporter, name) + assert len(spans) == 1, f"Expected 1 span named '{name}', got {len(spans)}: {[s.name for s in exporter.get_finished_spans()]}" + return spans[0] + + +def _make_async_session_mock(driver_config=None): + """Create a mock that behaves like an async QuerySession after create().""" + cfg = driver_config or FakeDriverConfig() + driver = MagicMock() + driver._driver_config = cfg + + session = MagicMock() + session._driver = driver + session._session_id = "test-session-id" + session._node_id = 12345 + session.session_id = "test-session-id" + session.node_id = 12345 + return session, driver + + +def _make_async_tx(session, driver): + """Create a real async QueryTxContext wired to mocked session/driver.""" + from ydb._grpc.grpcwrapper.ydb_query_public_types import QuerySerializableReadWrite + from ydb.aio.query.transaction import QueryTxContext + + tx = QueryTxContext(driver, session, QuerySerializableReadWrite()) + tx._tx_state._change_state(QueryTxStateEnum.BEGINED) + tx._tx_state.tx_id = "test-tx-id" + return tx + + +class TestAsyncCreateSessionSpan: + @pytest.mark.asyncio + async def test_create_session_emits_span(self, otel_setup): + exporter = otel_setup + + from ydb.aio.query.session import QuerySession + + qs = QuerySession.__new__(QuerySession) + cfg = FakeDriverConfig() + driver = MagicMock() + driver._driver_config = cfg + qs._driver = driver + qs._session_id = None + qs._closed = False + + with patch.object(QuerySession, "_create_call", new_callable=AsyncMock): + with patch.object(QuerySession, "_attach", new_callable=AsyncMock): + await qs.create() + + span = _get_single_span(exporter, "ydb.CreateSession") + assert span.kind == SpanKind.CLIENT + attrs = dict(span.attributes) + assert attrs["db.system.name"] == "ydb" + assert attrs["db.namespace"] == "/test_database" + assert attrs["server.address"] == "test_endpoint" + assert attrs["server.port"] == 1337 + + +class TestAsyncExecuteQuerySpan: + @pytest.mark.asyncio + async def test_session_execute_emits_span(self, otel_setup): + exporter = otel_setup + + from ydb.aio.query.session import QuerySession + + qs = QuerySession.__new__(QuerySession) + cfg = FakeDriverConfig() + driver = MagicMock() + driver._driver_config = cfg + qs._driver = driver + qs._session_id = "test-session-id" + qs._node_id = 12345 + qs._closed = False + + fake_stream = _empty_async_iter() + with patch.object(QuerySession, "_execute_call", new_callable=AsyncMock, return_value=fake_stream): + result = await qs.execute("SELECT 1;") + async for _ in result: + pass + + span = _get_single_span(exporter, "ydb.ExecuteQuery") + attrs = dict(span.attributes) + assert attrs["ydb.session.id"] == "test-session-id" + assert attrs["ydb.node.id"] == 12345 + + @pytest.mark.asyncio + async def test_tx_execute_emits_span_with_tx_id(self, otel_setup): + exporter = otel_setup + session, driver = _make_async_session_mock() + tx = _make_async_tx(session, driver) + + fake_stream = _empty_async_iter() + with patch.object(type(tx), "_execute_call", new_callable=AsyncMock, return_value=fake_stream): + tx._prev_stream = None + result = await tx.execute("SELECT 1;") + async for _ in result: + pass + + span = _get_single_span(exporter, "ydb.ExecuteQuery") + attrs = dict(span.attributes) + assert attrs["ydb.tx.id"] == "test-tx-id" + assert attrs["ydb.session.id"] == "test-session-id" + assert attrs["ydb.node.id"] == 12345 + + +class TestAsyncCommitSpan: + @pytest.mark.asyncio + async def test_commit_emits_span(self, otel_setup): + exporter = otel_setup + session, driver = _make_async_session_mock() + tx = _make_async_tx(session, driver) + + with patch.object(type(tx), "_commit_call", new_callable=AsyncMock): + await tx.commit() + + span = _get_single_span(exporter, "ydb.Commit") + assert span.kind == SpanKind.CLIENT + attrs = dict(span.attributes) + assert attrs["ydb.tx.id"] == "test-tx-id" + assert attrs["ydb.session.id"] == "test-session-id" + + +class TestAsyncRollbackSpan: + @pytest.mark.asyncio + async def test_rollback_emits_span(self, otel_setup): + exporter = otel_setup + session, driver = _make_async_session_mock() + tx = _make_async_tx(session, driver) + + with patch.object(type(tx), "_rollback_call", new_callable=AsyncMock): + await tx.rollback() + + span = _get_single_span(exporter, "ydb.Rollback") + assert span.kind == SpanKind.CLIENT + attrs = dict(span.attributes) + assert attrs["ydb.tx.id"] == "test-tx-id" + assert attrs["ydb.session.id"] == "test-session-id" + + +class TestAsyncErrorHandling: + @pytest.mark.asyncio + async def test_error_sets_error_status(self, otel_setup): + exporter = otel_setup + + from ydb import issues + + class FakeStatus: + name = "SCHEME_ERROR" + + exc = issues.SchemeError("Table not found") + exc.status = FakeStatus() + + from ydb.aio.query.session import QuerySession + + qs = QuerySession.__new__(QuerySession) + cfg = FakeDriverConfig() + driver = MagicMock() + driver._driver_config = cfg + qs._driver = driver + qs._session_id = "test-session-id" + qs._node_id = 12345 + qs._closed = False + + with patch.object(QuerySession, "_execute_call", new_callable=AsyncMock, side_effect=exc): + with pytest.raises(issues.SchemeError): + await qs.execute("SELECT * FROM non_existing_table") + + span = _get_single_span(exporter, "ydb.ExecuteQuery") + assert span.status.status_code == StatusCode.ERROR + attrs = dict(span.attributes) + assert attrs["error.type"] == "SCHEME_ERROR" + assert len(span.events) > 0 + + +class TestAsyncConcurrentSpansIsolation: + @pytest.mark.asyncio + async def test_parallel_executes_do_not_become_parent_child(self, otel_setup): + """Two concurrent execute calls must produce sibling spans, not parent-child.""" + exporter = otel_setup + + from ydb.aio.query.session import QuerySession + + async def _slow_async_iter(): + await asyncio.sleep(0.5) + return + yield # noqa + + def _make_session(): + qs = QuerySession.__new__(QuerySession) + cfg = FakeDriverConfig() + driver = MagicMock() + driver._driver_config = cfg + qs._driver = driver + qs._session_id = "test-session-id" + qs._node_id = 1 + qs._closed = False + return qs + + async def do_execute(qs): + fake_stream = _slow_async_iter() + with patch.object(QuerySession, "_execute_call", new_callable=AsyncMock, return_value=fake_stream): + result = await qs.execute("SELECT 1") + async for _ in result: + pass + + qs1 = _make_session() + qs2 = _make_session() + await asyncio.gather(do_execute(qs1), do_execute(qs2)) + + spans = _get_spans(exporter, "ydb.ExecuteQuery") + assert len(spans) == 2 + + ids = {s.context.span_id for s in spans} + for s in spans: + if s.parent is not None: + assert s.parent.span_id not in ids, "Concurrent spans must be siblings, not parent-child" diff --git a/tests/tracing/test_tracing_sync.py b/tests/tracing/test_tracing_sync.py new file mode 100644 index 00000000..1bc6f835 --- /dev/null +++ b/tests/tracing/test_tracing_sync.py @@ -0,0 +1,281 @@ +"""Unit tests for OpenTelemetry tracing — synchronous SDK operations. + +Uses an in-memory span exporter to verify that correct spans, attributes, +parent-child relationships, and error handling are produced by the SDK. +No real YDB connection is needed. +""" + +from unittest.mock import MagicMock, patch +from opentelemetry import trace +from opentelemetry.trace import StatusCode, SpanKind +from ydb.opentelemetry.tracing import _registry, create_ydb_span +from ydb.query.transaction import QueryTxStateEnum +from .conftest import FakeDriverConfig + +import pytest + +def _get_spans(exporter, name=None): + spans = exporter.get_finished_spans() + if name is not None: + spans = [s for s in spans if s.name == name] + return spans + + +def _get_single_span(exporter, name): + spans = _get_spans(exporter, name) + assert len(spans) == 1, f"Expected 1 span named '{name}', got {len(spans)}: {[s.name for s in exporter.get_finished_spans()]}" + return spans[0] + + +def _make_session_mock(driver_config=None): + """Create a mock that behaves like a sync QuerySession after create().""" + cfg = driver_config or FakeDriverConfig() + driver = MagicMock() + driver._driver_config = cfg + + session = MagicMock() + session._driver = driver + session._session_id = "test-session-id" + session._node_id = 12345 + session.session_id = "test-session-id" + session.node_id = 12345 + return session, driver + + +def _make_tx(session, driver): + """Create a real QueryTxContext wired to mocked session/driver.""" + from ydb._grpc.grpcwrapper.ydb_query_public_types import QuerySerializableReadWrite + from ydb.query.transaction import QueryTxContext + + tx = QueryTxContext(driver, session, QuerySerializableReadWrite()) + # Simulate that the transaction has been started (so commit/rollback create spans) + tx._tx_state._change_state(QueryTxStateEnum.BEGINED) + tx._tx_state.tx_id = "test-tx-id" + return tx + + +class TestCreateSessionSpan: + def test_create_session_emits_span(self, otel_setup): + exporter = otel_setup + + from ydb.query.session import QuerySession + + qs = QuerySession.__new__(QuerySession) + cfg = FakeDriverConfig() + driver = MagicMock() + driver._driver_config = cfg + qs._driver = driver + qs._session_id = None + qs._closed = False + + with patch.object(QuerySession, "_create_call", return_value=None): + with patch.object(QuerySession, "_attach", return_value=None): + qs.create() + + span = _get_single_span(exporter, "ydb.CreateSession") + assert span.kind == SpanKind.CLIENT + attrs = dict(span.attributes) + assert attrs["db.system.name"] == "ydb" + assert attrs["db.namespace"] == "/test_database" + assert attrs["server.address"] == "test_endpoint" + assert attrs["server.port"] == 1337 + assert span.status.status_code == StatusCode.UNSET + + +class TestExecuteQuerySpan: + def test_session_execute_emits_span(self, otel_setup): + exporter = otel_setup + + from ydb.query.session import QuerySession + + qs = QuerySession.__new__(QuerySession) + cfg = FakeDriverConfig() + driver = MagicMock() + driver._driver_config = cfg + qs._driver = driver + qs._session_id = "test-session-id" + qs._node_id = 12345 + qs._closed = False + + fake_stream = iter([]) # empty stream that raises StopIteration immediately + with patch.object(QuerySession, "_execute_call", return_value=fake_stream): + result = qs.execute("SELECT 1;") + # Consume the iterator to finish the span + list(result) + + span = _get_single_span(exporter, "ydb.ExecuteQuery") + assert span.kind == SpanKind.CLIENT + attrs = dict(span.attributes) + assert attrs["db.system.name"] == "ydb" + assert attrs["db.namespace"] == "/test_database" + assert attrs["server.address"] == "test_endpoint" + assert attrs["server.port"] == 1337 + assert attrs["ydb.session.id"] == "test-session-id" + assert attrs["ydb.node.id"] == 12345 + + def test_tx_execute_emits_span_with_tx_id(self, otel_setup): + exporter = otel_setup + session, driver = _make_session_mock() + tx = _make_tx(session, driver) + + fake_stream = iter([]) + with patch.object(type(tx), "_execute_call", return_value=fake_stream): + tx._prev_stream = None + result = tx.execute("SELECT 1;") + list(result) + + span = _get_single_span(exporter, "ydb.ExecuteQuery") + attrs = dict(span.attributes) + assert attrs["ydb.tx.id"] == "test-tx-id" + assert attrs["ydb.session.id"] == "test-session-id" + assert attrs["ydb.node.id"] == 12345 + + +class TestCommitSpan: + def test_commit_emits_span(self, otel_setup): + exporter = otel_setup + session, driver = _make_session_mock() + tx = _make_tx(session, driver) + + with patch.object(type(tx), "_commit_call", return_value=None): + tx.commit() + + span = _get_single_span(exporter, "ydb.Commit") + assert span.kind == SpanKind.CLIENT + attrs = dict(span.attributes) + assert attrs["db.system.name"] == "ydb" + assert attrs["ydb.tx.id"] == "test-tx-id" + assert attrs["ydb.session.id"] == "test-session-id" + assert attrs["ydb.node.id"] == 12345 + + +class TestRollbackSpan: + def test_rollback_emits_span(self, otel_setup): + exporter = otel_setup + session, driver = _make_session_mock() + tx = _make_tx(session, driver) + + with patch.object(type(tx), "_rollback_call", return_value=None): + tx.rollback() + + span = _get_single_span(exporter, "ydb.Rollback") + assert span.kind == SpanKind.CLIENT + attrs = dict(span.attributes) + assert attrs["db.system.name"] == "ydb" + assert attrs["ydb.tx.id"] == "test-tx-id" + assert attrs["ydb.session.id"] == "test-session-id" + assert attrs["ydb.node.id"] == 12345 + + +class TestErrorHandling: + def test_error_sets_error_status_and_attributes(self, otel_setup): + exporter = otel_setup + + from ydb import issues + + exc = issues.SchemeError("Table not found") + + from ydb.query.session import QuerySession + + qs = QuerySession.__new__(QuerySession) + cfg = FakeDriverConfig() + driver = MagicMock() + driver._driver_config = cfg + qs._driver = driver + qs._session_id = "test-session-id" + qs._node_id = 12345 + qs._closed = False + + with patch.object(QuerySession, "_execute_call", side_effect=exc): + with pytest.raises(issues.SchemeError): + qs.execute("SELECT * FROM non_existing_table") + + span = _get_single_span(exporter, "ydb.ExecuteQuery") + assert span.status.status_code == StatusCode.ERROR + attrs = dict(span.attributes) + assert attrs["error.type"] == "SCHEME_ERROR" + assert attrs["db.response.status_code"] == "SCHEME_ERROR" + assert len(span.events) > 0 + + +class TestNoSpansWhenDisabled: + def test_no_spans_without_enable_tracing(self): + """Without enable_tracing(), the registry uses noop — no spans are created.""" + + from tests.tracing.conftest import _exporter + + _registry.set_create_span(None) + _registry.set_metadata_hook(None) + _exporter.clear() + + with create_ydb_span("ydb.CreateSession", FakeDriverConfig()): + pass + + assert len(_exporter.get_finished_spans()) == 0 + + +class TestParentChildRelationship: + def test_sdk_span_is_child_of_user_span(self, otel_setup): + exporter = otel_setup + + tracer = trace.get_tracer("test.tracer") + + with tracer.start_as_current_span("user.operation") as parent_span: + with create_ydb_span("ydb.ExecuteQuery", FakeDriverConfig(), session_id="s1", node_id=1): + pass + + spans = exporter.get_finished_spans() + ydb_span = next(s for s in spans if s.name == "ydb.ExecuteQuery") + user_span = next(s for s in spans if s.name == "user.operation") + + assert ydb_span.parent is not None + assert ydb_span.parent.span_id == user_span.context.span_id + assert ydb_span.context.trace_id == user_span.context.trace_id + + +class TestTraceMetadataInjection: + def test_get_trace_metadata_returns_traceparent(self, otel_setup): + from ydb.opentelemetry.tracing import get_trace_metadata + + tracer = trace.get_tracer("test.tracer") + + with tracer.start_as_current_span("test.span"): + metadata = get_trace_metadata() + + keys = [k for k, v in metadata] + assert "traceparent" in keys + + +class TestDriverInitializeSpan: + def test_driver_initialize_emits_internal_span(self, otel_setup): + exporter = otel_setup + + cfg = FakeDriverConfig() + + with create_ydb_span("ydb.Driver.Initialize", cfg, kind="internal"): + pass + + span = _get_single_span(exporter, "ydb.Driver.Initialize") + assert span.kind == SpanKind.INTERNAL + attrs = dict(span.attributes) + assert attrs["db.system.name"] == "ydb" + assert attrs["db.namespace"] == "/test_database" + + +class TestCommonAttributes: + @pytest.mark.parametrize("endpoint,expected_host,expected_port", [ + ("grpc://host.example.com:2136", "grpc://host.example.com", 2136), + ("localhost:2136", "localhost", 2136), + ]) + def test_endpoint_parsing(self, otel_setup, endpoint, expected_host, expected_port): + exporter = otel_setup + cfg = FakeDriverConfig(endpoint=endpoint, database="/mydb") + + with create_ydb_span("ydb.Test", cfg): + pass + + span = _get_single_span(exporter, "ydb.Test") + attrs = dict(span.attributes) + assert attrs["server.address"] == expected_host + assert attrs["server.port"] == expected_port + assert attrs["db.namespace"] == "/mydb" diff --git a/ydb/aio/query/base.py b/ydb/aio/query/base.py index 66df3703..cbf22e98 100644 --- a/ydb/aio/query/base.py +++ b/ydb/aio/query/base.py @@ -2,9 +2,10 @@ class AsyncResponseContextIterator(_utilities.AsyncResponseIterator): - def __init__(self, it, wrapper, on_error=None): + def __init__(self, it, wrapper, on_error=None, span=None): super().__init__(it, wrapper) self._on_error = on_error + self._span = span async def __aenter__(self) -> "AsyncResponseContextIterator": return self @@ -12,12 +13,27 @@ async def __aenter__(self) -> "AsyncResponseContextIterator": async def _next(self): try: return await super()._next() + except StopAsyncIteration: + self._finish_span() + raise except Exception as e: if self._on_error: self._on_error(e) + self._finish_span(e) raise e + def _finish_span(self, exception=None): + if self._span is not None: + if exception is not None: + self._span.set_error(exception) + self._span.end() + self._span = None + + def __del__(self): + self._finish_span() + async def __aexit__(self, exc_type, exc_val, exc_tb): # To close stream on YDB it is necessary to scroll through it to the end async for _ in self: pass + self._finish_span() diff --git a/ydb/aio/query/session.py b/ydb/aio/query/session.py index 80a236df..bd291069 100644 --- a/ydb/aio/query/session.py +++ b/ydb/aio/query/session.py @@ -161,8 +161,11 @@ async def execute( """ self._check_session_ready_to_use() - with create_ydb_span("ydb.ExecuteQuery", self._driver._driver_config, - session_id=self._session_id, node_id=self._node_id): + span = create_ydb_span( + "ydb.ExecuteQuery", self._driver._driver_config, session_id=self._session_id, node_id=self._node_id + ) + + try: stream_it = await self._execute_call( query=query, parameters=parameters, @@ -186,7 +189,13 @@ async def execute( settings=self._settings, ), on_error=self._on_execute_stream_error, + span=span, ) + except Exception as e: + if span is not None: + span.set_error(e) + span.end() + raise async def explain( self, diff --git a/ydb/aio/query/transaction.py b/ydb/aio/query/transaction.py index 746831a4..87666984 100644 --- a/ydb/aio/query/transaction.py +++ b/ydb/aio/query/transaction.py @@ -107,10 +107,13 @@ async def commit(self, settings: Optional[BaseRequestSettings] = None) -> None: await self._ensure_prev_stream_finished() - with create_ydb_span("ydb.Commit", self._driver._driver_config, - session_id=self.session.session_id, - node_id=self.session.node_id, - tx_id=self._tx_state.tx_id): + with create_ydb_span( + "ydb.Commit", + self._driver._driver_config, + session_id=self.session.session_id, + node_id=self.session.node_id, + tx_id=self._tx_state.tx_id, + ): try: await self._execute_callbacks_async(base.TxEvent.BEFORE_COMMIT) await self._commit_call(settings) @@ -138,10 +141,13 @@ async def rollback(self, settings: Optional[BaseRequestSettings] = None) -> None await self._ensure_prev_stream_finished() - with create_ydb_span("ydb.Rollback", self._driver._driver_config, - session_id=self.session.session_id, - node_id=self.session.node_id, - tx_id=self._tx_state.tx_id): + with create_ydb_span( + "ydb.Rollback", + self._driver._driver_config, + session_id=self.session.session_id, + node_id=self.session.node_id, + tx_id=self._tx_state.tx_id, + ): try: await self._execute_callbacks_async(base.TxEvent.BEFORE_ROLLBACK) await self._rollback_call(settings) @@ -196,10 +202,15 @@ async def execute( """ await self._ensure_prev_stream_finished() - with create_ydb_span("ydb.ExecuteQuery", self._driver._driver_config, - session_id=self.session.session_id, - node_id=self.session.node_id, - tx_id=self._tx_state.tx_id): + span = create_ydb_span( + "ydb.ExecuteQuery", + self._driver._driver_config, + session_id=self.session.session_id, + node_id=self.session.node_id, + tx_id=self._tx_state.tx_id, + ) + + try: stream_it = await self._execute_call( query=query, parameters=parameters, @@ -225,5 +236,11 @@ async def execute( settings=self.session._settings, ), on_error=self.session._on_execute_stream_error, + span=span, ) return self._prev_stream + except Exception as e: + if span is not None: + span.set_error(e) + span.end() + raise diff --git a/ydb/opentelemetry/__init__.py b/ydb/opentelemetry/__init__.py index c732dbf4..f9587e9e 100644 --- a/ydb/opentelemetry/__init__.py +++ b/ydb/opentelemetry/__init__.py @@ -1,6 +1,12 @@ def enable_tracing(): """Enable OpenTelemetry trace context propagation and span creation for all YDB gRPC calls.""" - from ydb.opentelemetry._plugin import _enable_tracing + try: + from ydb.opentelemetry._plugin import _enable_tracing + except ImportError: + raise ImportError( + "OpenTelemetry packages are required for tracing support. " + "Install them with: pip install ydb[tracing]" + ) from None _enable_tracing() diff --git a/ydb/opentelemetry/_plugin.py b/ydb/opentelemetry/_plugin.py index 828e7000..81a54186 100644 --- a/ydb/opentelemetry/_plugin.py +++ b/ydb/opentelemetry/_plugin.py @@ -1,82 +1,77 @@ -from contextlib import contextmanager +from opentelemetry import context, trace +from opentelemetry.propagate import inject +from opentelemetry.trace import StatusCode -_MIN_OTEL_VERSION = "1.0.0" +from ydb import issues +from ydb.opentelemetry.tracing import _registry _tracer = None _enabled = False - -def _check_dependencies(): - try: - from opentelemetry.version import __version__ as otel_version - except ImportError: - raise ImportError( - "OpenTelemetry packages are required for tracing support. " - "Install them with: pip install ydb[tracing]" - ) from None - - from packaging.version import Version - - if Version(otel_version) < Version(_MIN_OTEL_VERSION): - raise ImportError( - f"OpenTelemetry >= {_MIN_OTEL_VERSION} is required, " - f"but {otel_version} is installed. " - "Upgrade with: pip install ydb[tracing]" - ) +_KIND_MAP = { + "client": trace.SpanKind.CLIENT, + "internal": trace.SpanKind.INTERNAL, +} def _otel_metadata_hook(): """Injects W3C Trace Context (traceparent/tracestate) into gRPC metadata.""" - from opentelemetry.propagate import inject - headers = {} inject(headers) return list(headers.items()) -@contextmanager -def _otel_span(name, attributes=None, kind=None): - from opentelemetry import trace +def _set_error_on_span(span, exception): + if isinstance(exception, issues.Error) and exception.status is not None: + error_type = exception.status.name + span.set_attribute("db.response.status_code", error_type) + else: + error_type = type(exception).__qualname__ - kind_map = { - "client": trace.SpanKind.CLIENT, - "internal": trace.SpanKind.INTERNAL, - } - otel_kind = kind_map.get(kind, trace.SpanKind.CLIENT) - with _tracer.start_as_current_span( - name, - kind=otel_kind, - attributes=attributes or {}, - ) as span: - try: - yield span - except Exception as e: - _otel_set_error(span, e) - raise + span.set_attribute("error.type", error_type) + span.set_status(StatusCode.ERROR, str(exception)) + span.record_exception(exception) -def _otel_set_error(span, exception): - """Records an exception on the span and sets ERROR status.""" - if span is None: - return +class TracingSpan: + """Wrapper around an OTel span that manages context lifecycle. - from opentelemetry.trace import StatusCode - from ydb import issues - - attrs = {} - if isinstance(exception, issues.Error): - status_code = getattr(exception, "status", None) - if status_code is not None: - attrs["db.response.status_code"] = str(status_code) - attrs["error.type"] = status_code.name - else: - attrs["error.type"] = type(exception).__qualname__ - else: - attrs["error.type"] = type(exception).__qualname__ + Can be used as a context manager or manually + """ - span.set_attributes(attrs) - span.set_status(StatusCode.ERROR, str(exception)) - span.record_exception(exception) + def __init__(self, span, token): + self._span = span + self._token = token + + def set_error(self, exception): + _set_error_on_span(self._span, exception) + + def end(self): + self._span.end() + if self._token is not None: + context.detach(self._token) + self._token = None + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_val is not None: + self.set_error(exc_val) + self.end() + return False + + +def _create_span(name, attributes=None, kind=None): + # Can be used as a context manager or manually + span = _tracer.start_span( + name, + kind=_KIND_MAP.get(kind, trace.SpanKind.CLIENT), + attributes=attributes or {}, + ) + ctx = trace.set_span_in_context(span) + token = context.attach(ctx) + return TracingSpan(span, token) def _enable_tracing(): @@ -85,12 +80,7 @@ def _enable_tracing(): if _enabled: return - _check_dependencies() - - from opentelemetry import trace - from ydb.opentelemetry.tracing import _registry - _tracer = trace.get_tracer("ydb.sdk") _enabled = True _registry.set_metadata_hook(_otel_metadata_hook) - _registry.set_span_factory(_otel_span) + _registry.set_create_span(_create_span) diff --git a/ydb/opentelemetry/tracing.py b/ydb/opentelemetry/tracing.py index 265eff94..07a0ead7 100644 --- a/ydb/opentelemetry/tracing.py +++ b/ydb/opentelemetry/tracing.py @@ -1,26 +1,37 @@ -from contextlib import contextmanager +class _NoopSpan: + """Returned by create_ydb_span when tracing is disabled.""" + def set_error(self, exception): + pass -@contextmanager -def _noop_span(name, attributes=None, kind=None): - yield None + def end(self): + pass + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + return False + + +_NOOP_SPAN = _NoopSpan() class OtelTracingRegistry: """Singleton registry for OpenTelemetry tracing. - Holds the span factory and metadata hook. - By default everything is no-op until :func:`enable_tracing` is called - from :mod:`ydb.opentelemetry`. + By default everything is no-op until :func:`enable_tracing` is called. """ def __init__(self): self._metadata_hook = None - self._span_factory = _noop_span + self._create_span_func = None def create_span(self, name, attributes=None, kind=None): - """Create a tracing span (context manager).""" - return self._span_factory(name, attributes, kind=kind) + """Create a span. Returns a TracingSpan or _NoopSpan.""" + if self._create_span_func is None: + return _NOOP_SPAN + return self._create_span_func(name, attributes, kind=kind) def get_trace_metadata(self): """Return tracing metadata (e.g. W3C traceparent) for gRPC calls.""" @@ -29,45 +40,21 @@ def get_trace_metadata(self): return [] def set_metadata_hook(self, hook): - """Set a hook that returns tracing metadata for gRPC calls. - - *hook* must be a callable returning a list of ``(key, value)`` tuples. - """ self._metadata_hook = hook - def set_span_factory(self, factory): - """Set a span factory for tracing SDK operations. - - *factory* must be a context-manager factory: - ``factory(name, attributes, kind) -> context manager yielding span``. - """ - self._span_factory = factory + def set_create_span(self, func): + self._create_span_func = func _registry = OtelTracingRegistry() - -def create_span(name, attributes=None, kind=None): - """Create a tracing span via the global registry.""" - return _registry.create_span(name, attributes, kind) - - def get_trace_metadata(): """Return tracing metadata for gRPC calls.""" return _registry.get_trace_metadata() -def create_ydb_span(name, driver_config, session_id=None, node_id=None, tx_id=None, kind=None): - """Create a span pre-filled with standard YDB attributes. - - :param name: Span name (e.g. ``"ydb.ExecuteQuery"``). - :param driver_config: :class:`ydb.DriverConfig` instance. - :param session_id: Optional session ID. - :param node_id: Optional node ID. - :param tx_id: Optional transaction ID. - :param kind: Optional span kind (``"client"`` or ``"internal"``). - """ +def _build_ydb_attrs(driver_config, session_id=None, node_id=None, tx_id=None): endpoint = getattr(driver_config, "endpoint", None) or "" host, _, port = endpoint.rpartition(":") attrs = { @@ -82,4 +69,12 @@ def create_ydb_span(name, driver_config, session_id=None, node_id=None, tx_id=No attrs["ydb.node.id"] = node_id or 0 if tx_id is not None: attrs["ydb.tx.id"] = tx_id or "" + return attrs + + +def create_ydb_span(name, driver_config, session_id=None, node_id=None, tx_id=None, kind=None): + """Create a span pre-filled with standard YDB attributes. + Can be used as a context manager or manually. + """ + attrs = _build_ydb_attrs(driver_config, session_id, node_id, tx_id) return _registry.create_span(name, attributes=attrs, kind=kind) diff --git a/ydb/query/base.py b/ydb/query/base.py index e7764e1c..09b8a40b 100644 --- a/ydb/query/base.py +++ b/ydb/query/base.py @@ -73,9 +73,10 @@ class QueryResultSetFormat(enum.IntEnum): class SyncResponseContextIterator(_utilities.SyncResponseIterator): - def __init__(self, it, wrapper, on_error=None): + def __init__(self, it, wrapper, on_error=None, span=None): super().__init__(it, wrapper) self._on_error = on_error + self._span = span def __enter__(self) -> "SyncResponseContextIterator": return self @@ -83,15 +84,30 @@ def __enter__(self) -> "SyncResponseContextIterator": def _next(self): try: return super()._next() + except StopIteration: + self._finish_span() + raise except Exception as e: if self._on_error: self._on_error(e) + self._finish_span(e) raise e + def _finish_span(self, exception=None): + if self._span is not None: + if exception is not None: + self._span.set_error(exception) + self._span.end() + self._span = None + + def __del__(self): + self._finish_span() + def __exit__(self, exc_type, exc_val, exc_tb): # To close stream on YDB it is necessary to scroll through it to the end for _ in self: pass + self._finish_span() class QueryClientSettings: diff --git a/ydb/query/session.py b/ydb/query/session.py index d9f379f1..91a35dce 100644 --- a/ydb/query/session.py +++ b/ydb/query/session.py @@ -437,8 +437,11 @@ def execute( """ self._check_session_ready_to_use() - with create_ydb_span("ydb.ExecuteQuery", self._driver._driver_config, - session_id=self._session_id, node_id=self._node_id): + span = create_ydb_span( + "ydb.ExecuteQuery", self._driver._driver_config, session_id=self._session_id, node_id=self._node_id + ) + + try: stream_it = self._execute_call( query=query, parameters=parameters, @@ -462,7 +465,13 @@ def execute( settings=self._settings, ), on_error=self._on_execute_stream_error, + span=span, ) + except Exception as e: + if span is not None: + span.set_error(e) + span.end() + raise def explain( self, diff --git a/ydb/query/transaction.py b/ydb/query/transaction.py index 8631ba52..d2aef95c 100644 --- a/ydb/query/transaction.py +++ b/ydb/query/transaction.py @@ -554,10 +554,13 @@ def commit(self, settings: Optional[BaseRequestSettings] = None) -> None: self._ensure_prev_stream_finished() - with create_ydb_span("ydb.Commit", self._driver._driver_config, - session_id=self.session.session_id, - node_id=self.session.node_id, - tx_id=self._tx_state.tx_id): + with create_ydb_span( + "ydb.Commit", + self._driver._driver_config, + session_id=self.session.session_id, + node_id=self.session.node_id, + tx_id=self._tx_state.tx_id, + ): try: self._execute_callbacks_sync(base.TxEvent.BEFORE_COMMIT) self._commit_call(settings) @@ -584,10 +587,13 @@ def rollback(self, settings: Optional[BaseRequestSettings] = None) -> None: self._ensure_prev_stream_finished() - with create_ydb_span("ydb.Rollback", self._driver._driver_config, - session_id=self.session.session_id, - node_id=self.session.node_id, - tx_id=self._tx_state.tx_id): + with create_ydb_span( + "ydb.Rollback", + self._driver._driver_config, + session_id=self.session.session_id, + node_id=self.session.node_id, + tx_id=self._tx_state.tx_id, + ): try: self._execute_callbacks_sync(base.TxEvent.BEFORE_ROLLBACK) self._rollback_call(settings) @@ -643,10 +649,15 @@ def execute( """ self._ensure_prev_stream_finished() - with create_ydb_span("ydb.ExecuteQuery", self._driver._driver_config, - session_id=self.session.session_id, - node_id=self.session.node_id, - tx_id=self._tx_state.tx_id): + span = create_ydb_span( + "ydb.ExecuteQuery", + self._driver._driver_config, + session_id=self.session.session_id, + node_id=self.session.node_id, + tx_id=self._tx_state.tx_id, + ) + + try: stream_it = self._execute_call( query=query, commit_tx=commit_tx, @@ -672,5 +683,11 @@ def execute( settings=self.session._settings, ), on_error=self.session._on_execute_stream_error, + span=span, ) return self._prev_stream + except Exception as e: + if span is not None: + span.set_error(e) + span.end() + raise From acdc32f4904169caee0c5b4de20c1ac6e9b62519 Mon Sep 17 00:00:00 2001 From: tewbo Date: Tue, 24 Mar 2026 07:58:54 +0300 Subject: [PATCH 04/36] * format --- examples/opentelemetry/example.py | 1 + tests/tracing/conftest.py | 1 - tests/tracing/test_tracing_async.py | 5 ++++- tests/tracing/test_tracing_sync.py | 16 +++++++++++----- ydb/opentelemetry/__init__.py | 3 +-- ydb/query/base.py | 1 - ydb/query/session.py | 24 ++++++++---------------- ydb/query/transaction.py | 24 ++++++++---------------- 8 files changed, 33 insertions(+), 42 deletions(-) diff --git a/examples/opentelemetry/example.py b/examples/opentelemetry/example.py index fad3111a..55be4257 100644 --- a/examples/opentelemetry/example.py +++ b/examples/opentelemetry/example.py @@ -59,6 +59,7 @@ async def tx_callee(session): await pool.retry_operation_async(tx_callee) + sync_example() asyncio.run(async_example()) diff --git a/tests/tracing/conftest.py b/tests/tracing/conftest.py index b54ffd05..94f653b8 100644 --- a/tests/tracing/conftest.py +++ b/tests/tracing/conftest.py @@ -13,7 +13,6 @@ from ydb.opentelemetry.tracing import _registry - _provider = TracerProvider() _exporter = InMemorySpanExporter() _provider.add_span_processor(SimpleSpanProcessor(_exporter)) diff --git a/tests/tracing/test_tracing_async.py b/tests/tracing/test_tracing_async.py index 08bcddba..4b059f2c 100644 --- a/tests/tracing/test_tracing_async.py +++ b/tests/tracing/test_tracing_async.py @@ -11,6 +11,7 @@ import asyncio import pytest + async def _empty_async_iter(): return yield # noqa: makes this an async generator @@ -25,7 +26,9 @@ def _get_spans(exporter, name=None): def _get_single_span(exporter, name): spans = _get_spans(exporter, name) - assert len(spans) == 1, f"Expected 1 span named '{name}', got {len(spans)}: {[s.name for s in exporter.get_finished_spans()]}" + assert ( + len(spans) == 1 + ), f"Expected 1 span named '{name}', got {len(spans)}: {[s.name for s in exporter.get_finished_spans()]}" return spans[0] diff --git a/tests/tracing/test_tracing_sync.py b/tests/tracing/test_tracing_sync.py index 1bc6f835..42dfae74 100644 --- a/tests/tracing/test_tracing_sync.py +++ b/tests/tracing/test_tracing_sync.py @@ -14,6 +14,7 @@ import pytest + def _get_spans(exporter, name=None): spans = exporter.get_finished_spans() if name is not None: @@ -23,7 +24,9 @@ def _get_spans(exporter, name=None): def _get_single_span(exporter, name): spans = _get_spans(exporter, name) - assert len(spans) == 1, f"Expected 1 span named '{name}', got {len(spans)}: {[s.name for s in exporter.get_finished_spans()]}" + assert ( + len(spans) == 1 + ), f"Expected 1 span named '{name}', got {len(spans)}: {[s.name for s in exporter.get_finished_spans()]}" return spans[0] @@ -263,10 +266,13 @@ def test_driver_initialize_emits_internal_span(self, otel_setup): class TestCommonAttributes: - @pytest.mark.parametrize("endpoint,expected_host,expected_port", [ - ("grpc://host.example.com:2136", "grpc://host.example.com", 2136), - ("localhost:2136", "localhost", 2136), - ]) + @pytest.mark.parametrize( + "endpoint,expected_host,expected_port", + [ + ("grpc://host.example.com:2136", "grpc://host.example.com", 2136), + ("localhost:2136", "localhost", 2136), + ], + ) def test_endpoint_parsing(self, otel_setup, endpoint, expected_host, expected_port): exporter = otel_setup cfg = FakeDriverConfig(endpoint=endpoint, database="/mydb") diff --git a/ydb/opentelemetry/__init__.py b/ydb/opentelemetry/__init__.py index f9587e9e..144e7bc4 100644 --- a/ydb/opentelemetry/__init__.py +++ b/ydb/opentelemetry/__init__.py @@ -4,8 +4,7 @@ def enable_tracing(): from ydb.opentelemetry._plugin import _enable_tracing except ImportError: raise ImportError( - "OpenTelemetry packages are required for tracing support. " - "Install them with: pip install ydb[tracing]" + "OpenTelemetry packages are required for tracing support. " "Install them with: pip install ydb[tracing]" ) from None _enable_tracing() diff --git a/ydb/query/base.py b/ydb/query/base.py index 09b8a40b..1aeb4f6b 100644 --- a/ydb/query/base.py +++ b/ydb/query/base.py @@ -27,7 +27,6 @@ from ydb._topic_common.common import CallFromSyncToAsync, _get_shared_event_loop from ydb._grpc.grpcwrapper.common_utils import to_thread - if typing.TYPE_CHECKING: from .transaction import BaseQueryTxContext from .session import BaseQuerySession diff --git a/ydb/query/session.py b/ydb/query/session.py index 91a35dce..f2ef9b7c 100644 --- a/ydb/query/session.py +++ b/ydb/query/session.py @@ -145,14 +145,12 @@ def _on_execute_stream_error(self, e: Exception) -> None: @overload def _create_call( self: "BaseQuerySession[SyncDriver]", settings: Optional[BaseRequestSettings] = None - ) -> "BaseQuerySession[SyncDriver]": - ... + ) -> "BaseQuerySession[SyncDriver]": ... @overload def _create_call( self: "BaseQuerySession[AsyncDriver]", settings: Optional[BaseRequestSettings] = None - ) -> Awaitable["BaseQuerySession[AsyncDriver]"]: - ... + ) -> Awaitable["BaseQuerySession[AsyncDriver]"]: ... def _create_call( self, settings: Optional[BaseRequestSettings] = None @@ -171,14 +169,12 @@ def _create_call( @overload def _delete_call( self: "BaseQuerySession[SyncDriver]", settings: Optional[BaseRequestSettings] = None - ) -> "BaseQuerySession[SyncDriver]": - ... + ) -> "BaseQuerySession[SyncDriver]": ... @overload def _delete_call( self: "BaseQuerySession[AsyncDriver]", settings: Optional[BaseRequestSettings] = None - ) -> Awaitable["BaseQuerySession[AsyncDriver]"]: - ... + ) -> Awaitable["BaseQuerySession[AsyncDriver]"]: ... def _delete_call( self, settings: Optional[BaseRequestSettings] = None @@ -198,14 +194,12 @@ def _delete_call( @overload def _attach_call( self: "BaseQuerySession[SyncDriver]", - ) -> GrpcStreamCall[_apis.ydb_query.SessionState]: - ... + ) -> GrpcStreamCall[_apis.ydb_query.SessionState]: ... @overload def _attach_call( self: "BaseQuerySession[AsyncDriver]", - ) -> Awaitable[GrpcStreamCall[_apis.ydb_query.SessionState]]: - ... + ) -> Awaitable[GrpcStreamCall[_apis.ydb_query.SessionState]]: ... def _attach_call( self, @@ -234,8 +228,7 @@ def _execute_call( arrow_format_settings: Optional[base.ArrowFormatSettings] = None, concurrent_result_sets: bool = False, settings: Optional[BaseRequestSettings] = None, - ) -> Iterable[_apis.ydb_query.ExecuteQueryResponsePart]: - ... + ) -> Iterable[_apis.ydb_query.ExecuteQueryResponsePart]: ... @overload def _execute_call( @@ -251,8 +244,7 @@ def _execute_call( arrow_format_settings: Optional[base.ArrowFormatSettings] = None, concurrent_result_sets: bool = False, settings: Optional[BaseRequestSettings] = None, - ) -> Awaitable[Iterable[_apis.ydb_query.ExecuteQueryResponsePart]]: - ... + ) -> Awaitable[Iterable[_apis.ydb_query.ExecuteQueryResponsePart]]: ... def _execute_call( self, diff --git a/ydb/query/transaction.py b/ydb/query/transaction.py index d2aef95c..9b4f427d 100644 --- a/ydb/query/transaction.py +++ b/ydb/query/transaction.py @@ -286,14 +286,12 @@ def _check_external_error_set(self): @overload def _begin_call( self: "BaseQueryTxContext[SyncDriver]", settings: Optional[BaseRequestSettings] - ) -> "BaseQueryTxContext[SyncDriver]": - ... + ) -> "BaseQueryTxContext[SyncDriver]": ... @overload def _begin_call( self: "BaseQueryTxContext[AsyncDriver]", settings: Optional[BaseRequestSettings] - ) -> Awaitable["BaseQueryTxContext[AsyncDriver]"]: - ... + ) -> Awaitable["BaseQueryTxContext[AsyncDriver]"]: ... def _begin_call( self, settings: Optional[BaseRequestSettings] @@ -315,14 +313,12 @@ def _begin_call( @overload def _commit_call( self: "BaseQueryTxContext[SyncDriver]", settings: Optional[BaseRequestSettings] - ) -> "BaseQueryTxContext[SyncDriver]": - ... + ) -> "BaseQueryTxContext[SyncDriver]": ... @overload def _commit_call( self: "BaseQueryTxContext[AsyncDriver]", settings: Optional[BaseRequestSettings] - ) -> Awaitable["BaseQueryTxContext[AsyncDriver]"]: - ... + ) -> Awaitable["BaseQueryTxContext[AsyncDriver]"]: ... def _commit_call( self, settings: Optional[BaseRequestSettings] @@ -345,14 +341,12 @@ def _commit_call( @overload def _rollback_call( self: "BaseQueryTxContext[SyncDriver]", settings: Optional[BaseRequestSettings] - ) -> "BaseQueryTxContext[SyncDriver]": - ... + ) -> "BaseQueryTxContext[SyncDriver]": ... @overload def _rollback_call( self: "BaseQueryTxContext[AsyncDriver]", settings: Optional[BaseRequestSettings] - ) -> Awaitable["BaseQueryTxContext[AsyncDriver]"]: - ... + ) -> Awaitable["BaseQueryTxContext[AsyncDriver]"]: ... def _rollback_call( self, settings: Optional[BaseRequestSettings] @@ -386,8 +380,7 @@ def _execute_call( arrow_format_settings: Optional[base.ArrowFormatSettings], concurrent_result_sets: Optional[bool], settings: Optional[BaseRequestSettings], - ) -> Iterable[_apis.ydb_query.ExecuteQueryResponsePart]: - ... + ) -> Iterable[_apis.ydb_query.ExecuteQueryResponsePart]: ... @overload def _execute_call( @@ -403,8 +396,7 @@ def _execute_call( arrow_format_settings: Optional[base.ArrowFormatSettings], concurrent_result_sets: Optional[bool], settings: Optional[BaseRequestSettings], - ) -> Awaitable[Iterable[_apis.ydb_query.ExecuteQueryResponsePart]]: - ... + ) -> Awaitable[Iterable[_apis.ydb_query.ExecuteQueryResponsePart]]: ... def _execute_call( self, From 7bf72a98cfe0b33975bff7015362c275e29e6663 Mon Sep 17 00:00:00 2001 From: tewbo Date: Tue, 24 Mar 2026 08:29:01 +0300 Subject: [PATCH 05/36] * add otel to test requirements --- pyproject.toml | 1 + test-requirements.txt | 2 ++ 2 files changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 41e7ef6f..0b08f0b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ module = [ "requests.*", "ydb.public.api.*", "contrib.ydb.public.api.*", + "opentelemetry.*", ] ignore_missing_imports = true diff --git a/test-requirements.txt b/test-requirements.txt index a5b65963..0976ce50 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -43,6 +43,8 @@ sqlalchemy==1.4.26 pylint-protobuf cython freezegun>=1.3.0 +opentelemetry-api>=1.0.0 +opentelemetry-sdk>=1.0.0 # pytest-cov yandexcloud -e . From 74cc57d2bbb77b542af73c071b188b863fdc646d Mon Sep 17 00:00:00 2001 From: tewbo Date: Tue, 24 Mar 2026 08:41:55 +0300 Subject: [PATCH 06/36] fix black checkstyle --- ydb/query/session.py | 24 ++++++++++++++++-------- ydb/query/transaction.py | 24 ++++++++++++++++-------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/ydb/query/session.py b/ydb/query/session.py index f2ef9b7c..91a35dce 100644 --- a/ydb/query/session.py +++ b/ydb/query/session.py @@ -145,12 +145,14 @@ def _on_execute_stream_error(self, e: Exception) -> None: @overload def _create_call( self: "BaseQuerySession[SyncDriver]", settings: Optional[BaseRequestSettings] = None - ) -> "BaseQuerySession[SyncDriver]": ... + ) -> "BaseQuerySession[SyncDriver]": + ... @overload def _create_call( self: "BaseQuerySession[AsyncDriver]", settings: Optional[BaseRequestSettings] = None - ) -> Awaitable["BaseQuerySession[AsyncDriver]"]: ... + ) -> Awaitable["BaseQuerySession[AsyncDriver]"]: + ... def _create_call( self, settings: Optional[BaseRequestSettings] = None @@ -169,12 +171,14 @@ def _create_call( @overload def _delete_call( self: "BaseQuerySession[SyncDriver]", settings: Optional[BaseRequestSettings] = None - ) -> "BaseQuerySession[SyncDriver]": ... + ) -> "BaseQuerySession[SyncDriver]": + ... @overload def _delete_call( self: "BaseQuerySession[AsyncDriver]", settings: Optional[BaseRequestSettings] = None - ) -> Awaitable["BaseQuerySession[AsyncDriver]"]: ... + ) -> Awaitable["BaseQuerySession[AsyncDriver]"]: + ... def _delete_call( self, settings: Optional[BaseRequestSettings] = None @@ -194,12 +198,14 @@ def _delete_call( @overload def _attach_call( self: "BaseQuerySession[SyncDriver]", - ) -> GrpcStreamCall[_apis.ydb_query.SessionState]: ... + ) -> GrpcStreamCall[_apis.ydb_query.SessionState]: + ... @overload def _attach_call( self: "BaseQuerySession[AsyncDriver]", - ) -> Awaitable[GrpcStreamCall[_apis.ydb_query.SessionState]]: ... + ) -> Awaitable[GrpcStreamCall[_apis.ydb_query.SessionState]]: + ... def _attach_call( self, @@ -228,7 +234,8 @@ def _execute_call( arrow_format_settings: Optional[base.ArrowFormatSettings] = None, concurrent_result_sets: bool = False, settings: Optional[BaseRequestSettings] = None, - ) -> Iterable[_apis.ydb_query.ExecuteQueryResponsePart]: ... + ) -> Iterable[_apis.ydb_query.ExecuteQueryResponsePart]: + ... @overload def _execute_call( @@ -244,7 +251,8 @@ def _execute_call( arrow_format_settings: Optional[base.ArrowFormatSettings] = None, concurrent_result_sets: bool = False, settings: Optional[BaseRequestSettings] = None, - ) -> Awaitable[Iterable[_apis.ydb_query.ExecuteQueryResponsePart]]: ... + ) -> Awaitable[Iterable[_apis.ydb_query.ExecuteQueryResponsePart]]: + ... def _execute_call( self, diff --git a/ydb/query/transaction.py b/ydb/query/transaction.py index 9b4f427d..d2aef95c 100644 --- a/ydb/query/transaction.py +++ b/ydb/query/transaction.py @@ -286,12 +286,14 @@ def _check_external_error_set(self): @overload def _begin_call( self: "BaseQueryTxContext[SyncDriver]", settings: Optional[BaseRequestSettings] - ) -> "BaseQueryTxContext[SyncDriver]": ... + ) -> "BaseQueryTxContext[SyncDriver]": + ... @overload def _begin_call( self: "BaseQueryTxContext[AsyncDriver]", settings: Optional[BaseRequestSettings] - ) -> Awaitable["BaseQueryTxContext[AsyncDriver]"]: ... + ) -> Awaitable["BaseQueryTxContext[AsyncDriver]"]: + ... def _begin_call( self, settings: Optional[BaseRequestSettings] @@ -313,12 +315,14 @@ def _begin_call( @overload def _commit_call( self: "BaseQueryTxContext[SyncDriver]", settings: Optional[BaseRequestSettings] - ) -> "BaseQueryTxContext[SyncDriver]": ... + ) -> "BaseQueryTxContext[SyncDriver]": + ... @overload def _commit_call( self: "BaseQueryTxContext[AsyncDriver]", settings: Optional[BaseRequestSettings] - ) -> Awaitable["BaseQueryTxContext[AsyncDriver]"]: ... + ) -> Awaitable["BaseQueryTxContext[AsyncDriver]"]: + ... def _commit_call( self, settings: Optional[BaseRequestSettings] @@ -341,12 +345,14 @@ def _commit_call( @overload def _rollback_call( self: "BaseQueryTxContext[SyncDriver]", settings: Optional[BaseRequestSettings] - ) -> "BaseQueryTxContext[SyncDriver]": ... + ) -> "BaseQueryTxContext[SyncDriver]": + ... @overload def _rollback_call( self: "BaseQueryTxContext[AsyncDriver]", settings: Optional[BaseRequestSettings] - ) -> Awaitable["BaseQueryTxContext[AsyncDriver]"]: ... + ) -> Awaitable["BaseQueryTxContext[AsyncDriver]"]: + ... def _rollback_call( self, settings: Optional[BaseRequestSettings] @@ -380,7 +386,8 @@ def _execute_call( arrow_format_settings: Optional[base.ArrowFormatSettings], concurrent_result_sets: Optional[bool], settings: Optional[BaseRequestSettings], - ) -> Iterable[_apis.ydb_query.ExecuteQueryResponsePart]: ... + ) -> Iterable[_apis.ydb_query.ExecuteQueryResponsePart]: + ... @overload def _execute_call( @@ -396,7 +403,8 @@ def _execute_call( arrow_format_settings: Optional[base.ArrowFormatSettings], concurrent_result_sets: Optional[bool], settings: Optional[BaseRequestSettings], - ) -> Awaitable[Iterable[_apis.ydb_query.ExecuteQueryResponsePart]]: ... + ) -> Awaitable[Iterable[_apis.ydb_query.ExecuteQueryResponsePart]]: + ... def _execute_call( self, From de1d6d9c02b033e6355200ef02936f6346ed14a1 Mon Sep 17 00:00:00 2001 From: tewbo Date: Tue, 24 Mar 2026 08:45:33 +0300 Subject: [PATCH 07/36] fix flake8 checkstyle --- tests/conftest.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 9151ede0..6de23fe7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -261,7 +261,6 @@ def topic_consumer(): @pytest.fixture() -@pytest.mark.asyncio() async def topic_path(driver, topic_consumer, database) -> str: topic_path = database + "/test-topic" @@ -279,7 +278,6 @@ async def topic_path(driver, topic_consumer, database) -> str: @pytest.fixture() -@pytest.mark.asyncio() async def topic2_path(driver, topic_consumer, database) -> str: topic_path = database + "/test-topic2" @@ -297,7 +295,6 @@ async def topic2_path(driver, topic_consumer, database) -> str: @pytest.fixture() -@pytest.mark.asyncio() async def topic_with_two_partitions_path(driver, topic_consumer, database) -> str: topic_path = database + "/test-topic-two-partitions" @@ -317,7 +314,6 @@ async def topic_with_two_partitions_path(driver, topic_consumer, database) -> st @pytest.fixture() -@pytest.mark.asyncio() async def topic_with_messages(driver, topic_consumer, database): topic_path = database + "/test-topic-with-messages" try: @@ -348,7 +344,6 @@ async def topic_with_messages(driver, topic_consumer, database): @pytest.fixture() -@pytest.mark.asyncio() async def topic_with_messages_with_metadata(driver, topic_consumer, database): topic_path = database + "/test-topic-with-messages-with-metadata" try: @@ -373,7 +368,6 @@ async def topic_with_messages_with_metadata(driver, topic_consumer, database): @pytest.fixture() -@pytest.mark.asyncio() async def topic_reader(driver, topic_consumer, topic_path) -> ydb.TopicReaderAsyncIO: reader = driver.topic_client.reader(topic=topic_path, consumer=topic_consumer) yield reader From 3dda4176f7fe335fa36a718d4d24eda2d07c7590 Mon Sep 17 00:00:00 2001 From: tewbo Date: Tue, 24 Mar 2026 09:37:08 +0300 Subject: [PATCH 08/36] make property from driver config --- tests/conftest.py | 6 ++++++ tests/tracing/test_tracing_sync.py | 2 +- ydb/aio/query/session.py | 4 ++-- ydb/aio/query/transaction.py | 6 +++--- ydb/query/session.py | 10 +++++++--- ydb/query/transaction.py | 10 +++++++--- 6 files changed, 26 insertions(+), 12 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 6de23fe7..9151ede0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -261,6 +261,7 @@ def topic_consumer(): @pytest.fixture() +@pytest.mark.asyncio() async def topic_path(driver, topic_consumer, database) -> str: topic_path = database + "/test-topic" @@ -278,6 +279,7 @@ async def topic_path(driver, topic_consumer, database) -> str: @pytest.fixture() +@pytest.mark.asyncio() async def topic2_path(driver, topic_consumer, database) -> str: topic_path = database + "/test-topic2" @@ -295,6 +297,7 @@ async def topic2_path(driver, topic_consumer, database) -> str: @pytest.fixture() +@pytest.mark.asyncio() async def topic_with_two_partitions_path(driver, topic_consumer, database) -> str: topic_path = database + "/test-topic-two-partitions" @@ -314,6 +317,7 @@ async def topic_with_two_partitions_path(driver, topic_consumer, database) -> st @pytest.fixture() +@pytest.mark.asyncio() async def topic_with_messages(driver, topic_consumer, database): topic_path = database + "/test-topic-with-messages" try: @@ -344,6 +348,7 @@ async def topic_with_messages(driver, topic_consumer, database): @pytest.fixture() +@pytest.mark.asyncio() async def topic_with_messages_with_metadata(driver, topic_consumer, database): topic_path = database + "/test-topic-with-messages-with-metadata" try: @@ -368,6 +373,7 @@ async def topic_with_messages_with_metadata(driver, topic_consumer, database): @pytest.fixture() +@pytest.mark.asyncio() async def topic_reader(driver, topic_consumer, topic_path) -> ydb.TopicReaderAsyncIO: reader = driver.topic_client.reader(topic=topic_path, consumer=topic_consumer) yield reader diff --git a/tests/tracing/test_tracing_sync.py b/tests/tracing/test_tracing_sync.py index 42dfae74..03c020de 100644 --- a/tests/tracing/test_tracing_sync.py +++ b/tests/tracing/test_tracing_sync.py @@ -223,7 +223,7 @@ def test_sdk_span_is_child_of_user_span(self, otel_setup): tracer = trace.get_tracer("test.tracer") - with tracer.start_as_current_span("user.operation") as parent_span: + with tracer.start_as_current_span("user.operation"): with create_ydb_span("ydb.ExecuteQuery", FakeDriverConfig(), session_id="s1", node_id=1): pass diff --git a/ydb/aio/query/session.py b/ydb/aio/query/session.py index bd291069..4c8c1c99 100644 --- a/ydb/aio/query/session.py +++ b/ydb/aio/query/session.py @@ -106,7 +106,7 @@ async def create(self, settings: Optional[BaseRequestSettings] = None) -> "Query if self._closed: raise RuntimeError("Session is already closed") - with create_ydb_span("ydb.CreateSession", self._driver._driver_config): + with create_ydb_span("ydb.CreateSession", self._driver_config): await self._create_call(settings=settings) await self._attach() @@ -162,7 +162,7 @@ async def execute( self._check_session_ready_to_use() span = create_ydb_span( - "ydb.ExecuteQuery", self._driver._driver_config, session_id=self._session_id, node_id=self._node_id + "ydb.ExecuteQuery", self._driver_config, session_id=self._session_id, node_id=self._node_id ) try: diff --git a/ydb/aio/query/transaction.py b/ydb/aio/query/transaction.py index 87666984..cd764067 100644 --- a/ydb/aio/query/transaction.py +++ b/ydb/aio/query/transaction.py @@ -109,7 +109,7 @@ async def commit(self, settings: Optional[BaseRequestSettings] = None) -> None: with create_ydb_span( "ydb.Commit", - self._driver._driver_config, + self._driver_config, session_id=self.session.session_id, node_id=self.session.node_id, tx_id=self._tx_state.tx_id, @@ -143,7 +143,7 @@ async def rollback(self, settings: Optional[BaseRequestSettings] = None) -> None with create_ydb_span( "ydb.Rollback", - self._driver._driver_config, + self._driver_config, session_id=self.session.session_id, node_id=self.session.node_id, tx_id=self._tx_state.tx_id, @@ -204,7 +204,7 @@ async def execute( span = create_ydb_span( "ydb.ExecuteQuery", - self._driver._driver_config, + self._driver_config, session_id=self.session.session_id, node_id=self.session.node_id, tx_id=self._tx_state.tx_id, diff --git a/ydb/query/session.py b/ydb/query/session.py index 91a35dce..3b546f76 100644 --- a/ydb/query/session.py +++ b/ydb/query/session.py @@ -31,7 +31,7 @@ from .._constants import DEFAULT_INITIAL_RESPONSE_TIMEOUT, DEFAULT_LONG_STREAM_TIMEOUT if TYPE_CHECKING: - from ..driver import Driver as SyncDriver + from ..driver import Driver as SyncDriver, DriverConfig from ..aio.driver import Driver as AsyncDriver @@ -85,6 +85,10 @@ def __init__(self, driver: DriverT, settings: Optional[base.QueryClientSettings] self._last_query_stats = None + @property + def _driver_config(self) -> Optional["DriverConfig"]: + return getattr(self._driver, "_driver_config", None) + @property def session_id(self) -> Optional[str]: return self._session_id @@ -369,7 +373,7 @@ def create(self, settings: Optional[BaseRequestSettings] = None) -> "QuerySessio if self._closed: raise RuntimeError("Session is already closed.") - with create_ydb_span("ydb.CreateSession", self._driver._driver_config): + with create_ydb_span("ydb.CreateSession", self._driver_config): self._create_call(settings=settings) self._attach() @@ -438,7 +442,7 @@ def execute( self._check_session_ready_to_use() span = create_ydb_span( - "ydb.ExecuteQuery", self._driver._driver_config, session_id=self._session_id, node_id=self._node_id + "ydb.ExecuteQuery", self._driver_config, session_id=self._session_id, node_id=self._node_id ) try: diff --git a/ydb/query/transaction.py b/ydb/query/transaction.py index d2aef95c..f96b7788 100644 --- a/ydb/query/transaction.py +++ b/ydb/query/transaction.py @@ -245,6 +245,10 @@ def __init__(self, driver: DriverT, session: "BaseQuerySession", tx_mode: base.B self._external_error = None self._last_query_stats = None + @property + def _driver_config(self): + return getattr(self._driver, "_driver_config", None) + @property def session_id(self) -> Optional[str]: """ @@ -556,7 +560,7 @@ def commit(self, settings: Optional[BaseRequestSettings] = None) -> None: with create_ydb_span( "ydb.Commit", - self._driver._driver_config, + self._driver_config, session_id=self.session.session_id, node_id=self.session.node_id, tx_id=self._tx_state.tx_id, @@ -589,7 +593,7 @@ def rollback(self, settings: Optional[BaseRequestSettings] = None) -> None: with create_ydb_span( "ydb.Rollback", - self._driver._driver_config, + self._driver_config, session_id=self.session.session_id, node_id=self.session.node_id, tx_id=self._tx_state.tx_id, @@ -651,7 +655,7 @@ def execute( span = create_ydb_span( "ydb.ExecuteQuery", - self._driver._driver_config, + self._driver_config, session_id=self.session.session_id, node_id=self.session.node_id, tx_id=self._tx_state.tx_id, From b574b7759adfebfb278bfeacc6a90f992961a686 Mon Sep 17 00:00:00 2001 From: tewbo Date: Thu, 9 Apr 2026 20:26:12 +0300 Subject: [PATCH 09/36] add docs and fix pr review comments --- docs/index.rst | 17 ++- docs/opentelemetry.rst | 225 ++++++++++++++++++++++++++++++ examples/opentelemetry/example.py | 3 +- setup.py | 2 +- ydb/aio/connection.py | 1 - ydb/connection.py | 1 - ydb/opentelemetry/__init__.py | 12 +- ydb/opentelemetry/_plugin.py | 19 ++- 8 files changed, 266 insertions(+), 14 deletions(-) create mode 100644 docs/opentelemetry.rst diff --git a/docs/index.rst b/docs/index.rst index 3e53104e..77efd8b3 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -26,6 +26,12 @@ Python client for `YDB `_ — a fault-tolerant distributed SQ coordination scheme +.. toctree:: + :hidden: + :caption: Observability + + opentelemetry + .. toctree:: :hidden: :caption: Reference @@ -82,7 +88,7 @@ Distributed Coordination ------------------------ The :doc:`coordination` page covers distributed semaphores and leader election. If you -need to limit concurrent access to a shared resource across multiple processes or hosts, +need to limit concurrent access to aЗе shared resource across multiple processes or hosts, this is the service to use. Schema Management @@ -103,6 +109,15 @@ use the ``@ydb_retry`` decorator. Skipping this section is a common source of pr incidents. +Observability +------------- + +The :doc:`opentelemetry` page explains how to add distributed tracing to your +application using OpenTelemetry. One call to ``enable_tracing()`` instruments +query sessions, transactions, and connection pool operations — so you can +visualize request flow in Jaeger, Grafana, or any OpenTelemetry-compatible backend. + + API Reference ------------- diff --git a/docs/opentelemetry.rst b/docs/opentelemetry.rst new file mode 100644 index 00000000..32991b17 --- /dev/null +++ b/docs/opentelemetry.rst @@ -0,0 +1,225 @@ +ПрOpenTelemetry Tracing +===================== + +The SDK provides built-in distributed tracing via `OpenTelemetry `_. +When enabled, key YDB operations — such as session creation, query execution, transaction +commit/rollback, and driver initialization — produce OpenTelemetry spans. Trace +context is automatically propagated to the YDB server through gRPC metadata using the +`W3C Trace Context `_ standard. + +Tracing is **zero-cost when disabled**: the SDK uses no-op stubs by default, so there is +no overhead unless you explicitly opt in. + + +Installation +------------ + +OpenTelemetry packages are not included by default. Install the SDK with the +``opentelemetry`` extra: + +.. code-block:: sh + + pip install ydb[opentelemetry] + +This pulls in ``opentelemetry-api``. You will also need ``opentelemetry-sdk`` and an +exporter for your tracing backend, for example: + +.. code-block:: sh + + # OTLP/gRPC exporter (works with Jaeger, Tempo, and others) + pip install opentelemetry-exporter-otlp-proto-grpc + + +Enabling Tracing +---------------- + +Call ``enable_tracing()`` once, **after** configuring your OpenTelemetry tracer provider +and **before** creating a ``Driver``: + +.. code-block:: python + + from opentelemetry import trace + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + from opentelemetry.sdk.resources import Resource + + import ydb + from ydb.opentelemetry import enable_tracing + + # 1. Set up OpenTelemetry + resource = Resource(attributes={"service.name": "my-service"}) + provider = TracerProvider(resource=resource) + provider.add_span_processor( + BatchSpanProcessor(OTLPSpanExporter(endpoint="http://localhost:4317")) + ) + trace.set_tracer_provider(provider) + + # 2. Enable YDB tracing + enable_tracing() + + # 3. Use the SDK as usual — spans are created automatically + with ydb.Driver(endpoint="grpc://localhost:2136", database="/local") as driver: + driver.wait(timeout=5) + with ydb.QuerySessionPool(driver) as pool: + pool.execute_with_retries("SELECT 1") + + provider.shutdown() + +``enable_tracing()`` accepts an optional ``tracer`` argument. If omitted, the SDK +obtains a tracer named ``"ydb.sdk"`` from the global tracer provider. + + +What Is Instrumented +-------------------- + +The following operations produce spans: + +.. list-table:: + :header-rows: 1 + :widths: 35 20 45 + + * - Span Name + - Kind + - Description + * - ``ydb.Driver.Initialize`` + - INTERNAL + - Driver wait / endpoint discovery. + * - ``ydb.CreateSession`` + - CLIENT + - Creating a new query session. + * - ``ydb.ExecuteQuery`` + - CLIENT + - Executing a query (including ``execute_with_retries``). + * - ``ydb.CommitTransaction`` + - CLIENT + - Committing an explicit transaction. + * - ``ydb.RollbackTransaction`` + - CLIENT + - Rolling back a transaction. + +All spans are nested under the currently active span, so wrapping your application +logic in a parent span produces a complete trace tree: + +.. code-block:: python + + tracer = trace.get_tracer(__name__) + + with tracer.start_as_current_span("handle-request"): + pool.execute_with_retries("SELECT 1") + # ↳ ydb.CreateSession (if a new session is needed) + # ↳ ydb.ExecuteQuery + + +Span Attributes +--------------- + +Every YDB span carries these semantic attributes: + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Attribute + - Description + * - ``db.system.name`` + - Always ``"ydb"``. + * - ``db.namespace`` + - Database path (e.g. ``"/local"``). + * - ``server.address`` + - Endpoint host. + * - ``server.port`` + - Endpoint port. + +Additional attributes are set when available: + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Attribute + - Description + * - ``ydb.session.id`` + - Session identifier. + * - ``ydb.node.id`` + - YDB node that handled the request. + * - ``ydb.tx.id`` + - Transaction identifier. + +On errors, the span also records: + +- ``error.type`` — ``"ydb_error"``, ``"transport_error"``, or the Python exception class name. +- ``db.response.status_code`` — the YDB status code name (e.g. ``"SCHEME_ERROR"``). + + +Trace Context Propagation +------------------------- + +When tracing is enabled, the SDK automatically injects trace context headers into +every gRPC call to YDB using the globally configured OpenTelemetry propagator +(``opentelemetry.propagate.inject``). By default, OpenTelemetry uses the +`W3C Trace Context `_ propagator, which adds +``traceparent`` and ``tracestate`` headers. + +YDB server expects W3C Trace Context headers, so the default propagator configuration +works out of the box. This allows the server to correlate client spans with +server-side processing, enabling end-to-end trace visibility across the entire +request path. + + +Async Usage +----------- + +Tracing works identically with the async driver. Call ``enable_tracing()`` once at +startup: + +.. code-block:: python + + import asyncio + import ydb + from ydb.opentelemetry import enable_tracing + + enable_tracing() + + async def main(): + async with ydb.aio.Driver( + endpoint="grpc://localhost:2136", + database="/local", + ) as driver: + await driver.wait(timeout=5) + async with ydb.aio.QuerySessionPool(driver) as pool: + await pool.execute_with_retries("SELECT 1") + + asyncio.run(main()) + + + +Using a Custom Tracer +--------------------- + +To use a specific tracer instead of the global one: + +.. code-block:: python + + from opentelemetry import trace + + my_tracer = trace.get_tracer("my.custom.tracer") + enable_tracing(tracer=my_tracer) + + +Running the Examples +-------------------- + +The ``examples/opentelemetry/`` directory contains ready-to-run examples with a Docker +Compose setup that starts YDB, an OTLP collector, Tempo, Prometheus, and Grafana: + +.. code-block:: sh + + cd examples/opentelemetry + docker compose -f compose-e2e.yaml up -d + + # Run the example + python example.py + +Open `http://localhost:3000 `_ (Grafana) to explore the +collected traces via the Tempo data source. diff --git a/examples/opentelemetry/example.py b/examples/opentelemetry/example.py index 55be4257..d36397c1 100644 --- a/examples/opentelemetry/example.py +++ b/examples/opentelemetry/example.py @@ -16,9 +16,8 @@ provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter(endpoint="http://localhost:4317"))) trace.set_tracer_provider(provider) -enable_tracing() - tracer = trace.get_tracer(__name__) +enable_tracing(tracer) ENDPOINT = "grpc://localhost:2136" DATABASE = "/local" diff --git a/setup.py b/setup.py index 50ccd578..0f850fbf 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,6 @@ options={"bdist_wheel": {"universal": True}}, extras_require={ "yc": ["yandexcloud", ], - "tracing": ["opentelemetry-api>=1.0.0", "opentelemetry-sdk>=1.0.0"], + "opentelemetry": ["opentelemetry-api>=1.0.0"], } ) diff --git a/ydb/aio/connection.py b/ydb/aio/connection.py index a0278c41..a3cf2ffc 100644 --- a/ydb/aio/connection.py +++ b/ydb/aio/connection.py @@ -161,7 +161,6 @@ def __init__( driver_config: Optional[DriverConfig] = None, endpoint_options: Optional[EndpointOptions] = None, ) -> None: - global _stubs_list self.endpoint = endpoint self.endpoint_key = EndpointKey(self.endpoint, getattr(endpoint_options, "node_id", None)) self.node_id = getattr(endpoint_options, "node_id", None) diff --git a/ydb/connection.py b/ydb/connection.py index d9bb6178..d1bcfdf5 100644 --- a/ydb/connection.py +++ b/ydb/connection.py @@ -423,7 +423,6 @@ def __init__( discovered by the YDB endpoint discovery mechanism :param driver_config: A driver config instance to be used for RPC call interception """ - global _stubs_list self.endpoint = endpoint self.node_id = getattr(endpoint_options, "node_id", None) self.endpoint_key = EndpointKey(endpoint, getattr(endpoint_options, "node_id", None)) diff --git a/ydb/opentelemetry/__init__.py b/ydb/opentelemetry/__init__.py index 144e7bc4..45a94047 100644 --- a/ydb/opentelemetry/__init__.py +++ b/ydb/opentelemetry/__init__.py @@ -1,13 +1,17 @@ -def enable_tracing(): - """Enable OpenTelemetry trace context propagation and span creation for all YDB gRPC calls.""" +def enable_tracing(tracer=None): + """Enable OpenTelemetry trace context propagation and span creation for all YDB gRPC calls. + + Args: + tracer: Optional OTel tracer to use. If not provided, the default tracer from the global tracer provider will be used. + """ try: from ydb.opentelemetry._plugin import _enable_tracing except ImportError: raise ImportError( - "OpenTelemetry packages are required for tracing support. " "Install them with: pip install ydb[tracing]" + "OpenTelemetry packages are required for tracing support. " "Install them with: pip install ydb[opentelemetry]" ) from None - _enable_tracing() + _enable_tracing(tracer) __all__ = ["enable_tracing"] diff --git a/ydb/opentelemetry/_plugin.py b/ydb/opentelemetry/_plugin.py index 81a54186..e77b532b 100644 --- a/ydb/opentelemetry/_plugin.py +++ b/ydb/opentelemetry/_plugin.py @@ -3,8 +3,19 @@ from opentelemetry.trace import StatusCode from ydb import issues +from ydb.issues import StatusCode as YdbStatusCode from ydb.opentelemetry.tracing import _registry +_TRANSPORT_STATUSES = frozenset( + { + YdbStatusCode.CONNECTION_LOST, + YdbStatusCode.CONNECTION_FAILURE, + YdbStatusCode.DEADLINE_EXCEEDED, + YdbStatusCode.CLIENT_INTERNAL_ERROR, + YdbStatusCode.UNIMPLEMENTED, + } +) + _tracer = None _enabled = False @@ -23,8 +34,8 @@ def _otel_metadata_hook(): def _set_error_on_span(span, exception): if isinstance(exception, issues.Error) and exception.status is not None: - error_type = exception.status.name - span.set_attribute("db.response.status_code", error_type) + span.set_attribute("db.response.status_code", exception.status.name) + error_type = "transport_error" if exception.status in _TRANSPORT_STATUSES else "ydb_error" else: error_type = type(exception).__qualname__ @@ -74,13 +85,13 @@ def _create_span(name, attributes=None, kind=None): return TracingSpan(span, token) -def _enable_tracing(): +def _enable_tracing(tracer=None): global _enabled, _tracer if _enabled: return - _tracer = trace.get_tracer("ydb.sdk") + _tracer = tracer if tracer is not None else trace.get_tracer("ydb.sdk") _enabled = True _registry.set_metadata_hook(_otel_metadata_hook) _registry.set_create_span(_create_span) From 7af5e2c3e2752b268e9d7f81d26c14dc1955ff01 Mon Sep 17 00:00:00 2001 From: tewbo Date: Thu, 9 Apr 2026 20:46:06 +0300 Subject: [PATCH 10/36] fix checkstyle and tests --- tests/tracing/test_tracing_async.py | 9 +++------ tests/tracing/test_tracing_sync.py | 2 +- ydb/opentelemetry/__init__.py | 3 ++- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/tracing/test_tracing_async.py b/tests/tracing/test_tracing_async.py index 4b059f2c..e3744341 100644 --- a/tests/tracing/test_tracing_async.py +++ b/tests/tracing/test_tracing_async.py @@ -169,16 +169,12 @@ async def test_rollback_emits_span(self, otel_setup): class TestAsyncErrorHandling: @pytest.mark.asyncio - async def test_error_sets_error_status(self, otel_setup): + async def test_error_sets_error_status_and_attributes(self, otel_setup): exporter = otel_setup from ydb import issues - class FakeStatus: - name = "SCHEME_ERROR" - exc = issues.SchemeError("Table not found") - exc.status = FakeStatus() from ydb.aio.query.session import QuerySession @@ -198,7 +194,8 @@ class FakeStatus: span = _get_single_span(exporter, "ydb.ExecuteQuery") assert span.status.status_code == StatusCode.ERROR attrs = dict(span.attributes) - assert attrs["error.type"] == "SCHEME_ERROR" + assert attrs["error.type"] == "ydb_error" + assert attrs["db.response.status_code"] == "SCHEME_ERROR" assert len(span.events) > 0 diff --git a/tests/tracing/test_tracing_sync.py b/tests/tracing/test_tracing_sync.py index 03c020de..f149f0c8 100644 --- a/tests/tracing/test_tracing_sync.py +++ b/tests/tracing/test_tracing_sync.py @@ -196,7 +196,7 @@ def test_error_sets_error_status_and_attributes(self, otel_setup): span = _get_single_span(exporter, "ydb.ExecuteQuery") assert span.status.status_code == StatusCode.ERROR attrs = dict(span.attributes) - assert attrs["error.type"] == "SCHEME_ERROR" + assert attrs["error.type"] == "ydb_error" assert attrs["db.response.status_code"] == "SCHEME_ERROR" assert len(span.events) > 0 diff --git a/ydb/opentelemetry/__init__.py b/ydb/opentelemetry/__init__.py index 45a94047..1ea6d6c8 100644 --- a/ydb/opentelemetry/__init__.py +++ b/ydb/opentelemetry/__init__.py @@ -8,7 +8,8 @@ def enable_tracing(tracer=None): from ydb.opentelemetry._plugin import _enable_tracing except ImportError: raise ImportError( - "OpenTelemetry packages are required for tracing support. " "Install them with: pip install ydb[opentelemetry]" + "OpenTelemetry packages are required for tracing support. " + "Install them with: pip install ydb[opentelemetry]" ) from None _enable_tracing(tracer) From 3e6b95e61865c967a857fed89965d77656f86075 Mon Sep 17 00:00:00 2001 From: tewbo Date: Thu, 9 Apr 2026 21:04:14 +0300 Subject: [PATCH 11/36] ci: retry failed workflow From 350b3b6c1f3c8e37ca187e8e298192d61a7e5278 Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Mon, 20 Apr 2026 14:38:05 +0300 Subject: [PATCH 12/36] feat(opentelemetry): retry-policy spans and per-node peer attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Query-service retries now emit an umbrella INTERNAL ydb.RunWithRetry span and a ydb.Try INTERNAL span per attempt. Each ydb.Try carries the ydb.retry.backoff_ms attribute (the sleep preceding the attempt — 0 for the first one, i.e. the next-attempt timeline includes the backoff). Retriable exceptions are recorded on the owning ydb.Try span, and an exception that escapes the whole retry loop (including an asyncio.CancelledError hitting a backoff sleep) is recorded on the outer ydb.RunWithRetry span. CLIENT spans (ydb.CreateSession, ydb.ExecuteQuery, ydb.Commit, ydb.Rollback) now also emit network.peer.address / network.peer.port for the concrete node serving the session, while server.address / server.port keep meaning the host from the connection string. Also fixes a "Пр" typo in docs/opentelemetry.rst and corrects span names (ydb.CommitTransaction -> ydb.Commit, ydb.RollbackTransaction -> ydb.Rollback). Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/opentelemetry.rst | 22 ++++-- tests/tracing/test_tracing_async.py | 68 +++++++++++++++++- tests/tracing/test_tracing_sync.py | 108 +++++++++++++++++++++++++++- ydb/aio/query/pool.py | 6 +- ydb/aio/query/session.py | 11 ++- ydb/aio/query/transaction.py | 3 + ydb/opentelemetry/_plugin.py | 3 + ydb/opentelemetry/tracing.py | 31 ++++++-- ydb/query/_retries.py | 97 +++++++++++++++++++++++++ ydb/query/pool.py | 6 +- ydb/query/session.py | 13 +++- ydb/query/transaction.py | 3 + ydb/retries.py | 5 +- 13 files changed, 347 insertions(+), 29 deletions(-) create mode 100644 ydb/query/_retries.py diff --git a/docs/opentelemetry.rst b/docs/opentelemetry.rst index 32991b17..ae7eaadc 100644 --- a/docs/opentelemetry.rst +++ b/docs/opentelemetry.rst @@ -1,4 +1,4 @@ -ПрOpenTelemetry Tracing +OpenTelemetry Tracing ===================== The SDK provides built-in distributed tracing via `OpenTelemetry `_. @@ -91,12 +91,18 @@ The following operations produce spans: * - ``ydb.ExecuteQuery`` - CLIENT - Executing a query (including ``execute_with_retries``). - * - ``ydb.CommitTransaction`` + * - ``ydb.Commit`` - CLIENT - Committing an explicit transaction. - * - ``ydb.RollbackTransaction`` + * - ``ydb.Rollback`` - CLIENT - Rolling back a transaction. + * - ``ydb.RunWithRetry`` + - INTERNAL + - Umbrella span wrapping the whole retryable block (``retry_operation_*`` / ``retry_tx_*`` / ``execute_with_retries``). + * - ``ydb.Try`` + - INTERNAL + - A single retry attempt. Carries ``ydb.retry.backoff_ms`` — how long the retrier slept before starting this attempt (``0`` for the first one). All spans are nested under the currently active span, so wrapping your application logic in a parent span produces a complete trace tree: @@ -114,7 +120,7 @@ logic in a parent span produces a complete trace tree: Span Attributes --------------- -Every YDB span carries these semantic attributes: +Every YDB RPC (CLIENT-kind) span carries these semantic attributes: .. list-table:: :header-rows: 1 @@ -127,9 +133,13 @@ Every YDB span carries these semantic attributes: * - ``db.namespace`` - Database path (e.g. ``"/local"``). * - ``server.address`` - - Endpoint host. + - Host from the connection string. * - ``server.port`` - - Endpoint port. + - Port from the connection string. + * - ``network.peer.address`` + - Actual node host used for this call (set once the session is attached to a node). + * - ``network.peer.port`` + - Actual node port used for this call. Additional attributes are set when available: diff --git a/tests/tracing/test_tracing_async.py b/tests/tracing/test_tracing_async.py index e3744341..5f3323ea 100644 --- a/tests/tracing/test_tracing_async.py +++ b/tests/tracing/test_tracing_async.py @@ -32,7 +32,7 @@ def _get_single_span(exporter, name): return spans[0] -def _make_async_session_mock(driver_config=None): +def _make_async_session_mock(driver_config=None, peer_endpoint=None): """Create a mock that behaves like an async QuerySession after create().""" cfg = driver_config or FakeDriverConfig() driver = MagicMock() @@ -42,6 +42,7 @@ def _make_async_session_mock(driver_config=None): session._driver = driver session._session_id = "test-session-id" session._node_id = 12345 + session._peer_endpoint = peer_endpoint session.session_id = "test-session-id" session.node_id = 12345 return session, driver @@ -199,6 +200,71 @@ async def test_error_sets_error_status_and_attributes(self, otel_setup): assert len(span.events) > 0 +class TestAsyncRetryPolicySpans: + @pytest.mark.asyncio + async def test_success_emits_single_try(self, otel_setup): + from ydb.query._retries import retry_operation_async + + exporter = otel_setup + + async def callee(): + return 7 + + assert await retry_operation_async(callee) == 7 + + run = _get_single_span(exporter, "ydb.RunWithRetry") + assert run.kind == SpanKind.INTERNAL + + tries = _get_spans(exporter, "ydb.Try") + assert len(tries) == 1 + assert tries[0].parent.span_id == run.context.span_id + assert dict(tries[0].attributes)["ydb.retry.backoff_ms"] == 0 + + @pytest.mark.asyncio + async def test_context_cancel_during_backoff_records_exception(self, otel_setup): + """Backoff sleep is the timeline of the next Try; a cancel hitting it + must be recorded on that Try span and propagate out through RunWithRetry. + """ + from ydb import issues + from ydb.query._retries import retry_operation_async + from ydb.retries import BackoffSettings, RetrySettings + + exporter = otel_setup + calls = {"n": 0} + + async def flaky(): + calls["n"] += 1 + raise issues.Unavailable("transient") + + retry_settings = RetrySettings( + max_retries=10, + fast_backoff_settings=BackoffSettings(ceiling=0, slot_duration=10.0), + slow_backoff_settings=BackoffSettings(ceiling=0, slot_duration=10.0), + ) + + task = asyncio.ensure_future(retry_operation_async(flaky, retry_settings)) + # Let the first attempt fail and the backoff sleep start. + for _ in range(10): + await asyncio.sleep(0.01) + if calls["n"] >= 1: + break + task.cancel() + with pytest.raises(asyncio.CancelledError): + await task + + run = _get_single_span(exporter, "ydb.RunWithRetry") + assert run.status.status_code == StatusCode.ERROR + + tries = _get_spans(exporter, "ydb.Try") + assert len(tries) >= 2 + # Try span that carried the cancelled backoff must be errored. + backoff_try = tries[-1] + assert backoff_try.status.status_code == StatusCode.ERROR + assert dict(backoff_try.attributes)["ydb.retry.backoff_ms"] > 0 + error_types = {dict(s.attributes).get("error.type") for s in tries} + assert "CancelledError" in error_types + + class TestAsyncConcurrentSpansIsolation: @pytest.mark.asyncio async def test_parallel_executes_do_not_become_parent_child(self, otel_setup): diff --git a/tests/tracing/test_tracing_sync.py b/tests/tracing/test_tracing_sync.py index f149f0c8..ceb5e2d2 100644 --- a/tests/tracing/test_tracing_sync.py +++ b/tests/tracing/test_tracing_sync.py @@ -30,7 +30,7 @@ def _get_single_span(exporter, name): return spans[0] -def _make_session_mock(driver_config=None): +def _make_session_mock(driver_config=None, peer_endpoint=None): """Create a mock that behaves like a sync QuerySession after create().""" cfg = driver_config or FakeDriverConfig() driver = MagicMock() @@ -40,6 +40,7 @@ def _make_session_mock(driver_config=None): session._driver = driver session._session_id = "test-session-id" session._node_id = 12345 + session._peer_endpoint = peer_endpoint session.session_id = "test-session-id" session.node_id = 12345 return session, driver @@ -98,6 +99,7 @@ def test_session_execute_emits_span(self, otel_setup): qs._driver = driver qs._session_id = "test-session-id" qs._node_id = 12345 + qs._peer_endpoint = "node-7.cluster:2136" qs._closed = False fake_stream = iter([]) # empty stream that raises StopIteration immediately @@ -113,6 +115,8 @@ def test_session_execute_emits_span(self, otel_setup): assert attrs["db.namespace"] == "/test_database" assert attrs["server.address"] == "test_endpoint" assert attrs["server.port"] == 1337 + assert attrs["network.peer.address"] == "node-7.cluster" + assert attrs["network.peer.port"] == 2136 assert attrs["ydb.session.id"] == "test-session-id" assert attrs["ydb.node.id"] == 12345 @@ -285,3 +289,105 @@ def test_endpoint_parsing(self, otel_setup, endpoint, expected_host, expected_po assert attrs["server.address"] == expected_host assert attrs["server.port"] == expected_port assert attrs["db.namespace"] == "/mydb" + + def test_peer_attributes_are_optional(self, otel_setup): + exporter = otel_setup + cfg = FakeDriverConfig() + + with create_ydb_span("ydb.Test", cfg): + pass + + span = _get_single_span(exporter, "ydb.Test") + attrs = dict(span.attributes) + assert "network.peer.address" not in attrs + assert "network.peer.port" not in attrs + + def test_peer_attributes_emitted_when_known(self, otel_setup): + exporter = otel_setup + cfg = FakeDriverConfig() + + with create_ydb_span("ydb.Test", cfg, peer_endpoint="peer.example.com:2137"): + pass + + span = _get_single_span(exporter, "ydb.Test") + attrs = dict(span.attributes) + assert attrs["network.peer.address"] == "peer.example.com" + assert attrs["network.peer.port"] == 2137 + + +class TestRetryPolicySpans: + def test_success_on_first_try_emits_single_try(self, otel_setup): + from ydb.query._retries import retry_operation_sync + + exporter = otel_setup + + def callee(): + return 42 + + assert retry_operation_sync(callee) == 42 + + run = _get_single_span(exporter, "ydb.RunWithRetry") + assert run.kind == SpanKind.INTERNAL + assert run.status.status_code == StatusCode.UNSET + + tries = _get_spans(exporter, "ydb.Try") + assert len(tries) == 1 + assert tries[0].kind == SpanKind.INTERNAL + assert dict(tries[0].attributes)["ydb.retry.backoff_ms"] == 0 + assert tries[0].parent.span_id == run.context.span_id + + def test_retry_backoff_ms_on_each_try(self, otel_setup): + from ydb import issues + from ydb.query._retries import retry_operation_sync + from ydb.retries import RetrySettings, BackoffSettings + + exporter = otel_setup + counter = {"n": 0} + + def flaky(): + counter["n"] += 1 + if counter["n"] < 3: + raise issues.Unavailable("transient") + return "ok" + + retry_settings = RetrySettings( + max_retries=5, + fast_backoff_settings=BackoffSettings(ceiling=0, slot_duration=0.05), + slow_backoff_settings=BackoffSettings(ceiling=0, slot_duration=0.05), + ) + + assert retry_operation_sync(flaky, retry_settings) == "ok" + + tries = _get_spans(exporter, "ydb.Try") + assert len(tries) == 3 + # first attempt has no preceding backoff, later ones have a positive one + backoff_values = [dict(s.attributes)["ydb.retry.backoff_ms"] for s in tries] + assert backoff_values[0] == 0 + assert all(v >= 0 for v in backoff_values) + assert any(v > 0 for v in backoff_values[1:]) + # failed Try spans record the exception + assert tries[0].status.status_code == StatusCode.ERROR + assert tries[1].status.status_code == StatusCode.ERROR + assert tries[2].status.status_code == StatusCode.UNSET + + def test_non_retryable_error_propagates_to_run_span(self, otel_setup): + from ydb import issues + from ydb.query._retries import retry_operation_sync + + exporter = otel_setup + + def broken(): + raise issues.SchemeError("boom") + + with pytest.raises(issues.SchemeError): + retry_operation_sync(broken) + + run = _get_single_span(exporter, "ydb.RunWithRetry") + assert run.status.status_code == StatusCode.ERROR + + tries = _get_spans(exporter, "ydb.Try") + assert len(tries) == 1 + assert tries[0].status.status_code == StatusCode.ERROR + attrs = dict(tries[0].attributes) + assert attrs["error.type"] == "ydb_error" + assert attrs["db.response.status_code"] == "SCHEME_ERROR" diff --git a/ydb/aio/query/pool.py b/ydb/aio/query/pool.py index 7561a21b..42ec16da 100644 --- a/ydb/aio/query/pool.py +++ b/ydb/aio/query/pool.py @@ -14,10 +14,8 @@ from .session import ( QuerySession, ) -from ...retries import ( - RetrySettings, - retry_operation_async, -) +from ...retries import RetrySettings +from ...query._retries import retry_operation_async from ...query.base import BaseQueryTxMode, QueryExplainResultFormat from ...query.base import QueryClientSettings from ... import convert diff --git a/ydb/aio/query/session.py b/ydb/aio/query/session.py index 4c8c1c99..08c5dfe1 100644 --- a/ydb/aio/query/session.py +++ b/ydb/aio/query/session.py @@ -19,7 +19,7 @@ from ...query import base from ...query.session import BaseQuerySession -from ...opentelemetry.tracing import create_ydb_span +from ...opentelemetry.tracing import create_ydb_span, set_peer_attributes from ..._constants import DEFAULT_INITIAL_RESPONSE_TIMEOUT @@ -106,8 +106,9 @@ async def create(self, settings: Optional[BaseRequestSettings] = None) -> "Query if self._closed: raise RuntimeError("Session is already closed") - with create_ydb_span("ydb.CreateSession", self._driver_config): + with create_ydb_span("ydb.CreateSession", self._driver_config) as span: await self._create_call(settings=settings) + set_peer_attributes(span, self._peer_endpoint) await self._attach() return self @@ -162,7 +163,11 @@ async def execute( self._check_session_ready_to_use() span = create_ydb_span( - "ydb.ExecuteQuery", self._driver_config, session_id=self._session_id, node_id=self._node_id + "ydb.ExecuteQuery", + self._driver_config, + session_id=self._session_id, + node_id=self._node_id, + peer_endpoint=self._peer_endpoint, ) try: diff --git a/ydb/aio/query/transaction.py b/ydb/aio/query/transaction.py index cd764067..567cf231 100644 --- a/ydb/aio/query/transaction.py +++ b/ydb/aio/query/transaction.py @@ -113,6 +113,7 @@ async def commit(self, settings: Optional[BaseRequestSettings] = None) -> None: session_id=self.session.session_id, node_id=self.session.node_id, tx_id=self._tx_state.tx_id, + peer_endpoint=getattr(self.session, "_peer_endpoint", None), ): try: await self._execute_callbacks_async(base.TxEvent.BEFORE_COMMIT) @@ -147,6 +148,7 @@ async def rollback(self, settings: Optional[BaseRequestSettings] = None) -> None session_id=self.session.session_id, node_id=self.session.node_id, tx_id=self._tx_state.tx_id, + peer_endpoint=getattr(self.session, "_peer_endpoint", None), ): try: await self._execute_callbacks_async(base.TxEvent.BEFORE_ROLLBACK) @@ -208,6 +210,7 @@ async def execute( session_id=self.session.session_id, node_id=self.session.node_id, tx_id=self._tx_state.tx_id, + peer_endpoint=getattr(self.session, "_peer_endpoint", None), ) try: diff --git a/ydb/opentelemetry/_plugin.py b/ydb/opentelemetry/_plugin.py index e77b532b..f555ea1a 100644 --- a/ydb/opentelemetry/_plugin.py +++ b/ydb/opentelemetry/_plugin.py @@ -57,6 +57,9 @@ def __init__(self, span, token): def set_error(self, exception): _set_error_on_span(self._span, exception) + def set_attribute(self, key, value): + self._span.set_attribute(key, value) + def end(self): self._span.end() if self._token is not None: diff --git a/ydb/opentelemetry/tracing.py b/ydb/opentelemetry/tracing.py index 07a0ead7..0ff7f568 100644 --- a/ydb/opentelemetry/tracing.py +++ b/ydb/opentelemetry/tracing.py @@ -4,6 +4,9 @@ class _NoopSpan: def set_error(self, exception): pass + def set_attribute(self, key, value): + pass + def end(self): pass @@ -54,15 +57,24 @@ def get_trace_metadata(): return _registry.get_trace_metadata() -def _build_ydb_attrs(driver_config, session_id=None, node_id=None, tx_id=None): - endpoint = getattr(driver_config, "endpoint", None) or "" +def _split_endpoint(endpoint): + endpoint = endpoint or "" host, _, port = endpoint.rpartition(":") + return host, int(port) if port.isdigit() else 0 + + +def _build_ydb_attrs(driver_config, session_id=None, node_id=None, tx_id=None, peer_endpoint=None): + host, port = _split_endpoint(getattr(driver_config, "endpoint", None)) attrs = { "db.system.name": "ydb", "db.namespace": getattr(driver_config, "database", None) or "", "server.address": host, - "server.port": int(port) if port.isdigit() else 0, + "server.port": port, } + if peer_endpoint is not None: + peer_host, peer_port = _split_endpoint(peer_endpoint) + attrs["network.peer.address"] = peer_host + attrs["network.peer.port"] = peer_port if session_id is not None: attrs["ydb.session.id"] = session_id or "" if node_id is not None: @@ -72,9 +84,18 @@ def _build_ydb_attrs(driver_config, session_id=None, node_id=None, tx_id=None): return attrs -def create_ydb_span(name, driver_config, session_id=None, node_id=None, tx_id=None, kind=None): +def create_ydb_span(name, driver_config, session_id=None, node_id=None, tx_id=None, kind=None, peer_endpoint=None): """Create a span pre-filled with standard YDB attributes. Can be used as a context manager or manually. """ - attrs = _build_ydb_attrs(driver_config, session_id, node_id, tx_id) + attrs = _build_ydb_attrs(driver_config, session_id, node_id, tx_id, peer_endpoint) return _registry.create_span(name, attributes=attrs, kind=kind) + + +def set_peer_attributes(span, peer_endpoint): + """Fill in network.peer.* attributes on an existing span once the peer is known.""" + if peer_endpoint is None: + return + peer_host, peer_port = _split_endpoint(peer_endpoint) + span.set_attribute("network.peer.address", peer_host) + span.set_attribute("network.peer.port", peer_port) diff --git a/ydb/query/_retries.py b/ydb/query/_retries.py new file mode 100644 index 00000000..b5b991b0 --- /dev/null +++ b/ydb/query/_retries.py @@ -0,0 +1,97 @@ +"""Retry wrappers that emit OpenTelemetry spans around the query-service retry loop. + +``ydb.RunWithRetry`` is the umbrella INTERNAL span, and each attempt is wrapped in +a ``ydb.Try`` INTERNAL span with the ``ydb.retry.backoff_ms`` attribute capturing +the sleep that preceded it. When the retry fails, the offending exception is +recorded on the ``ydb.Try`` span; when it propagates out, it is also recorded on +the outer ``ydb.RunWithRetry`` span via the context-manager protocol. +""" +import asyncio +import time +from typing import Any, Callable, Optional + +from ..opentelemetry.tracing import _registry +from ..retries import RetrySettings, YdbRetryOperationSleepOpt, retry_operation_impl + + +_RUN_WITH_RETRY = "ydb.RunWithRetry" +_TRY = "ydb.Try" +_BACKOFF_ATTR = "ydb.retry.backoff_ms" + + +def _start_try_span(backoff_ms: int): + return _registry.create_span(_TRY, attributes={_BACKOFF_ATTR: backoff_ms}, kind="internal") + + +def retry_operation_sync( + callee: Callable[..., Any], + retry_settings: Optional[RetrySettings] = None, + *args: Any, + **kwargs: Any, +) -> Any: + with _registry.create_span(_RUN_WITH_RETRY, kind="internal"): + opt_generator = retry_operation_impl(callee, retry_settings, *args, **kwargs) + try_span = _start_try_span(0) + try: + for next_opt in opt_generator: + if isinstance(next_opt, YdbRetryOperationSleepOpt): + exc = getattr(next_opt, "exception", None) + if exc is not None: + try_span.set_error(exc) + try_span.end() + try_span = None + backoff_ms = int(next_opt.timeout * 1000) + try_span = _start_try_span(backoff_ms) + time.sleep(next_opt.timeout) + else: + try_span.end() + try_span = None + return next_opt.result + except BaseException as e: + if try_span is not None: + try_span.set_error(e) + try_span.end() + try_span = None + raise + if try_span is not None: + try_span.end() + return None + + +async def retry_operation_async( + callee: Callable[..., Any], + retry_settings: Optional[RetrySettings] = None, + *args: Any, + **kwargs: Any, +) -> Any: + with _registry.create_span(_RUN_WITH_RETRY, kind="internal"): + opt_generator = retry_operation_impl(callee, retry_settings, *args, **kwargs) + try_span = _start_try_span(0) + try: + for next_opt in opt_generator: + if isinstance(next_opt, YdbRetryOperationSleepOpt): + exc = getattr(next_opt, "exception", None) + if exc is not None: + try_span.set_error(exc) + try_span.end() + try_span = None + backoff_ms = int(next_opt.timeout * 1000) + try_span = _start_try_span(backoff_ms) + await asyncio.sleep(next_opt.timeout) + else: + try: + result = await next_opt.result + try_span.end() + try_span = None + return result + except BaseException as e: # pylint: disable=W0703 + next_opt.set_exception(e) + except BaseException as e: + if try_span is not None: + try_span.set_error(e) + try_span.end() + try_span = None + raise + if try_span is not None: + try_span.end() + return None diff --git a/ydb/query/pool.py b/ydb/query/pool.py index af344365..f7b25d14 100644 --- a/ydb/query/pool.py +++ b/ydb/query/pool.py @@ -20,10 +20,8 @@ from .session import ( QuerySession, ) -from ..retries import ( - RetrySettings, - retry_operation_sync, -) +from ..retries import RetrySettings +from ._retries import retry_operation_sync from .. import issues from .. import convert from ..settings import BaseRequestSettings diff --git a/ydb/query/session.py b/ydb/query/session.py index 3b546f76..10cc775b 100644 --- a/ydb/query/session.py +++ b/ydb/query/session.py @@ -18,7 +18,7 @@ from .base import QueryExplainResultFormat from .. import _apis, issues, _utilities -from ..opentelemetry.tracing import create_ydb_span +from ..opentelemetry.tracing import create_ydb_span, set_peer_attributes from ..settings import BaseRequestSettings from ..connection import _RpcState as RpcState, EndpointKey from .._grpc.grpcwrapper import common_utils @@ -47,6 +47,7 @@ def wrapper_create_session( issues._process_response(message.status) session._session_id = message.session_id session._node_id = message.node_id + session._peer_endpoint = getattr(rpc_state, "endpoint", None) return session @@ -71,6 +72,7 @@ class BaseQuerySession(abc.ABC, Generic[DriverT]): # Session data _session_id: Optional[str] = None _node_id: Optional[int] = None + _peer_endpoint: Optional[str] = None _closed: bool = False def __init__(self, driver: DriverT, settings: Optional[base.QueryClientSettings] = None): @@ -373,8 +375,9 @@ def create(self, settings: Optional[BaseRequestSettings] = None) -> "QuerySessio if self._closed: raise RuntimeError("Session is already closed.") - with create_ydb_span("ydb.CreateSession", self._driver_config): + with create_ydb_span("ydb.CreateSession", self._driver_config) as span: self._create_call(settings=settings) + set_peer_attributes(span, self._peer_endpoint) self._attach() return self @@ -442,7 +445,11 @@ def execute( self._check_session_ready_to_use() span = create_ydb_span( - "ydb.ExecuteQuery", self._driver_config, session_id=self._session_id, node_id=self._node_id + "ydb.ExecuteQuery", + self._driver_config, + session_id=self._session_id, + node_id=self._node_id, + peer_endpoint=self._peer_endpoint, ) try: diff --git a/ydb/query/transaction.py b/ydb/query/transaction.py index f96b7788..1d2cb855 100644 --- a/ydb/query/transaction.py +++ b/ydb/query/transaction.py @@ -564,6 +564,7 @@ def commit(self, settings: Optional[BaseRequestSettings] = None) -> None: session_id=self.session.session_id, node_id=self.session.node_id, tx_id=self._tx_state.tx_id, + peer_endpoint=getattr(self.session, "_peer_endpoint", None), ): try: self._execute_callbacks_sync(base.TxEvent.BEFORE_COMMIT) @@ -597,6 +598,7 @@ def rollback(self, settings: Optional[BaseRequestSettings] = None) -> None: session_id=self.session.session_id, node_id=self.session.node_id, tx_id=self._tx_state.tx_id, + peer_endpoint=getattr(self.session, "_peer_endpoint", None), ): try: self._execute_callbacks_sync(base.TxEvent.BEFORE_ROLLBACK) @@ -659,6 +661,7 @@ def execute( session_id=self.session.session_id, node_id=self.session.node_id, tx_id=self._tx_state.tx_id, + peer_endpoint=getattr(self.session, "_peer_endpoint", None), ) try: diff --git a/ydb/retries.py b/ydb/retries.py index c151e3d2..21110876 100644 --- a/ydb/retries.py +++ b/ydb/retries.py @@ -72,8 +72,9 @@ def with_slow_backoff(self, backoff_settings: BackoffSettings) -> "RetrySettings class YdbRetryOperationSleepOpt: - def __init__(self, timeout: float) -> None: + def __init__(self, timeout: float, exception: Optional[BaseException] = None) -> None: self.timeout = timeout + self.exception = exception def __eq__(self, other: object) -> bool: return ( @@ -142,7 +143,7 @@ def retry_operation_impl( yield_sleep = False if yield_sleep: - yield YdbRetryOperationSleepOpt(retriable_info.sleep_timeout_seconds) + yield YdbRetryOperationSleepOpt(retriable_info.sleep_timeout_seconds, exception=e) except Exception as e: # you should provide your own handler you want From 3e55d612094b8ee826df7b2c8b4347f2584d5cfe Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Mon, 20 Apr 2026 15:15:28 +0300 Subject: [PATCH 13/36] refactor(opentelemetry): inline retry spans into ydb.retries Move ydb.RunWithRetry / ydb.Try span emission directly into retry_operation_sync / retry_operation_async in ydb/retries.py, and drop the short-lived ydb.query._retries shim. Tracing is still no-op by default, so there is no cost for the table-service callers that share the same retry loop; we just stop duplicating the retry logic to add spans. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/tracing/test_tracing_async.py | 4 +- tests/tracing/test_tracing_sync.py | 6 +- ydb/aio/query/pool.py | 6 +- ydb/query/_retries.py | 97 ----------------------------- ydb/query/pool.py | 6 +- ydb/retries.py | 75 +++++++++++++++++----- 6 files changed, 73 insertions(+), 121 deletions(-) delete mode 100644 ydb/query/_retries.py diff --git a/tests/tracing/test_tracing_async.py b/tests/tracing/test_tracing_async.py index 5f3323ea..0c3d48f0 100644 --- a/tests/tracing/test_tracing_async.py +++ b/tests/tracing/test_tracing_async.py @@ -203,7 +203,7 @@ async def test_error_sets_error_status_and_attributes(self, otel_setup): class TestAsyncRetryPolicySpans: @pytest.mark.asyncio async def test_success_emits_single_try(self, otel_setup): - from ydb.query._retries import retry_operation_async + from ydb.retries import retry_operation_async exporter = otel_setup @@ -226,7 +226,7 @@ async def test_context_cancel_during_backoff_records_exception(self, otel_setup) must be recorded on that Try span and propagate out through RunWithRetry. """ from ydb import issues - from ydb.query._retries import retry_operation_async + from ydb.retries import retry_operation_async from ydb.retries import BackoffSettings, RetrySettings exporter = otel_setup diff --git a/tests/tracing/test_tracing_sync.py b/tests/tracing/test_tracing_sync.py index ceb5e2d2..c294ed76 100644 --- a/tests/tracing/test_tracing_sync.py +++ b/tests/tracing/test_tracing_sync.py @@ -317,7 +317,7 @@ def test_peer_attributes_emitted_when_known(self, otel_setup): class TestRetryPolicySpans: def test_success_on_first_try_emits_single_try(self, otel_setup): - from ydb.query._retries import retry_operation_sync + from ydb.retries import retry_operation_sync exporter = otel_setup @@ -338,7 +338,7 @@ def callee(): def test_retry_backoff_ms_on_each_try(self, otel_setup): from ydb import issues - from ydb.query._retries import retry_operation_sync + from ydb.retries import retry_operation_sync from ydb.retries import RetrySettings, BackoffSettings exporter = otel_setup @@ -372,7 +372,7 @@ def flaky(): def test_non_retryable_error_propagates_to_run_span(self, otel_setup): from ydb import issues - from ydb.query._retries import retry_operation_sync + from ydb.retries import retry_operation_sync exporter = otel_setup diff --git a/ydb/aio/query/pool.py b/ydb/aio/query/pool.py index 42ec16da..7561a21b 100644 --- a/ydb/aio/query/pool.py +++ b/ydb/aio/query/pool.py @@ -14,8 +14,10 @@ from .session import ( QuerySession, ) -from ...retries import RetrySettings -from ...query._retries import retry_operation_async +from ...retries import ( + RetrySettings, + retry_operation_async, +) from ...query.base import BaseQueryTxMode, QueryExplainResultFormat from ...query.base import QueryClientSettings from ... import convert diff --git a/ydb/query/_retries.py b/ydb/query/_retries.py deleted file mode 100644 index b5b991b0..00000000 --- a/ydb/query/_retries.py +++ /dev/null @@ -1,97 +0,0 @@ -"""Retry wrappers that emit OpenTelemetry spans around the query-service retry loop. - -``ydb.RunWithRetry`` is the umbrella INTERNAL span, and each attempt is wrapped in -a ``ydb.Try`` INTERNAL span with the ``ydb.retry.backoff_ms`` attribute capturing -the sleep that preceded it. When the retry fails, the offending exception is -recorded on the ``ydb.Try`` span; when it propagates out, it is also recorded on -the outer ``ydb.RunWithRetry`` span via the context-manager protocol. -""" -import asyncio -import time -from typing import Any, Callable, Optional - -from ..opentelemetry.tracing import _registry -from ..retries import RetrySettings, YdbRetryOperationSleepOpt, retry_operation_impl - - -_RUN_WITH_RETRY = "ydb.RunWithRetry" -_TRY = "ydb.Try" -_BACKOFF_ATTR = "ydb.retry.backoff_ms" - - -def _start_try_span(backoff_ms: int): - return _registry.create_span(_TRY, attributes={_BACKOFF_ATTR: backoff_ms}, kind="internal") - - -def retry_operation_sync( - callee: Callable[..., Any], - retry_settings: Optional[RetrySettings] = None, - *args: Any, - **kwargs: Any, -) -> Any: - with _registry.create_span(_RUN_WITH_RETRY, kind="internal"): - opt_generator = retry_operation_impl(callee, retry_settings, *args, **kwargs) - try_span = _start_try_span(0) - try: - for next_opt in opt_generator: - if isinstance(next_opt, YdbRetryOperationSleepOpt): - exc = getattr(next_opt, "exception", None) - if exc is not None: - try_span.set_error(exc) - try_span.end() - try_span = None - backoff_ms = int(next_opt.timeout * 1000) - try_span = _start_try_span(backoff_ms) - time.sleep(next_opt.timeout) - else: - try_span.end() - try_span = None - return next_opt.result - except BaseException as e: - if try_span is not None: - try_span.set_error(e) - try_span.end() - try_span = None - raise - if try_span is not None: - try_span.end() - return None - - -async def retry_operation_async( - callee: Callable[..., Any], - retry_settings: Optional[RetrySettings] = None, - *args: Any, - **kwargs: Any, -) -> Any: - with _registry.create_span(_RUN_WITH_RETRY, kind="internal"): - opt_generator = retry_operation_impl(callee, retry_settings, *args, **kwargs) - try_span = _start_try_span(0) - try: - for next_opt in opt_generator: - if isinstance(next_opt, YdbRetryOperationSleepOpt): - exc = getattr(next_opt, "exception", None) - if exc is not None: - try_span.set_error(exc) - try_span.end() - try_span = None - backoff_ms = int(next_opt.timeout * 1000) - try_span = _start_try_span(backoff_ms) - await asyncio.sleep(next_opt.timeout) - else: - try: - result = await next_opt.result - try_span.end() - try_span = None - return result - except BaseException as e: # pylint: disable=W0703 - next_opt.set_exception(e) - except BaseException as e: - if try_span is not None: - try_span.set_error(e) - try_span.end() - try_span = None - raise - if try_span is not None: - try_span.end() - return None diff --git a/ydb/query/pool.py b/ydb/query/pool.py index f7b25d14..af344365 100644 --- a/ydb/query/pool.py +++ b/ydb/query/pool.py @@ -20,8 +20,10 @@ from .session import ( QuerySession, ) -from ..retries import RetrySettings -from ._retries import retry_operation_sync +from ..retries import ( + RetrySettings, + retry_operation_sync, +) from .. import issues from .. import convert from ..settings import BaseRequestSettings diff --git a/ydb/retries.py b/ydb/retries.py index 21110876..bd4fc4ad 100644 --- a/ydb/retries.py +++ b/ydb/retries.py @@ -7,6 +7,16 @@ from . import issues from ._errors import check_retriable_error +from .opentelemetry.tracing import _registry as _tracing_registry + + +_RUN_WITH_RETRY_SPAN = "ydb.RunWithRetry" +_TRY_SPAN = "ydb.Try" +_BACKOFF_ATTR = "ydb.retry.backoff_ms" + + +def _start_try_span(backoff_ms: int): + return _tracing_registry.create_span(_TRY_SPAN, attributes={_BACKOFF_ATTR: backoff_ms}, kind="internal") class BackoffSettings: @@ -160,12 +170,29 @@ def retry_operation_sync( *args: Any, **kwargs: Any, ) -> Any: - opt_generator = retry_operation_impl(callee, retry_settings, *args, **kwargs) - for next_opt in opt_generator: - if isinstance(next_opt, YdbRetryOperationSleepOpt): - time.sleep(next_opt.timeout) - else: - return next_opt.result + with _tracing_registry.create_span(_RUN_WITH_RETRY_SPAN, kind="internal"): + opt_generator = retry_operation_impl(callee, retry_settings, *args, **kwargs) + try_span = _start_try_span(0) + try: + for next_opt in opt_generator: + if isinstance(next_opt, YdbRetryOperationSleepOpt): + exc = next_opt.exception + if exc is not None: + try_span.set_error(exc) + try_span.end() + try_span = _start_try_span(int(next_opt.timeout * 1000)) + time.sleep(next_opt.timeout) + else: + try_span.end() + try_span = None + return next_opt.result + except BaseException as e: + if try_span is not None: + try_span.set_error(e) + try_span.end() + raise + if try_span is not None: + try_span.end() return None @@ -187,15 +214,33 @@ async def retry_operation_async( # pylint: disable=W1113 Returns awaitable result of coroutine. If retries are not succussful exception is raised. """ - opt_generator = retry_operation_impl(callee, retry_settings, *args, **kwargs) - for next_opt in opt_generator: - if isinstance(next_opt, YdbRetryOperationSleepOpt): - await asyncio.sleep(next_opt.timeout) - else: - try: - return await next_opt.result - except BaseException as e: # pylint: disable=W0703 - next_opt.set_exception(e) + with _tracing_registry.create_span(_RUN_WITH_RETRY_SPAN, kind="internal"): + opt_generator = retry_operation_impl(callee, retry_settings, *args, **kwargs) + try_span = _start_try_span(0) + try: + for next_opt in opt_generator: + if isinstance(next_opt, YdbRetryOperationSleepOpt): + exc = next_opt.exception + if exc is not None: + try_span.set_error(exc) + try_span.end() + try_span = _start_try_span(int(next_opt.timeout * 1000)) + await asyncio.sleep(next_opt.timeout) + else: + try: + result = await next_opt.result + try_span.end() + try_span = None + return result + except BaseException as e: # pylint: disable=W0703 + next_opt.set_exception(e) + except BaseException as e: + if try_span is not None: + try_span.set_error(e) + try_span.end() + raise + if try_span is not None: + try_span.end() return None From e11b180a29909d70dc913aec5e84966df14be764 Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Mon, 20 Apr 2026 15:53:07 +0300 Subject: [PATCH 14/36] refactor(opentelemetry): peer from endpoint map; add ydb.node.dc; drop session.id/tx.id RPC (CLIENT-kind) spans now carry the peer metadata from the discovery endpoint map, not from the grpc-target string of the request: * network.peer.address = EndpointInfo.address (the node host) * network.peer.port = EndpointInfo.port * ydb.node.dc = EndpointInfo.location To do that, EndpointOptions and Connection now also carry address/port/ location populated by resolver.endpoints_with_options(); sessions resolve their peer tuple via driver._store.connections_by_node_id after CreateSession returns, which is the right place to ask which node owns this session. Dropped the noisy ydb.session.id and ydb.tx.id attributes - they pollute every span and are recoverable from trace context if really needed. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/opentelemetry.rst | 10 ++-- tests/tracing/test_tracing_async.py | 35 +++++++++----- tests/tracing/test_tracing_sync.py | 73 ++++++++++++++++++++++------- ydb/aio/query/session.py | 5 +- ydb/aio/query/transaction.py | 12 ++--- ydb/connection.py | 13 ++++- ydb/opentelemetry/tracing.py | 41 +++++++++------- ydb/query/session.py | 29 ++++++++++-- ydb/query/transaction.py | 12 ++--- ydb/resolver.py | 6 ++- 10 files changed, 155 insertions(+), 81 deletions(-) diff --git a/docs/opentelemetry.rst b/docs/opentelemetry.rst index ae7eaadc..f924ec48 100644 --- a/docs/opentelemetry.rst +++ b/docs/opentelemetry.rst @@ -137,9 +137,11 @@ Every YDB RPC (CLIENT-kind) span carries these semantic attributes: * - ``server.port`` - Port from the connection string. * - ``network.peer.address`` - - Actual node host used for this call (set once the session is attached to a node). + - Actual node host from the discovery endpoint map (set once the session is attached to a node). * - ``network.peer.port`` - - Actual node port used for this call. + - Actual node port from the discovery endpoint map. + * - ``ydb.node.dc`` + - Data-center / location reported by discovery for the node (e.g. ``"vla"``, ``"sas"``). Additional attributes are set when available: @@ -149,12 +151,8 @@ Additional attributes are set when available: * - Attribute - Description - * - ``ydb.session.id`` - - Session identifier. * - ``ydb.node.id`` - YDB node that handled the request. - * - ``ydb.tx.id`` - - Transaction identifier. On errors, the span also records: diff --git a/tests/tracing/test_tracing_async.py b/tests/tracing/test_tracing_async.py index 0c3d48f0..2ff574a5 100644 --- a/tests/tracing/test_tracing_async.py +++ b/tests/tracing/test_tracing_async.py @@ -32,7 +32,7 @@ def _get_single_span(exporter, name): return spans[0] -def _make_async_session_mock(driver_config=None, peer_endpoint=None): +def _make_async_session_mock(driver_config=None, peer=None): """Create a mock that behaves like an async QuerySession after create().""" cfg = driver_config or FakeDriverConfig() driver = MagicMock() @@ -42,7 +42,7 @@ def _make_async_session_mock(driver_config=None, peer_endpoint=None): session._driver = driver session._session_id = "test-session-id" session._node_id = 12345 - session._peer_endpoint = peer_endpoint + session._peer = peer session.session_id = "test-session-id" session.node_id = 12345 return session, driver @@ -101,6 +101,7 @@ async def test_session_execute_emits_span(self, otel_setup): qs._driver = driver qs._session_id = "test-session-id" qs._node_id = 12345 + qs._peer = ("n1", 2136, "dc-a") qs._closed = False fake_stream = _empty_async_iter() @@ -111,13 +112,15 @@ async def test_session_execute_emits_span(self, otel_setup): span = _get_single_span(exporter, "ydb.ExecuteQuery") attrs = dict(span.attributes) - assert attrs["ydb.session.id"] == "test-session-id" assert attrs["ydb.node.id"] == 12345 + assert attrs["network.peer.address"] == "n1" + assert attrs["ydb.node.dc"] == "dc-a" + assert "ydb.session.id" not in attrs @pytest.mark.asyncio - async def test_tx_execute_emits_span_with_tx_id(self, otel_setup): + async def test_tx_execute_emits_span(self, otel_setup): exporter = otel_setup - session, driver = _make_async_session_mock() + session, driver = _make_async_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_async_tx(session, driver) fake_stream = _empty_async_iter() @@ -129,16 +132,18 @@ async def test_tx_execute_emits_span_with_tx_id(self, otel_setup): span = _get_single_span(exporter, "ydb.ExecuteQuery") attrs = dict(span.attributes) - assert attrs["ydb.tx.id"] == "test-tx-id" - assert attrs["ydb.session.id"] == "test-session-id" assert attrs["ydb.node.id"] == 12345 + assert attrs["network.peer.address"] == "n1" + assert attrs["ydb.node.dc"] == "dc-a" + assert "ydb.tx.id" not in attrs + assert "ydb.session.id" not in attrs class TestAsyncCommitSpan: @pytest.mark.asyncio async def test_commit_emits_span(self, otel_setup): exporter = otel_setup - session, driver = _make_async_session_mock() + session, driver = _make_async_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_async_tx(session, driver) with patch.object(type(tx), "_commit_call", new_callable=AsyncMock): @@ -147,15 +152,17 @@ async def test_commit_emits_span(self, otel_setup): span = _get_single_span(exporter, "ydb.Commit") assert span.kind == SpanKind.CLIENT attrs = dict(span.attributes) - assert attrs["ydb.tx.id"] == "test-tx-id" - assert attrs["ydb.session.id"] == "test-session-id" + assert attrs["network.peer.address"] == "n1" + assert attrs["ydb.node.dc"] == "dc-a" + assert "ydb.tx.id" not in attrs + assert "ydb.session.id" not in attrs class TestAsyncRollbackSpan: @pytest.mark.asyncio async def test_rollback_emits_span(self, otel_setup): exporter = otel_setup - session, driver = _make_async_session_mock() + session, driver = _make_async_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_async_tx(session, driver) with patch.object(type(tx), "_rollback_call", new_callable=AsyncMock): @@ -164,8 +171,10 @@ async def test_rollback_emits_span(self, otel_setup): span = _get_single_span(exporter, "ydb.Rollback") assert span.kind == SpanKind.CLIENT attrs = dict(span.attributes) - assert attrs["ydb.tx.id"] == "test-tx-id" - assert attrs["ydb.session.id"] == "test-session-id" + assert attrs["network.peer.address"] == "n1" + assert attrs["ydb.node.dc"] == "dc-a" + assert "ydb.tx.id" not in attrs + assert "ydb.session.id" not in attrs class TestAsyncErrorHandling: diff --git a/tests/tracing/test_tracing_sync.py b/tests/tracing/test_tracing_sync.py index c294ed76..37753462 100644 --- a/tests/tracing/test_tracing_sync.py +++ b/tests/tracing/test_tracing_sync.py @@ -30,7 +30,7 @@ def _get_single_span(exporter, name): return spans[0] -def _make_session_mock(driver_config=None, peer_endpoint=None): +def _make_session_mock(driver_config=None, peer=None): """Create a mock that behaves like a sync QuerySession after create().""" cfg = driver_config or FakeDriverConfig() driver = MagicMock() @@ -40,7 +40,7 @@ def _make_session_mock(driver_config=None, peer_endpoint=None): session._driver = driver session._session_id = "test-session-id" session._node_id = 12345 - session._peer_endpoint = peer_endpoint + session._peer = peer session.session_id = "test-session-id" session.node_id = 12345 return session, driver @@ -99,7 +99,7 @@ def test_session_execute_emits_span(self, otel_setup): qs._driver = driver qs._session_id = "test-session-id" qs._node_id = 12345 - qs._peer_endpoint = "node-7.cluster:2136" + qs._peer = ("node-7.cluster", 2136, "dc-east") qs._closed = False fake_stream = iter([]) # empty stream that raises StopIteration immediately @@ -117,12 +117,14 @@ def test_session_execute_emits_span(self, otel_setup): assert attrs["server.port"] == 1337 assert attrs["network.peer.address"] == "node-7.cluster" assert attrs["network.peer.port"] == 2136 - assert attrs["ydb.session.id"] == "test-session-id" + assert attrs["ydb.node.dc"] == "dc-east" assert attrs["ydb.node.id"] == 12345 + assert "ydb.session.id" not in attrs + assert "ydb.tx.id" not in attrs - def test_tx_execute_emits_span_with_tx_id(self, otel_setup): + def test_tx_execute_emits_span(self, otel_setup): exporter = otel_setup - session, driver = _make_session_mock() + session, driver = _make_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_tx(session, driver) fake_stream = iter([]) @@ -133,15 +135,18 @@ def test_tx_execute_emits_span_with_tx_id(self, otel_setup): span = _get_single_span(exporter, "ydb.ExecuteQuery") attrs = dict(span.attributes) - assert attrs["ydb.tx.id"] == "test-tx-id" - assert attrs["ydb.session.id"] == "test-session-id" assert attrs["ydb.node.id"] == 12345 + assert attrs["network.peer.address"] == "n1" + assert attrs["network.peer.port"] == 2136 + assert attrs["ydb.node.dc"] == "dc-a" + assert "ydb.session.id" not in attrs + assert "ydb.tx.id" not in attrs class TestCommitSpan: def test_commit_emits_span(self, otel_setup): exporter = otel_setup - session, driver = _make_session_mock() + session, driver = _make_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_tx(session, driver) with patch.object(type(tx), "_commit_call", return_value=None): @@ -151,15 +156,17 @@ def test_commit_emits_span(self, otel_setup): assert span.kind == SpanKind.CLIENT attrs = dict(span.attributes) assert attrs["db.system.name"] == "ydb" - assert attrs["ydb.tx.id"] == "test-tx-id" - assert attrs["ydb.session.id"] == "test-session-id" assert attrs["ydb.node.id"] == 12345 + assert attrs["network.peer.address"] == "n1" + assert attrs["ydb.node.dc"] == "dc-a" + assert "ydb.session.id" not in attrs + assert "ydb.tx.id" not in attrs class TestRollbackSpan: def test_rollback_emits_span(self, otel_setup): exporter = otel_setup - session, driver = _make_session_mock() + session, driver = _make_session_mock(peer=("n1", 2136, "dc-a")) tx = _make_tx(session, driver) with patch.object(type(tx), "_rollback_call", return_value=None): @@ -169,9 +176,11 @@ def test_rollback_emits_span(self, otel_setup): assert span.kind == SpanKind.CLIENT attrs = dict(span.attributes) assert attrs["db.system.name"] == "ydb" - assert attrs["ydb.tx.id"] == "test-tx-id" - assert attrs["ydb.session.id"] == "test-session-id" assert attrs["ydb.node.id"] == 12345 + assert attrs["network.peer.address"] == "n1" + assert attrs["ydb.node.dc"] == "dc-a" + assert "ydb.session.id" not in attrs + assert "ydb.tx.id" not in attrs class TestErrorHandling: @@ -228,7 +237,7 @@ def test_sdk_span_is_child_of_user_span(self, otel_setup): tracer = trace.get_tracer("test.tracer") with tracer.start_as_current_span("user.operation"): - with create_ydb_span("ydb.ExecuteQuery", FakeDriverConfig(), session_id="s1", node_id=1): + with create_ydb_span("ydb.ExecuteQuery", FakeDriverConfig(), node_id=1): pass spans = exporter.get_finished_spans() @@ -306,13 +315,45 @@ def test_peer_attributes_emitted_when_known(self, otel_setup): exporter = otel_setup cfg = FakeDriverConfig() - with create_ydb_span("ydb.Test", cfg, peer_endpoint="peer.example.com:2137"): + with create_ydb_span("ydb.Test", cfg, peer=("peer.example.com", 2137, "dc-west")): pass span = _get_single_span(exporter, "ydb.Test") attrs = dict(span.attributes) assert attrs["network.peer.address"] == "peer.example.com" assert attrs["network.peer.port"] == 2137 + assert attrs["ydb.node.dc"] == "dc-west" + + +class TestPeerFromEndpointMap: + def test_wrapper_create_session_pulls_peer_from_store(self, otel_setup): + """wrapper_create_session must resolve peer (host, port, dc) via the driver's + connections_by_node_id cache, not via the grpc target string of the rpc call. + """ + from ydb.query.session import wrapper_create_session + + connection = MagicMock() + connection.endpoint = "ipv4:10.0.0.1:2136" + connection.peer_address = "node-42.dc-west.example" + connection.peer_port = 2136 + connection.peer_location = "dc-west" + + driver = MagicMock() + driver._store.connections_by_node_id = {42: connection} + + session = MagicMock() + session._driver = driver + + rpc_state = MagicMock() + rpc_state.endpoint = "ipv4:10.0.0.1:2136" # grpc-target string — should be ignored + + proto = MagicMock() + with patch("ydb.query.session._ydb_query.CreateSessionResponse.from_proto") as from_proto: + from_proto.return_value = MagicMock(session_id="s-1", node_id=42, status=MagicMock()) + with patch("ydb.issues._process_response"): + wrapper_create_session(rpc_state, proto, session) + + assert session._peer == ("node-42.dc-west.example", 2136, "dc-west") class TestRetryPolicySpans: diff --git a/ydb/aio/query/session.py b/ydb/aio/query/session.py index 08c5dfe1..d7e09419 100644 --- a/ydb/aio/query/session.py +++ b/ydb/aio/query/session.py @@ -108,7 +108,7 @@ async def create(self, settings: Optional[BaseRequestSettings] = None) -> "Query with create_ydb_span("ydb.CreateSession", self._driver_config) as span: await self._create_call(settings=settings) - set_peer_attributes(span, self._peer_endpoint) + set_peer_attributes(span, self._peer) await self._attach() return self @@ -165,9 +165,8 @@ async def execute( span = create_ydb_span( "ydb.ExecuteQuery", self._driver_config, - session_id=self._session_id, node_id=self._node_id, - peer_endpoint=self._peer_endpoint, + peer=self._peer, ) try: diff --git a/ydb/aio/query/transaction.py b/ydb/aio/query/transaction.py index 567cf231..f935641d 100644 --- a/ydb/aio/query/transaction.py +++ b/ydb/aio/query/transaction.py @@ -110,10 +110,8 @@ async def commit(self, settings: Optional[BaseRequestSettings] = None) -> None: with create_ydb_span( "ydb.Commit", self._driver_config, - session_id=self.session.session_id, node_id=self.session.node_id, - tx_id=self._tx_state.tx_id, - peer_endpoint=getattr(self.session, "_peer_endpoint", None), + peer=getattr(self.session, "_peer", None), ): try: await self._execute_callbacks_async(base.TxEvent.BEFORE_COMMIT) @@ -145,10 +143,8 @@ async def rollback(self, settings: Optional[BaseRequestSettings] = None) -> None with create_ydb_span( "ydb.Rollback", self._driver_config, - session_id=self.session.session_id, node_id=self.session.node_id, - tx_id=self._tx_state.tx_id, - peer_endpoint=getattr(self.session, "_peer_endpoint", None), + peer=getattr(self.session, "_peer", None), ): try: await self._execute_callbacks_async(base.TxEvent.BEFORE_ROLLBACK) @@ -207,10 +203,8 @@ async def execute( span = create_ydb_span( "ydb.ExecuteQuery", self._driver_config, - session_id=self.session.session_id, node_id=self.session.node_id, - tx_id=self._tx_state.tx_id, - peer_endpoint=getattr(self.session, "_peer_endpoint", None), + peer=getattr(self.session, "_peer", None), ) try: diff --git a/ydb/connection.py b/ydb/connection.py index d1bcfdf5..d64438ef 100644 --- a/ydb/connection.py +++ b/ydb/connection.py @@ -198,11 +198,14 @@ def _get_request_timeout(settings): class EndpointOptions(object): - __slots__ = ("ssl_target_name_override", "node_id") + __slots__ = ("ssl_target_name_override", "node_id", "address", "port", "location") - def __init__(self, ssl_target_name_override=None, node_id=None): + def __init__(self, ssl_target_name_override=None, node_id=None, address=None, port=None, location=None): self.ssl_target_name_override = ssl_target_name_override self.node_id = node_id + self.address = address + self.port = port + self.location = location def _construct_channel_options(driver_config, endpoint_options=None): @@ -409,6 +412,9 @@ class Connection(object): "closing", "endpoint_key", "node_id", + "peer_address", + "peer_port", + "peer_location", ) def __init__( @@ -425,6 +431,9 @@ def __init__( """ self.endpoint = endpoint self.node_id = getattr(endpoint_options, "node_id", None) + self.peer_address = getattr(endpoint_options, "address", None) + self.peer_port = getattr(endpoint_options, "port", None) + self.peer_location = getattr(endpoint_options, "location", None) self.endpoint_key = EndpointKey(endpoint, getattr(endpoint_options, "node_id", None)) self._channel = channel_factory(self.endpoint, driver_config, endpoint_options=endpoint_options) self._driver_config = driver_config diff --git a/ydb/opentelemetry/tracing.py b/ydb/opentelemetry/tracing.py index 0ff7f568..a936b269 100644 --- a/ydb/opentelemetry/tracing.py +++ b/ydb/opentelemetry/tracing.py @@ -63,7 +63,7 @@ def _split_endpoint(endpoint): return host, int(port) if port.isdigit() else 0 -def _build_ydb_attrs(driver_config, session_id=None, node_id=None, tx_id=None, peer_endpoint=None): +def _build_ydb_attrs(driver_config, node_id=None, peer=None): host, port = _split_endpoint(getattr(driver_config, "endpoint", None)) attrs = { "db.system.name": "ydb", @@ -71,31 +71,38 @@ def _build_ydb_attrs(driver_config, session_id=None, node_id=None, tx_id=None, p "server.address": host, "server.port": port, } - if peer_endpoint is not None: - peer_host, peer_port = _split_endpoint(peer_endpoint) - attrs["network.peer.address"] = peer_host - attrs["network.peer.port"] = peer_port - if session_id is not None: - attrs["ydb.session.id"] = session_id or "" + if peer is not None: + address, port_, location = peer + if address is not None: + attrs["network.peer.address"] = address + if port_ is not None: + attrs["network.peer.port"] = int(port_) + if location: + attrs["ydb.node.dc"] = location if node_id is not None: attrs["ydb.node.id"] = node_id or 0 - if tx_id is not None: - attrs["ydb.tx.id"] = tx_id or "" return attrs -def create_ydb_span(name, driver_config, session_id=None, node_id=None, tx_id=None, kind=None, peer_endpoint=None): +def create_ydb_span(name, driver_config, node_id=None, kind=None, peer=None): """Create a span pre-filled with standard YDB attributes. + + ``peer`` is a ``(address, port, location)`` tuple pulled from the endpoint + map for the specific node serving the call; missing fields are skipped. Can be used as a context manager or manually. """ - attrs = _build_ydb_attrs(driver_config, session_id, node_id, tx_id, peer_endpoint) + attrs = _build_ydb_attrs(driver_config, node_id, peer) return _registry.create_span(name, attributes=attrs, kind=kind) -def set_peer_attributes(span, peer_endpoint): - """Fill in network.peer.* attributes on an existing span once the peer is known.""" - if peer_endpoint is None: +def set_peer_attributes(span, peer): + """Fill in network.peer.* and ydb.node.dc on an existing span once the peer is known.""" + if peer is None: return - peer_host, peer_port = _split_endpoint(peer_endpoint) - span.set_attribute("network.peer.address", peer_host) - span.set_attribute("network.peer.port", peer_port) + address, port, location = peer + if address is not None: + span.set_attribute("network.peer.address", address) + if port is not None: + span.set_attribute("network.peer.port", int(port)) + if location: + span.set_attribute("ydb.node.dc", location) diff --git a/ydb/query/session.py b/ydb/query/session.py index 10cc775b..a7a32c40 100644 --- a/ydb/query/session.py +++ b/ydb/query/session.py @@ -47,10 +47,30 @@ def wrapper_create_session( issues._process_response(message.status) session._session_id = message.session_id session._node_id = message.node_id - session._peer_endpoint = getattr(rpc_state, "endpoint", None) + session._peer = _resolve_peer(session._driver, message.node_id) return session +def _resolve_peer(driver, node_id): + """Look up network.peer.* / ydb.node.dc for a node in the driver's endpoint map.""" + if node_id is None: + return None + store = getattr(driver, "_store", None) + if store is None: + return None + by_node = getattr(store, "connections_by_node_id", None) + if not by_node: + return None + connection = by_node.get(node_id) + if connection is None: + return None + return ( + getattr(connection, "peer_address", None), + getattr(connection, "peer_port", None), + getattr(connection, "peer_location", None), + ) + + def wrapper_delete_session( rpc_state: RpcState, response_pb: _apis.ydb_query.DeleteSessionResponse, @@ -72,7 +92,7 @@ class BaseQuerySession(abc.ABC, Generic[DriverT]): # Session data _session_id: Optional[str] = None _node_id: Optional[int] = None - _peer_endpoint: Optional[str] = None + _peer: Optional[tuple] = None _closed: bool = False def __init__(self, driver: DriverT, settings: Optional[base.QueryClientSettings] = None): @@ -377,7 +397,7 @@ def create(self, settings: Optional[BaseRequestSettings] = None) -> "QuerySessio with create_ydb_span("ydb.CreateSession", self._driver_config) as span: self._create_call(settings=settings) - set_peer_attributes(span, self._peer_endpoint) + set_peer_attributes(span, self._peer) self._attach() return self @@ -447,9 +467,8 @@ def execute( span = create_ydb_span( "ydb.ExecuteQuery", self._driver_config, - session_id=self._session_id, node_id=self._node_id, - peer_endpoint=self._peer_endpoint, + peer=self._peer, ) try: diff --git a/ydb/query/transaction.py b/ydb/query/transaction.py index 1d2cb855..b1134c0e 100644 --- a/ydb/query/transaction.py +++ b/ydb/query/transaction.py @@ -561,10 +561,8 @@ def commit(self, settings: Optional[BaseRequestSettings] = None) -> None: with create_ydb_span( "ydb.Commit", self._driver_config, - session_id=self.session.session_id, node_id=self.session.node_id, - tx_id=self._tx_state.tx_id, - peer_endpoint=getattr(self.session, "_peer_endpoint", None), + peer=getattr(self.session, "_peer", None), ): try: self._execute_callbacks_sync(base.TxEvent.BEFORE_COMMIT) @@ -595,10 +593,8 @@ def rollback(self, settings: Optional[BaseRequestSettings] = None) -> None: with create_ydb_span( "ydb.Rollback", self._driver_config, - session_id=self.session.session_id, node_id=self.session.node_id, - tx_id=self._tx_state.tx_id, - peer_endpoint=getattr(self.session, "_peer_endpoint", None), + peer=getattr(self.session, "_peer", None), ): try: self._execute_callbacks_sync(base.TxEvent.BEFORE_ROLLBACK) @@ -658,10 +654,8 @@ def execute( span = create_ydb_span( "ydb.ExecuteQuery", self._driver_config, - session_id=self.session.session_id, node_id=self.session.node_id, - tx_id=self._tx_state.tx_id, - peer_endpoint=getattr(self.session, "_peer_endpoint", None), + peer=getattr(self.session, "_peer", None), ) try: diff --git a/ydb/resolver.py b/ydb/resolver.py index 5047f4e5..d55de389 100644 --- a/ydb/resolver.py +++ b/ydb/resolver.py @@ -54,7 +54,11 @@ def endpoints_with_options(self) -> typing.Generator[typing.Tuple[str, conn_impl ssl_target_name_override = self.address endpoint_options = conn_impl.EndpointOptions( - ssl_target_name_override=ssl_target_name_override, node_id=self.node_id + ssl_target_name_override=ssl_target_name_override, + node_id=self.node_id, + address=self.address, + port=self.port, + location=self.location, ) if self.ipv6_addrs or self.ipv4_addrs: From 70b778dd6d2da8c786dbb1aa0f0c76a0195eab03 Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Fri, 1 May 2026 09:33:00 +0300 Subject: [PATCH 15/36] fix issue --- .gitignore | 4 +- CHANGELOG.md | 7 + docs/index.rst | 2 +- docs/opentelemetry.rst | 40 +++- examples/opentelemetry/README.md | 80 ++++++++ examples/opentelemetry/compose-e2e.yaml | 19 ++ .../opentelemetry/docker-compose.otel.yaml | 90 +++++++++ examples/opentelemetry/example.py | 65 ------- examples/opentelemetry/otel_example.py | 128 ++++++++++++ examples/opentelemetry/requirements.txt | 10 + setup.py | 1 + test-requirements.txt | 2 + tests/tracing/conftest.py | 15 +- tests/tracing/test_tracing_async.py | 95 +++++++-- tests/tracing/test_tracing_sync.py | 79 +++++++- ydb/_errors.py | 21 +- ydb/aio/driver.py | 1 + ydb/aio/pool.py | 34 +++- ydb/aio/query/base.py | 11 +- ydb/aio/query/session.py | 40 ++-- ydb/aio/query/transaction.py | 35 ++-- ydb/opentelemetry/__init__.py | 26 ++- ydb/opentelemetry/_plugin.py | 90 +++++++-- ydb/opentelemetry/tracing.py | 75 +++++++- ydb/query/base.py | 16 +- ydb/query/session.py | 41 ++-- ydb/query/transaction.py | 35 ++-- ydb/retries.py | 182 ++++++++++-------- ydb/table_test.py | 8 +- 29 files changed, 973 insertions(+), 279 deletions(-) create mode 100644 examples/opentelemetry/README.md create mode 100644 examples/opentelemetry/docker-compose.otel.yaml delete mode 100644 examples/opentelemetry/example.py create mode 100644 examples/opentelemetry/otel_example.py create mode 100644 examples/opentelemetry/requirements.txt diff --git a/.gitignore b/.gitignore index 36b3d2e2..6ae4d6e8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,10 @@ __pycache__ ydb.egg-info/ -/.idea +.idea/ /.vscode /tox /venv -/.venv +.venv/ /ydb_certs /ydb_data /tmp diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d78349b..5c4b5fc2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## Unreleased ## +* OpenTelemetry: W3C trace context for gRPC stays bound for the whole ``ExecuteQuery`` stream + (until the result iterator finishes); no long-lived ``context.attach`` on the span; + ``disable_tracing()``; correct ``server.*`` from ``grpc://`` endpoints; zero work in + ``create_ydb_span`` when tracing is off; one ``ydb.Try`` per attempt for fast retriable + errors in sync retries. + ## 3.26.10 ## * Fix asyncio.Cancelled error propagated to writer/reader * add nearest DC detection with TCP race diff --git a/docs/index.rst b/docs/index.rst index 77efd8b3..cbe2c5dd 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -88,7 +88,7 @@ Distributed Coordination ------------------------ The :doc:`coordination` page covers distributed semaphores and leader election. If you -need to limit concurrent access to aЗе shared resource across multiple processes or hosts, +need to limit concurrent access to a shared resource across multiple processes or hosts, this is the service to use. Schema Management diff --git a/docs/opentelemetry.rst b/docs/opentelemetry.rst index f924ec48..4596dce9 100644 --- a/docs/opentelemetry.rst +++ b/docs/opentelemetry.rst @@ -69,6 +69,9 @@ and **before** creating a ``Driver``: ``enable_tracing()`` accepts an optional ``tracer`` argument. If omitted, the SDK obtains a tracer named ``"ydb.sdk"`` from the global tracer provider. +Repeated calls to ``enable_tracing()`` do nothing until you call ``disable_tracing()``, +which removes hooks so you can reconfigure or turn instrumentation off. + What Is Instrumented -------------------- @@ -218,16 +221,39 @@ To use a specific tracer instead of the global one: Running the Examples -------------------- -The ``examples/opentelemetry/`` directory contains ready-to-run examples with a Docker -Compose setup that starts YDB, an OTLP collector, Tempo, Prometheus, and Grafana: +The runnable script is ``examples/opentelemetry/otel_example.py`` (bank table + concurrent +Serializable transactions and ``app_startup`` / ``example_tli`` application spans). **Start +Docker (YDB or the full stack) first**, then install and run on the host — see +``examples/opentelemetry/README.md`` for the full order of commands and environment variables. + +**Full stack in one command** (YDB + OTLP + Tempo + Grafana; includes a one-shot ``otel-example`` container; compose file in ``examples/opentelemetry/``): + +.. code-block:: sh + + cd examples/opentelemetry && docker compose -f docker-compose.otel.yml up + +**Alternative** (same file, from the repository root): + +.. code-block:: sh + + docker compose -f examples/opentelemetry/docker-compose.otel.yml up + +**Typical local run** (YDB in Docker, script on the host — Compose **before** ``pip`` / ``python``): + +.. code-block:: sh + + docker compose up -d + pip install -e '.[opentelemetry]' -r examples/opentelemetry/requirements.txt + python examples/opentelemetry/otel_example.py + +**Stack from** ``examples/opentelemetry/`` **only** (then install and run from repo root as above): .. code-block:: sh cd examples/opentelemetry docker compose -f compose-e2e.yaml up -d + cd ../.. + pip install -e '.[opentelemetry]' -r examples/opentelemetry/requirements.txt + python examples/opentelemetry/otel_example.py - # Run the example - python example.py - -Open `http://localhost:3000 `_ (Grafana) to explore the -collected traces via the Tempo data source. +Open `http://localhost:3000 `_ (Grafana) to explore traces via Tempo. diff --git a/examples/opentelemetry/README.md b/examples/opentelemetry/README.md new file mode 100644 index 00000000..bc246a29 --- /dev/null +++ b/examples/opentelemetry/README.md @@ -0,0 +1,80 @@ +# OpenTelemetry example (YDB Python SDK) + +Async demo in [`otel_example.py`](otel_example.py): OTLP export, `enable_tracing()`, +`app_startup` and `example_tli` application spans, bank table, Serializable transactions (TLI-style load). + +Most steps assume the **repository root** as the current directory; the install step also shows the variant from this folder. + +## 1. Start YDB (or the full stack) with Docker **first** + +Without running containers, the example has nothing to connect to. + +**Only YDB** (minimal `docker-compose.yml` in the repo root — enough for the script on the host): + +```sh +cd /path/to/ydb-python-sdk +docker compose up -d +# wait until the ydb container is healthy / port 2136 is open, then continue +``` + +**Full stack** (YDB + OTLP collector + Tempo + Grafana; the `otel-example` service also runs the script once inside Compose). The compose file is `docker-compose.otel.yaml` next to this README. + +```sh +cd /path/to/ydb-python-sdk/examples/opentelemetry +docker compose -f docker-compose.otel.yaml up +``` + +From the repository root you can use the same file with: + +```sh +cd /path/to/ydb-python-sdk +docker compose -f examples/opentelemetry/docker-compose.otel.yaml up +``` + +Grafana: http://localhost:3000 + +**Logs for `otel-example`:** the container name is prefixed (e.g. `opentelemetry-otel-example-1`); use `docker compose -f docker-compose.otel.yaml ps` or `docker ps -a` to find it. The service is one-shot (`restart: "no"`) — it may already have exited. + +**Only configs from this folder** (same idea, from `examples/opentelemetry`): + +```sh +cd /path/to/ydb-python-sdk/examples/opentelemetry +docker compose -f compose-e2e.yaml up -d +cd ../.. +``` + +## 2. Install dependencies (on the host, for a local `python` run) + +**From the repository root** (editable SDK + pins from this example): + +```sh +python3 -m venv .venv +source .venv/bin/activate # Windows: .venv\Scripts\activate +pip install -e '.[opentelemetry]' -r examples/opentelemetry/requirements.txt +``` + +**If your shell is already in** `examples/opentelemetry/` (same result): + +```sh +pip install -e '../..[opentelemetry]' -r requirements.txt +``` + +`requirements.txt` includes a merge of the repository’s core `requirements.txt` (grpc, ``packaging``, …) plus the OpenTelemetry lines. The `-e` install is only needed to register the package; otherwise this example prepends the repo to ``sys.path``. + +**Without** `pip -e` (``ydb`` from the checkout via `sys.path`): from this directory run `pip install -r requirements.txt`, then ``python otel_example.py``. + +## 3. Run the example (after YDB from step 1 is up) + +```sh +python examples/opentelemetry/otel_example.py +``` + +Defaults: YDB `grpc://localhost:2136`, OTLP `http://localhost:4317` (for a local collector, if you use one). + +## Environment (Docker / overrides) + +| Variable | Meaning | +|----------|---------| +| `YDB_ENDPOINT` | e.g. `grpc://ydb:2136` inside the Compose network | +| `YDB_DATABASE` | default `/local` | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | e.g. `http://otel-collector:4317` | diff --git a/examples/opentelemetry/compose-e2e.yaml b/examples/opentelemetry/compose-e2e.yaml index 933d9a38..d6ee604d 100644 --- a/examples/opentelemetry/compose-e2e.yaml +++ b/examples/opentelemetry/compose-e2e.yaml @@ -59,3 +59,22 @@ services: ports: - "3000:3000" depends_on: [ prometheus, tempo ] + + otel-example: + image: python:3.11-slim + working_dir: /workspace + volumes: + - ../..:/workspace + environment: + YDB_ENDPOINT: grpc://ydb:2136 + YDB_DATABASE: /local + OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317 + OTEL_SERVICE_NAME: ydb-otel-example + depends_on: + - ydb + - otel-collector + restart: "no" + command: > + bash -c "set -euo pipefail + pip install --no-cache-dir -e '.[opentelemetry]' -r requirements.txt + python otel_example.py" diff --git a/examples/opentelemetry/docker-compose.otel.yaml b/examples/opentelemetry/docker-compose.otel.yaml new file mode 100644 index 00000000..d7cb934c --- /dev/null +++ b/examples/opentelemetry/docker-compose.otel.yaml @@ -0,0 +1,90 @@ +# Full OpenTelemetry demo: YDB (server-side tracing config), collector, Tempo, Prometheus, Grafana, +# and a one-shot container that runs otel_example.py once. +# +# Run from this directory (paths below are relative to this file): +# cd examples/opentelemetry && docker compose -f docker-compose.otel.yaml up +# +# Or from the repository root: +# docker compose -f examples/opentelemetry/docker-compose.otel.yaml up + +version: "3.3" + +services: + ydb: + image: ydbplatform/local-ydb:trunk + restart: always + platform: linux/amd64 + environment: + YDB_DEFAULT_LOG_LEVEL: NOTICE + GRPC_TLS_PORT: "2135" + GRPC_PORT: "2136" + MON_PORT: "8765" + YDB_USE_IN_MEMORY_PDISKS: "true" + command: ["--config-path", "/ydb_config/ydb-config-with-tracing.yaml"] + ports: + - "2135:2135" + - "2136:2136" + - "8765:8765" + volumes: + - ./ydb_config:/ydb_config:ro + + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + command: ["--config=/etc/otelcol/config.yaml"] + volumes: + - ./otel-collector-config.yaml:/etc/otelcol/config.yaml:ro + ports: + - "4317:4317" + - "4318:4318" + - "9464:9464" + - "13133:13133" + - "13317:55679" + + prometheus: + image: prom/prometheus:latest + volumes: + - ./prometheus.yaml:/etc/prometheus/prometheus.yml:ro + ports: + - "9090:9090" + depends_on: [otel-collector] + + tempo: + image: grafana/tempo:2.4.1 + command: ["-config.file=/etc/tempo.yaml"] + volumes: + - ./tempo.yaml:/etc/tempo.yaml:ro + ports: + - "3200:3200" + depends_on: [otel-collector] + + grafana: + image: grafana/grafana:10.4.2 + environment: + GF_AUTH_ANONYMOUS_ENABLED: "true" + GF_AUTH_ANONYMOUS_ORG_ROLE: "Admin" + volumes: + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + ports: + - "3000:3000" + depends_on: [prometheus, tempo] + + otel-example: + image: python:3.11-slim + working_dir: /workspace + volumes: + # repository root (../../ from this file) for editable `pip install -e .[opentelemetry]` + - ../../:/workspace + environment: + YDB_ENDPOINT: grpc://ydb:2136 + YDB_DATABASE: /local + OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317 + OTEL_SERVICE_NAME: ydb-otel-example + depends_on: + - ydb + - otel-collector + restart: "no" + command: > + bash -c "set -euo pipefail + pip install --no-cache-dir -e '.[opentelemetry]' -r examples/opentelemetry/requirements.txt + python examples/opentelemetry/otel_example.py" diff --git a/examples/opentelemetry/example.py b/examples/opentelemetry/example.py deleted file mode 100644 index d36397c1..00000000 --- a/examples/opentelemetry/example.py +++ /dev/null @@ -1,65 +0,0 @@ -"""Minimal example: OpenTelemetry tracing for YDB Python SDK.""" - -import asyncio - -from opentelemetry import trace -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor -from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter -from opentelemetry.sdk.resources import Resource - -import ydb -from ydb.opentelemetry import enable_tracing - -resource = Resource(attributes={"service.name": "ydb-example"}) -provider = TracerProvider(resource=resource) -provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter(endpoint="http://localhost:4317"))) -trace.set_tracer_provider(provider) - -tracer = trace.get_tracer(__name__) -enable_tracing(tracer) - -ENDPOINT = "grpc://localhost:2136" -DATABASE = "/local" - - -def sync_example(): - """Sync: session execute and transaction execute + commit.""" - with ydb.Driver(endpoint=ENDPOINT, database=DATABASE) as driver: - driver.wait(timeout=5) - - with ydb.QuerySessionPool(driver) as pool: - with tracer.start_as_current_span("sync-example"): - pool.execute_with_retries("SELECT 1") - - def tx_callee(session): - with session.transaction() as tx: - list(tx.execute("SELECT 1")) - tx.commit() - - pool.retry_operation_sync(tx_callee) - - -async def async_example(): - """Async: session execute and transaction execute + commit.""" - async with ydb.aio.Driver(endpoint=ENDPOINT, database=DATABASE) as driver: - await driver.wait(timeout=5) - - async with ydb.aio.QuerySessionPool(driver) as pool: - with tracer.start_as_current_span("async-example"): - await pool.execute_with_retries("SELECT 1") - - async def tx_callee(session): - async with session.transaction() as tx: - result = await tx.execute("SELECT 1") - async for _ in result: - pass - await tx.commit() - - await pool.retry_operation_async(tx_callee) - - -sync_example() -asyncio.run(async_example()) - -provider.shutdown() diff --git a/examples/opentelemetry/otel_example.py b/examples/opentelemetry/otel_example.py new file mode 100644 index 00000000..45b8fa77 --- /dev/null +++ b/examples/opentelemetry/otel_example.py @@ -0,0 +1,128 @@ +"""OpenTelemetry + YDB demo: bank table and concurrent transactions (TLI-style workload). + +Uses ``disable_discovery=True`` so a single ``grpc://…`` to local YDB (e.g. ``local-ydb`` in Docker +from the host) is not replaced by internal discovery addresses. +""" + +from __future__ import annotations + +import asyncio +import os +import socket +import sys +from pathlib import Path + +# For ``python otel_example.py`` in this tree without an installed ``ydb`` package. +_repo_root = Path(__file__).resolve().parent.parent.parent +if str(_repo_root) not in sys.path: + sys.path.insert(0, str(_repo_root)) + +import ydb +from ydb import _utilities as _yutil +from opentelemetry import trace +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from ydb.opentelemetry import enable_tracing + + +def _env(name: str, default: str) -> str: + v = os.environ.get(name) + return v if v is not None and v != "" else default + + +def _assert_tcp_reachable_for_endpoint(endpoint: str) -> None: + """Before ``Driver`` starts: fail fast if nothing listens (clearer than ``driver.wait`` timeout).""" + bare = _yutil.wrap_endpoint(endpoint) + if bare.count(":") < 1: + return + host, _, port_s = bare.rpartition(":") + if not port_s or not host: + return + try: + port = int(port_s) + except ValueError: + return + try: + with socket.create_connection((host, port), timeout=3.0): + pass + except OSError as e: + raise RuntimeError( + f"Nothing accepts TCP on {host}:{port} — start YDB first, e.g. from the repository root: " + f"docker compose up -d (then the script at grpc://{host}:{port} can connect). Original error: {e!s}" + ) from e + + +async def _first_amount(tx) -> int: + async with await tx.execute("SELECT amount FROM bank WHERE id = 1") as results: + async for rs in results: + for row in rs.rows: + return int(row["amount"]) + raise RuntimeError("no row for id=1") + + +async def _bank_read_update(tx) -> None: + count = await _first_amount(tx) + async with await tx.execute( + "UPDATE bank SET amount = $amt + 1 WHERE id = 1", + {"$amt": (count, ydb.PrimitiveType.Int32)}, + ): + pass + + +async def main() -> None: + endpoint = _env("YDB_ENDPOINT", "grpc://localhost:2136") + database = _env("YDB_DATABASE", "/local") + otlp_endpoint = _env("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317") + + resource = Resource(attributes={"service.name": _env("OTEL_SERVICE_NAME", "ydb-otel-example")}) + provider = TracerProvider(resource=resource) + provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter(endpoint=otlp_endpoint))) + trace.set_tracer_provider(provider) + + tracer = trace.get_tracer(__name__) + enable_tracing(tracer) + + _assert_tcp_reachable_for_endpoint(endpoint) + + async with ydb.aio.Driver( + endpoint=endpoint, + database=database, + disable_discovery=True, + ) as driver: + await driver.wait(timeout=60) + + async with ydb.aio.QuerySessionPool(driver) as pool: + with tracer.start_as_current_span("app_startup") as startup: + startup.set_attribute("app.message", "hello") + + await pool.execute_with_retries("DROP TABLE IF EXISTS bank") + await pool.execute_with_retries( + "CREATE TABLE bank (id Int32, amount Int32, PRIMARY KEY (id))" + ) + + print("Insert row...") + await pool.execute_with_retries("INSERT INTO bank (id, amount) VALUES (1, 0)") + + print("Preparing queries...") + await pool.retry_tx_async(_bank_read_update) + + print("Emulation TLI...") + + async def concurrent_task(task_num: int) -> None: + with tracer.start_as_current_span("example_tli") as act: + act.set_attribute("app.message", f"concurrent task {task_num}") + await pool.retry_tx_async(_bank_read_update) + + await asyncio.gather(*(concurrent_task(i) for i in range(10))) + + final_rows = await pool.execute_with_retries("SELECT amount FROM bank WHERE id = 1") + amount = int(list(final_rows[0].rows)[0]["amount"]) + print(f"Final amount (after serializable retries): {amount}") + + provider.shutdown() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/opentelemetry/requirements.txt b/examples/opentelemetry/requirements.txt new file mode 100644 index 00000000..fc6d399a --- /dev/null +++ b/examples/opentelemetry/requirements.txt @@ -0,0 +1,10 @@ +# Core ydb import/runtime (grpc, ``packaging``, etc.) — same as repository root ``requirements.txt``. +-r ../../requirements.txt +# Extras and OTLP (``ydb[opentelemetry]`` only brings ``opentelemetry-api``) +# With editable install: +# (repository root) pip install -e '.[opentelemetry]' -r examples/opentelemetry/requirements.txt +# (this directory) pip install -e '../..[opentelemetry]' -r requirements.txt +# Without ``pip -e``, set ``sys.path`` is enough for ``ydb`` from the checkout, but you still need these lines: +# pip install -r requirements.txt +opentelemetry-sdk>=1.0.0 +opentelemetry-exporter-otlp-proto-grpc>=1.0.0 diff --git a/setup.py b/setup.py index 0f850fbf..b3d04f0e 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,7 @@ options={"bdist_wheel": {"universal": True}}, extras_require={ "yc": ["yandexcloud", ], + # Named ``opentelemetry`` (not ``tracing``): avoids clashing with ``ydb.tracing`` (PR #786, vgvoleg). "opentelemetry": ["opentelemetry-api>=1.0.0"], } ) diff --git a/test-requirements.txt b/test-requirements.txt index 0976ce50..849cb57a 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -45,6 +45,8 @@ cython freezegun>=1.3.0 opentelemetry-api>=1.0.0 opentelemetry-sdk>=1.0.0 +# Namespace ``opentelemetry.exporter`` (examples, OTLP); not part of ``opentelemetry-api``. +opentelemetry-exporter-otlp-proto-grpc>=1.0.0 # pytest-cov yandexcloud -e . diff --git a/tests/tracing/conftest.py b/tests/tracing/conftest.py index 94f653b8..26c39cef 100644 --- a/tests/tracing/conftest.py +++ b/tests/tracing/conftest.py @@ -11,8 +11,6 @@ from opentelemetry.sdk.trace.export import SimpleSpanProcessor from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter -from ydb.opentelemetry.tracing import _registry - _provider = TracerProvider() _exporter = InMemorySpanExporter() _provider.add_span_processor(SimpleSpanProcessor(_exporter)) @@ -25,24 +23,17 @@ def otel_setup(): Each test gets a clean exporter (cleared before and after). """ - import ydb.opentelemetry._plugin as _plugin - _exporter.clear() - _plugin._enabled = False - _plugin._tracer = None - - from ydb.opentelemetry import enable_tracing + from ydb.opentelemetry import disable_tracing, enable_tracing + disable_tracing() enable_tracing() yield _exporter # Restore noop state - _registry.set_create_span(None) - _registry.set_metadata_hook(None) - _plugin._enabled = False - _plugin._tracer = None + disable_tracing() _exporter.clear() diff --git a/tests/tracing/test_tracing_async.py b/tests/tracing/test_tracing_async.py index 2ff574a5..de2ab249 100644 --- a/tests/tracing/test_tracing_async.py +++ b/tests/tracing/test_tracing_async.py @@ -227,16 +227,45 @@ async def callee(): tries = _get_spans(exporter, "ydb.Try") assert len(tries) == 1 assert tries[0].parent.span_id == run.context.span_id - assert dict(tries[0].attributes)["ydb.retry.backoff_ms"] == 0 + assert "ydb.retry.backoff_ms" not in dict(tries[0].attributes) + assert tries[0].status.status_code == StatusCode.UNSET + + @pytest.mark.asyncio + async def test_retry_failed_tries_set_error_status(self, otel_setup): + """Failed async attempts must set ``ydb.Try`` status to ERROR (not UNSET).""" + from ydb import issues + from ydb.retries import BackoffSettings, RetrySettings, retry_operation_async + + exporter = otel_setup + counter = {"n": 0} + + async def flaky(): + counter["n"] += 1 + if counter["n"] < 3: + raise issues.Unavailable("transient") + return "ok" + + retry_settings = RetrySettings( + max_retries=5, + fast_backoff_settings=BackoffSettings(ceiling=0, slot_duration=0.05), + slow_backoff_settings=BackoffSettings(ceiling=0, slot_duration=0.05), + ) + + assert await retry_operation_async(flaky, retry_settings) == "ok" + + tries = _get_spans(exporter, "ydb.Try") + assert len(tries) == 3 + assert tries[0].status.status_code == StatusCode.ERROR + assert tries[1].status.status_code == StatusCode.ERROR + assert tries[2].status.status_code == StatusCode.UNSET @pytest.mark.asyncio async def test_context_cancel_during_backoff_records_exception(self, otel_setup): - """Backoff sleep is the timeline of the next Try; a cancel hitting it - must be recorded on that Try span and propagate out through RunWithRetry. + """Inter-attempt sleep is outside ``ydb.Try``; cancellation during + ``asyncio.sleep`` is recorded on ``ydb.RunWithRetry`` (``record_exception``). """ from ydb import issues - from ydb.retries import retry_operation_async - from ydb.retries import BackoffSettings, RetrySettings + from ydb.retries import BackoffSettings, RetrySettings, retry_operation_async exporter = otel_setup calls = {"n": 0} @@ -252,7 +281,6 @@ async def flaky(): ) task = asyncio.ensure_future(retry_operation_async(flaky, retry_settings)) - # Let the first attempt fail and the backoff sleep start. for _ in range(10): await asyncio.sleep(0.01) if calls["n"] >= 1: @@ -263,15 +291,54 @@ async def flaky(): run = _get_single_span(exporter, "ydb.RunWithRetry") assert run.status.status_code == StatusCode.ERROR - + # TracingSpan / OTel will attach the cancellation as span events (record_exception) when enabled. + assert run.events is not None + # First attempt: ``ydb.Try``; cancel hits ``ydb.RunWithRetry`` during the inter-attempt sleep. tries = _get_spans(exporter, "ydb.Try") - assert len(tries) >= 2 - # Try span that carried the cancelled backoff must be errored. - backoff_try = tries[-1] - assert backoff_try.status.status_code == StatusCode.ERROR - assert dict(backoff_try.attributes)["ydb.retry.backoff_ms"] > 0 - error_types = {dict(s.attributes).get("error.type") for s in tries} - assert "CancelledError" in error_types + assert len(tries) >= 1 + + +class TestAsyncRetrySpanNesting: + @pytest.mark.asyncio + async def test_execute_query_is_child_of_try_under_run_with_retry(self, otel_setup): + """``ydb.RunWithRetry`` -> ``ydb.Try`` -> ``ydb.ExecuteQuery`` (deep nesting). + + The previous implementation produced sibling spans because ``ydb.Try`` was + opened *after* the awaitable was created, leaving the gRPC span without an + active ``ydb.Try`` context. This test pins the corrected nesting. + """ + from ydb.aio.query.session import QuerySession + from ydb.retries import retry_operation_async + + exporter = otel_setup + + qs = QuerySession.__new__(QuerySession) + cfg = FakeDriverConfig() + driver = MagicMock() + driver._driver_config = cfg + qs._driver = driver + qs._session_id = "test-session-id" + qs._node_id = 12345 + qs._peer = ("n1", 2136, "dc-a") + qs._closed = False + + async def callee(): + fake_stream = _empty_async_iter() + with patch.object(QuerySession, "_execute_call", new_callable=AsyncMock, return_value=fake_stream): + result = await qs.execute("SELECT 1;") + async for _ in result: + pass + return "ok" + + assert await retry_operation_async(callee) == "ok" + + run = _get_single_span(exporter, "ydb.RunWithRetry") + try_span = _get_single_span(exporter, "ydb.Try") + exec_span = _get_single_span(exporter, "ydb.ExecuteQuery") + + assert try_span.parent.span_id == run.context.span_id + assert exec_span.parent.span_id == try_span.context.span_id + assert exec_span.context.trace_id == run.context.trace_id class TestAsyncConcurrentSpansIsolation: diff --git a/tests/tracing/test_tracing_sync.py b/tests/tracing/test_tracing_sync.py index 37753462..636ec63a 100644 --- a/tests/tracing/test_tracing_sync.py +++ b/tests/tracing/test_tracing_sync.py @@ -282,8 +282,9 @@ class TestCommonAttributes: @pytest.mark.parametrize( "endpoint,expected_host,expected_port", [ - ("grpc://host.example.com:2136", "grpc://host.example.com", 2136), + ("grpc://host.example.com:2136", "host.example.com", 2136), ("localhost:2136", "localhost", 2136), + ("[::1]:2136", "[::1]", 2136), ], ) def test_endpoint_parsing(self, otel_setup, endpoint, expected_host, expected_port): @@ -374,7 +375,7 @@ def callee(): tries = _get_spans(exporter, "ydb.Try") assert len(tries) == 1 assert tries[0].kind == SpanKind.INTERNAL - assert dict(tries[0].attributes)["ydb.retry.backoff_ms"] == 0 + assert "ydb.retry.backoff_ms" not in dict(tries[0].attributes) assert tries[0].parent.span_id == run.context.span_id def test_retry_backoff_ms_on_each_try(self, otel_setup): @@ -401,16 +402,46 @@ def flaky(): tries = _get_spans(exporter, "ydb.Try") assert len(tries) == 3 - # first attempt has no preceding backoff, later ones have a positive one - backoff_values = [dict(s.attributes)["ydb.retry.backoff_ms"] for s in tries] - assert backoff_values[0] == 0 - assert all(v >= 0 for v in backoff_values) - assert any(v > 0 for v in backoff_values[1:]) + # First attempt has no preceding backoff, so no attribute at all; later ones + # carry a positive integer ms. + attrs0 = dict(tries[0].attributes) + assert "ydb.retry.backoff_ms" not in attrs0 + later_values = [dict(s.attributes).get("ydb.retry.backoff_ms") for s in tries[1:]] + assert all(isinstance(v, int) and v > 0 for v in later_values) # failed Try spans record the exception assert tries[0].status.status_code == StatusCode.ERROR assert tries[1].status.status_code == StatusCode.ERROR assert tries[2].status.status_code == StatusCode.UNSET + def test_skip_backoff_errors_still_emit_one_try_per_attempt(self, otel_setup): + """Aborted/BadSession path yields zero sleep but must rotate ydb.Try spans (sync loop).""" + from ydb import issues + from ydb.retries import RetrySettings, retry_operation_sync + + exporter = otel_setup + counter = {"n": 0} + + def flaky(): + counter["n"] += 1 + if counter["n"] < 3: + raise issues.Aborted("retry me") + return "ok" + + assert retry_operation_sync(flaky, RetrySettings(max_retries=5)) == "ok" + + tries = _get_spans(exporter, "ydb.Try") + assert len(tries) == 3 + assert tries[0].status.status_code == StatusCode.ERROR + assert tries[1].status.status_code == StatusCode.ERROR + assert tries[2].status.status_code == StatusCode.UNSET + # First Try has no preceding sleep -> attribute is absent. + # Skip-yield path means subsequent Tries had no real wait either, but the + # attribute is still set to 0 to make "we did go through a retry boundary" + # explicit. + assert "ydb.retry.backoff_ms" not in dict(tries[0].attributes) + assert dict(tries[1].attributes)["ydb.retry.backoff_ms"] == 0 + assert dict(tries[2].attributes)["ydb.retry.backoff_ms"] == 0 + def test_non_retryable_error_propagates_to_run_span(self, otel_setup): from ydb import issues from ydb.retries import retry_operation_sync @@ -432,3 +463,37 @@ def broken(): attrs = dict(tries[0].attributes) assert attrs["error.type"] == "ydb_error" assert attrs["db.response.status_code"] == "SCHEME_ERROR" + + def test_execute_query_is_child_of_try_under_run_with_retry(self, otel_setup): + """``ydb.RunWithRetry`` -> ``ydb.Try`` -> ``ydb.ExecuteQuery`` (sync path).""" + from ydb.query.session import QuerySession + from ydb.retries import retry_operation_sync + + exporter = otel_setup + + qs = QuerySession.__new__(QuerySession) + cfg = FakeDriverConfig() + driver = MagicMock() + driver._driver_config = cfg + qs._driver = driver + qs._session_id = "test-session-id" + qs._node_id = 12345 + qs._peer = ("n1", 2136, "dc-a") + qs._closed = False + + def callee(): + fake_stream = iter([]) + with patch.object(QuerySession, "_execute_call", return_value=fake_stream): + result = qs.execute("SELECT 1;") + list(result) + return "ok" + + assert retry_operation_sync(callee) == "ok" + + run = _get_single_span(exporter, "ydb.RunWithRetry") + try_span = _get_single_span(exporter, "ydb.Try") + exec_span = _get_single_span(exporter, "ydb.ExecuteQuery") + + assert try_span.parent.span_id == run.context.span_id + assert exec_span.parent.span_id == try_span.context.span_id + assert exec_span.context.trace_id == run.context.trace_id diff --git a/ydb/_errors.py b/ydb/_errors.py index 1e969c09..4670024e 100644 --- a/ydb/_errors.py +++ b/ydb/_errors.py @@ -24,32 +24,32 @@ def check_retriable_error(err, retry_settings, attempt): if isinstance(err, issues.Cancelled): if retry_settings.retry_cancelled: - return ErrorRetryInfo(True, retry_settings.fast_backoff.calc_timeout(attempt)) + return ErrorRetryInfo(True, retry_settings.fast_backoff.calc_backoff_ms(attempt)) if isinstance(err, issues.NotFound): if retry_settings.retry_not_found: - return ErrorRetryInfo(True, retry_settings.fast_backoff.calc_timeout(attempt)) + return ErrorRetryInfo(True, retry_settings.fast_backoff.calc_backoff_ms(attempt)) else: return ErrorRetryInfo(False, None) if isinstance(err, issues.InternalError): if retry_settings.retry_internal_error: - return ErrorRetryInfo(True, retry_settings.slow_backoff.calc_timeout(attempt)) + return ErrorRetryInfo(True, retry_settings.slow_backoff.calc_backoff_ms(attempt)) else: return ErrorRetryInfo(False, None) for t in _errors_retriable_fast_backoff_types: if isinstance(err, t): - return ErrorRetryInfo(True, retry_settings.fast_backoff.calc_timeout(attempt)) + return ErrorRetryInfo(True, retry_settings.fast_backoff.calc_backoff_ms(attempt)) for t in _errors_retriable_slow_backoff_types: if isinstance(err, t): - return ErrorRetryInfo(True, retry_settings.slow_backoff.calc_timeout(attempt)) + return ErrorRetryInfo(True, retry_settings.slow_backoff.calc_backoff_ms(attempt)) if retry_settings.idempotent: for t in _errors_retriable_slow_backoff_idempotent_types: if isinstance(err, t): - return ErrorRetryInfo(True, retry_settings.slow_backoff.calc_timeout(attempt)) + return ErrorRetryInfo(True, retry_settings.slow_backoff.calc_backoff_ms(attempt)) return ErrorRetryInfo(False, None) @@ -57,4 +57,11 @@ def check_retriable_error(err, retry_settings, attempt): @dataclass class ErrorRetryInfo: is_retriable: bool - sleep_timeout_seconds: Optional[float] + # Single source: integer ms from ``BackoffSettings.calc_backoff_ms`` (before ``/ 1000`` to seconds). + sleep_backoff_ms: Optional[int] = None + + @property + def sleep_timeout_seconds(self) -> Optional[float]: + if self.sleep_backoff_ms is None: + return None + return self.sleep_backoff_ms / 1000.0 diff --git a/ydb/aio/driver.py b/ydb/aio/driver.py index 405e5fcb..241f5b00 100644 --- a/ydb/aio/driver.py +++ b/ydb/aio/driver.py @@ -69,6 +69,7 @@ def __init__( root_certificates, credentials, config_class=DriverConfig, + **kwargs ) super(Driver, self).__init__(config) diff --git a/ydb/aio/pool.py b/ydb/aio/pool.py index 4f1b0cdd..7fd9a6bb 100644 --- a/ydb/aio/pool.py +++ b/ydb/aio/pool.py @@ -39,7 +39,25 @@ async def get( # async version with different Connection type if self._fast_fail_error: raise self._fast_fail_error else: - await asyncio.wait_for(self._event.wait(), timeout=wait_timeout) + # With ``disable_discovery``, the initial connection can fail (gRPC / TLS / etc.). + # ``_event`` is then never set; without also waiting on ``_fast_fail_event`` the caller + # would block until *wait_timeout* with no useful error. + wait_conn = asyncio.create_task(self._event.wait()) + wait_fail = asyncio.create_task(self._fast_fail_event.wait()) + try: + done, pending = await asyncio.wait( + (wait_conn, wait_fail), + timeout=wait_timeout, + return_when=asyncio.FIRST_COMPLETED, + ) + finally: + for t in (wait_conn, wait_fail): + if not t.done(): + t.cancel() + if self._fast_fail_error is not None: + raise self._fast_fail_error + if not done: + raise asyncio.TimeoutError if preferred_endpoint is not None and preferred_endpoint.node_id in self.connections_by_node_id: return self.connections_by_node_id[preferred_endpoint.node_id] # type: ignore[return-value] @@ -254,11 +272,15 @@ def __init__(self, driver_config: "DriverConfig") -> None: if driver_config.disable_discovery: # If discovery is disabled, just add the initial endpoint to the store async def init_connection() -> None: - ready_connection = Connection(self._driver_config.endpoint, self._driver_config) - await ready_connection.connection_ready( - ready_timeout=getattr(self._driver_config, "discovery_request_timeout", 10) - ) - self._store.add(ready_connection) + try: + ready_connection = Connection(self._driver_config.endpoint, self._driver_config) + await ready_connection.connection_ready( + ready_timeout=getattr(self._driver_config, "discovery_request_timeout", 10) + ) + self._store.add(ready_connection) + except Exception as e: # noqa: BLE001 — surface to wait() via complete_discovery + self._store.complete_discovery(e) + return # Create and schedule the task to initialize the connection self._discovery_task = asyncio.get_event_loop().create_task(init_connection()) diff --git a/ydb/aio/query/base.py b/ydb/aio/query/base.py index cbf22e98..3cd49f74 100644 --- a/ydb/aio/query/base.py +++ b/ydb/aio/query/base.py @@ -2,10 +2,13 @@ class AsyncResponseContextIterator(_utilities.AsyncResponseIterator): - def __init__(self, it, wrapper, on_error=None, span=None): + """Async ExecuteQuery result stream; span + gRPC propagation token (see sync class doc).""" + + def __init__(self, it, wrapper, on_error=None, span=None, grpc_propagation_token=None): super().__init__(it, wrapper) self._on_error = on_error self._span = span + self._grpc_propagation_token = grpc_propagation_token async def __aenter__(self) -> "AsyncResponseContextIterator": return self @@ -23,6 +26,12 @@ async def _next(self): raise e def _finish_span(self, exception=None): + # Pop gRPC propagation before ending span (same contract as sync iterator). + if self._grpc_propagation_token is not None: + from ydb.opentelemetry.tracing import pop_otel_span_for_grpc + + pop_otel_span_for_grpc(self._grpc_propagation_token) + self._grpc_propagation_token = None if self._span is not None: if exception is not None: self._span.set_error(exception) diff --git a/ydb/aio/query/session.py b/ydb/aio/query/session.py index d7e09419..0bb082ef 100644 --- a/ydb/aio/query/session.py +++ b/ydb/aio/query/session.py @@ -19,7 +19,12 @@ from ...query import base from ...query.session import BaseQuerySession -from ...opentelemetry.tracing import create_ydb_span, set_peer_attributes +from ...opentelemetry.tracing import ( + create_ydb_span, + pop_otel_span_for_grpc, + push_otel_span_for_grpc, + set_peer_attributes, +) from ..._constants import DEFAULT_INITIAL_RESPONSE_TIMEOUT @@ -170,19 +175,25 @@ async def execute( ) try: - stream_it = await self._execute_call( - query=query, - parameters=parameters, - commit_tx=True, - syntax=syntax, - exec_mode=exec_mode, - stats_mode=stats_mode, - schema_inclusion_mode=schema_inclusion_mode, - result_set_format=result_set_format, - arrow_format_settings=arrow_format_settings, - concurrent_result_sets=concurrent_result_sets, - settings=settings, - ) + # PR #786: async mirror of sync session.execute propagation (vgvoleg). + tok = push_otel_span_for_grpc(span) + try: + stream_it = await self._execute_call( + query=query, + parameters=parameters, + commit_tx=True, + syntax=syntax, + exec_mode=exec_mode, + stats_mode=stats_mode, + schema_inclusion_mode=schema_inclusion_mode, + result_set_format=result_set_format, + arrow_format_settings=arrow_format_settings, + concurrent_result_sets=concurrent_result_sets, + settings=settings, + ) + except BaseException: + pop_otel_span_for_grpc(tok) + raise return AsyncResponseContextIterator( it=stream_it, @@ -194,6 +205,7 @@ async def execute( ), on_error=self._on_execute_stream_error, span=span, + grpc_propagation_token=tok, ) except Exception as e: if span is not None: diff --git a/ydb/aio/query/transaction.py b/ydb/aio/query/transaction.py index f935641d..915f912c 100644 --- a/ydb/aio/query/transaction.py +++ b/ydb/aio/query/transaction.py @@ -12,7 +12,7 @@ BaseQueryTxContext, QueryTxStateEnum, ) -from ...opentelemetry.tracing import create_ydb_span +from ...opentelemetry.tracing import create_ydb_span, pop_otel_span_for_grpc, push_otel_span_for_grpc if TYPE_CHECKING: from .session import QuerySession @@ -208,19 +208,25 @@ async def execute( ) try: - stream_it = await self._execute_call( - query=query, - parameters=parameters, - commit_tx=commit_tx, - syntax=syntax, - exec_mode=exec_mode, - stats_mode=stats_mode, - schema_inclusion_mode=schema_inclusion_mode, - result_set_format=result_set_format, - arrow_format_settings=arrow_format_settings, - concurrent_result_sets=concurrent_result_sets, - settings=settings, - ) + # PR #786: async mirror of sync transaction.execute propagation. + tok = push_otel_span_for_grpc(span) + try: + stream_it = await self._execute_call( + query=query, + parameters=parameters, + commit_tx=commit_tx, + syntax=syntax, + exec_mode=exec_mode, + stats_mode=stats_mode, + schema_inclusion_mode=schema_inclusion_mode, + result_set_format=result_set_format, + arrow_format_settings=arrow_format_settings, + concurrent_result_sets=concurrent_result_sets, + settings=settings, + ) + except BaseException: + pop_otel_span_for_grpc(tok) + raise self._prev_stream = AsyncResponseContextIterator( it=stream_it, @@ -234,6 +240,7 @@ async def execute( ), on_error=self.session._on_execute_stream_error, span=span, + grpc_propagation_token=tok, ) return self._prev_stream except Exception as e: diff --git a/ydb/opentelemetry/__init__.py b/ydb/opentelemetry/__init__.py index 1ea6d6c8..a984405f 100644 --- a/ydb/opentelemetry/__init__.py +++ b/ydb/opentelemetry/__init__.py @@ -1,8 +1,20 @@ +"""Public OpenTelemetry entrypoints for YDB (PR #786). + +``disable_tracing`` exists because ``enable_tracing`` is idempotent: reviewers asked for +either documentation or an explicit reset before changing the tracer (vgvoleg). +""" + + def enable_tracing(tracer=None): """Enable OpenTelemetry trace context propagation and span creation for all YDB gRPC calls. + This call is **idempotent**: if tracing is already enabled, later calls do nothing + (including passing a different ``tracer``). Call :func:`disable_tracing` first to + reconfigure or turn instrumentation off. + Args: - tracer: Optional OTel tracer to use. If not provided, the default tracer from the global tracer provider will be used. + tracer: Optional OTel tracer to use. If not provided, the default tracer named + ``ydb.sdk`` from the global tracer provider will be used. """ try: from ydb.opentelemetry._plugin import _enable_tracing @@ -15,4 +27,14 @@ def enable_tracing(tracer=None): _enable_tracing(tracer) -__all__ = ["enable_tracing"] +def disable_tracing(): + """Disable YDB OpenTelemetry hooks and allow :func:`enable_tracing` to run again.""" + try: + from ydb.opentelemetry._plugin import _disable_tracing + except ImportError: + return + + _disable_tracing() + + +__all__ = ["disable_tracing", "enable_tracing"] diff --git a/ydb/opentelemetry/_plugin.py b/ydb/opentelemetry/_plugin.py index f555ea1a..4dd4b272 100644 --- a/ydb/opentelemetry/_plugin.py +++ b/ydb/opentelemetry/_plugin.py @@ -1,11 +1,34 @@ -from opentelemetry import context, trace +"""OpenTelemetry bridge for YDB (PR #786 review follow-ups). + +Review themes addressed here: + +- **error.type vs YDB status:** map ``issues.Error.status`` to ``transport_error`` vs + ``ydb_error`` using the client transport status band (``_TRANSPORT_STATUSES``), not the + status name as ``error.type`` (review: KirillKurdyukov). + +- **No long-lived ``context.attach`` on streaming execute:** attaching for the whole + result iterator + detaching from ``__del__`` or another task caused OTel warnings + (review: vgvoleg). ``ExecuteQuery`` uses ``tracing.push_otel_span_for_grpc`` with a + token cleared in the iterator ``_finish_span``; ``TracingSpan.end()`` never detaches. + +- **Explicit ``inject`` context:** when the ContextVar is set, ``inject`` uses + ``trace.set_span_in_context(otel_span)`` instead of relying on global attach for the + stream lifetime (review: vgvoleg). + +- **Tracer / reset:** ``enable_tracing(tracer=...)`` is idempotent; ``disable_tracing()`` + clears hooks so tracing can be reconfigured (review: vgvoleg / KirillKurdyukov). +""" + +from opentelemetry import context as otel_context +from opentelemetry import trace from opentelemetry.propagate import inject from opentelemetry.trace import StatusCode from ydb import issues from ydb.issues import StatusCode as YdbStatusCode -from ydb.opentelemetry.tracing import _registry +from ydb.opentelemetry.tracing import _registry, get_active_grpc_otel_span +# YDB client transport StatusCode values (401xxx band) -> OTel error.type transport_error. _TRANSPORT_STATUSES = frozenset( { YdbStatusCode.CONNECTION_LOST, @@ -26,9 +49,18 @@ def _otel_metadata_hook(): - """Injects W3C Trace Context (traceparent/tracestate) into gRPC metadata.""" + """Injects W3C Trace Context (traceparent/tracestate) into gRPC metadata. + + When ``get_active_grpc_otel_span()`` is set (ExecuteQuery / with-blocks), inject + uses that span explicitly so we do not depend on OTel ``context.attach`` for the + whole stream (PR review: vgvoleg). + """ headers = {} - inject(headers) + otel_span = get_active_grpc_otel_span() + if otel_span is not None: + inject(headers, context=trace.set_span_in_context(otel_span)) + else: + inject(headers) return list(headers.items()) @@ -45,14 +77,22 @@ def _set_error_on_span(span, exception): class TracingSpan: - """Wrapper around an OTel span that manages context lifecycle. + """Wrapper around an OTel span. - Can be used as a context manager or manually + **With-blocks** (CreateSession, Commit, RunWithRetry, …): ``__enter__`` does a + *short* ``context.attach`` for the block so child spans (e.g. ``ydb.Try``) get the + correct parent; ``__exit__`` detaches and ends the span (review: vgvoleg — attach + must not outlive the block / stream). + + **ExecuteQuery** does not use this context manager: the caller holds + ``push_otel_span_for_grpc`` until the result iterator finishes (see + ``SyncResponseContextIterator``); :meth:`end` does not call ``context.detach``. """ - def __init__(self, span, token): + def __init__(self, span): self._span = span - self._token = token + self._grpc_propagation_token = None + self._otel_context_token = None def set_error(self, exception): _set_error_on_span(self._span, exception) @@ -62,14 +102,23 @@ def set_attribute(self, key, value): def end(self): self._span.end() - if self._token is not None: - context.detach(self._token) - self._token = None def __enter__(self): + from ydb.opentelemetry.tracing import push_otel_span_for_grpc + + ctx = trace.set_span_in_context(self._span) + self._otel_context_token = otel_context.attach(ctx) + self._grpc_propagation_token = push_otel_span_for_grpc(self) return self def __exit__(self, exc_type, exc_val, exc_tb): + from ydb.opentelemetry.tracing import pop_otel_span_for_grpc + + pop_otel_span_for_grpc(self._grpc_propagation_token) + self._grpc_propagation_token = None + if self._otel_context_token is not None: + otel_context.detach(self._otel_context_token) + self._otel_context_token = None if exc_val is not None: self.set_error(exc_val) self.end() @@ -77,15 +126,12 @@ def __exit__(self, exc_type, exc_val, exc_tb): def _create_span(name, attributes=None, kind=None): - # Can be used as a context manager or manually span = _tracer.start_span( name, kind=_KIND_MAP.get(kind, trace.SpanKind.CLIENT), attributes=attributes or {}, ) - ctx = trace.set_span_in_context(span) - token = context.attach(ctx) - return TracingSpan(span, token) + return TracingSpan(span) def _enable_tracing(tracer=None): @@ -98,3 +144,17 @@ def _enable_tracing(tracer=None): _enabled = True _registry.set_metadata_hook(_otel_metadata_hook) _registry.set_create_span(_create_span) + + +def _disable_tracing(): + """Clear hooks and tracer; after this, :func:`~ydb.opentelemetry.enable_tracing` may be called again. + + Review (vgvoleg): ``enable_tracing()`` is idempotent; callers need an explicit way + to reset hooks / pass a new tracer without reaching into private module state. + """ + global _enabled, _tracer + + _registry.set_create_span(None) + _registry.set_metadata_hook(None) + _enabled = False + _tracer = None diff --git a/ydb/opentelemetry/tracing.py b/ydb/opentelemetry/tracing.py index a936b269..bdb89608 100644 --- a/ydb/opentelemetry/tracing.py +++ b/ydb/opentelemetry/tracing.py @@ -1,3 +1,49 @@ +"""OpenTelemetry helpers and registry (PR #786 review). + +- **ContextVar ``_OTEL_SPAN_FOR_GRPC``:** holds the SDK span used for W3C ``inject`` on + gRPC metadata without keeping ``context.attach`` for the whole ExecuteQuery stream + (review: vgvoleg). Bound from ``push_otel_span_for_grpc`` until the result iterator + finishes (full execute lifecycle for propagation). + +- **``create_ydb_span`` early return:** avoids building attribute dicts when tracing is + off so the path stays no-op (review: vgvoleg — docs promise zero-cost when disabled). + +- **``_split_endpoint``:** strips ``grpc://`` / ``grpcs://`` and supports ``[ipv6]:port`` + so ``server.address`` matches semantic conventions (review: vgvoleg). +""" + +import contextvars +from typing import Any, Optional, Tuple + +_OTEL_SPAN_FOR_GRPC: contextvars.ContextVar[Optional[Any]] = contextvars.ContextVar( + "_OTEL_SPAN_FOR_GRPC", + default=None, +) + + +def get_active_grpc_otel_span(): + """OpenTelemetry SDK span used for W3C inject on the next gRPC metadata build, if any.""" + return _OTEL_SPAN_FOR_GRPC.get() + + +def push_otel_span_for_grpc(wrapped_span) -> Optional[contextvars.Token]: + """Bind ``wrapped_span`` (TracingSpan from _plugin) for the next gRPC metadata build. + + Returns a token for :func:`pop_otel_span_for_grpc`, or ``None`` if nothing was pushed. + """ + if wrapped_span is None: + return None + raw = getattr(wrapped_span, "_span", None) + if raw is None: + return None + return _OTEL_SPAN_FOR_GRPC.set(raw) + + +def pop_otel_span_for_grpc(token: Optional[contextvars.Token]) -> None: + if token is not None: + _OTEL_SPAN_FOR_GRPC.reset(token) + + class _NoopSpan: """Returned by create_ydb_span when tracing is disabled.""" @@ -23,13 +69,16 @@ def __exit__(self, exc_type, exc_val, exc_tb): class OtelTracingRegistry: """Singleton registry for OpenTelemetry tracing. - By default everything is no-op until :func:`enable_tracing` is called. + By default everything is no-op until :func:`~ydb.opentelemetry.enable_tracing` is called. """ def __init__(self): self._metadata_hook = None self._create_span_func = None + def is_active(self) -> bool: + return self._create_span_func is not None + def create_span(self, name, attributes=None, kind=None): """Create a span. Returns a TracingSpan or _NoopSpan.""" if self._create_span_func is None: @@ -57,10 +106,23 @@ def get_trace_metadata(): return _registry.get_trace_metadata() -def _split_endpoint(endpoint): - endpoint = endpoint or "" - host, _, port = endpoint.rpartition(":") - return host, int(port) if port.isdigit() else 0 +def _split_endpoint(endpoint: Optional[str]) -> Tuple[str, int]: + """Split ``host:port`` for OTel ``server.*`` attributes (no ``grpc://`` prefix; IPv6-safe).""" + ep = endpoint or "" + if ep.startswith("grpcs://"): + ep = ep[len("grpcs://") :] + elif ep.startswith("grpc://"): + ep = ep[len("grpc://") :] + + if ep.startswith("["): + close = ep.find("]") + if close != -1 and len(ep) > close + 1 and ep[close + 1] == ":": + host = ep[: close + 1] + port_s = ep[close + 2 :] + return host, int(port_s) if port_s.isdigit() else 0 + + host, _, port_s = ep.rpartition(":") + return host, int(port_s) if port_s.isdigit() else 0 def _build_ydb_attrs(driver_config, node_id=None, peer=None): @@ -91,6 +153,9 @@ def create_ydb_span(name, driver_config, node_id=None, kind=None, peer=None): map for the specific node serving the call; missing fields are skipped. Can be used as a context manager or manually. """ + # Review (vgvoleg): skip _build_ydb_attrs when hooks are unset (zero-cost when disabled). + if not _registry.is_active(): + return _NOOP_SPAN attrs = _build_ydb_attrs(driver_config, node_id, peer) return _registry.create_span(name, attributes=attrs, kind=kind) diff --git a/ydb/query/base.py b/ydb/query/base.py index 1aeb4f6b..f69445c5 100644 --- a/ydb/query/base.py +++ b/ydb/query/base.py @@ -72,10 +72,18 @@ class QueryResultSetFormat(enum.IntEnum): class SyncResponseContextIterator(_utilities.SyncResponseIterator): - def __init__(self, it, wrapper, on_error=None, span=None): + """Streams ExecuteQuery results; ends the OTel span when the stream is fully consumed. + + ``grpc_propagation_token`` (PR #786): keeps W3C inject bound for the *entire* execute + (from first gRPC metadata until this iterator finishes), without a long-lived OTel + ``context.attach`` on the span (review: vgvoleg + execute lifecycle expectation). + """ + + def __init__(self, it, wrapper, on_error=None, span=None, grpc_propagation_token=None): super().__init__(it, wrapper) self._on_error = on_error self._span = span + self._grpc_propagation_token = grpc_propagation_token def __enter__(self) -> "SyncResponseContextIterator": return self @@ -93,6 +101,12 @@ def _next(self): raise e def _finish_span(self, exception=None): + # Pop gRPC propagation before ending span so metadata hooks do not outlive the span. + if self._grpc_propagation_token is not None: + from ydb.opentelemetry.tracing import pop_otel_span_for_grpc + + pop_otel_span_for_grpc(self._grpc_propagation_token) + self._grpc_propagation_token = None if self._span is not None: if exception is not None: self._span.set_error(exception) diff --git a/ydb/query/session.py b/ydb/query/session.py index a7a32c40..d17ad4de 100644 --- a/ydb/query/session.py +++ b/ydb/query/session.py @@ -18,7 +18,12 @@ from .base import QueryExplainResultFormat from .. import _apis, issues, _utilities -from ..opentelemetry.tracing import create_ydb_span, set_peer_attributes +from ..opentelemetry.tracing import ( + create_ydb_span, + pop_otel_span_for_grpc, + push_otel_span_for_grpc, + set_peer_attributes, +) from ..settings import BaseRequestSettings from ..connection import _RpcState as RpcState, EndpointKey from .._grpc.grpcwrapper import common_utils @@ -472,19 +477,26 @@ def execute( ) try: - stream_it = self._execute_call( - query=query, - parameters=parameters, - commit_tx=True, - syntax=syntax, - exec_mode=exec_mode, - stats_mode=stats_mode, - schema_inclusion_mode=schema_inclusion_mode, - result_set_format=result_set_format, - arrow_format_settings=arrow_format_settings, - concurrent_result_sets=concurrent_result_sets, - settings=settings, - ) + # PR #786: push before _execute_call; token lives on SyncResponseContextIterator until + # the stream is fully read so W3C inject matches the whole ExecuteQuery (vgvoleg). + tok = push_otel_span_for_grpc(span) + try: + stream_it = self._execute_call( + query=query, + parameters=parameters, + commit_tx=True, + syntax=syntax, + exec_mode=exec_mode, + stats_mode=stats_mode, + schema_inclusion_mode=schema_inclusion_mode, + result_set_format=result_set_format, + arrow_format_settings=arrow_format_settings, + concurrent_result_sets=concurrent_result_sets, + settings=settings, + ) + except BaseException: + pop_otel_span_for_grpc(tok) + raise return base.SyncResponseContextIterator( stream_it, @@ -496,6 +508,7 @@ def execute( ), on_error=self._on_execute_stream_error, span=span, + grpc_propagation_token=tok, ) except Exception as e: if span is not None: diff --git a/ydb/query/transaction.py b/ydb/query/transaction.py index b1134c0e..782a1ccf 100644 --- a/ydb/query/transaction.py +++ b/ydb/query/transaction.py @@ -17,7 +17,7 @@ _apis, issues, ) -from ..opentelemetry.tracing import create_ydb_span +from ..opentelemetry.tracing import create_ydb_span, pop_otel_span_for_grpc, push_otel_span_for_grpc from .._grpc.grpcwrapper import ydb_topic as _ydb_topic from .._grpc.grpcwrapper import ydb_query as _ydb_query from ..connection import _RpcState as RpcState @@ -659,19 +659,25 @@ def execute( ) try: - stream_it = self._execute_call( - query=query, - commit_tx=commit_tx, - syntax=syntax, - exec_mode=exec_mode, - stats_mode=stats_mode, - schema_inclusion_mode=schema_inclusion_mode, - result_set_format=result_set_format, - arrow_format_settings=arrow_format_settings, - parameters=parameters, - concurrent_result_sets=concurrent_result_sets, - settings=settings, - ) + # PR #786: same propagation contract as QuerySession.execute (see session.py). + tok = push_otel_span_for_grpc(span) + try: + stream_it = self._execute_call( + query=query, + commit_tx=commit_tx, + syntax=syntax, + exec_mode=exec_mode, + stats_mode=stats_mode, + schema_inclusion_mode=schema_inclusion_mode, + result_set_format=result_set_format, + arrow_format_settings=arrow_format_settings, + parameters=parameters, + concurrent_result_sets=concurrent_result_sets, + settings=settings, + ) + except BaseException: + pop_otel_span_for_grpc(tok) + raise self._prev_stream = base.SyncResponseContextIterator( stream_it, @@ -685,6 +691,7 @@ def execute( ), on_error=self.session._on_execute_stream_error, span=span, + grpc_propagation_token=tok, ) return self._prev_stream except Exception as e: diff --git a/ydb/retries.py b/ydb/retries.py index bd4fc4ad..1b9ce309 100644 --- a/ydb/retries.py +++ b/ydb/retries.py @@ -3,7 +3,7 @@ import inspect import random import time -from typing import Any, Callable, Generator, Optional, Union +from typing import Any, Awaitable, Callable, Generator, Optional, Union from . import issues from ._errors import check_retriable_error @@ -15,8 +15,16 @@ _BACKOFF_ATTR = "ydb.retry.backoff_ms" -def _start_try_span(backoff_ms: int): - return _tracing_registry.create_span(_TRY_SPAN, attributes={_BACKOFF_ATTR: backoff_ms}, kind="internal") +def _start_run_with_retry_span(): + return _tracing_registry.create_span(_RUN_WITH_RETRY_SPAN, kind="internal") + + +def _start_try_span(backoff_ms: Optional[int]): + # ``backoff_ms is None`` for the first attempt — the attribute is omitted because + # there was no preceding sleep at all. For every subsequent attempt the attribute + # is set, including ``0`` on the skip-yield retry path (Aborted/BadSession/...). + attrs = {_BACKOFF_ATTR: backoff_ms} if backoff_ms is not None else None + return _tracing_registry.create_span(_TRY_SPAN, attributes=attrs, kind="internal") class BackoffSettings: @@ -30,12 +38,16 @@ def __init__( self.slot_duration = slot_duration self.uncertain_ratio = uncertain_ratio - def calc_timeout(self, retry_number: int) -> float: + def calc_backoff_ms(self, retry_number: int) -> int: slots_count = 1 << min(retry_number, self.ceiling) max_duration_ms = slots_count * self.slot_duration * 1000.0 - # duration_ms = random.random() * max_duration_ms * uncertain_ratio) + max_duration_ms * (1 - uncertain_ratio) + # duration_ms = random.random() * max_duration_ms * uncertain_ratio + max_duration_ms * (1 - uncertain_ratio) duration_ms = max_duration_ms * (random.random() * self.uncertain_ratio + 1.0 - self.uncertain_ratio) - return duration_ms / 1000.0 + return int(duration_ms) + + def calc_timeout(self, retry_number: int) -> float: + """Backward-compatible alias returning seconds.""" + return self.calc_backoff_ms(retry_number) / 1000.0 class RetrySettings: @@ -82,10 +94,23 @@ def with_slow_backoff(self, backoff_settings: BackoffSettings) -> "RetrySettings class YdbRetryOperationSleepOpt: + """Yielded by :func:`retry_operation_impl` between attempts. + + ``timeout`` is the wait in seconds (``time.sleep`` / ``asyncio.sleep``); for the + "skip yield" YDB error path (``Aborted``/``BadSession``/``NotFound``/``InternalError``) + it is ``0.0`` and ``exception`` is set so consumers still emit one artefact per + attempt (e.g. a ``ydb.Try`` span). ``backoff_ms`` exposes the same value as integer + milliseconds for OpenTelemetry attributes. + """ + def __init__(self, timeout: float, exception: Optional[BaseException] = None) -> None: self.timeout = timeout self.exception = exception + @property + def backoff_ms(self) -> int: + return int(self.timeout * 1000) + def __eq__(self, other: object) -> bool: return ( type(self) is type(other) and isinstance(other, YdbRetryOperationSleepOpt) and self.timeout == other.timeout @@ -121,6 +146,14 @@ def retry_operation_impl( *args: Any, **kwargs: Any, ) -> Generator[Union[YdbRetryOperationSleepOpt, YdbRetryOperationFinalResult], None, None]: + """Pure retry-policy generator. + + Yields ``YdbRetryOperationFinalResult`` (callee's return value, or coroutine for an + async callee) and, between attempts, ``YdbRetryOperationSleepOpt`` carrying the wait + time in seconds plus the original exception. OpenTelemetry spans are created by the + callers (``retry_operation_sync`` / ``retry_operation_async``), not here, so the + generator stays unaware of tracing. + """ retry_settings = RetrySettings() if retry_settings is None else retry_settings status: Optional[issues.Error] = None @@ -140,23 +173,23 @@ def retry_operation_impl( if not retriable_info.is_retriable: raise - skip_yield_error_types = [ + skip_yield_error_types = ( issues.Aborted, issues.BadSession, issues.NotFound, issues.InternalError, - ] - - yield_sleep = True - for t in skip_yield_error_types: - if isinstance(e, t): - yield_sleep = False + ) - if yield_sleep: - yield YdbRetryOperationSleepOpt(retriable_info.sleep_timeout_seconds, exception=e) + if isinstance(e, skip_yield_error_types): + # Fast retry path: no inter-attempt sleep, but we still yield a marker + # SleepOpt(0.0) so consumers (e.g. the sync wrapper) advance per-attempt + # bookkeeping such as ``ydb.Try`` spans. + yield YdbRetryOperationSleepOpt(0.0, exception=e) + else: + sleep_seconds = retriable_info.sleep_timeout_seconds or 0.0 + yield YdbRetryOperationSleepOpt(sleep_seconds, exception=e) except Exception as e: - # you should provide your own handler you want retry_settings.unknown_error_handler(e) raise @@ -170,29 +203,39 @@ def retry_operation_sync( *args: Any, **kwargs: Any, ) -> Any: - with _tracing_registry.create_span(_RUN_WITH_RETRY_SPAN, kind="internal"): - opt_generator = retry_operation_impl(callee, retry_settings, *args, **kwargs) - try_span = _start_try_span(0) - try: - for next_opt in opt_generator: - if isinstance(next_opt, YdbRetryOperationSleepOpt): - exc = next_opt.exception - if exc is not None: - try_span.set_error(exc) - try_span.end() - try_span = _start_try_span(int(next_opt.timeout * 1000)) + """Drive :func:`retry_operation_impl` synchronously with OpenTelemetry spans. + + ``ydb.RunWithRetry`` is the outer ``INTERNAL`` span; each attempt runs inside a + ``ydb.Try`` whose ``ydb.retry.backoff_ms`` is the wait that preceded it. The first + ``ydb.Try`` has no such wait so the attribute is omitted; subsequent attempts + always carry it (``0`` on the skip-yield retry path). RPC spans + (``ydb.ExecuteQuery``/``ydb.Commit``/``ydb.Rollback``) nest under the active + ``ydb.Try`` because the sync callee runs while ``TracingSpan.__enter__`` has the + OTel context attached. + """ + backoff_ms: Optional[int] = None + + if inspect.iscoroutinefunction(callee): + # Async callee with a sync driver: keep current legacy behaviour — the impl just + # creates the coroutine, the caller is responsible for awaiting it. No ``ydb.Try`` + # is opened around the bare coroutine creation; tracing for that case lives in + # ``retry_operation_async``. + traced_callee: Callable[..., Any] = callee + else: + + @functools.wraps(callee) + def traced_callee(*a: Any, **kw: Any) -> Any: + with _start_try_span(backoff_ms): + return callee(*a, **kw) + + with _start_run_with_retry_span(): + for next_opt in retry_operation_impl(traced_callee, retry_settings, *args, **kwargs): + if isinstance(next_opt, YdbRetryOperationSleepOpt): + backoff_ms = next_opt.backoff_ms + if next_opt.timeout > 0: time.sleep(next_opt.timeout) - else: - try_span.end() - try_span = None - return next_opt.result - except BaseException as e: - if try_span is not None: - try_span.set_error(e) - try_span.end() - raise - if try_span is not None: - try_span.end() + else: + return next_opt.result return None @@ -202,45 +245,31 @@ async def retry_operation_async( # pylint: disable=W1113 *args: Any, **kwargs: Any, ) -> Any: - """ - The retry operation helper can be used to retry a coroutine that raises YDB specific - exceptions. - - :param callee: A coroutine to retry. - :param retry_settings: An instance of ydb.RetrySettings that describes how the coroutine - should be retried. If None, default instance of retry settings will be used. - :param args: A tuple with positional arguments to be passed into the coroutine. - :param kwargs: A dictionary with keyword arguments to be passed into the coroutine. + """Drive :func:`retry_operation_impl` asynchronously with OpenTelemetry spans. - Returns awaitable result of coroutine. If retries are not succussful exception is raised. + Mirrors :func:`retry_operation_sync`. The inter-attempt ``await asyncio.sleep`` runs + *outside* ``ydb.Try`` so an `asyncio.CancelledError` during the wait is recorded on + ``ydb.RunWithRetry`` (the outer span), not on a misleading per-attempt span. """ - with _tracing_registry.create_span(_RUN_WITH_RETRY_SPAN, kind="internal"): - opt_generator = retry_operation_impl(callee, retry_settings, *args, **kwargs) - try_span = _start_try_span(0) - try: - for next_opt in opt_generator: - if isinstance(next_opt, YdbRetryOperationSleepOpt): - exc = next_opt.exception - if exc is not None: - try_span.set_error(exc) - try_span.end() - try_span = _start_try_span(int(next_opt.timeout * 1000)) + backoff_ms: Optional[int] = None + with _start_run_with_retry_span(): + for next_opt in retry_operation_impl(callee, retry_settings, *args, **kwargs): + if isinstance(next_opt, YdbRetryOperationSleepOpt): + backoff_ms = next_opt.backoff_ms + if next_opt.timeout > 0: await asyncio.sleep(next_opt.timeout) - else: + else: + with _start_try_span(backoff_ms) as try_span: + awaitable: Awaitable[Any] = next_opt.result try: - result = await next_opt.result - try_span.end() - try_span = None - return result - except BaseException as e: # pylint: disable=W0703 + return await awaitable + except BaseException as e: # noqa: BLE001 + # Exception is swallowed by ``next_opt.set_exception`` so the + # impl re-raises it on the next ``next()`` call; the ``with`` + # would not see it via ``__exit__``, so mark ``ydb.Try`` failed + # explicitly. + try_span.set_error(e) next_opt.set_exception(e) - except BaseException as e: - if try_span is not None: - try_span.set_error(e) - try_span.end() - raise - if try_span is not None: - try_span.end() return None @@ -294,12 +323,11 @@ async def async_wrapper(*args: Any, **kwargs: Any) -> Any: return await retry_operation_async(func, retry_settings, *args, **kwargs) return async_wrapper - else: - @functools.wraps(func) - def sync_wrapper(*args: Any, **kwargs: Any) -> Any: - return retry_operation_sync(func, retry_settings, *args, **kwargs) + @functools.wraps(func) + def sync_wrapper(*args: Any, **kwargs: Any) -> Any: + return retry_operation_sync(func, retry_settings, *args, **kwargs) - return sync_wrapper + return sync_wrapper return decorator diff --git a/ydb/table_test.py b/ydb/table_test.py index b365fda1..fca8815e 100644 --- a/ydb/table_test.py +++ b/ydb/table_test.py @@ -80,7 +80,13 @@ def check_retriable_error(err_type, backoff): YdbRetryOperationSleepOpt(backoff.calc_timeout(1)), ] == yields else: - assert [] == yields + # PR #786: retry_operation_impl now yields SleepOpt(0, exc) for these types so + # ``retry_operation_sync`` matches async behaviour (one ``ydb.Try`` per attempt). + assert len(yields) == 2 + assert all( + isinstance(y, YdbRetryOperationSleepOpt) and y.timeout == 0.0 and y.exception is not None + for y in yields + ) assert exc == err_type("test2") From c66205a41f0712e6cec42a3f325e28f19d40b734 Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Fri, 1 May 2026 09:39:18 +0300 Subject: [PATCH 16/36] fix issue --- docs/opentelemetry.rst | 11 +-- examples/opentelemetry/README.md | 8 --- examples/opentelemetry/compose-e2e.yaml | 80 --------------------- examples/opentelemetry/ydb_config/README.md | 4 +- 4 files changed, 4 insertions(+), 99 deletions(-) delete mode 100644 examples/opentelemetry/compose-e2e.yaml diff --git a/docs/opentelemetry.rst b/docs/opentelemetry.rst index 4596dce9..0bcf73c6 100644 --- a/docs/opentelemetry.rst +++ b/docs/opentelemetry.rst @@ -230,13 +230,13 @@ Docker (YDB or the full stack) first**, then install and run on the host — see .. code-block:: sh - cd examples/opentelemetry && docker compose -f docker-compose.otel.yml up + cd examples/opentelemetry && docker compose -f docker-compose.otel.yaml up **Alternative** (same file, from the repository root): .. code-block:: sh - docker compose -f examples/opentelemetry/docker-compose.otel.yml up + docker compose -f examples/opentelemetry/docker-compose.otel.yaml up **Typical local run** (YDB in Docker, script on the host — Compose **before** ``pip`` / ``python``): @@ -245,13 +245,6 @@ Docker (YDB or the full stack) first**, then install and run on the host — see docker compose up -d pip install -e '.[opentelemetry]' -r examples/opentelemetry/requirements.txt python examples/opentelemetry/otel_example.py - -**Stack from** ``examples/opentelemetry/`` **only** (then install and run from repo root as above): - -.. code-block:: sh - - cd examples/opentelemetry - docker compose -f compose-e2e.yaml up -d cd ../.. pip install -e '.[opentelemetry]' -r examples/opentelemetry/requirements.txt python examples/opentelemetry/otel_example.py diff --git a/examples/opentelemetry/README.md b/examples/opentelemetry/README.md index bc246a29..3cc371a3 100644 --- a/examples/opentelemetry/README.md +++ b/examples/opentelemetry/README.md @@ -35,14 +35,6 @@ Grafana: http://localhost:3000 **Logs for `otel-example`:** the container name is prefixed (e.g. `opentelemetry-otel-example-1`); use `docker compose -f docker-compose.otel.yaml ps` or `docker ps -a` to find it. The service is one-shot (`restart: "no"`) — it may already have exited. -**Only configs from this folder** (same idea, from `examples/opentelemetry`): - -```sh -cd /path/to/ydb-python-sdk/examples/opentelemetry -docker compose -f compose-e2e.yaml up -d -cd ../.. -``` - ## 2. Install dependencies (on the host, for a local `python` run) **From the repository root** (editable SDK + pins from this example): diff --git a/examples/opentelemetry/compose-e2e.yaml b/examples/opentelemetry/compose-e2e.yaml deleted file mode 100644 index d6ee604d..00000000 --- a/examples/opentelemetry/compose-e2e.yaml +++ /dev/null @@ -1,80 +0,0 @@ -version: "3.3" -services: - ydb: - image: ydbplatform/local-ydb:trunk - restart: always - hostname: localhost - platform: linux/amd64 - environment: - YDB_DEFAULT_LOG_LEVEL: NOTICE - GRPC_TLS_PORT: "2135" - GRPC_PORT: "2136" - MON_PORT: "8765" - YDB_USE_IN_MEMORY_PDISKS: "true" - command: [ "--config-path", "/ydb_config/ydb-config-with-tracing.yaml" ] - ports: - - "2135:2135" - - "2136:2136" - - "8765:8765" - volumes: - - ./ydb_config:/ydb_config:ro - - otel-collector: - image: otel/opentelemetry-collector-contrib:latest - command: [ "--config=/etc/otelcol/config.yaml" ] - volumes: - - ./otel-collector-config.yaml:/etc/otelcol/config.yaml:ro - ports: - - "4317:4317" - - "4318:4318" - - "9464:9464" - - "13133:13133" - - "13317:55679" - - prometheus: - image: prom/prometheus:latest - volumes: - - ./prometheus.yaml:/etc/prometheus/prometheus.yml:ro - ports: - - "9090:9090" - depends_on: [ otel-collector ] - - tempo: - image: grafana/tempo:2.4.1 - command: [ "-config.file=/etc/tempo.yaml" ] - volumes: - - ./tempo.yaml:/etc/tempo.yaml:ro - ports: - - "3200:3200" - depends_on: [ otel-collector ] - - grafana: - image: grafana/grafana:10.4.2 - environment: - GF_AUTH_ANONYMOUS_ENABLED: "true" - GF_AUTH_ANONYMOUS_ORG_ROLE: "Admin" - volumes: - - ./grafana/provisioning:/etc/grafana/provisioning:ro - - ./grafana/dashboards:/var/lib/grafana/dashboards:ro - ports: - - "3000:3000" - depends_on: [ prometheus, tempo ] - - otel-example: - image: python:3.11-slim - working_dir: /workspace - volumes: - - ../..:/workspace - environment: - YDB_ENDPOINT: grpc://ydb:2136 - YDB_DATABASE: /local - OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317 - OTEL_SERVICE_NAME: ydb-otel-example - depends_on: - - ydb - - otel-collector - restart: "no" - command: > - bash -c "set -euo pipefail - pip install --no-cache-dir -e '.[opentelemetry]' -r requirements.txt - python otel_example.py" diff --git a/examples/opentelemetry/ydb_config/README.md b/examples/opentelemetry/ydb_config/README.md index cbffaaba..70c8c5a7 100644 --- a/examples/opentelemetry/ydb_config/README.md +++ b/examples/opentelemetry/ydb_config/README.md @@ -19,10 +19,10 @@ Default service name (so you can find it in Tempo/Grafana): `ydb` ## 3) Run with the overridden config -Restart YDB (the main `compose-e2e.yaml` will automatically use `--config-path` if `ydb-config.yaml` exists): +Restart YDB (the main `docker-compose.otel.yaml` will automatically use `--config-path` if `ydb-config.yaml` exists): ```bash -docker-compose -f compose-e2e.yaml up -d --force-recreate ydb +docker compose -f docker-compose.otel.yaml up -d --force-recreate ydb ``` Now you should see additional server-side traces in Tempo/Grafana (service name defaults to `ydb-local` in the snippet). From bce0e024d8fb096dfbe32188a1ce7ae97284ab68 Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Fri, 1 May 2026 09:59:38 +0300 Subject: [PATCH 17/36] fix issue --- examples/opentelemetry/otel_example.py | 20 +++++++++----------- ydb/aio/driver.py | 2 +- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/examples/opentelemetry/otel_example.py b/examples/opentelemetry/otel_example.py index 45b8fa77..819ef476 100644 --- a/examples/opentelemetry/otel_example.py +++ b/examples/opentelemetry/otel_example.py @@ -17,14 +17,14 @@ if str(_repo_root) not in sys.path: sys.path.insert(0, str(_repo_root)) -import ydb -from ydb import _utilities as _yutil -from opentelemetry import trace -from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter -from opentelemetry.sdk.resources import Resource -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor -from ydb.opentelemetry import enable_tracing +import ydb # noqa: E402 +from ydb import _utilities as _yutil # noqa: E402 +from opentelemetry import trace # noqa: E402 +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter # noqa: E402 +from opentelemetry.sdk.resources import Resource # noqa: E402 +from opentelemetry.sdk.trace import TracerProvider # noqa: E402 +from opentelemetry.sdk.trace.export import BatchSpanProcessor # noqa: E402 +from ydb.opentelemetry import enable_tracing # noqa: E402 def _env(name: str, default: str) -> str: @@ -98,9 +98,7 @@ async def main() -> None: startup.set_attribute("app.message", "hello") await pool.execute_with_retries("DROP TABLE IF EXISTS bank") - await pool.execute_with_retries( - "CREATE TABLE bank (id Int32, amount Int32, PRIMARY KEY (id))" - ) + await pool.execute_with_retries("CREATE TABLE bank (id Int32, amount Int32, PRIMARY KEY (id))") print("Insert row...") await pool.execute_with_retries("INSERT INTO bank (id, amount) VALUES (1, 0)") diff --git a/ydb/aio/driver.py b/ydb/aio/driver.py index 241f5b00..88221e94 100644 --- a/ydb/aio/driver.py +++ b/ydb/aio/driver.py @@ -69,7 +69,7 @@ def __init__( root_certificates, credentials, config_class=DriverConfig, - **kwargs + **kwargs, ) super(Driver, self).__init__(config) From 60b9a58eb341fc1da7b89f16cc58d579c0595d30 Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Fri, 1 May 2026 10:11:12 +0300 Subject: [PATCH 18/36] fix issue --- examples/opentelemetry/docker-compose.otel.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/opentelemetry/docker-compose.otel.yaml b/examples/opentelemetry/docker-compose.otel.yaml index d7cb934c..46125727 100644 --- a/examples/opentelemetry/docker-compose.otel.yaml +++ b/examples/opentelemetry/docker-compose.otel.yaml @@ -7,8 +7,6 @@ # Or from the repository root: # docker compose -f examples/opentelemetry/docker-compose.otel.yaml up -version: "3.3" - services: ydb: image: ydbplatform/local-ydb:trunk From 31b2cf206586eeb9b50936926191573cc014890d Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Sat, 2 May 2026 09:21:20 +0300 Subject: [PATCH 19/36] fix issue --- .dockerignore | 4 +++- docs/opentelemetry.rst | 10 +++------ examples/opentelemetry/Dockerfile | 21 +++++++++++++++++++ examples/opentelemetry/README.md | 16 +++++++------- .../opentelemetry/docker-compose.otel.yaml | 13 ++++-------- 5 files changed, 40 insertions(+), 24 deletions(-) create mode 100644 examples/opentelemetry/Dockerfile diff --git a/.dockerignore b/.dockerignore index 90fe8e80..90dc0607 100644 --- a/.dockerignore +++ b/.dockerignore @@ -4,4 +4,6 @@ !README.md !requirements.txt !pyproject.toml -!setup.py \ No newline at end of file +!setup.py +!examples/opentelemetry/otel_example.py +!examples/opentelemetry/requirements.txt \ No newline at end of file diff --git a/docs/opentelemetry.rst b/docs/opentelemetry.rst index 0bcf73c6..38bcca50 100644 --- a/docs/opentelemetry.rst +++ b/docs/opentelemetry.rst @@ -226,17 +226,13 @@ Serializable transactions and ``app_startup`` / ``example_tli`` application span Docker (YDB or the full stack) first**, then install and run on the host — see ``examples/opentelemetry/README.md`` for the full order of commands and environment variables. -**Full stack in one command** (YDB + OTLP + Tempo + Grafana; includes a one-shot ``otel-example`` container; compose file in ``examples/opentelemetry/``): +**Full stack in one command** (YDB + OTLP + Tempo + Grafana; the ``otel-example`` service is built from ``examples/opentelemetry/Dockerfile`` and runs the script once): .. code-block:: sh - cd examples/opentelemetry && docker compose -f docker-compose.otel.yaml up + docker compose -f examples/opentelemetry/docker-compose.otel.yaml up --build -**Alternative** (same file, from the repository root): - -.. code-block:: sh - - docker compose -f examples/opentelemetry/docker-compose.otel.yaml up +The first run builds the ``otel-example`` image from the local SDK source; subsequent runs reuse the cached image. Pass ``--build`` again if you change the SDK or the demo script. **Typical local run** (YDB in Docker, script on the host — Compose **before** ``pip`` / ``python``): diff --git a/examples/opentelemetry/Dockerfile b/examples/opentelemetry/Dockerfile new file mode 100644 index 00000000..c7b25acb --- /dev/null +++ b/examples/opentelemetry/Dockerfile @@ -0,0 +1,21 @@ +# Isolated image for the OpenTelemetry demo. Build context is the repository root. +# +# docker compose -f examples/opentelemetry/docker-compose.otel.yaml build otel-example +# +# A separate ``.dockerignore`` at the repo root keeps the context small. + +FROM python:3.11-slim + +WORKDIR /app + +# Dependency layer: copy only what setup.py needs so changes to the demo script do +# not bust the cached pip install. +COPY setup.py pyproject.toml README.md requirements.txt ./ +COPY ydb ./ydb +COPY examples/opentelemetry/requirements.txt ./examples/opentelemetry/requirements.txt +RUN pip install --no-cache-dir -e '.[opentelemetry]' -r examples/opentelemetry/requirements.txt + +# Demo script. +COPY examples/opentelemetry/otel_example.py ./examples/opentelemetry/otel_example.py + +CMD ["python", "examples/opentelemetry/otel_example.py"] diff --git a/examples/opentelemetry/README.md b/examples/opentelemetry/README.md index 3cc371a3..b131f54a 100644 --- a/examples/opentelemetry/README.md +++ b/examples/opentelemetry/README.md @@ -17,23 +17,25 @@ docker compose up -d # wait until the ydb container is healthy / port 2136 is open, then continue ``` -**Full stack** (YDB + OTLP collector + Tempo + Grafana; the `otel-example` service also runs the script once inside Compose). The compose file is `docker-compose.otel.yaml` next to this README. +**Full stack** (YDB + OTLP collector + Tempo + Grafana; the `otel-example` service is built from a `Dockerfile` and runs the script once inside Compose). The compose file is `docker-compose.otel.yaml` next to this README. ```sh -cd /path/to/ydb-python-sdk/examples/opentelemetry -docker compose -f docker-compose.otel.yaml up +cd /path/to/ydb-python-sdk +docker compose -f examples/opentelemetry/docker-compose.otel.yaml up --build ``` -From the repository root you can use the same file with: +From this folder the build context is still resolved correctly (it is `../..` relative to the compose file): ```sh -cd /path/to/ydb-python-sdk -docker compose -f examples/opentelemetry/docker-compose.otel.yaml up +cd /path/to/ydb-python-sdk/examples/opentelemetry +docker compose -f docker-compose.otel.yaml up --build ``` +The first run builds the `otel-example` image from the local SDK source (`Dockerfile` in this folder, `.dockerignore` at the repo root keeps the context small). Subsequent runs reuse the cached image; pass `--build` if you change the SDK or the demo script. + Grafana: http://localhost:3000 -**Logs for `otel-example`:** the container name is prefixed (e.g. `opentelemetry-otel-example-1`); use `docker compose -f docker-compose.otel.yaml ps` or `docker ps -a` to find it. The service is one-shot (`restart: "no"`) — it may already have exited. +**Logs for `otel-example`:** the container name is prefixed (e.g. `opentelemetry-otel-example-1`); use `docker compose -f examples/opentelemetry/docker-compose.otel.yaml ps` or `docker ps -a` to find it. The service is one-shot (`restart: "no"`) — it may already have exited. ## 2. Install dependencies (on the host, for a local `python` run) diff --git a/examples/opentelemetry/docker-compose.otel.yaml b/examples/opentelemetry/docker-compose.otel.yaml index 46125727..023ae3fd 100644 --- a/examples/opentelemetry/docker-compose.otel.yaml +++ b/examples/opentelemetry/docker-compose.otel.yaml @@ -68,11 +68,10 @@ services: depends_on: [prometheus, tempo] otel-example: - image: python:3.11-slim - working_dir: /workspace - volumes: - # repository root (../../ from this file) for editable `pip install -e .[opentelemetry]` - - ../../:/workspace + # Built from the local SDK source (no volume mount, no relative-path tricks). + build: + context: ../.. + dockerfile: examples/opentelemetry/Dockerfile environment: YDB_ENDPOINT: grpc://ydb:2136 YDB_DATABASE: /local @@ -82,7 +81,3 @@ services: - ydb - otel-collector restart: "no" - command: > - bash -c "set -euo pipefail - pip install --no-cache-dir -e '.[opentelemetry]' -r examples/opentelemetry/requirements.txt - python examples/opentelemetry/otel_example.py" From 763052a6c27762c970de64b319ffa060691ffa23 Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Sat, 2 May 2026 09:33:07 +0300 Subject: [PATCH 20/36] fix issue --- docs/opentelemetry.rst | 10 ++++--- tests/tracing/test_tracing_sync.py | 44 ++++++++++++++++++++++++++++++ ydb/aio/query/base.py | 7 ++++- ydb/aio/query/session.py | 2 ++ ydb/aio/query/transaction.py | 2 ++ ydb/query/base.py | 11 +++++++- ydb/query/session.py | 7 +++++ ydb/query/transaction.py | 5 ++++ 8 files changed, 82 insertions(+), 6 deletions(-) diff --git a/docs/opentelemetry.rst b/docs/opentelemetry.rst index 38bcca50..e91e629f 100644 --- a/docs/opentelemetry.rst +++ b/docs/opentelemetry.rst @@ -105,7 +105,12 @@ The following operations produce spans: - Umbrella span wrapping the whole retryable block (``retry_operation_*`` / ``retry_tx_*`` / ``execute_with_retries``). * - ``ydb.Try`` - INTERNAL - - A single retry attempt. Carries ``ydb.retry.backoff_ms`` — how long the retrier slept before starting this attempt (``0`` for the first one). + - A single retry attempt. From the **second** attempt onward carries + ``ydb.retry.backoff_ms`` — how long the retrier slept before starting this + attempt (``0`` on the skip-yield retry path: ``Aborted`` / ``BadSession`` / + ``NotFound`` / ``InternalError``, where the protocol prescribes immediate + retry without backoff). The very first ``ydb.Try`` omits the attribute + entirely because nothing preceded it. All spans are nested under the currently active span, so wrapping your application logic in a parent span produces a complete trace tree: @@ -241,8 +246,5 @@ The first run builds the ``otel-example`` image from the local SDK source; subse docker compose up -d pip install -e '.[opentelemetry]' -r examples/opentelemetry/requirements.txt python examples/opentelemetry/otel_example.py - cd ../.. - pip install -e '.[opentelemetry]' -r examples/opentelemetry/requirements.txt - python examples/opentelemetry/otel_example.py Open `http://localhost:3000 `_ (Grafana) to explore traces via Tempo. diff --git a/tests/tracing/test_tracing_sync.py b/tests/tracing/test_tracing_sync.py index 636ec63a..10acbebb 100644 --- a/tests/tracing/test_tracing_sync.py +++ b/tests/tracing/test_tracing_sync.py @@ -413,6 +413,50 @@ def flaky(): assert tries[1].status.status_code == StatusCode.ERROR assert tries[2].status.status_code == StatusCode.UNSET + def test_backoff_ms_attribute_matches_actual_sleep(self, otel_setup, monkeypatch): + """Pin the closure: ``ydb.retry.backoff_ms`` on the n-th ``ydb.Try`` equals + the sleep that preceded it, regardless of which retry attempt triggered it. + + Both ``random.random`` and ``time.sleep`` are mocked so the math is fully + deterministic and the test does not actually wait. With + ``ceiling=0, slot_duration=0.1, uncertain_ratio=0.5`` and ``random()=0.5``:: + + slots_count = 1 + max_duration = 1 * 0.1 * 1000 = 100 ms + duration = 100 * (0.5*0.5 + 0.5) = 75 ms + """ + from ydb import issues + from ydb.retries import retry_operation_sync, RetrySettings, BackoffSettings + + monkeypatch.setattr("random.random", lambda: 0.5) + sleeps = [] + monkeypatch.setattr("time.sleep", sleeps.append) + + exporter = otel_setup + counter = {"n": 0} + + def flaky(): + counter["n"] += 1 + if counter["n"] < 3: + raise issues.Unavailable("transient") + return "ok" + + settings = RetrySettings( + max_retries=5, + fast_backoff_settings=BackoffSettings(ceiling=0, slot_duration=0.1, uncertain_ratio=0.5), + slow_backoff_settings=BackoffSettings(ceiling=0, slot_duration=0.1, uncertain_ratio=0.5), + ) + assert retry_operation_sync(flaky, settings) == "ok" + + expected_ms = 75 + + tries = _get_spans(exporter, "ydb.Try") + assert len(tries) == 3 + assert "ydb.retry.backoff_ms" not in dict(tries[0].attributes) + assert dict(tries[1].attributes)["ydb.retry.backoff_ms"] == expected_ms + assert dict(tries[2].attributes)["ydb.retry.backoff_ms"] == expected_ms + assert sleeps == [expected_ms / 1000.0, expected_ms / 1000.0] + def test_skip_backoff_errors_still_emit_one_try_per_attempt(self, otel_setup): """Aborted/BadSession path yields zero sleep but must rotate ydb.Try spans (sync loop).""" from ydb import issues diff --git a/ydb/aio/query/base.py b/ydb/aio/query/base.py index 992005df..7324bce7 100644 --- a/ydb/aio/query/base.py +++ b/ydb/aio/query/base.py @@ -47,7 +47,12 @@ def _finish_span(self, exception=None): self._span = None def __del__(self): - self._finish_span() + # See sync iterator: GC may run in a different ContextVar context, where + # ``reset(token)`` would raise ValueError. End the span only. + if self._span is not None: + self._span.end() + self._span = None + self._grpc_propagation_token = None async def __aexit__(self, exc_type, exc_val, exc_tb): # To close stream on YDB it is necessary to scroll through it to the end. diff --git a/ydb/aio/query/session.py b/ydb/aio/query/session.py index 5a0824d4..39df07f0 100644 --- a/ydb/aio/query/session.py +++ b/ydb/aio/query/session.py @@ -193,6 +193,7 @@ async def execute( ) except BaseException: pop_otel_span_for_grpc(tok) + tok = None raise return AsyncResponseContextIterator( @@ -208,6 +209,7 @@ async def execute( grpc_propagation_token=tok, ) except Exception as e: + pop_otel_span_for_grpc(tok) if span is not None: span.set_error(e) span.end() diff --git a/ydb/aio/query/transaction.py b/ydb/aio/query/transaction.py index 1f865208..ff8720df 100644 --- a/ydb/aio/query/transaction.py +++ b/ydb/aio/query/transaction.py @@ -229,6 +229,7 @@ async def execute( ) except BaseException: pop_otel_span_for_grpc(tok) + tok = None raise self._prev_stream = AsyncResponseContextIterator( @@ -247,6 +248,7 @@ async def execute( ) return self._prev_stream except Exception as e: + pop_otel_span_for_grpc(tok) if span is not None: span.set_error(e) span.end() diff --git a/ydb/query/base.py b/ydb/query/base.py index b3e80736..af7056f3 100644 --- a/ydb/query/base.py +++ b/ydb/query/base.py @@ -121,7 +121,16 @@ def _finish_span(self, exception=None): self._span = None def __del__(self): - self._finish_span() + # GC may finalize this iterator in a different execution context than the + # one that produced ``_grpc_propagation_token`` (CPython runs GC in + # whichever thread triggers collection). ``ContextVar.reset`` raises + # ``ValueError`` in that case; skip the pop here — the ContextVar is + # context-local, the leaked entry is harmless, only the span needs to be + # ended cleanly. + if self._span is not None: + self._span.end() + self._span = None + self._grpc_propagation_token = None def __exit__(self, exc_type, exc_val, exc_tb): # To close stream on YDB it is necessary to scroll through it to the end. diff --git a/ydb/query/session.py b/ydb/query/session.py index 99cffc3b..c7df7cde 100644 --- a/ydb/query/session.py +++ b/ydb/query/session.py @@ -519,6 +519,7 @@ def execute( ) except BaseException: pop_otel_span_for_grpc(tok) + tok = None # mark popped so the outer ``except`` is a no-op raise return base.SyncResponseContextIterator( @@ -534,6 +535,12 @@ def execute( grpc_propagation_token=tok, ) except Exception as e: + # If the iterator constructor (above) raises, the gRPC propagation + # ContextVar would otherwise leak the now-ended span into the next + # gRPC call on this context. ``tok`` is ``None`` when the inner + # ``except BaseException`` already popped it, in which case + # ``pop_otel_span_for_grpc`` is a no-op. + pop_otel_span_for_grpc(tok) if span is not None: span.set_error(e) span.end() diff --git a/ydb/query/transaction.py b/ydb/query/transaction.py index 83e088f5..78cbc551 100644 --- a/ydb/query/transaction.py +++ b/ydb/query/transaction.py @@ -669,6 +669,7 @@ def execute( ) except BaseException: pop_otel_span_for_grpc(tok) + tok = None raise self._prev_stream = base.SyncResponseContextIterator( @@ -687,6 +688,10 @@ def execute( ) return self._prev_stream except Exception as e: + # Same fall-through as in QuerySession.execute: pop the gRPC + # propagation ContextVar so a failed iterator construction does not + # leak the now-ended span into the next gRPC call on this context. + pop_otel_span_for_grpc(tok) if span is not None: span.set_error(e) span.end() From dd60ff0bd99a8e32abd412c014bc4f7b8715d839 Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Sat, 2 May 2026 10:04:41 +0300 Subject: [PATCH 21/36] fix issue --- ydb/aio/query/base.py | 14 +----- ydb/aio/query/session.py | 23 ++------- ydb/aio/query/transaction.py | 18 ++----- ydb/opentelemetry/_plugin.py | 91 ++++++++++++++++-------------------- ydb/opentelemetry/tracing.py | 50 +++++--------------- ydb/query/base.py | 23 +-------- ydb/query/session.py | 33 ++++--------- ydb/query/transaction.py | 21 ++------- 8 files changed, 74 insertions(+), 199 deletions(-) diff --git a/ydb/aio/query/base.py b/ydb/aio/query/base.py index 7324bce7..0344683d 100644 --- a/ydb/aio/query/base.py +++ b/ydb/aio/query/base.py @@ -2,13 +2,12 @@ class AsyncResponseContextIterator(_utilities.AsyncResponseIterator): - """Async ExecuteQuery result stream; span + gRPC propagation token (see sync class doc).""" + """Async ExecuteQuery result stream; ends the attached OTel span when consumed.""" - def __init__(self, it, wrapper, on_error=None, span=None, grpc_propagation_token=None): + def __init__(self, it, wrapper, on_error=None, span=None): super().__init__(it, wrapper) self._on_error = on_error self._span = span - self._grpc_propagation_token = grpc_propagation_token async def __aenter__(self) -> "AsyncResponseContextIterator": return self @@ -34,12 +33,6 @@ async def _next(self): raise def _finish_span(self, exception=None): - # Pop gRPC propagation before ending span (same contract as sync iterator). - if self._grpc_propagation_token is not None: - from ydb.opentelemetry.tracing import pop_otel_span_for_grpc - - pop_otel_span_for_grpc(self._grpc_propagation_token) - self._grpc_propagation_token = None if self._span is not None: if exception is not None: self._span.set_error(exception) @@ -47,12 +40,9 @@ def _finish_span(self, exception=None): self._span = None def __del__(self): - # See sync iterator: GC may run in a different ContextVar context, where - # ``reset(token)`` would raise ValueError. End the span only. if self._span is not None: self._span.end() self._span = None - self._grpc_propagation_token = None async def __aexit__(self, exc_type, exc_val, exc_tb): # To close stream on YDB it is necessary to scroll through it to the end. diff --git a/ydb/aio/query/session.py b/ydb/aio/query/session.py index 39df07f0..b6b8ee92 100644 --- a/ydb/aio/query/session.py +++ b/ydb/aio/query/session.py @@ -19,12 +19,7 @@ from ...query import base from ...query.session import BaseQuerySession -from ...opentelemetry.tracing import ( - create_ydb_span, - pop_otel_span_for_grpc, - push_otel_span_for_grpc, - set_peer_attributes, -) +from ...opentelemetry.tracing import create_ydb_span, set_peer_attributes from ..._constants import DEFAULT_INITIAL_RESPONSE_TIMEOUT @@ -175,9 +170,7 @@ async def execute( ) try: - # PR #786: async mirror of sync session.execute propagation (vgvoleg). - tok = push_otel_span_for_grpc(span) - try: + with span.attach_context(): stream_it = await self._execute_call( query=query, parameters=parameters, @@ -191,11 +184,6 @@ async def execute( concurrent_result_sets=concurrent_result_sets, settings=settings, ) - except BaseException: - pop_otel_span_for_grpc(tok) - tok = None - raise - return AsyncResponseContextIterator( it=stream_it, wrapper=lambda resp: base.wrap_execute_query_response( @@ -206,13 +194,10 @@ async def execute( ), on_error=self._on_execute_stream_error, span=span, - grpc_propagation_token=tok, ) except Exception as e: - pop_otel_span_for_grpc(tok) - if span is not None: - span.set_error(e) - span.end() + span.set_error(e) + span.end() raise async def explain( diff --git a/ydb/aio/query/transaction.py b/ydb/aio/query/transaction.py index ff8720df..68868f40 100644 --- a/ydb/aio/query/transaction.py +++ b/ydb/aio/query/transaction.py @@ -12,7 +12,7 @@ BaseQueryTxContext, QueryTxStateEnum, ) -from ...opentelemetry.tracing import create_ydb_span, pop_otel_span_for_grpc, push_otel_span_for_grpc +from ...opentelemetry.tracing import create_ydb_span if TYPE_CHECKING: from .session import QuerySession @@ -211,9 +211,7 @@ async def execute( ) try: - # PR #786: async mirror of sync transaction.execute propagation. - tok = push_otel_span_for_grpc(span) - try: + with span.attach_context(): stream_it = await self._execute_call( query=query, parameters=parameters, @@ -227,11 +225,6 @@ async def execute( concurrent_result_sets=concurrent_result_sets, settings=settings, ) - except BaseException: - pop_otel_span_for_grpc(tok) - tok = None - raise - self._prev_stream = AsyncResponseContextIterator( it=stream_it, wrapper=lambda resp: base.wrap_execute_query_response( @@ -244,12 +237,9 @@ async def execute( ), on_error=self.session._on_execute_stream_error, span=span, - grpc_propagation_token=tok, ) return self._prev_stream except Exception as e: - pop_otel_span_for_grpc(tok) - if span is not None: - span.set_error(e) - span.end() + span.set_error(e) + span.end() raise diff --git a/ydb/opentelemetry/_plugin.py b/ydb/opentelemetry/_plugin.py index 4dd4b272..3e2dc188 100644 --- a/ydb/opentelemetry/_plugin.py +++ b/ydb/opentelemetry/_plugin.py @@ -1,23 +1,4 @@ -"""OpenTelemetry bridge for YDB (PR #786 review follow-ups). - -Review themes addressed here: - -- **error.type vs YDB status:** map ``issues.Error.status`` to ``transport_error`` vs - ``ydb_error`` using the client transport status band (``_TRANSPORT_STATUSES``), not the - status name as ``error.type`` (review: KirillKurdyukov). - -- **No long-lived ``context.attach`` on streaming execute:** attaching for the whole - result iterator + detaching from ``__del__`` or another task caused OTel warnings - (review: vgvoleg). ``ExecuteQuery`` uses ``tracing.push_otel_span_for_grpc`` with a - token cleared in the iterator ``_finish_span``; ``TracingSpan.end()`` never detaches. - -- **Explicit ``inject`` context:** when the ContextVar is set, ``inject`` uses - ``trace.set_span_in_context(otel_span)`` instead of relying on global attach for the - stream lifetime (review: vgvoleg). - -- **Tracer / reset:** ``enable_tracing(tracer=...)`` is idempotent; ``disable_tracing()`` - clears hooks so tracing can be reconfigured (review: vgvoleg / KirillKurdyukov). -""" +"""OpenTelemetry bridge for YDB.""" from opentelemetry import context as otel_context from opentelemetry import trace @@ -26,7 +7,7 @@ from ydb import issues from ydb.issues import StatusCode as YdbStatusCode -from ydb.opentelemetry.tracing import _registry, get_active_grpc_otel_span +from ydb.opentelemetry.tracing import _registry # YDB client transport StatusCode values (401xxx band) -> OTel error.type transport_error. _TRANSPORT_STATUSES = frozenset( @@ -49,18 +30,9 @@ def _otel_metadata_hook(): - """Injects W3C Trace Context (traceparent/tracestate) into gRPC metadata. - - When ``get_active_grpc_otel_span()`` is set (ExecuteQuery / with-blocks), inject - uses that span explicitly so we do not depend on OTel ``context.attach`` for the - whole stream (PR review: vgvoleg). - """ + """Inject W3C Trace Context into outgoing gRPC metadata using the active OTel context.""" headers = {} - otel_span = get_active_grpc_otel_span() - if otel_span is not None: - inject(headers, context=trace.set_span_in_context(otel_span)) - else: - inject(headers) + inject(headers) return list(headers.items()) @@ -76,22 +48,45 @@ def _set_error_on_span(span, exception): span.record_exception(exception) +class _AttachContext: + """Make a span the active OTel context for a ``with`` block, without ending it. + + Used around the initial gRPC call of a streaming RPC: the span outlives the + ``with`` block — the result iterator owns ``end()``. For non-streaming RPCs + use ``with create_ydb_span(...)`` directly. + """ + + def __init__(self, raw_span): + self._raw = raw_span + self._token = None + + def __enter__(self): + ctx = trace.set_span_in_context(self._raw) + self._token = otel_context.attach(ctx) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self._token is not None: + otel_context.detach(self._token) + self._token = None + return False + + class TracingSpan: """Wrapper around an OTel span. - **With-blocks** (CreateSession, Commit, RunWithRetry, …): ``__enter__`` does a - *short* ``context.attach`` for the block so child spans (e.g. ``ydb.Try``) get the - correct parent; ``__exit__`` detaches and ends the span (review: vgvoleg — attach - must not outlive the block / stream). + As context manager: ``__enter__`` attaches the OTel context (so child spans + nest correctly and ``inject()`` sees this span when building gRPC metadata) + and ``__exit__`` detaches and ends the span. Used by Commit / Rollback / + RunWithRetry / Try and similar single-shot operations. - **ExecuteQuery** does not use this context manager: the caller holds - ``push_otel_span_for_grpc`` until the result iterator finishes (see - ``SyncResponseContextIterator``); :meth:`end` does not call ``context.detach``. + For ExecuteQuery streams the span outlives the ``with`` block: call + :meth:`attach_context` around the initial gRPC call only, and let the result + iterator own ``end()``. """ def __init__(self, span): self._span = span - self._grpc_propagation_token = None self._otel_context_token = None def set_error(self, exception): @@ -103,19 +98,15 @@ def set_attribute(self, key, value): def end(self): self._span.end() - def __enter__(self): - from ydb.opentelemetry.tracing import push_otel_span_for_grpc + def attach_context(self): + return _AttachContext(self._span) + def __enter__(self): ctx = trace.set_span_in_context(self._span) self._otel_context_token = otel_context.attach(ctx) - self._grpc_propagation_token = push_otel_span_for_grpc(self) return self def __exit__(self, exc_type, exc_val, exc_tb): - from ydb.opentelemetry.tracing import pop_otel_span_for_grpc - - pop_otel_span_for_grpc(self._grpc_propagation_token) - self._grpc_propagation_token = None if self._otel_context_token is not None: otel_context.detach(self._otel_context_token) self._otel_context_token = None @@ -147,11 +138,7 @@ def _enable_tracing(tracer=None): def _disable_tracing(): - """Clear hooks and tracer; after this, :func:`~ydb.opentelemetry.enable_tracing` may be called again. - - Review (vgvoleg): ``enable_tracing()`` is idempotent; callers need an explicit way - to reset hooks / pass a new tracer without reaching into private module state. - """ + """Clear hooks and tracer; after this, :func:`~ydb.opentelemetry.enable_tracing` may be called again.""" global _enabled, _tracer _registry.set_create_span(None) diff --git a/ydb/opentelemetry/tracing.py b/ydb/opentelemetry/tracing.py index bdb89608..5aa39813 100644 --- a/ydb/opentelemetry/tracing.py +++ b/ydb/opentelemetry/tracing.py @@ -1,47 +1,17 @@ -"""OpenTelemetry helpers and registry (PR #786 review). +"""OpenTelemetry helpers and registry.""" -- **ContextVar ``_OTEL_SPAN_FOR_GRPC``:** holds the SDK span used for W3C ``inject`` on - gRPC metadata without keeping ``context.attach`` for the whole ExecuteQuery stream - (review: vgvoleg). Bound from ``push_otel_span_for_grpc`` until the result iterator - finishes (full execute lifecycle for propagation). +from typing import Optional, Tuple -- **``create_ydb_span`` early return:** avoids building attribute dicts when tracing is - off so the path stays no-op (review: vgvoleg — docs promise zero-cost when disabled). -- **``_split_endpoint``:** strips ``grpc://`` / ``grpcs://`` and supports ``[ipv6]:port`` - so ``server.address`` matches semantic conventions (review: vgvoleg). -""" - -import contextvars -from typing import Any, Optional, Tuple - -_OTEL_SPAN_FOR_GRPC: contextvars.ContextVar[Optional[Any]] = contextvars.ContextVar( - "_OTEL_SPAN_FOR_GRPC", - default=None, -) - - -def get_active_grpc_otel_span(): - """OpenTelemetry SDK span used for W3C inject on the next gRPC metadata build, if any.""" - return _OTEL_SPAN_FOR_GRPC.get() - - -def push_otel_span_for_grpc(wrapped_span) -> Optional[contextvars.Token]: - """Bind ``wrapped_span`` (TracingSpan from _plugin) for the next gRPC metadata build. +class _NoopCtx: + def __enter__(self): + return self - Returns a token for :func:`pop_otel_span_for_grpc`, or ``None`` if nothing was pushed. - """ - if wrapped_span is None: - return None - raw = getattr(wrapped_span, "_span", None) - if raw is None: - return None - return _OTEL_SPAN_FOR_GRPC.set(raw) + def __exit__(self, exc_type, exc_val, exc_tb): + return False -def pop_otel_span_for_grpc(token: Optional[contextvars.Token]) -> None: - if token is not None: - _OTEL_SPAN_FOR_GRPC.reset(token) +_NOOP_CTX = _NoopCtx() class _NoopSpan: @@ -56,6 +26,9 @@ def set_attribute(self, key, value): def end(self): pass + def attach_context(self): + return _NOOP_CTX + def __enter__(self): return self @@ -153,7 +126,6 @@ def create_ydb_span(name, driver_config, node_id=None, kind=None, peer=None): map for the specific node serving the call; missing fields are skipped. Can be used as a context manager or manually. """ - # Review (vgvoleg): skip _build_ydb_attrs when hooks are unset (zero-cost when disabled). if not _registry.is_active(): return _NOOP_SPAN attrs = _build_ydb_attrs(driver_config, node_id, peer) diff --git a/ydb/query/base.py b/ydb/query/base.py index af7056f3..e3d124ab 100644 --- a/ydb/query/base.py +++ b/ydb/query/base.py @@ -72,18 +72,12 @@ class QueryResultSetFormat(enum.IntEnum): class SyncResponseContextIterator(_utilities.SyncResponseIterator): - """Streams ExecuteQuery results; ends the OTel span when the stream is fully consumed. + """Streams ExecuteQuery results; ends the attached OTel span when the stream is consumed.""" - ``grpc_propagation_token`` (PR #786): keeps W3C inject bound for the *entire* execute - (from first gRPC metadata until this iterator finishes), without a long-lived OTel - ``context.attach`` on the span (review: vgvoleg + execute lifecycle expectation). - """ - - def __init__(self, it, wrapper, on_error=None, span=None, grpc_propagation_token=None): + def __init__(self, it, wrapper, on_error=None, span=None): super().__init__(it, wrapper) self._on_error = on_error self._span = span - self._grpc_propagation_token = grpc_propagation_token def __enter__(self) -> "SyncResponseContextIterator": return self @@ -108,12 +102,6 @@ def _next(self): raise def _finish_span(self, exception=None): - # Pop gRPC propagation before ending span so metadata hooks do not outlive the span. - if self._grpc_propagation_token is not None: - from ydb.opentelemetry.tracing import pop_otel_span_for_grpc - - pop_otel_span_for_grpc(self._grpc_propagation_token) - self._grpc_propagation_token = None if self._span is not None: if exception is not None: self._span.set_error(exception) @@ -121,16 +109,9 @@ def _finish_span(self, exception=None): self._span = None def __del__(self): - # GC may finalize this iterator in a different execution context than the - # one that produced ``_grpc_propagation_token`` (CPython runs GC in - # whichever thread triggers collection). ``ContextVar.reset`` raises - # ``ValueError`` in that case; skip the pop here — the ContextVar is - # context-local, the leaked entry is harmless, only the span needs to be - # ended cleanly. if self._span is not None: self._span.end() self._span = None - self._grpc_propagation_token = None def __exit__(self, exc_type, exc_val, exc_tb): # To close stream on YDB it is necessary to scroll through it to the end. diff --git a/ydb/query/session.py b/ydb/query/session.py index c7df7cde..8336d457 100644 --- a/ydb/query/session.py +++ b/ydb/query/session.py @@ -18,12 +18,7 @@ from .base import QueryExplainResultFormat from .. import _apis, issues, _utilities -from ..opentelemetry.tracing import ( - create_ydb_span, - pop_otel_span_for_grpc, - push_otel_span_for_grpc, - set_peer_attributes, -) +from ..opentelemetry.tracing import create_ydb_span, set_peer_attributes from ..settings import BaseRequestSettings from ..connection import _RpcState as RpcState, EndpointKey from .._grpc.grpcwrapper import common_utils @@ -500,10 +495,11 @@ def execute( ) try: - # PR #786: push before _execute_call; token lives on SyncResponseContextIterator until - # the stream is fully read so W3C inject matches the whole ExecuteQuery (vgvoleg). - tok = push_otel_span_for_grpc(span) - try: + # Make ``ydb.ExecuteQuery`` the active OTel context only around the + # initial gRPC call so ``inject()`` writes ``traceparent`` into the + # request metadata. The span itself outlives this block — the result + # iterator owns ``end()``. + with span.attach_context(): stream_it = self._execute_call( query=query, parameters=parameters, @@ -517,11 +513,6 @@ def execute( concurrent_result_sets=concurrent_result_sets, settings=settings, ) - except BaseException: - pop_otel_span_for_grpc(tok) - tok = None # mark popped so the outer ``except`` is a no-op - raise - return base.SyncResponseContextIterator( stream_it, lambda resp: base.wrap_execute_query_response( @@ -532,18 +523,10 @@ def execute( ), on_error=self._on_execute_stream_error, span=span, - grpc_propagation_token=tok, ) except Exception as e: - # If the iterator constructor (above) raises, the gRPC propagation - # ContextVar would otherwise leak the now-ended span into the next - # gRPC call on this context. ``tok`` is ``None`` when the inner - # ``except BaseException`` already popped it, in which case - # ``pop_otel_span_for_grpc`` is a no-op. - pop_otel_span_for_grpc(tok) - if span is not None: - span.set_error(e) - span.end() + span.set_error(e) + span.end() raise def explain( diff --git a/ydb/query/transaction.py b/ydb/query/transaction.py index 78cbc551..9abe3046 100644 --- a/ydb/query/transaction.py +++ b/ydb/query/transaction.py @@ -17,7 +17,7 @@ _apis, issues, ) -from ..opentelemetry.tracing import create_ydb_span, pop_otel_span_for_grpc, push_otel_span_for_grpc +from ..opentelemetry.tracing import create_ydb_span from .._grpc.grpcwrapper import ydb_topic as _ydb_topic from .._grpc.grpcwrapper import ydb_query as _ydb_query from ..connection import _RpcState as RpcState @@ -651,9 +651,7 @@ def execute( ) try: - # PR #786: same propagation contract as QuerySession.execute (see session.py). - tok = push_otel_span_for_grpc(span) - try: + with span.attach_context(): stream_it = self._execute_call( query=query, commit_tx=commit_tx, @@ -667,11 +665,6 @@ def execute( concurrent_result_sets=concurrent_result_sets, settings=settings, ) - except BaseException: - pop_otel_span_for_grpc(tok) - tok = None - raise - self._prev_stream = base.SyncResponseContextIterator( stream_it, lambda resp: base.wrap_execute_query_response( @@ -684,15 +677,9 @@ def execute( ), on_error=self.session._on_execute_stream_error, span=span, - grpc_propagation_token=tok, ) return self._prev_stream except Exception as e: - # Same fall-through as in QuerySession.execute: pop the gRPC - # propagation ContextVar so a failed iterator construction does not - # leak the now-ended span into the next gRPC call on this context. - pop_otel_span_for_grpc(tok) - if span is not None: - span.set_error(e) - span.end() + span.set_error(e) + span.end() raise From 93fa974e07a2d9757b2704be9e1902dd027eaf1a Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Sat, 2 May 2026 10:12:13 +0300 Subject: [PATCH 22/36] fix issue --- docs/opentelemetry.rst | 2 +- examples/opentelemetry/Dockerfile | 2 +- examples/opentelemetry/README.md | 8 ++++---- .../{docker-compose.otel.yaml => compose-e2e.yaml} | 4 ++-- examples/opentelemetry/ydb_config/README.md | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) rename examples/opentelemetry/{docker-compose.otel.yaml => compose-e2e.yaml} (93%) diff --git a/docs/opentelemetry.rst b/docs/opentelemetry.rst index e91e629f..127f4e54 100644 --- a/docs/opentelemetry.rst +++ b/docs/opentelemetry.rst @@ -235,7 +235,7 @@ Docker (YDB or the full stack) first**, then install and run on the host — see .. code-block:: sh - docker compose -f examples/opentelemetry/docker-compose.otel.yaml up --build + docker compose -f examples/opentelemetry/compose-e2e.yaml up --build The first run builds the ``otel-example`` image from the local SDK source; subsequent runs reuse the cached image. Pass ``--build`` again if you change the SDK or the demo script. diff --git a/examples/opentelemetry/Dockerfile b/examples/opentelemetry/Dockerfile index c7b25acb..326721a1 100644 --- a/examples/opentelemetry/Dockerfile +++ b/examples/opentelemetry/Dockerfile @@ -1,6 +1,6 @@ # Isolated image for the OpenTelemetry demo. Build context is the repository root. # -# docker compose -f examples/opentelemetry/docker-compose.otel.yaml build otel-example +# docker compose -f examples/opentelemetry/compose-e2e.yaml build otel-example # # A separate ``.dockerignore`` at the repo root keeps the context small. diff --git a/examples/opentelemetry/README.md b/examples/opentelemetry/README.md index b131f54a..1af90f6d 100644 --- a/examples/opentelemetry/README.md +++ b/examples/opentelemetry/README.md @@ -17,25 +17,25 @@ docker compose up -d # wait until the ydb container is healthy / port 2136 is open, then continue ``` -**Full stack** (YDB + OTLP collector + Tempo + Grafana; the `otel-example` service is built from a `Dockerfile` and runs the script once inside Compose). The compose file is `docker-compose.otel.yaml` next to this README. +**Full stack** (YDB + OTLP collector + Tempo + Grafana; the `otel-example` service is built from a `Dockerfile` and runs the script once inside Compose). The compose file is `compose-e2e.yaml` next to this README. ```sh cd /path/to/ydb-python-sdk -docker compose -f examples/opentelemetry/docker-compose.otel.yaml up --build +docker compose -f examples/opentelemetry/compose-e2e.yaml up --build ``` From this folder the build context is still resolved correctly (it is `../..` relative to the compose file): ```sh cd /path/to/ydb-python-sdk/examples/opentelemetry -docker compose -f docker-compose.otel.yaml up --build +docker compose -f compose-e2e.yaml up --build ``` The first run builds the `otel-example` image from the local SDK source (`Dockerfile` in this folder, `.dockerignore` at the repo root keeps the context small). Subsequent runs reuse the cached image; pass `--build` if you change the SDK or the demo script. Grafana: http://localhost:3000 -**Logs for `otel-example`:** the container name is prefixed (e.g. `opentelemetry-otel-example-1`); use `docker compose -f examples/opentelemetry/docker-compose.otel.yaml ps` or `docker ps -a` to find it. The service is one-shot (`restart: "no"`) — it may already have exited. +**Logs for `otel-example`:** the container name is prefixed (e.g. `opentelemetry-otel-example-1`); use `docker compose -f examples/opentelemetry/compose-e2e.yaml ps` or `docker ps -a` to find it. The service is one-shot (`restart: "no"`) — it may already have exited. ## 2. Install dependencies (on the host, for a local `python` run) diff --git a/examples/opentelemetry/docker-compose.otel.yaml b/examples/opentelemetry/compose-e2e.yaml similarity index 93% rename from examples/opentelemetry/docker-compose.otel.yaml rename to examples/opentelemetry/compose-e2e.yaml index 023ae3fd..460ba8f6 100644 --- a/examples/opentelemetry/docker-compose.otel.yaml +++ b/examples/opentelemetry/compose-e2e.yaml @@ -2,10 +2,10 @@ # and a one-shot container that runs otel_example.py once. # # Run from this directory (paths below are relative to this file): -# cd examples/opentelemetry && docker compose -f docker-compose.otel.yaml up +# cd examples/opentelemetry && docker compose -f compose-e2e.yaml up # # Or from the repository root: -# docker compose -f examples/opentelemetry/docker-compose.otel.yaml up +# docker compose -f examples/opentelemetry/compose-e2e.yaml up services: ydb: diff --git a/examples/opentelemetry/ydb_config/README.md b/examples/opentelemetry/ydb_config/README.md index 70c8c5a7..6e36fae1 100644 --- a/examples/opentelemetry/ydb_config/README.md +++ b/examples/opentelemetry/ydb_config/README.md @@ -19,10 +19,10 @@ Default service name (so you can find it in Tempo/Grafana): `ydb` ## 3) Run with the overridden config -Restart YDB (the main `docker-compose.otel.yaml` will automatically use `--config-path` if `ydb-config.yaml` exists): +Restart YDB (the main `compose-e2e.yaml` will automatically use `--config-path` if `ydb-config.yaml` exists): ```bash -docker compose -f docker-compose.otel.yaml up -d --force-recreate ydb +docker compose -f compose-e2e.yaml up -d --force-recreate ydb ``` Now you should see additional server-side traces in Tempo/Grafana (service name defaults to `ydb-local` in the snippet). From c4e304db92aee9d57056a6dc69fd29536272b71c Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Sat, 2 May 2026 10:29:20 +0300 Subject: [PATCH 23/36] fix issue --- examples/opentelemetry/otel_example.py | 32 ++++++++++++++++++-------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/examples/opentelemetry/otel_example.py b/examples/opentelemetry/otel_example.py index 819ef476..67d271e3 100644 --- a/examples/opentelemetry/otel_example.py +++ b/examples/opentelemetry/otel_example.py @@ -62,13 +62,27 @@ async def _first_amount(tx) -> int: raise RuntimeError("no row for id=1") -async def _bank_read_update(tx) -> None: - count = await _first_amount(tx) - async with await tx.execute( - "UPDATE bank SET amount = $amt + 1 WHERE id = 1", - {"$amt": (count, ydb.PrimitiveType.Int32)}, - ): - pass +async def _bank_read_update_tx(pool: "ydb.aio.QuerySessionPool") -> None: + """Read-modify-write under a Serializable transaction. + + Begins the tx implicitly via ``TransactionControl.begin_tx`` on the first + ``tx.execute(...)``: avoids a separate ``BeginTransaction`` round trip and + keeps the trace tree clean (no client-side gap between ``ydb.Try`` and the + server-side ``BeginTransactionRequest``). + """ + + async def _do() -> None: + async with pool.checkout() as session: + async with session.transaction() as tx: + count = await _first_amount(tx) + async with await tx.execute( + "UPDATE bank SET amount = $amt + 1 WHERE id = 1", + {"$amt": (count, ydb.PrimitiveType.Int32)}, + ): + pass + await tx.commit() + + await pool.retry_operation_async(_do) async def main() -> None: @@ -104,14 +118,14 @@ async def main() -> None: await pool.execute_with_retries("INSERT INTO bank (id, amount) VALUES (1, 0)") print("Preparing queries...") - await pool.retry_tx_async(_bank_read_update) + await _bank_read_update_tx(pool) print("Emulation TLI...") async def concurrent_task(task_num: int) -> None: with tracer.start_as_current_span("example_tli") as act: act.set_attribute("app.message", f"concurrent task {task_num}") - await pool.retry_tx_async(_bank_read_update) + await _bank_read_update_tx(pool) await asyncio.gather(*(concurrent_task(i) for i in range(10))) From 0beb6d09e5e4484dd7b774ac3b4e65fd80b4435c Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Sat, 2 May 2026 11:01:39 +0300 Subject: [PATCH 24/36] added ydb.BeginTransaction --- examples/opentelemetry/otel_example.py | 32 +++++----------- tests/tracing/test_tracing_async.py | 52 ++++++++++++++++++++++++++ tests/tracing/test_tracing_sync.py | 50 +++++++++++++++++++++++++ ydb/aio/query/transaction.py | 8 +++- ydb/query/transaction.py | 8 +++- 5 files changed, 125 insertions(+), 25 deletions(-) diff --git a/examples/opentelemetry/otel_example.py b/examples/opentelemetry/otel_example.py index 67d271e3..819ef476 100644 --- a/examples/opentelemetry/otel_example.py +++ b/examples/opentelemetry/otel_example.py @@ -62,27 +62,13 @@ async def _first_amount(tx) -> int: raise RuntimeError("no row for id=1") -async def _bank_read_update_tx(pool: "ydb.aio.QuerySessionPool") -> None: - """Read-modify-write under a Serializable transaction. - - Begins the tx implicitly via ``TransactionControl.begin_tx`` on the first - ``tx.execute(...)``: avoids a separate ``BeginTransaction`` round trip and - keeps the trace tree clean (no client-side gap between ``ydb.Try`` and the - server-side ``BeginTransactionRequest``). - """ - - async def _do() -> None: - async with pool.checkout() as session: - async with session.transaction() as tx: - count = await _first_amount(tx) - async with await tx.execute( - "UPDATE bank SET amount = $amt + 1 WHERE id = 1", - {"$amt": (count, ydb.PrimitiveType.Int32)}, - ): - pass - await tx.commit() - - await pool.retry_operation_async(_do) +async def _bank_read_update(tx) -> None: + count = await _first_amount(tx) + async with await tx.execute( + "UPDATE bank SET amount = $amt + 1 WHERE id = 1", + {"$amt": (count, ydb.PrimitiveType.Int32)}, + ): + pass async def main() -> None: @@ -118,14 +104,14 @@ async def main() -> None: await pool.execute_with_retries("INSERT INTO bank (id, amount) VALUES (1, 0)") print("Preparing queries...") - await _bank_read_update_tx(pool) + await pool.retry_tx_async(_bank_read_update) print("Emulation TLI...") async def concurrent_task(task_num: int) -> None: with tracer.start_as_current_span("example_tli") as act: act.set_attribute("app.message", f"concurrent task {task_num}") - await _bank_read_update_tx(pool) + await pool.retry_tx_async(_bank_read_update) await asyncio.gather(*(concurrent_task(i) for i in range(10))) diff --git a/tests/tracing/test_tracing_async.py b/tests/tracing/test_tracing_async.py index de2ab249..4bb87348 100644 --- a/tests/tracing/test_tracing_async.py +++ b/tests/tracing/test_tracing_async.py @@ -59,6 +59,14 @@ def _make_async_tx(session, driver): return tx +def _make_fresh_async_tx(session, driver): + """Create a real async QueryTxContext in NOT_INITIALIZED state (for begin()).""" + from ydb._grpc.grpcwrapper.ydb_query_public_types import QuerySerializableReadWrite + from ydb.aio.query.transaction import QueryTxContext + + return QueryTxContext(driver, session, QuerySerializableReadWrite()) + + class TestAsyncCreateSessionSpan: @pytest.mark.asyncio async def test_create_session_emits_span(self, otel_setup): @@ -139,6 +147,50 @@ async def test_tx_execute_emits_span(self, otel_setup): assert "ydb.session.id" not in attrs +class TestAsyncBeginTransactionSpan: + @pytest.mark.asyncio + async def test_begin_emits_span(self, otel_setup): + exporter = otel_setup + session, driver = _make_async_session_mock(peer=("n1", 2136, "dc-a")) + tx = _make_fresh_async_tx(session, driver) + + with patch.object(type(tx), "_begin_call", new_callable=AsyncMock): + await tx.begin() + + span = _get_single_span(exporter, "ydb.BeginTransaction") + assert span.kind == SpanKind.CLIENT + attrs = dict(span.attributes) + assert attrs["db.system.name"] == "ydb" + assert attrs["db.namespace"] == "/test_database" + assert attrs["ydb.node.id"] == 12345 + assert attrs["network.peer.address"] == "n1" + assert attrs["network.peer.port"] == 2136 + assert attrs["ydb.node.dc"] == "dc-a" + assert "ydb.session.id" not in attrs + assert "ydb.tx.id" not in attrs + assert span.status.status_code == StatusCode.UNSET + + @pytest.mark.asyncio + async def test_begin_sets_error_status_on_failure(self, otel_setup): + from ydb import issues + + exporter = otel_setup + session, driver = _make_async_session_mock(peer=("n1", 2136, "dc-a")) + tx = _make_fresh_async_tx(session, driver) + + exc = issues.Unavailable("bad node") + with patch.object(type(tx), "_begin_call", new_callable=AsyncMock, side_effect=exc): + with pytest.raises(issues.Unavailable): + await tx.begin() + + span = _get_single_span(exporter, "ydb.BeginTransaction") + assert span.status.status_code == StatusCode.ERROR + attrs = dict(span.attributes) + assert attrs["error.type"] == "ydb_error" + assert attrs["db.response.status_code"] == "UNAVAILABLE" + assert len(span.events) > 0 + + class TestAsyncCommitSpan: @pytest.mark.asyncio async def test_commit_emits_span(self, otel_setup): diff --git a/tests/tracing/test_tracing_sync.py b/tests/tracing/test_tracing_sync.py index 10acbebb..0a16b53e 100644 --- a/tests/tracing/test_tracing_sync.py +++ b/tests/tracing/test_tracing_sync.py @@ -58,6 +58,14 @@ def _make_tx(session, driver): return tx +def _make_fresh_tx(session, driver): + """Create a real QueryTxContext in NOT_INITIALIZED state (for begin()).""" + from ydb._grpc.grpcwrapper.ydb_query_public_types import QuerySerializableReadWrite + from ydb.query.transaction import QueryTxContext + + return QueryTxContext(driver, session, QuerySerializableReadWrite()) + + class TestCreateSessionSpan: def test_create_session_emits_span(self, otel_setup): exporter = otel_setup @@ -143,6 +151,48 @@ def test_tx_execute_emits_span(self, otel_setup): assert "ydb.tx.id" not in attrs +class TestBeginTransactionSpan: + def test_begin_emits_span(self, otel_setup): + exporter = otel_setup + session, driver = _make_session_mock(peer=("n1", 2136, "dc-a")) + tx = _make_fresh_tx(session, driver) + + with patch.object(type(tx), "_begin_call", return_value=None): + tx.begin() + + span = _get_single_span(exporter, "ydb.BeginTransaction") + assert span.kind == SpanKind.CLIENT + attrs = dict(span.attributes) + assert attrs["db.system.name"] == "ydb" + assert attrs["db.namespace"] == "/test_database" + assert attrs["ydb.node.id"] == 12345 + assert attrs["network.peer.address"] == "n1" + assert attrs["network.peer.port"] == 2136 + assert attrs["ydb.node.dc"] == "dc-a" + assert "ydb.session.id" not in attrs + assert "ydb.tx.id" not in attrs + assert span.status.status_code == StatusCode.UNSET + + def test_begin_sets_error_status_on_failure(self, otel_setup): + from ydb import issues + + exporter = otel_setup + session, driver = _make_session_mock(peer=("n1", 2136, "dc-a")) + tx = _make_fresh_tx(session, driver) + + exc = issues.Unavailable("bad node") + with patch.object(type(tx), "_begin_call", side_effect=exc): + with pytest.raises(issues.Unavailable): + tx.begin() + + span = _get_single_span(exporter, "ydb.BeginTransaction") + assert span.status.status_code == StatusCode.ERROR + attrs = dict(span.attributes) + assert attrs["error.type"] == "ydb_error" + assert attrs["db.response.status_code"] == "UNAVAILABLE" + assert len(span.events) > 0 + + class TestCommitSpan: def test_commit_emits_span(self, otel_setup): exporter = otel_setup diff --git a/ydb/aio/query/transaction.py b/ydb/aio/query/transaction.py index 68868f40..ca7cc6da 100644 --- a/ydb/aio/query/transaction.py +++ b/ydb/aio/query/transaction.py @@ -88,7 +88,13 @@ async def begin(self, settings: Optional[BaseRequestSettings] = None) -> "QueryT :return: None or exception if begin is failed """ - await self._begin_call(settings) + with create_ydb_span( + "ydb.BeginTransaction", + self._driver_config, + node_id=self.session.node_id, + peer=getattr(self.session, "_peer", None), + ): + await self._begin_call(settings) return self async def commit(self, settings: Optional[BaseRequestSettings] = None) -> None: diff --git a/ydb/query/transaction.py b/ydb/query/transaction.py index 9abe3046..75f18a36 100644 --- a/ydb/query/transaction.py +++ b/ydb/query/transaction.py @@ -528,7 +528,13 @@ def begin(self, settings: Optional[BaseRequestSettings] = None) -> "QueryTxConte :return: Transaction object or exception if begin is failed """ - self._begin_call(settings) + with create_ydb_span( + "ydb.BeginTransaction", + self._driver_config, + node_id=self.session.node_id, + peer=getattr(self.session, "_peer", None), + ): + self._begin_call(settings) return self From 0220d7e32c9d9b2dcfb7b74f4655615366b6322c Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Sat, 2 May 2026 19:15:44 +0300 Subject: [PATCH 25/36] added healthcheck --- examples/opentelemetry/compose-e2e.yaml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/examples/opentelemetry/compose-e2e.yaml b/examples/opentelemetry/compose-e2e.yaml index 460ba8f6..f8402d50 100644 --- a/examples/opentelemetry/compose-e2e.yaml +++ b/examples/opentelemetry/compose-e2e.yaml @@ -25,6 +25,12 @@ services: - "8765:8765" volumes: - ./ydb_config:/ydb_config:ro + healthcheck: + test: bash -c "exec 6<> /dev/tcp/localhost/2136" + interval: 5s + timeout: 2s + retries: 30 + start_period: 30s otel-collector: image: otel/opentelemetry-collector-contrib:latest @@ -78,6 +84,8 @@ services: OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317 OTEL_SERVICE_NAME: ydb-otel-example depends_on: - - ydb - - otel-collector + ydb: + condition: service_healthy + otel-collector: + condition: service_started restart: "no" From e6721d06e67d0548e5e01b14e980b53842eecc35 Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Sat, 2 May 2026 19:28:44 +0300 Subject: [PATCH 26/36] micro refactoring --- examples/opentelemetry/otel_example.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/examples/opentelemetry/otel_example.py b/examples/opentelemetry/otel_example.py index 819ef476..e474b723 100644 --- a/examples/opentelemetry/otel_example.py +++ b/examples/opentelemetry/otel_example.py @@ -31,29 +31,6 @@ def _env(name: str, default: str) -> str: v = os.environ.get(name) return v if v is not None and v != "" else default - -def _assert_tcp_reachable_for_endpoint(endpoint: str) -> None: - """Before ``Driver`` starts: fail fast if nothing listens (clearer than ``driver.wait`` timeout).""" - bare = _yutil.wrap_endpoint(endpoint) - if bare.count(":") < 1: - return - host, _, port_s = bare.rpartition(":") - if not port_s or not host: - return - try: - port = int(port_s) - except ValueError: - return - try: - with socket.create_connection((host, port), timeout=3.0): - pass - except OSError as e: - raise RuntimeError( - f"Nothing accepts TCP on {host}:{port} — start YDB first, e.g. from the repository root: " - f"docker compose up -d (then the script at grpc://{host}:{port} can connect). Original error: {e!s}" - ) from e - - async def _first_amount(tx) -> int: async with await tx.execute("SELECT amount FROM bank WHERE id = 1") as results: async for rs in results: @@ -84,8 +61,6 @@ async def main() -> None: tracer = trace.get_tracer(__name__) enable_tracing(tracer) - _assert_tcp_reachable_for_endpoint(endpoint) - async with ydb.aio.Driver( endpoint=endpoint, database=database, From 135916ba603055564ce1ab7c973a9bfa6116eea2 Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Sun, 3 May 2026 09:33:08 +0300 Subject: [PATCH 27/36] added tests --- tests/tracing/test_tracing_async.py | 47 +++++++++++++++++++++++++++++ tests/tracing/test_tracing_sync.py | 46 ++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) diff --git a/tests/tracing/test_tracing_async.py b/tests/tracing/test_tracing_async.py index 4bb87348..3ff9ec6b 100644 --- a/tests/tracing/test_tracing_async.py +++ b/tests/tracing/test_tracing_async.py @@ -229,6 +229,53 @@ async def test_rollback_emits_span(self, otel_setup): assert "ydb.session.id" not in attrs +class TestAsyncCommitRollbackErrorRecording: + """Async commit/rollback: the span must record the exception (event + + StatusCode.ERROR + error.type + db.response.status_code) when the underlying + RPC raises, just like the sync path. + """ + + @pytest.mark.asyncio + async def test_commit_records_exception_on_failure(self, otel_setup): + from ydb import issues + + exporter = otel_setup + session, driver = _make_async_session_mock(peer=("n1", 2136, "dc-a")) + tx = _make_async_tx(session, driver) + + exc = issues.Aborted("boom") + with patch.object(type(tx), "_commit_call", new_callable=AsyncMock, side_effect=exc): + with pytest.raises(issues.Aborted): + await tx.commit() + + span = _get_single_span(exporter, "ydb.Commit") + assert span.status.status_code == StatusCode.ERROR + attrs = dict(span.attributes) + assert attrs["error.type"] == "ydb_error" + assert attrs["db.response.status_code"] == "ABORTED" + assert any(e.name == "exception" for e in span.events) + + @pytest.mark.asyncio + async def test_rollback_records_exception_on_failure(self, otel_setup): + from ydb import issues + + exporter = otel_setup + session, driver = _make_async_session_mock(peer=("n1", 2136, "dc-a")) + tx = _make_async_tx(session, driver) + + exc = issues.Unavailable("boom") + with patch.object(type(tx), "_rollback_call", new_callable=AsyncMock, side_effect=exc): + with pytest.raises(issues.Unavailable): + await tx.rollback() + + span = _get_single_span(exporter, "ydb.Rollback") + assert span.status.status_code == StatusCode.ERROR + attrs = dict(span.attributes) + assert attrs["error.type"] == "ydb_error" + assert attrs["db.response.status_code"] == "UNAVAILABLE" + assert any(e.name == "exception" for e in span.events) + + class TestAsyncErrorHandling: @pytest.mark.asyncio async def test_error_sets_error_status_and_attributes(self, otel_setup): diff --git a/tests/tracing/test_tracing_sync.py b/tests/tracing/test_tracing_sync.py index 0a16b53e..44964d6f 100644 --- a/tests/tracing/test_tracing_sync.py +++ b/tests/tracing/test_tracing_sync.py @@ -233,6 +233,52 @@ def test_rollback_emits_span(self, otel_setup): assert "ydb.tx.id" not in attrs +class TestCommitRollbackErrorRecording: + """When the underlying RPC raises, the span must: + - end with ``StatusCode.ERROR`` + - have ``error.type`` and ``db.response.status_code`` set + - have the exception recorded as a span event (``record_exception``) + """ + + def test_commit_records_exception_on_failure(self, otel_setup): + from ydb import issues + + exporter = otel_setup + session, driver = _make_session_mock(peer=("n1", 2136, "dc-a")) + tx = _make_tx(session, driver) + + exc = issues.Aborted("boom") + with patch.object(type(tx), "_commit_call", side_effect=exc): + with pytest.raises(issues.Aborted): + tx.commit() + + span = _get_single_span(exporter, "ydb.Commit") + assert span.status.status_code == StatusCode.ERROR + attrs = dict(span.attributes) + assert attrs["error.type"] == "ydb_error" + assert attrs["db.response.status_code"] == "ABORTED" + assert any(e.name == "exception" for e in span.events) + + def test_rollback_records_exception_on_failure(self, otel_setup): + from ydb import issues + + exporter = otel_setup + session, driver = _make_session_mock(peer=("n1", 2136, "dc-a")) + tx = _make_tx(session, driver) + + exc = issues.Unavailable("boom") + with patch.object(type(tx), "_rollback_call", side_effect=exc): + with pytest.raises(issues.Unavailable): + tx.rollback() + + span = _get_single_span(exporter, "ydb.Rollback") + assert span.status.status_code == StatusCode.ERROR + attrs = dict(span.attributes) + assert attrs["error.type"] == "ydb_error" + assert attrs["db.response.status_code"] == "UNAVAILABLE" + assert any(e.name == "exception" for e in span.events) + + class TestErrorHandling: def test_error_sets_error_status_and_attributes(self, otel_setup): exporter = otel_setup From b5264e2a3cf3e17332b465f523680003d54e71f3 Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Sun, 3 May 2026 11:23:19 +0300 Subject: [PATCH 28/36] refactoring --- tests/tracing/test_tracing_sync.py | 7 +- ydb/_errors.py | 21 ++--- ydb/aio/pool.py | 34 ++------- ydb/opentelemetry/__init__.py | 6 +- ydb/query/session.py | 4 - ydb/retries.py | 118 +++++++++-------------------- ydb/table_test.py | 11 +-- 7 files changed, 56 insertions(+), 145 deletions(-) diff --git a/tests/tracing/test_tracing_sync.py b/tests/tracing/test_tracing_sync.py index 44964d6f..3a8d28d5 100644 --- a/tests/tracing/test_tracing_sync.py +++ b/tests/tracing/test_tracing_sync.py @@ -554,7 +554,7 @@ def flaky(): assert sleeps == [expected_ms / 1000.0, expected_ms / 1000.0] def test_skip_backoff_errors_still_emit_one_try_per_attempt(self, otel_setup): - """Aborted/BadSession path yields zero sleep but must rotate ydb.Try spans (sync loop).""" + """Aborted/BadSession path skips the inter-attempt sleep but must still rotate ydb.Try spans.""" from ydb import issues from ydb.retries import RetrySettings, retry_operation_sync @@ -575,9 +575,8 @@ def flaky(): assert tries[1].status.status_code == StatusCode.ERROR assert tries[2].status.status_code == StatusCode.UNSET # First Try has no preceding sleep -> attribute is absent. - # Skip-yield path means subsequent Tries had no real wait either, but the - # attribute is still set to 0 to make "we did go through a retry boundary" - # explicit. + # Skip-yield path means the inter-attempt sleep was zero, so backoff_ms = 0 + # is recorded on retries to make "we did go through a retry boundary" explicit. assert "ydb.retry.backoff_ms" not in dict(tries[0].attributes) assert dict(tries[1].attributes)["ydb.retry.backoff_ms"] == 0 assert dict(tries[2].attributes)["ydb.retry.backoff_ms"] == 0 diff --git a/ydb/_errors.py b/ydb/_errors.py index 4670024e..1e969c09 100644 --- a/ydb/_errors.py +++ b/ydb/_errors.py @@ -24,32 +24,32 @@ def check_retriable_error(err, retry_settings, attempt): if isinstance(err, issues.Cancelled): if retry_settings.retry_cancelled: - return ErrorRetryInfo(True, retry_settings.fast_backoff.calc_backoff_ms(attempt)) + return ErrorRetryInfo(True, retry_settings.fast_backoff.calc_timeout(attempt)) if isinstance(err, issues.NotFound): if retry_settings.retry_not_found: - return ErrorRetryInfo(True, retry_settings.fast_backoff.calc_backoff_ms(attempt)) + return ErrorRetryInfo(True, retry_settings.fast_backoff.calc_timeout(attempt)) else: return ErrorRetryInfo(False, None) if isinstance(err, issues.InternalError): if retry_settings.retry_internal_error: - return ErrorRetryInfo(True, retry_settings.slow_backoff.calc_backoff_ms(attempt)) + return ErrorRetryInfo(True, retry_settings.slow_backoff.calc_timeout(attempt)) else: return ErrorRetryInfo(False, None) for t in _errors_retriable_fast_backoff_types: if isinstance(err, t): - return ErrorRetryInfo(True, retry_settings.fast_backoff.calc_backoff_ms(attempt)) + return ErrorRetryInfo(True, retry_settings.fast_backoff.calc_timeout(attempt)) for t in _errors_retriable_slow_backoff_types: if isinstance(err, t): - return ErrorRetryInfo(True, retry_settings.slow_backoff.calc_backoff_ms(attempt)) + return ErrorRetryInfo(True, retry_settings.slow_backoff.calc_timeout(attempt)) if retry_settings.idempotent: for t in _errors_retriable_slow_backoff_idempotent_types: if isinstance(err, t): - return ErrorRetryInfo(True, retry_settings.slow_backoff.calc_backoff_ms(attempt)) + return ErrorRetryInfo(True, retry_settings.slow_backoff.calc_timeout(attempt)) return ErrorRetryInfo(False, None) @@ -57,11 +57,4 @@ def check_retriable_error(err, retry_settings, attempt): @dataclass class ErrorRetryInfo: is_retriable: bool - # Single source: integer ms from ``BackoffSettings.calc_backoff_ms`` (before ``/ 1000`` to seconds). - sleep_backoff_ms: Optional[int] = None - - @property - def sleep_timeout_seconds(self) -> Optional[float]: - if self.sleep_backoff_ms is None: - return None - return self.sleep_backoff_ms / 1000.0 + sleep_timeout_seconds: Optional[float] diff --git a/ydb/aio/pool.py b/ydb/aio/pool.py index 7fd9a6bb..4f1b0cdd 100644 --- a/ydb/aio/pool.py +++ b/ydb/aio/pool.py @@ -39,25 +39,7 @@ async def get( # async version with different Connection type if self._fast_fail_error: raise self._fast_fail_error else: - # With ``disable_discovery``, the initial connection can fail (gRPC / TLS / etc.). - # ``_event`` is then never set; without also waiting on ``_fast_fail_event`` the caller - # would block until *wait_timeout* with no useful error. - wait_conn = asyncio.create_task(self._event.wait()) - wait_fail = asyncio.create_task(self._fast_fail_event.wait()) - try: - done, pending = await asyncio.wait( - (wait_conn, wait_fail), - timeout=wait_timeout, - return_when=asyncio.FIRST_COMPLETED, - ) - finally: - for t in (wait_conn, wait_fail): - if not t.done(): - t.cancel() - if self._fast_fail_error is not None: - raise self._fast_fail_error - if not done: - raise asyncio.TimeoutError + await asyncio.wait_for(self._event.wait(), timeout=wait_timeout) if preferred_endpoint is not None and preferred_endpoint.node_id in self.connections_by_node_id: return self.connections_by_node_id[preferred_endpoint.node_id] # type: ignore[return-value] @@ -272,15 +254,11 @@ def __init__(self, driver_config: "DriverConfig") -> None: if driver_config.disable_discovery: # If discovery is disabled, just add the initial endpoint to the store async def init_connection() -> None: - try: - ready_connection = Connection(self._driver_config.endpoint, self._driver_config) - await ready_connection.connection_ready( - ready_timeout=getattr(self._driver_config, "discovery_request_timeout", 10) - ) - self._store.add(ready_connection) - except Exception as e: # noqa: BLE001 — surface to wait() via complete_discovery - self._store.complete_discovery(e) - return + ready_connection = Connection(self._driver_config.endpoint, self._driver_config) + await ready_connection.connection_ready( + ready_timeout=getattr(self._driver_config, "discovery_request_timeout", 10) + ) + self._store.add(ready_connection) # Create and schedule the task to initialize the connection self._discovery_task = asyncio.get_event_loop().create_task(init_connection()) diff --git a/ydb/opentelemetry/__init__.py b/ydb/opentelemetry/__init__.py index a984405f..a77e1ab7 100644 --- a/ydb/opentelemetry/__init__.py +++ b/ydb/opentelemetry/__init__.py @@ -1,8 +1,4 @@ -"""Public OpenTelemetry entrypoints for YDB (PR #786). - -``disable_tracing`` exists because ``enable_tracing`` is idempotent: reviewers asked for -either documentation or an explicit reset before changing the tracer (vgvoleg). -""" +"""Public OpenTelemetry entrypoints for YDB.""" def enable_tracing(tracer=None): diff --git a/ydb/query/session.py b/ydb/query/session.py index 8336d457..473f9276 100644 --- a/ydb/query/session.py +++ b/ydb/query/session.py @@ -495,10 +495,6 @@ def execute( ) try: - # Make ``ydb.ExecuteQuery`` the active OTel context only around the - # initial gRPC call so ``inject()`` writes ``traceparent`` into the - # request metadata. The span itself outlives this block — the result - # iterator owns ``end()``. with span.attach_context(): stream_it = self._execute_call( query=query, diff --git a/ydb/retries.py b/ydb/retries.py index 1b9ce309..aacdb5ca 100644 --- a/ydb/retries.py +++ b/ydb/retries.py @@ -3,7 +3,7 @@ import inspect import random import time -from typing import Any, Awaitable, Callable, Generator, Optional, Union +from typing import Any, Callable, Generator, Optional, Union from . import issues from ._errors import check_retriable_error @@ -20,9 +20,6 @@ def _start_run_with_retry_span(): def _start_try_span(backoff_ms: Optional[int]): - # ``backoff_ms is None`` for the first attempt — the attribute is omitted because - # there was no preceding sleep at all. For every subsequent attempt the attribute - # is set, including ``0`` on the skip-yield retry path (Aborted/BadSession/...). attrs = {_BACKOFF_ATTR: backoff_ms} if backoff_ms is not None else None return _tracing_registry.create_span(_TRY_SPAN, attributes=attrs, kind="internal") @@ -38,16 +35,12 @@ def __init__( self.slot_duration = slot_duration self.uncertain_ratio = uncertain_ratio - def calc_backoff_ms(self, retry_number: int) -> int: + def calc_timeout(self, retry_number: int) -> float: slots_count = 1 << min(retry_number, self.ceiling) max_duration_ms = slots_count * self.slot_duration * 1000.0 - # duration_ms = random.random() * max_duration_ms * uncertain_ratio + max_duration_ms * (1 - uncertain_ratio) + # duration_ms = random.random() * max_duration_ms * uncertain_ratio) + max_duration_ms * (1 - uncertain_ratio) duration_ms = max_duration_ms * (random.random() * self.uncertain_ratio + 1.0 - self.uncertain_ratio) - return int(duration_ms) - - def calc_timeout(self, retry_number: int) -> float: - """Backward-compatible alias returning seconds.""" - return self.calc_backoff_ms(retry_number) / 1000.0 + return duration_ms / 1000.0 class RetrySettings: @@ -94,22 +87,8 @@ def with_slow_backoff(self, backoff_settings: BackoffSettings) -> "RetrySettings class YdbRetryOperationSleepOpt: - """Yielded by :func:`retry_operation_impl` between attempts. - - ``timeout`` is the wait in seconds (``time.sleep`` / ``asyncio.sleep``); for the - "skip yield" YDB error path (``Aborted``/``BadSession``/``NotFound``/``InternalError``) - it is ``0.0`` and ``exception`` is set so consumers still emit one artefact per - attempt (e.g. a ``ydb.Try`` span). ``backoff_ms`` exposes the same value as integer - milliseconds for OpenTelemetry attributes. - """ - - def __init__(self, timeout: float, exception: Optional[BaseException] = None) -> None: + def __init__(self, timeout: float) -> None: self.timeout = timeout - self.exception = exception - - @property - def backoff_ms(self) -> int: - return int(self.timeout * 1000) def __eq__(self, other: object) -> bool: return ( @@ -146,14 +125,6 @@ def retry_operation_impl( *args: Any, **kwargs: Any, ) -> Generator[Union[YdbRetryOperationSleepOpt, YdbRetryOperationFinalResult], None, None]: - """Pure retry-policy generator. - - Yields ``YdbRetryOperationFinalResult`` (callee's return value, or coroutine for an - async callee) and, between attempts, ``YdbRetryOperationSleepOpt`` carrying the wait - time in seconds plus the original exception. OpenTelemetry spans are created by the - callers (``retry_operation_sync`` / ``retry_operation_async``), not here, so the - generator stays unaware of tracing. - """ retry_settings = RetrySettings() if retry_settings is None else retry_settings status: Optional[issues.Error] = None @@ -181,15 +152,14 @@ def retry_operation_impl( ) if isinstance(e, skip_yield_error_types): - # Fast retry path: no inter-attempt sleep, but we still yield a marker - # SleepOpt(0.0) so consumers (e.g. the sync wrapper) advance per-attempt - # bookkeeping such as ``ydb.Try`` spans. - yield YdbRetryOperationSleepOpt(0.0, exception=e) + # Skip the inter-attempt sleep but still emit a marker so consumers + # advance per-attempt bookkeeping (e.g. ``ydb.Try`` spans get backoff=0). + yield YdbRetryOperationSleepOpt(0.0) else: - sleep_seconds = retriable_info.sleep_timeout_seconds or 0.0 - yield YdbRetryOperationSleepOpt(sleep_seconds, exception=e) + yield YdbRetryOperationSleepOpt(retriable_info.sleep_timeout_seconds) except Exception as e: + # you should provide your own handler you want retry_settings.unknown_error_handler(e) raise @@ -203,37 +173,18 @@ def retry_operation_sync( *args: Any, **kwargs: Any, ) -> Any: - """Drive :func:`retry_operation_impl` synchronously with OpenTelemetry spans. - - ``ydb.RunWithRetry`` is the outer ``INTERNAL`` span; each attempt runs inside a - ``ydb.Try`` whose ``ydb.retry.backoff_ms`` is the wait that preceded it. The first - ``ydb.Try`` has no such wait so the attribute is omitted; subsequent attempts - always carry it (``0`` on the skip-yield retry path). RPC spans - (``ydb.ExecuteQuery``/``ydb.Commit``/``ydb.Rollback``) nest under the active - ``ydb.Try`` because the sync callee runs while ``TracingSpan.__enter__`` has the - OTel context attached. - """ backoff_ms: Optional[int] = None - if inspect.iscoroutinefunction(callee): - # Async callee with a sync driver: keep current legacy behaviour — the impl just - # creates the coroutine, the caller is responsible for awaiting it. No ``ydb.Try`` - # is opened around the bare coroutine creation; tracing for that case lives in - # ``retry_operation_async``. - traced_callee: Callable[..., Any] = callee - else: - - @functools.wraps(callee) - def traced_callee(*a: Any, **kw: Any) -> Any: - with _start_try_span(backoff_ms): - return callee(*a, **kw) + @functools.wraps(callee) + def traced_callee(*a: Any, **kw: Any) -> Any: + with _start_try_span(backoff_ms): + return callee(*a, **kw) with _start_run_with_retry_span(): for next_opt in retry_operation_impl(traced_callee, retry_settings, *args, **kwargs): if isinstance(next_opt, YdbRetryOperationSleepOpt): - backoff_ms = next_opt.backoff_ms - if next_opt.timeout > 0: - time.sleep(next_opt.timeout) + backoff_ms = int(next_opt.timeout * 1000) + time.sleep(next_opt.timeout) else: return next_opt.result return None @@ -245,29 +196,29 @@ async def retry_operation_async( # pylint: disable=W1113 *args: Any, **kwargs: Any, ) -> Any: - """Drive :func:`retry_operation_impl` asynchronously with OpenTelemetry spans. + """ + The retry operation helper can be used to retry a coroutine that raises YDB specific + exceptions. + + :param callee: A coroutine to retry. + :param retry_settings: An instance of ydb.RetrySettings that describes how the coroutine + should be retried. If None, default instance of retry settings will be used. + :param args: A tuple with positional arguments to be passed into the coroutine. + :param kwargs: A dictionary with keyword arguments to be passed into the coroutine. - Mirrors :func:`retry_operation_sync`. The inter-attempt ``await asyncio.sleep`` runs - *outside* ``ydb.Try`` so an `asyncio.CancelledError` during the wait is recorded on - ``ydb.RunWithRetry`` (the outer span), not on a misleading per-attempt span. + Returns awaitable result of coroutine. If retries are not succussful exception is raised. """ backoff_ms: Optional[int] = None with _start_run_with_retry_span(): for next_opt in retry_operation_impl(callee, retry_settings, *args, **kwargs): if isinstance(next_opt, YdbRetryOperationSleepOpt): - backoff_ms = next_opt.backoff_ms - if next_opt.timeout > 0: - await asyncio.sleep(next_opt.timeout) + backoff_ms = int(next_opt.timeout * 1000) + await asyncio.sleep(next_opt.timeout) else: with _start_try_span(backoff_ms) as try_span: - awaitable: Awaitable[Any] = next_opt.result try: - return await awaitable - except BaseException as e: # noqa: BLE001 - # Exception is swallowed by ``next_opt.set_exception`` so the - # impl re-raises it on the next ``next()`` call; the ``with`` - # would not see it via ``__exit__``, so mark ``ydb.Try`` failed - # explicitly. + return await next_opt.result + except BaseException as e: # pylint: disable=W0703 try_span.set_error(e) next_opt.set_exception(e) return None @@ -323,11 +274,12 @@ async def async_wrapper(*args: Any, **kwargs: Any) -> Any: return await retry_operation_async(func, retry_settings, *args, **kwargs) return async_wrapper + else: - @functools.wraps(func) - def sync_wrapper(*args: Any, **kwargs: Any) -> Any: - return retry_operation_sync(func, retry_settings, *args, **kwargs) + @functools.wraps(func) + def sync_wrapper(*args: Any, **kwargs: Any) -> Any: + return retry_operation_sync(func, retry_settings, *args, **kwargs) - return sync_wrapper + return sync_wrapper return decorator diff --git a/ydb/table_test.py b/ydb/table_test.py index fca8815e..03973ef5 100644 --- a/ydb/table_test.py +++ b/ydb/table_test.py @@ -80,13 +80,10 @@ def check_retriable_error(err_type, backoff): YdbRetryOperationSleepOpt(backoff.calc_timeout(1)), ] == yields else: - # PR #786: retry_operation_impl now yields SleepOpt(0, exc) for these types so - # ``retry_operation_sync`` matches async behaviour (one ``ydb.Try`` per attempt). - assert len(yields) == 2 - assert all( - isinstance(y, YdbRetryOperationSleepOpt) and y.timeout == 0.0 and y.exception is not None - for y in yields - ) + # Skip-yield error types (Aborted/BadSession/NotFound/InternalError): impl emits + # SleepOpt(0.0) markers so consumers can rotate per-attempt bookkeeping + # (e.g. ``ydb.Try`` spans get backoff_ms=0). + assert [YdbRetryOperationSleepOpt(0.0), YdbRetryOperationSleepOpt(0.0)] == yields assert exc == err_type("test2") From 75f95ea98721af5c2dc04e78fa8f8258576f5cf3 Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Sun, 3 May 2026 11:48:07 +0300 Subject: [PATCH 29/36] fix linter --- examples/opentelemetry/otel_example.py | 27 +++++++++++++------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/examples/opentelemetry/otel_example.py b/examples/opentelemetry/otel_example.py index e474b723..2361ba49 100644 --- a/examples/opentelemetry/otel_example.py +++ b/examples/opentelemetry/otel_example.py @@ -8,7 +8,6 @@ import asyncio import os -import socket import sys from pathlib import Path @@ -17,20 +16,20 @@ if str(_repo_root) not in sys.path: sys.path.insert(0, str(_repo_root)) -import ydb # noqa: E402 -from ydb import _utilities as _yutil # noqa: E402 -from opentelemetry import trace # noqa: E402 -from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter # noqa: E402 -from opentelemetry.sdk.resources import Resource # noqa: E402 -from opentelemetry.sdk.trace import TracerProvider # noqa: E402 -from opentelemetry.sdk.trace.export import BatchSpanProcessor # noqa: E402 -from ydb.opentelemetry import enable_tracing # noqa: E402 +import ydb +from opentelemetry import trace +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from ydb.opentelemetry import enable_tracing def _env(name: str, default: str) -> str: v = os.environ.get(name) return v if v is not None and v != "" else default + async def _first_amount(tx) -> int: async with await tx.execute("SELECT amount FROM bank WHERE id = 1") as results: async for rs in results: @@ -42,8 +41,8 @@ async def _first_amount(tx) -> int: async def _bank_read_update(tx) -> None: count = await _first_amount(tx) async with await tx.execute( - "UPDATE bank SET amount = $amt + 1 WHERE id = 1", - {"$amt": (count, ydb.PrimitiveType.Int32)}, + "UPDATE bank SET amount = $amt + 1 WHERE id = 1", + {"$amt": (count, ydb.PrimitiveType.Int32)}, ): pass @@ -62,9 +61,9 @@ async def main() -> None: enable_tracing(tracer) async with ydb.aio.Driver( - endpoint=endpoint, - database=database, - disable_discovery=True, + endpoint=endpoint, + database=database, + disable_discovery=True, ) as driver: await driver.wait(timeout=60) From 9ec69598539e3c361bb129bf5bd5de4737cf642d Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Sun, 3 May 2026 11:53:03 +0300 Subject: [PATCH 30/36] fix linter --- examples/opentelemetry/otel_example.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/opentelemetry/otel_example.py b/examples/opentelemetry/otel_example.py index 2361ba49..fcb99ba5 100644 --- a/examples/opentelemetry/otel_example.py +++ b/examples/opentelemetry/otel_example.py @@ -41,8 +41,8 @@ async def _first_amount(tx) -> int: async def _bank_read_update(tx) -> None: count = await _first_amount(tx) async with await tx.execute( - "UPDATE bank SET amount = $amt + 1 WHERE id = 1", - {"$amt": (count, ydb.PrimitiveType.Int32)}, + "UPDATE bank SET amount = $amt + 1 WHERE id = 1", + {"$amt": (count, ydb.PrimitiveType.Int32)}, ): pass @@ -61,9 +61,9 @@ async def main() -> None: enable_tracing(tracer) async with ydb.aio.Driver( - endpoint=endpoint, - database=database, - disable_discovery=True, + endpoint=endpoint, + database=database, + disable_discovery=True, ) as driver: await driver.wait(timeout=60) From 515d57cae3f16f83bd253e772fc59af37c5050ba Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Sun, 3 May 2026 16:34:29 +0300 Subject: [PATCH 31/36] fix linter --- examples/opentelemetry/otel_example.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/examples/opentelemetry/otel_example.py b/examples/opentelemetry/otel_example.py index fcb99ba5..4f68643b 100644 --- a/examples/opentelemetry/otel_example.py +++ b/examples/opentelemetry/otel_example.py @@ -8,21 +8,13 @@ import asyncio import os -import sys -from pathlib import Path - -# For ``python otel_example.py`` in this tree without an installed ``ydb`` package. -_repo_root = Path(__file__).resolve().parent.parent.parent -if str(_repo_root) not in sys.path: - sys.path.insert(0, str(_repo_root)) - import ydb +from ydb.opentelemetry import enable_tracing from opentelemetry import trace from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor -from ydb.opentelemetry import enable_tracing def _env(name: str, default: str) -> str: @@ -41,8 +33,8 @@ async def _first_amount(tx) -> int: async def _bank_read_update(tx) -> None: count = await _first_amount(tx) async with await tx.execute( - "UPDATE bank SET amount = $amt + 1 WHERE id = 1", - {"$amt": (count, ydb.PrimitiveType.Int32)}, + "UPDATE bank SET amount = $amt + 1 WHERE id = 1", + {"$amt": (count, ydb.PrimitiveType.Int32)}, ): pass @@ -61,9 +53,9 @@ async def main() -> None: enable_tracing(tracer) async with ydb.aio.Driver( - endpoint=endpoint, - database=database, - disable_discovery=True, + endpoint=endpoint, + database=database, + disable_discovery=True, ) as driver: await driver.wait(timeout=60) From fc01331e8a5712caf02099261c5f2d007f355c3a Mon Sep 17 00:00:00 2001 From: KirillKurdyukov Date: Sun, 3 May 2026 16:40:02 +0300 Subject: [PATCH 32/36] fix linter --- examples/opentelemetry/otel_example.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/opentelemetry/otel_example.py b/examples/opentelemetry/otel_example.py index 4f68643b..6ec0c5a8 100644 --- a/examples/opentelemetry/otel_example.py +++ b/examples/opentelemetry/otel_example.py @@ -33,8 +33,8 @@ async def _first_amount(tx) -> int: async def _bank_read_update(tx) -> None: count = await _first_amount(tx) async with await tx.execute( - "UPDATE bank SET amount = $amt + 1 WHERE id = 1", - {"$amt": (count, ydb.PrimitiveType.Int32)}, + "UPDATE bank SET amount = $amt + 1 WHERE id = 1", + {"$amt": (count, ydb.PrimitiveType.Int32)}, ): pass @@ -53,9 +53,9 @@ async def main() -> None: enable_tracing(tracer) async with ydb.aio.Driver( - endpoint=endpoint, - database=database, - disable_discovery=True, + endpoint=endpoint, + database=database, + disable_discovery=True, ) as driver: await driver.wait(timeout=60) From e6deab1bb894ad712f5a2a8b0a02f053886c0be3 Mon Sep 17 00:00:00 2001 From: tewbo Date: Mon, 4 May 2026 21:09:46 +0300 Subject: [PATCH 33/36] fix issue --- tests/tracing/test_tracing_async.py | 42 +++++++++++++++++++++++++++++ ydb/aio/connection.py | 6 +++++ 2 files changed, 48 insertions(+) diff --git a/tests/tracing/test_tracing_async.py b/tests/tracing/test_tracing_async.py index 3ff9ec6b..ae2682ff 100644 --- a/tests/tracing/test_tracing_async.py +++ b/tests/tracing/test_tracing_async.py @@ -94,6 +94,48 @@ async def test_create_session_emits_span(self, otel_setup): assert attrs["server.address"] == "test_endpoint" assert attrs["server.port"] == 1337 + def test_async_connection_peer_attributes_are_resolved(self, otel_setup): + exporter = otel_setup + + from ydb.aio.connection import Connection + from ydb.connection import EndpointOptions + from ydb.opentelemetry.tracing import create_ydb_span + from ydb.query.session import _resolve_peer + + cfg = FakeDriverConfig() + endpoint_options = EndpointOptions( + node_id=12345, + address="node.example.net", + port=2136, + location="dc-a", + ) + + with patch("ydb.aio.connection.channel_factory", return_value=MagicMock()): + with patch("ydb.aio.connection._stubs_list", ()): + connection = Connection( + endpoint="grpc://node.example.net:2136", + driver_config=cfg, + endpoint_options=endpoint_options, + ) + + driver = MagicMock() + driver._store.connections_by_node_id = {12345: connection} + + span = create_ydb_span( + "ydb.CreateSession", + cfg, + node_id=12345, + peer=_resolve_peer(driver, 12345), + ) + span.end() + + span = _get_single_span(exporter, "ydb.CreateSession") + attrs = dict(span.attributes) + assert attrs["ydb.node.id"] == 12345 + assert attrs["network.peer.address"] == "node.example.net" + assert attrs["network.peer.port"] == 2136 + assert attrs["ydb.node.dc"] == "dc-a" + class TestAsyncExecuteQuerySpan: @pytest.mark.asyncio diff --git a/ydb/aio/connection.py b/ydb/aio/connection.py index a3cf2ffc..e5e57e3b 100644 --- a/ydb/aio/connection.py +++ b/ydb/aio/connection.py @@ -153,6 +153,9 @@ class Connection: "closing", "endpoint_key", "node_id", + "peer_address", + "peer_port", + "peer_location", ) def __init__( @@ -164,6 +167,9 @@ def __init__( self.endpoint = endpoint self.endpoint_key = EndpointKey(self.endpoint, getattr(endpoint_options, "node_id", None)) self.node_id = getattr(endpoint_options, "node_id", None) + self.peer_address = getattr(endpoint_options, "address", None) + self.peer_port = getattr(endpoint_options, "port", None) + self.peer_location = getattr(endpoint_options, "location", None) self._channel = channel_factory(self.endpoint, driver_config, grpc.aio, endpoint_options=endpoint_options) self._driver_config = driver_config From c72eb1721d9bdea87b7dd0c90eef172d8b6ffe0a Mon Sep 17 00:00:00 2001 From: Oleg Ovcharuk Date: Thu, 7 May 2026 15:37:00 +0300 Subject: [PATCH 34/36] Refactor code --- .dockerignore | 4 +- .gitignore | 4 +- CHANGELOG.md | 9 +-- docs/opentelemetry.rst | 3 + setup.py | 1 - test-requirements.txt | 3 - tests/tracing/test_tracing_async.py | 43 ++++++------ tests/tracing/test_tracing_sync.py | 58 ++++++++-------- ydb/aio/pool.py | 4 +- ydb/aio/query/base.py | 26 +++---- ydb/aio/query/session.py | 59 ++++++++-------- ydb/aio/query/transaction.py | 75 ++++++++++----------- ydb/aio/table.py | 3 +- ydb/opentelemetry/__init__.py | 4 +- ydb/opentelemetry/{_plugin.py => plugin.py} | 53 ++++++--------- ydb/opentelemetry/tracing.py | 62 +++++++++++------ ydb/pool.py | 4 +- ydb/query/base.py | 26 +++---- ydb/query/session.py | 59 ++++++++-------- ydb/query/transaction.py | 75 ++++++++++----------- ydb/retries.py | 27 +++----- 21 files changed, 282 insertions(+), 320 deletions(-) rename ydb/opentelemetry/{_plugin.py => plugin.py} (67%) diff --git a/.dockerignore b/.dockerignore index 90dc0607..90fe8e80 100644 --- a/.dockerignore +++ b/.dockerignore @@ -4,6 +4,4 @@ !README.md !requirements.txt !pyproject.toml -!setup.py -!examples/opentelemetry/otel_example.py -!examples/opentelemetry/requirements.txt \ No newline at end of file +!setup.py \ No newline at end of file diff --git a/.gitignore b/.gitignore index 6ae4d6e8..36b3d2e2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,10 @@ __pycache__ ydb.egg-info/ -.idea/ +/.idea /.vscode /tox /venv -.venv/ +/.venv /ydb_certs /ydb_data /tmp diff --git a/CHANGELOG.md b/CHANGELOG.md index 7fa7d663..c0cf8703 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,4 @@ -## Unreleased ## -* OpenTelemetry: W3C trace context for gRPC stays bound for the whole ``ExecuteQuery`` stream - (until the result iterator finishes); no long-lived ``context.attach`` on the span; - ``disable_tracing()``; correct ``server.*`` from ``grpc://`` endpoints; zero work in - ``create_ydb_span`` when tracing is off; one ``ydb.Try`` per attempt for fast retriable - errors in sync retries. +* Added OpenTelemetry tracing support via `ydb.opentelemetry.enable_tracing()` ## 3.28.4 ## * Fix iam module lazy loading @@ -73,7 +68,7 @@ * Make DeadlineExceeded not retriable ## 3.23.4 ## -* Allow rollback after TLI +* Allow rollback after TLI ## 3.23.3 ## * Make attach session error readable diff --git a/docs/opentelemetry.rst b/docs/opentelemetry.rst index 127f4e54..c4eb810e 100644 --- a/docs/opentelemetry.rst +++ b/docs/opentelemetry.rst @@ -94,6 +94,9 @@ The following operations produce spans: * - ``ydb.ExecuteQuery`` - CLIENT - Executing a query (including ``execute_with_retries``). + * - ``ydb.BeginTransaction`` + - CLIENT + - Explicitly beginning a transaction via ``.begin()``. * - ``ydb.Commit`` - CLIENT - Committing an explicit transaction. diff --git a/setup.py b/setup.py index 634668dc..7cadc459 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,6 @@ options={"bdist_wheel": {"universal": True}}, extras_require={ "yc": ["yandexcloud", ], - # Named ``opentelemetry`` (not ``tracing``): avoids clashing with ``ydb.tracing`` (PR #786, vgvoleg). "opentelemetry": ["opentelemetry-api>=1.0.0"], } ) diff --git a/test-requirements.txt b/test-requirements.txt index 18d4cbf7..0f8d784b 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -43,10 +43,7 @@ sqlalchemy==1.4.26 pylint-protobuf cython freezegun>=1.3.0 -opentelemetry-api>=1.0.0 opentelemetry-sdk>=1.0.0 -# Namespace ``opentelemetry.exporter`` (examples, OTLP); not part of ``opentelemetry-api``. -opentelemetry-exporter-otlp-proto-grpc>=1.0.0 # pytest-cov yandexcloud -e . diff --git a/tests/tracing/test_tracing_async.py b/tests/tracing/test_tracing_async.py index ae2682ff..c4d98c06 100644 --- a/tests/tracing/test_tracing_async.py +++ b/tests/tracing/test_tracing_async.py @@ -4,6 +4,7 @@ """ from opentelemetry.trace import StatusCode, SpanKind +from ydb.opentelemetry.tracing import SpanName, create_ydb_span from ydb.query.transaction import QueryTxStateEnum from .conftest import FakeDriverConfig from unittest.mock import AsyncMock, MagicMock, patch @@ -86,7 +87,7 @@ async def test_create_session_emits_span(self, otel_setup): with patch.object(QuerySession, "_attach", new_callable=AsyncMock): await qs.create() - span = _get_single_span(exporter, "ydb.CreateSession") + span = _get_single_span(exporter, SpanName.CREATE_SESSION) assert span.kind == SpanKind.CLIENT attrs = dict(span.attributes) assert attrs["db.system.name"] == "ydb" @@ -122,14 +123,14 @@ def test_async_connection_peer_attributes_are_resolved(self, otel_setup): driver._store.connections_by_node_id = {12345: connection} span = create_ydb_span( - "ydb.CreateSession", + SpanName.CREATE_SESSION, cfg, node_id=12345, peer=_resolve_peer(driver, 12345), ) span.end() - span = _get_single_span(exporter, "ydb.CreateSession") + span = _get_single_span(exporter, SpanName.CREATE_SESSION) attrs = dict(span.attributes) assert attrs["ydb.node.id"] == 12345 assert attrs["network.peer.address"] == "node.example.net" @@ -160,7 +161,7 @@ async def test_session_execute_emits_span(self, otel_setup): async for _ in result: pass - span = _get_single_span(exporter, "ydb.ExecuteQuery") + span = _get_single_span(exporter, SpanName.EXECUTE_QUERY) attrs = dict(span.attributes) assert attrs["ydb.node.id"] == 12345 assert attrs["network.peer.address"] == "n1" @@ -180,7 +181,7 @@ async def test_tx_execute_emits_span(self, otel_setup): async for _ in result: pass - span = _get_single_span(exporter, "ydb.ExecuteQuery") + span = _get_single_span(exporter, SpanName.EXECUTE_QUERY) attrs = dict(span.attributes) assert attrs["ydb.node.id"] == 12345 assert attrs["network.peer.address"] == "n1" @@ -199,7 +200,7 @@ async def test_begin_emits_span(self, otel_setup): with patch.object(type(tx), "_begin_call", new_callable=AsyncMock): await tx.begin() - span = _get_single_span(exporter, "ydb.BeginTransaction") + span = _get_single_span(exporter, SpanName.BEGIN_TRANSACTION) assert span.kind == SpanKind.CLIENT attrs = dict(span.attributes) assert attrs["db.system.name"] == "ydb" @@ -225,7 +226,7 @@ async def test_begin_sets_error_status_on_failure(self, otel_setup): with pytest.raises(issues.Unavailable): await tx.begin() - span = _get_single_span(exporter, "ydb.BeginTransaction") + span = _get_single_span(exporter, SpanName.BEGIN_TRANSACTION) assert span.status.status_code == StatusCode.ERROR attrs = dict(span.attributes) assert attrs["error.type"] == "ydb_error" @@ -243,7 +244,7 @@ async def test_commit_emits_span(self, otel_setup): with patch.object(type(tx), "_commit_call", new_callable=AsyncMock): await tx.commit() - span = _get_single_span(exporter, "ydb.Commit") + span = _get_single_span(exporter, SpanName.COMMIT) assert span.kind == SpanKind.CLIENT attrs = dict(span.attributes) assert attrs["network.peer.address"] == "n1" @@ -262,7 +263,7 @@ async def test_rollback_emits_span(self, otel_setup): with patch.object(type(tx), "_rollback_call", new_callable=AsyncMock): await tx.rollback() - span = _get_single_span(exporter, "ydb.Rollback") + span = _get_single_span(exporter, SpanName.ROLLBACK) assert span.kind == SpanKind.CLIENT attrs = dict(span.attributes) assert attrs["network.peer.address"] == "n1" @@ -290,7 +291,7 @@ async def test_commit_records_exception_on_failure(self, otel_setup): with pytest.raises(issues.Aborted): await tx.commit() - span = _get_single_span(exporter, "ydb.Commit") + span = _get_single_span(exporter, SpanName.COMMIT) assert span.status.status_code == StatusCode.ERROR attrs = dict(span.attributes) assert attrs["error.type"] == "ydb_error" @@ -310,7 +311,7 @@ async def test_rollback_records_exception_on_failure(self, otel_setup): with pytest.raises(issues.Unavailable): await tx.rollback() - span = _get_single_span(exporter, "ydb.Rollback") + span = _get_single_span(exporter, SpanName.ROLLBACK) assert span.status.status_code == StatusCode.ERROR attrs = dict(span.attributes) assert attrs["error.type"] == "ydb_error" @@ -342,7 +343,7 @@ async def test_error_sets_error_status_and_attributes(self, otel_setup): with pytest.raises(issues.SchemeError): await qs.execute("SELECT * FROM non_existing_table") - span = _get_single_span(exporter, "ydb.ExecuteQuery") + span = _get_single_span(exporter, SpanName.EXECUTE_QUERY) assert span.status.status_code == StatusCode.ERROR attrs = dict(span.attributes) assert attrs["error.type"] == "ydb_error" @@ -362,10 +363,10 @@ async def callee(): assert await retry_operation_async(callee) == 7 - run = _get_single_span(exporter, "ydb.RunWithRetry") + run = _get_single_span(exporter, SpanName.RUN_WITH_RETRY) assert run.kind == SpanKind.INTERNAL - tries = _get_spans(exporter, "ydb.Try") + tries = _get_spans(exporter, SpanName.TRY) assert len(tries) == 1 assert tries[0].parent.span_id == run.context.span_id assert "ydb.retry.backoff_ms" not in dict(tries[0].attributes) @@ -394,7 +395,7 @@ async def flaky(): assert await retry_operation_async(flaky, retry_settings) == "ok" - tries = _get_spans(exporter, "ydb.Try") + tries = _get_spans(exporter, SpanName.TRY) assert len(tries) == 3 assert tries[0].status.status_code == StatusCode.ERROR assert tries[1].status.status_code == StatusCode.ERROR @@ -430,12 +431,12 @@ async def flaky(): with pytest.raises(asyncio.CancelledError): await task - run = _get_single_span(exporter, "ydb.RunWithRetry") + run = _get_single_span(exporter, SpanName.RUN_WITH_RETRY) assert run.status.status_code == StatusCode.ERROR # TracingSpan / OTel will attach the cancellation as span events (record_exception) when enabled. assert run.events is not None # First attempt: ``ydb.Try``; cancel hits ``ydb.RunWithRetry`` during the inter-attempt sleep. - tries = _get_spans(exporter, "ydb.Try") + tries = _get_spans(exporter, SpanName.TRY) assert len(tries) >= 1 @@ -473,9 +474,9 @@ async def callee(): assert await retry_operation_async(callee) == "ok" - run = _get_single_span(exporter, "ydb.RunWithRetry") - try_span = _get_single_span(exporter, "ydb.Try") - exec_span = _get_single_span(exporter, "ydb.ExecuteQuery") + run = _get_single_span(exporter, SpanName.RUN_WITH_RETRY) + try_span = _get_single_span(exporter, SpanName.TRY) + exec_span = _get_single_span(exporter, SpanName.EXECUTE_QUERY) assert try_span.parent.span_id == run.context.span_id assert exec_span.parent.span_id == try_span.context.span_id @@ -517,7 +518,7 @@ async def do_execute(qs): qs2 = _make_session() await asyncio.gather(do_execute(qs1), do_execute(qs2)) - spans = _get_spans(exporter, "ydb.ExecuteQuery") + spans = _get_spans(exporter, SpanName.EXECUTE_QUERY) assert len(spans) == 2 ids = {s.context.span_id for s in spans} diff --git a/tests/tracing/test_tracing_sync.py b/tests/tracing/test_tracing_sync.py index 3a8d28d5..9f8bbc42 100644 --- a/tests/tracing/test_tracing_sync.py +++ b/tests/tracing/test_tracing_sync.py @@ -8,7 +8,7 @@ from unittest.mock import MagicMock, patch from opentelemetry import trace from opentelemetry.trace import StatusCode, SpanKind -from ydb.opentelemetry.tracing import _registry, create_ydb_span +from ydb.opentelemetry.tracing import SpanName, _registry, create_ydb_span from ydb.query.transaction import QueryTxStateEnum from .conftest import FakeDriverConfig @@ -84,7 +84,7 @@ def test_create_session_emits_span(self, otel_setup): with patch.object(QuerySession, "_attach", return_value=None): qs.create() - span = _get_single_span(exporter, "ydb.CreateSession") + span = _get_single_span(exporter, SpanName.CREATE_SESSION) assert span.kind == SpanKind.CLIENT attrs = dict(span.attributes) assert attrs["db.system.name"] == "ydb" @@ -116,7 +116,7 @@ def test_session_execute_emits_span(self, otel_setup): # Consume the iterator to finish the span list(result) - span = _get_single_span(exporter, "ydb.ExecuteQuery") + span = _get_single_span(exporter, SpanName.EXECUTE_QUERY) assert span.kind == SpanKind.CLIENT attrs = dict(span.attributes) assert attrs["db.system.name"] == "ydb" @@ -141,7 +141,7 @@ def test_tx_execute_emits_span(self, otel_setup): result = tx.execute("SELECT 1;") list(result) - span = _get_single_span(exporter, "ydb.ExecuteQuery") + span = _get_single_span(exporter, SpanName.EXECUTE_QUERY) attrs = dict(span.attributes) assert attrs["ydb.node.id"] == 12345 assert attrs["network.peer.address"] == "n1" @@ -160,7 +160,7 @@ def test_begin_emits_span(self, otel_setup): with patch.object(type(tx), "_begin_call", return_value=None): tx.begin() - span = _get_single_span(exporter, "ydb.BeginTransaction") + span = _get_single_span(exporter, SpanName.BEGIN_TRANSACTION) assert span.kind == SpanKind.CLIENT attrs = dict(span.attributes) assert attrs["db.system.name"] == "ydb" @@ -185,7 +185,7 @@ def test_begin_sets_error_status_on_failure(self, otel_setup): with pytest.raises(issues.Unavailable): tx.begin() - span = _get_single_span(exporter, "ydb.BeginTransaction") + span = _get_single_span(exporter, SpanName.BEGIN_TRANSACTION) assert span.status.status_code == StatusCode.ERROR attrs = dict(span.attributes) assert attrs["error.type"] == "ydb_error" @@ -202,7 +202,7 @@ def test_commit_emits_span(self, otel_setup): with patch.object(type(tx), "_commit_call", return_value=None): tx.commit() - span = _get_single_span(exporter, "ydb.Commit") + span = _get_single_span(exporter, SpanName.COMMIT) assert span.kind == SpanKind.CLIENT attrs = dict(span.attributes) assert attrs["db.system.name"] == "ydb" @@ -222,7 +222,7 @@ def test_rollback_emits_span(self, otel_setup): with patch.object(type(tx), "_rollback_call", return_value=None): tx.rollback() - span = _get_single_span(exporter, "ydb.Rollback") + span = _get_single_span(exporter, SpanName.ROLLBACK) assert span.kind == SpanKind.CLIENT attrs = dict(span.attributes) assert attrs["db.system.name"] == "ydb" @@ -252,7 +252,7 @@ def test_commit_records_exception_on_failure(self, otel_setup): with pytest.raises(issues.Aborted): tx.commit() - span = _get_single_span(exporter, "ydb.Commit") + span = _get_single_span(exporter, SpanName.COMMIT) assert span.status.status_code == StatusCode.ERROR attrs = dict(span.attributes) assert attrs["error.type"] == "ydb_error" @@ -271,7 +271,7 @@ def test_rollback_records_exception_on_failure(self, otel_setup): with pytest.raises(issues.Unavailable): tx.rollback() - span = _get_single_span(exporter, "ydb.Rollback") + span = _get_single_span(exporter, SpanName.ROLLBACK) assert span.status.status_code == StatusCode.ERROR attrs = dict(span.attributes) assert attrs["error.type"] == "ydb_error" @@ -302,7 +302,7 @@ def test_error_sets_error_status_and_attributes(self, otel_setup): with pytest.raises(issues.SchemeError): qs.execute("SELECT * FROM non_existing_table") - span = _get_single_span(exporter, "ydb.ExecuteQuery") + span = _get_single_span(exporter, SpanName.EXECUTE_QUERY) assert span.status.status_code == StatusCode.ERROR attrs = dict(span.attributes) assert attrs["error.type"] == "ydb_error" @@ -320,7 +320,7 @@ def test_no_spans_without_enable_tracing(self): _registry.set_metadata_hook(None) _exporter.clear() - with create_ydb_span("ydb.CreateSession", FakeDriverConfig()): + with create_ydb_span(SpanName.CREATE_SESSION, FakeDriverConfig()).attach_context(): pass assert len(_exporter.get_finished_spans()) == 0 @@ -333,11 +333,11 @@ def test_sdk_span_is_child_of_user_span(self, otel_setup): tracer = trace.get_tracer("test.tracer") with tracer.start_as_current_span("user.operation"): - with create_ydb_span("ydb.ExecuteQuery", FakeDriverConfig(), node_id=1): + with create_ydb_span(SpanName.EXECUTE_QUERY, FakeDriverConfig(), node_id=1).attach_context(): pass spans = exporter.get_finished_spans() - ydb_span = next(s for s in spans if s.name == "ydb.ExecuteQuery") + ydb_span = next(s for s in spans if s.name == SpanName.EXECUTE_QUERY) user_span = next(s for s in spans if s.name == "user.operation") assert ydb_span.parent is not None @@ -364,10 +364,10 @@ def test_driver_initialize_emits_internal_span(self, otel_setup): cfg = FakeDriverConfig() - with create_ydb_span("ydb.Driver.Initialize", cfg, kind="internal"): + with create_ydb_span(SpanName.DRIVER_INITIALIZE, cfg, kind="internal").attach_context(): pass - span = _get_single_span(exporter, "ydb.Driver.Initialize") + span = _get_single_span(exporter, SpanName.DRIVER_INITIALIZE) assert span.kind == SpanKind.INTERNAL attrs = dict(span.attributes) assert attrs["db.system.name"] == "ydb" @@ -387,7 +387,7 @@ def test_endpoint_parsing(self, otel_setup, endpoint, expected_host, expected_po exporter = otel_setup cfg = FakeDriverConfig(endpoint=endpoint, database="/mydb") - with create_ydb_span("ydb.Test", cfg): + with create_ydb_span("ydb.Test", cfg).attach_context(): pass span = _get_single_span(exporter, "ydb.Test") @@ -400,7 +400,7 @@ def test_peer_attributes_are_optional(self, otel_setup): exporter = otel_setup cfg = FakeDriverConfig() - with create_ydb_span("ydb.Test", cfg): + with create_ydb_span("ydb.Test", cfg).attach_context(): pass span = _get_single_span(exporter, "ydb.Test") @@ -412,7 +412,7 @@ def test_peer_attributes_emitted_when_known(self, otel_setup): exporter = otel_setup cfg = FakeDriverConfig() - with create_ydb_span("ydb.Test", cfg, peer=("peer.example.com", 2137, "dc-west")): + with create_ydb_span("ydb.Test", cfg, peer=("peer.example.com", 2137, "dc-west")).attach_context(): pass span = _get_single_span(exporter, "ydb.Test") @@ -464,11 +464,11 @@ def callee(): assert retry_operation_sync(callee) == 42 - run = _get_single_span(exporter, "ydb.RunWithRetry") + run = _get_single_span(exporter, SpanName.RUN_WITH_RETRY) assert run.kind == SpanKind.INTERNAL assert run.status.status_code == StatusCode.UNSET - tries = _get_spans(exporter, "ydb.Try") + tries = _get_spans(exporter, SpanName.TRY) assert len(tries) == 1 assert tries[0].kind == SpanKind.INTERNAL assert "ydb.retry.backoff_ms" not in dict(tries[0].attributes) @@ -496,7 +496,7 @@ def flaky(): assert retry_operation_sync(flaky, retry_settings) == "ok" - tries = _get_spans(exporter, "ydb.Try") + tries = _get_spans(exporter, SpanName.TRY) assert len(tries) == 3 # First attempt has no preceding backoff, so no attribute at all; later ones # carry a positive integer ms. @@ -546,7 +546,7 @@ def flaky(): expected_ms = 75 - tries = _get_spans(exporter, "ydb.Try") + tries = _get_spans(exporter, SpanName.TRY) assert len(tries) == 3 assert "ydb.retry.backoff_ms" not in dict(tries[0].attributes) assert dict(tries[1].attributes)["ydb.retry.backoff_ms"] == expected_ms @@ -569,7 +569,7 @@ def flaky(): assert retry_operation_sync(flaky, RetrySettings(max_retries=5)) == "ok" - tries = _get_spans(exporter, "ydb.Try") + tries = _get_spans(exporter, SpanName.TRY) assert len(tries) == 3 assert tries[0].status.status_code == StatusCode.ERROR assert tries[1].status.status_code == StatusCode.ERROR @@ -593,10 +593,10 @@ def broken(): with pytest.raises(issues.SchemeError): retry_operation_sync(broken) - run = _get_single_span(exporter, "ydb.RunWithRetry") + run = _get_single_span(exporter, SpanName.RUN_WITH_RETRY) assert run.status.status_code == StatusCode.ERROR - tries = _get_spans(exporter, "ydb.Try") + tries = _get_spans(exporter, SpanName.TRY) assert len(tries) == 1 assert tries[0].status.status_code == StatusCode.ERROR attrs = dict(tries[0].attributes) @@ -629,9 +629,9 @@ def callee(): assert retry_operation_sync(callee) == "ok" - run = _get_single_span(exporter, "ydb.RunWithRetry") - try_span = _get_single_span(exporter, "ydb.Try") - exec_span = _get_single_span(exporter, "ydb.ExecuteQuery") + run = _get_single_span(exporter, SpanName.RUN_WITH_RETRY) + try_span = _get_single_span(exporter, SpanName.TRY) + exec_span = _get_single_span(exporter, SpanName.EXECUTE_QUERY) assert try_span.parent.span_id == run.context.span_id assert exec_span.parent.span_id == try_span.context.span_id diff --git a/ydb/aio/pool.py b/ydb/aio/pool.py index 4f1b0cdd..5eb51b5c 100644 --- a/ydb/aio/pool.py +++ b/ydb/aio/pool.py @@ -6,7 +6,7 @@ from typing import Any, Callable, Optional, Tuple, TYPE_CHECKING from ydb import issues -from ydb.opentelemetry.tracing import create_ydb_span +from ydb.opentelemetry.tracing import SpanName, create_ydb_span from ydb.pool import ConnectionsCache as _ConnectionsCache, IConnectionPool from .connection import Connection, EndpointKey @@ -286,7 +286,7 @@ async def __wrapper__() -> None: return __wrapper__ async def wait(self, timeout: Optional[float] = 7.0, fail_fast: bool = False) -> None: # type: ignore[override] # async override of sync method - with create_ydb_span("ydb.Driver.Initialize", self._driver_config, kind="internal"): + with create_ydb_span(SpanName.DRIVER_INITIALIZE, self._driver_config, kind="internal").attach_context(): await self._store.get(fast_fail=fail_fast, wait_timeout=timeout if timeout is not None else 7.0) def discovery_debug_details(self) -> str: diff --git a/ydb/aio/query/base.py b/ydb/aio/query/base.py index 0344683d..6c13dfd4 100644 --- a/ydb/aio/query/base.py +++ b/ydb/aio/query/base.py @@ -2,12 +2,12 @@ class AsyncResponseContextIterator(_utilities.AsyncResponseIterator): - """Async ExecuteQuery result stream; ends the attached OTel span when consumed.""" + """Async ExecuteQuery result stream.""" - def __init__(self, it, wrapper, on_error=None, span=None): + def __init__(self, it, wrapper, on_error=None, on_finish=None): super().__init__(it, wrapper) self._on_error = on_error - self._span = span + self._on_finish = on_finish async def __aenter__(self) -> "AsyncResponseContextIterator": return self @@ -18,7 +18,7 @@ async def _next(self): except StopAsyncIteration: # Normal stream termination is not an error and must not invalidate # the session. - self._finish_span() + self._call_on_finish() raise except BaseException as e: # BaseException (not Exception) because asyncio.CancelledError @@ -29,20 +29,16 @@ async def _next(self): # reply with SessionBusy. if self._on_error: self._on_error(e) - self._finish_span(e) + self._call_on_finish(e) raise - def _finish_span(self, exception=None): - if self._span is not None: - if exception is not None: - self._span.set_error(exception) - self._span.end() - self._span = None + def _call_on_finish(self, exception=None): + if self._on_finish is not None: + self._on_finish(exception) + self._on_finish = None def __del__(self): - if self._span is not None: - self._span.end() - self._span = None + self._call_on_finish() async def __aexit__(self, exc_type, exc_val, exc_tb): # To close stream on YDB it is necessary to scroll through it to the end. @@ -56,4 +52,4 @@ async def __aexit__(self, exc_type, exc_val, exc_tb): pass except BaseException: pass - self._finish_span() + self._call_on_finish() diff --git a/ydb/aio/query/session.py b/ydb/aio/query/session.py index b6b8ee92..b776b638 100644 --- a/ydb/aio/query/session.py +++ b/ydb/aio/query/session.py @@ -19,7 +19,7 @@ from ...query import base from ...query.session import BaseQuerySession -from ...opentelemetry.tracing import create_ydb_span, set_peer_attributes +from ...opentelemetry.tracing import SpanName, create_ydb_span, set_peer_attributes, span_finish_callback from ..._constants import DEFAULT_INITIAL_RESPONSE_TIMEOUT @@ -106,7 +106,7 @@ async def create(self, settings: Optional[BaseRequestSettings] = None) -> "Query if self._closed: raise RuntimeError("Session is already closed") - with create_ydb_span("ydb.CreateSession", self._driver_config) as span: + with create_ydb_span(SpanName.CREATE_SESSION, self._driver_config).attach_context() as span: await self._create_call(settings=settings) set_peer_attributes(span, self._peer) await self._attach() @@ -163,42 +163,37 @@ async def execute( self._check_session_ready_to_use() span = create_ydb_span( - "ydb.ExecuteQuery", + SpanName.EXECUTE_QUERY, self._driver_config, node_id=self._node_id, peer=self._peer, ) - try: - with span.attach_context(): - stream_it = await self._execute_call( - query=query, - parameters=parameters, - commit_tx=True, - syntax=syntax, - exec_mode=exec_mode, - stats_mode=stats_mode, - schema_inclusion_mode=schema_inclusion_mode, - result_set_format=result_set_format, - arrow_format_settings=arrow_format_settings, - concurrent_result_sets=concurrent_result_sets, - settings=settings, - ) - return AsyncResponseContextIterator( - it=stream_it, - wrapper=lambda resp: base.wrap_execute_query_response( - rpc_state=None, - response_pb=resp, - session=self, - settings=self._settings, - ), - on_error=self._on_execute_stream_error, - span=span, + with span.attach_context(end_on_exit=False): + stream_it = await self._execute_call( + query=query, + parameters=parameters, + commit_tx=True, + syntax=syntax, + exec_mode=exec_mode, + stats_mode=stats_mode, + schema_inclusion_mode=schema_inclusion_mode, + result_set_format=result_set_format, + arrow_format_settings=arrow_format_settings, + concurrent_result_sets=concurrent_result_sets, + settings=settings, ) - except Exception as e: - span.set_error(e) - span.end() - raise + return AsyncResponseContextIterator( + it=stream_it, + wrapper=lambda resp: base.wrap_execute_query_response( + rpc_state=None, + response_pb=resp, + session=self, + settings=self._settings, + ), + on_error=self._on_execute_stream_error, + on_finish=span_finish_callback(span), + ) async def explain( self, diff --git a/ydb/aio/query/transaction.py b/ydb/aio/query/transaction.py index ca7cc6da..a05d91f2 100644 --- a/ydb/aio/query/transaction.py +++ b/ydb/aio/query/transaction.py @@ -12,7 +12,7 @@ BaseQueryTxContext, QueryTxStateEnum, ) -from ...opentelemetry.tracing import create_ydb_span +from ...opentelemetry.tracing import SpanName, create_ydb_span, span_finish_callback if TYPE_CHECKING: from .session import QuerySession @@ -89,11 +89,11 @@ async def begin(self, settings: Optional[BaseRequestSettings] = None) -> "QueryT :return: None or exception if begin is failed """ with create_ydb_span( - "ydb.BeginTransaction", + SpanName.BEGIN_TRANSACTION, self._driver_config, node_id=self.session.node_id, peer=getattr(self.session, "_peer", None), - ): + ).attach_context(): await self._begin_call(settings) return self @@ -117,11 +117,11 @@ async def commit(self, settings: Optional[BaseRequestSettings] = None) -> None: await self._ensure_prev_stream_finished() with create_ydb_span( - "ydb.Commit", + SpanName.COMMIT, self._driver_config, node_id=self.session.node_id, peer=getattr(self.session, "_peer", None), - ): + ).attach_context(): try: await self._execute_callbacks_async(base.TxEvent.BEFORE_COMMIT) await self._commit_call(settings) @@ -150,11 +150,11 @@ async def rollback(self, settings: Optional[BaseRequestSettings] = None) -> None await self._ensure_prev_stream_finished() with create_ydb_span( - "ydb.Rollback", + SpanName.ROLLBACK, self._driver_config, node_id=self.session.node_id, peer=getattr(self.session, "_peer", None), - ): + ).attach_context(): try: await self._execute_callbacks_async(base.TxEvent.BEFORE_ROLLBACK) await self._rollback_call(settings) @@ -210,42 +210,37 @@ async def execute( await self._ensure_prev_stream_finished() span = create_ydb_span( - "ydb.ExecuteQuery", + SpanName.EXECUTE_QUERY, self._driver_config, node_id=self.session.node_id, peer=getattr(self.session, "_peer", None), ) - try: - with span.attach_context(): - stream_it = await self._execute_call( - query=query, - parameters=parameters, - commit_tx=commit_tx, - syntax=syntax, - exec_mode=exec_mode, - stats_mode=stats_mode, - schema_inclusion_mode=schema_inclusion_mode, - result_set_format=result_set_format, - arrow_format_settings=arrow_format_settings, - concurrent_result_sets=concurrent_result_sets, - settings=settings, - ) - self._prev_stream = AsyncResponseContextIterator( - it=stream_it, - wrapper=lambda resp: base.wrap_execute_query_response( - rpc_state=None, - response_pb=resp, - session=self.session, - tx=self, - commit_tx=commit_tx, - settings=self.session._settings, - ), - on_error=self.session._on_execute_stream_error, - span=span, + with span.attach_context(end_on_exit=False): + stream_it = await self._execute_call( + query=query, + parameters=parameters, + commit_tx=commit_tx, + syntax=syntax, + exec_mode=exec_mode, + stats_mode=stats_mode, + schema_inclusion_mode=schema_inclusion_mode, + result_set_format=result_set_format, + arrow_format_settings=arrow_format_settings, + concurrent_result_sets=concurrent_result_sets, + settings=settings, ) - return self._prev_stream - except Exception as e: - span.set_error(e) - span.end() - raise + self._prev_stream = AsyncResponseContextIterator( + it=stream_it, + wrapper=lambda resp: base.wrap_execute_query_response( + rpc_state=None, + response_pb=resp, + session=self.session, + tx=self, + commit_tx=commit_tx, + settings=self.session._settings, + ), + on_error=self.session._on_execute_stream_error, + on_finish=span_finish_callback(span), + ) + return self._prev_stream diff --git a/ydb/aio/table.py b/ydb/aio/table.py index 0d14ba2f..8d5e02c1 100644 --- a/ydb/aio/table.py +++ b/ydb/aio/table.py @@ -462,7 +462,8 @@ async def retry_operation(callee, retry_settings=None, *args, **kwargs): # pyli opt_generator = ydb.retry_operation_impl(callee, retry_settings, *args, **kwargs) for next_opt in opt_generator: if isinstance(next_opt, ydb.YdbRetryOperationSleepOpt): - await asyncio.sleep(next_opt.timeout) + if next_opt.timeout > 0: + await asyncio.sleep(next_opt.timeout) else: try: return await next_opt.result diff --git a/ydb/opentelemetry/__init__.py b/ydb/opentelemetry/__init__.py index a77e1ab7..fc058d0d 100644 --- a/ydb/opentelemetry/__init__.py +++ b/ydb/opentelemetry/__init__.py @@ -13,7 +13,7 @@ def enable_tracing(tracer=None): ``ydb.sdk`` from the global tracer provider will be used. """ try: - from ydb.opentelemetry._plugin import _enable_tracing + from ydb.opentelemetry.plugin import _enable_tracing except ImportError: raise ImportError( "OpenTelemetry packages are required for tracing support. " @@ -26,7 +26,7 @@ def enable_tracing(tracer=None): def disable_tracing(): """Disable YDB OpenTelemetry hooks and allow :func:`enable_tracing` to run again.""" try: - from ydb.opentelemetry._plugin import _disable_tracing + from ydb.opentelemetry.plugin import _disable_tracing except ImportError: return diff --git a/ydb/opentelemetry/_plugin.py b/ydb/opentelemetry/plugin.py similarity index 67% rename from ydb/opentelemetry/_plugin.py rename to ydb/opentelemetry/plugin.py index 3e2dc188..76942789 100644 --- a/ydb/opentelemetry/_plugin.py +++ b/ydb/opentelemetry/plugin.py @@ -49,45 +49,46 @@ def _set_error_on_span(span, exception): class _AttachContext: - """Make a span the active OTel context for a ``with`` block, without ending it. + """Make a span the active OTel context for a ``with`` block. - Used around the initial gRPC call of a streaming RPC: the span outlives the - ``with`` block — the result iterator owns ``end()``. For non-streaming RPCs - use ``with create_ydb_span(...)`` directly. + When ``end_on_exit=True`` (default) the span is ended on exit — used for + single-shot RPCs. When ``end_on_exit=False`` the span is only ended on + exception — used for streaming RPCs where the result iterator owns ``end()``. """ - def __init__(self, raw_span): - self._raw = raw_span + def __init__(self, span, end_on_exit): + self._span = span + self._end_on_exit = end_on_exit self._token = None def __enter__(self): - ctx = trace.set_span_in_context(self._raw) + ctx = trace.set_span_in_context(self._span._span) self._token = otel_context.attach(ctx) - return self + return self._span def __exit__(self, exc_type, exc_val, exc_tb): if self._token is not None: otel_context.detach(self._token) self._token = None + if exc_val is not None: + self._span.set_error(exc_val) + self._span.end() + elif self._end_on_exit: + self._span.end() return False class TracingSpan: """Wrapper around an OTel span. - As context manager: ``__enter__`` attaches the OTel context (so child spans - nest correctly and ``inject()`` sees this span when building gRPC metadata) - and ``__exit__`` detaches and ends the span. Used by Commit / Rollback / - RunWithRetry / Try and similar single-shot operations. - - For ExecuteQuery streams the span outlives the ``with`` block: call - :meth:`attach_context` around the initial gRPC call only, and let the result - iterator own ``end()``. + Use :meth:`attach_context` as a context manager around any RPC call. + The default (``end_on_exit=True``) is for single-shot operations; pass + ``end_on_exit=False`` for streaming RPCs where the result iterator owns + ``end()``. """ def __init__(self, span): self._span = span - self._otel_context_token = None def set_error(self, exception): _set_error_on_span(self._span, exception) @@ -98,22 +99,8 @@ def set_attribute(self, key, value): def end(self): self._span.end() - def attach_context(self): - return _AttachContext(self._span) - - def __enter__(self): - ctx = trace.set_span_in_context(self._span) - self._otel_context_token = otel_context.attach(ctx) - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if self._otel_context_token is not None: - otel_context.detach(self._otel_context_token) - self._otel_context_token = None - if exc_val is not None: - self.set_error(exc_val) - self.end() - return False + def attach_context(self, end_on_exit=True): + return _AttachContext(self, end_on_exit) def _create_span(name, attributes=None, kind=None): diff --git a/ydb/opentelemetry/tracing.py b/ydb/opentelemetry/tracing.py index 5aa39813..9d7f47e0 100644 --- a/ydb/opentelemetry/tracing.py +++ b/ydb/opentelemetry/tracing.py @@ -1,19 +1,35 @@ -"""OpenTelemetry helpers and registry.""" +"""Internal SDK tracing helpers and registry.""" +import enum from typing import Optional, Tuple +class SpanName(str, enum.Enum): + """Canonical span names used across the YDB SDK.""" + + CREATE_SESSION = "ydb.CreateSession" + EXECUTE_QUERY = "ydb.ExecuteQuery" + BEGIN_TRANSACTION = "ydb.BeginTransaction" + COMMIT = "ydb.Commit" + ROLLBACK = "ydb.Rollback" + DRIVER_INITIALIZE = "ydb.Driver.Initialize" + RUN_WITH_RETRY = "ydb.RunWithRetry" + TRY = "ydb.Try" + + class _NoopCtx: + __slots__ = ("_span",) + + def __init__(self, span): + self._span = span + def __enter__(self): - return self + return self._span def __exit__(self, exc_type, exc_val, exc_tb): return False -_NOOP_CTX = _NoopCtx() - - class _NoopSpan: """Returned by create_ydb_span when tracing is disabled.""" @@ -26,14 +42,8 @@ def set_attribute(self, key, value): def end(self): pass - def attach_context(self): - return _NOOP_CTX - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - return False + def attach_context(self, end_on_exit=True): + return _NoopCtx(self) _NOOP_SPAN = _NoopSpan() @@ -53,13 +63,11 @@ def is_active(self) -> bool: return self._create_span_func is not None def create_span(self, name, attributes=None, kind=None): - """Create a span. Returns a TracingSpan or _NoopSpan.""" if self._create_span_func is None: return _NOOP_SPAN return self._create_span_func(name, attributes, kind=kind) def get_trace_metadata(self): - """Return tracing metadata (e.g. W3C traceparent) for gRPC calls.""" if self._metadata_hook is not None: return self._metadata_hook() return [] @@ -80,7 +88,6 @@ def get_trace_metadata(): def _split_endpoint(endpoint: Optional[str]) -> Tuple[str, int]: - """Split ``host:port`` for OTel ``server.*`` attributes (no ``grpc://`` prefix; IPv6-safe).""" ep = endpoint or "" if ep.startswith("grpcs://"): ep = ep[len("grpcs://") :] @@ -119,13 +126,13 @@ def _build_ydb_attrs(driver_config, node_id=None, peer=None): return attrs -def create_ydb_span(name, driver_config, node_id=None, kind=None, peer=None): - """Create a span pre-filled with standard YDB attributes. +def create_span(name, attributes=None, kind="internal"): + """Create a span with no YDB-specific attributes (used for SDK-internal operations).""" + return _registry.create_span(name, attributes=attributes, kind=kind).attach_context() - ``peer`` is a ``(address, port, location)`` tuple pulled from the endpoint - map for the specific node serving the call; missing fields are skipped. - Can be used as a context manager or manually. - """ + +def create_ydb_span(name, driver_config, node_id=None, kind=None, peer=None): + """Create a span pre-filled with standard YDB attributes.""" if not _registry.is_active(): return _NOOP_SPAN attrs = _build_ydb_attrs(driver_config, node_id, peer) @@ -143,3 +150,14 @@ def set_peer_attributes(span, peer): span.set_attribute("network.peer.port", int(port)) if location: span.set_attribute("ydb.node.dc", location) + + +def span_finish_callback(span): + """Return an on_finish callable that ends *span* when a streaming result iterator completes.""" + + def _finish(exception=None): + if exception is not None: + span.set_error(exception) + span.end() + + return _finish diff --git a/ydb/pool.py b/ydb/pool.py index 4fef6377..31bfe8ba 100644 --- a/ydb/pool.py +++ b/ydb/pool.py @@ -10,7 +10,7 @@ from typing import Any, Callable, ContextManager, List, Optional, Set, Tuple, TYPE_CHECKING from . import connection as connection_impl, issues, resolver, _utilities, tracing -from .opentelemetry.tracing import create_ydb_span +from .opentelemetry.tracing import SpanName, create_ydb_span from abc import abstractmethod from .connection import Connection, EndpointKey @@ -454,7 +454,7 @@ def wait(self, timeout: Optional[float] = None, fail_fast: bool = False) -> None :param timeout: A timeout to wait in seconds :return: None """ - with create_ydb_span("ydb.Driver.Initialize", self._driver_config, kind="internal"): + with create_ydb_span(SpanName.DRIVER_INITIALIZE, self._driver_config, kind="internal").attach_context(): if fail_fast: self._store.add_fast_fail().result(timeout) else: diff --git a/ydb/query/base.py b/ydb/query/base.py index e3d124ab..093c7d55 100644 --- a/ydb/query/base.py +++ b/ydb/query/base.py @@ -72,12 +72,12 @@ class QueryResultSetFormat(enum.IntEnum): class SyncResponseContextIterator(_utilities.SyncResponseIterator): - """Streams ExecuteQuery results; ends the attached OTel span when the stream is consumed.""" + """Streams ExecuteQuery results.""" - def __init__(self, it, wrapper, on_error=None, span=None): + def __init__(self, it, wrapper, on_error=None, on_finish=None): super().__init__(it, wrapper) self._on_error = on_error - self._span = span + self._on_finish = on_finish def __enter__(self) -> "SyncResponseContextIterator": return self @@ -88,7 +88,7 @@ def _next(self): except StopIteration: # Normal stream termination is not an error and must not invalidate # the session. - self._finish_span() + self._call_on_finish() raise except BaseException as e: # BaseException (not Exception) for parity with the async iterator: @@ -98,20 +98,16 @@ def _next(self): # SessionBusy. if self._on_error: self._on_error(e) - self._finish_span(e) + self._call_on_finish(e) raise - def _finish_span(self, exception=None): - if self._span is not None: - if exception is not None: - self._span.set_error(exception) - self._span.end() - self._span = None + def _call_on_finish(self, exception=None): + if self._on_finish is not None: + self._on_finish(exception) + self._on_finish = None def __del__(self): - if self._span is not None: - self._span.end() - self._span = None + self._call_on_finish() def __exit__(self, exc_type, exc_val, exc_tb): # To close stream on YDB it is necessary to scroll through it to the end. @@ -123,7 +119,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): pass except BaseException: pass - self._finish_span() + self._call_on_finish() class QueryClientSettings: diff --git a/ydb/query/session.py b/ydb/query/session.py index 473f9276..a9c1b4a5 100644 --- a/ydb/query/session.py +++ b/ydb/query/session.py @@ -18,7 +18,7 @@ from .base import QueryExplainResultFormat from .. import _apis, issues, _utilities -from ..opentelemetry.tracing import create_ydb_span, set_peer_attributes +from ..opentelemetry.tracing import SpanName, create_ydb_span, set_peer_attributes, span_finish_callback from ..settings import BaseRequestSettings from ..connection import _RpcState as RpcState, EndpointKey from .._grpc.grpcwrapper import common_utils @@ -418,7 +418,7 @@ def create(self, settings: Optional[BaseRequestSettings] = None) -> "QuerySessio if self._closed: raise RuntimeError("Session is already closed.") - with create_ydb_span("ydb.CreateSession", self._driver_config) as span: + with create_ydb_span(SpanName.CREATE_SESSION, self._driver_config).attach_context() as span: self._create_call(settings=settings) set_peer_attributes(span, self._peer) self._attach() @@ -488,42 +488,37 @@ def execute( self._check_session_ready_to_use() span = create_ydb_span( - "ydb.ExecuteQuery", + SpanName.EXECUTE_QUERY, self._driver_config, node_id=self._node_id, peer=self._peer, ) - try: - with span.attach_context(): - stream_it = self._execute_call( - query=query, - parameters=parameters, - commit_tx=True, - syntax=syntax, - exec_mode=exec_mode, - stats_mode=stats_mode, - schema_inclusion_mode=schema_inclusion_mode, - result_set_format=result_set_format, - arrow_format_settings=arrow_format_settings, - concurrent_result_sets=concurrent_result_sets, - settings=settings, - ) - return base.SyncResponseContextIterator( - stream_it, - lambda resp: base.wrap_execute_query_response( - rpc_state=None, - response_pb=resp, - session=self, - settings=self._settings, - ), - on_error=self._on_execute_stream_error, - span=span, + with span.attach_context(end_on_exit=False): + stream_it = self._execute_call( + query=query, + parameters=parameters, + commit_tx=True, + syntax=syntax, + exec_mode=exec_mode, + stats_mode=stats_mode, + schema_inclusion_mode=schema_inclusion_mode, + result_set_format=result_set_format, + arrow_format_settings=arrow_format_settings, + concurrent_result_sets=concurrent_result_sets, + settings=settings, ) - except Exception as e: - span.set_error(e) - span.end() - raise + return base.SyncResponseContextIterator( + stream_it, + lambda resp: base.wrap_execute_query_response( + rpc_state=None, + response_pb=resp, + session=self, + settings=self._settings, + ), + on_error=self._on_execute_stream_error, + on_finish=span_finish_callback(span), + ) def explain( self, diff --git a/ydb/query/transaction.py b/ydb/query/transaction.py index 75f18a36..1d278ac2 100644 --- a/ydb/query/transaction.py +++ b/ydb/query/transaction.py @@ -17,7 +17,7 @@ _apis, issues, ) -from ..opentelemetry.tracing import create_ydb_span +from ..opentelemetry.tracing import SpanName, create_ydb_span, span_finish_callback from .._grpc.grpcwrapper import ydb_topic as _ydb_topic from .._grpc.grpcwrapper import ydb_query as _ydb_query from ..connection import _RpcState as RpcState @@ -529,11 +529,11 @@ def begin(self, settings: Optional[BaseRequestSettings] = None) -> "QueryTxConte :return: Transaction object or exception if begin is failed """ with create_ydb_span( - "ydb.BeginTransaction", + SpanName.BEGIN_TRANSACTION, self._driver_config, node_id=self.session.node_id, peer=getattr(self.session, "_peer", None), - ): + ).attach_context(): self._begin_call(settings) return self @@ -557,11 +557,11 @@ def commit(self, settings: Optional[BaseRequestSettings] = None) -> None: self._ensure_prev_stream_finished() with create_ydb_span( - "ydb.Commit", + SpanName.COMMIT, self._driver_config, node_id=self.session.node_id, peer=getattr(self.session, "_peer", None), - ): + ).attach_context(): try: self._execute_callbacks_sync(base.TxEvent.BEFORE_COMMIT) self._commit_call(settings) @@ -589,11 +589,11 @@ def rollback(self, settings: Optional[BaseRequestSettings] = None) -> None: self._ensure_prev_stream_finished() with create_ydb_span( - "ydb.Rollback", + SpanName.ROLLBACK, self._driver_config, node_id=self.session.node_id, peer=getattr(self.session, "_peer", None), - ): + ).attach_context(): try: self._execute_callbacks_sync(base.TxEvent.BEFORE_ROLLBACK) self._rollback_call(settings) @@ -650,42 +650,37 @@ def execute( self._ensure_prev_stream_finished() span = create_ydb_span( - "ydb.ExecuteQuery", + SpanName.EXECUTE_QUERY, self._driver_config, node_id=self.session.node_id, peer=getattr(self.session, "_peer", None), ) - try: - with span.attach_context(): - stream_it = self._execute_call( - query=query, - commit_tx=commit_tx, - syntax=syntax, - exec_mode=exec_mode, - stats_mode=stats_mode, - schema_inclusion_mode=schema_inclusion_mode, - result_set_format=result_set_format, - arrow_format_settings=arrow_format_settings, - parameters=parameters, - concurrent_result_sets=concurrent_result_sets, - settings=settings, - ) - self._prev_stream = base.SyncResponseContextIterator( - stream_it, - lambda resp: base.wrap_execute_query_response( - rpc_state=None, - response_pb=resp, - session=self.session, - tx=self, - commit_tx=commit_tx, - settings=self.session._settings, - ), - on_error=self.session._on_execute_stream_error, - span=span, + with span.attach_context(end_on_exit=False): + stream_it = self._execute_call( + query=query, + commit_tx=commit_tx, + syntax=syntax, + exec_mode=exec_mode, + stats_mode=stats_mode, + schema_inclusion_mode=schema_inclusion_mode, + result_set_format=result_set_format, + arrow_format_settings=arrow_format_settings, + parameters=parameters, + concurrent_result_sets=concurrent_result_sets, + settings=settings, ) - return self._prev_stream - except Exception as e: - span.set_error(e) - span.end() - raise + self._prev_stream = base.SyncResponseContextIterator( + stream_it, + lambda resp: base.wrap_execute_query_response( + rpc_state=None, + response_pb=resp, + session=self.session, + tx=self, + commit_tx=commit_tx, + settings=self.session._settings, + ), + on_error=self.session._on_execute_stream_error, + on_finish=span_finish_callback(span), + ) + return self._prev_stream diff --git a/ydb/retries.py b/ydb/retries.py index aacdb5ca..7f1de6fa 100644 --- a/ydb/retries.py +++ b/ydb/retries.py @@ -7,21 +7,11 @@ from . import issues from ._errors import check_retriable_error -from .opentelemetry.tracing import _registry as _tracing_registry +from .opentelemetry.tracing import SpanName, create_span as _create_span -_RUN_WITH_RETRY_SPAN = "ydb.RunWithRetry" -_TRY_SPAN = "ydb.Try" -_BACKOFF_ATTR = "ydb.retry.backoff_ms" - - -def _start_run_with_retry_span(): - return _tracing_registry.create_span(_RUN_WITH_RETRY_SPAN, kind="internal") - - -def _start_try_span(backoff_ms: Optional[int]): - attrs = {_BACKOFF_ATTR: backoff_ms} if backoff_ms is not None else None - return _tracing_registry.create_span(_TRY_SPAN, attributes=attrs, kind="internal") +def _try_span_attrs(backoff_ms: Optional[int]): + return {"ydb.retry.backoff_ms": backoff_ms} if backoff_ms is not None else None class BackoffSettings: @@ -177,10 +167,10 @@ def retry_operation_sync( @functools.wraps(callee) def traced_callee(*a: Any, **kw: Any) -> Any: - with _start_try_span(backoff_ms): + with _create_span(SpanName.TRY, _try_span_attrs(backoff_ms)): return callee(*a, **kw) - with _start_run_with_retry_span(): + with _create_span(SpanName.RUN_WITH_RETRY): for next_opt in retry_operation_impl(traced_callee, retry_settings, *args, **kwargs): if isinstance(next_opt, YdbRetryOperationSleepOpt): backoff_ms = int(next_opt.timeout * 1000) @@ -209,13 +199,14 @@ async def retry_operation_async( # pylint: disable=W1113 Returns awaitable result of coroutine. If retries are not succussful exception is raised. """ backoff_ms: Optional[int] = None - with _start_run_with_retry_span(): + with _create_span(SpanName.RUN_WITH_RETRY): for next_opt in retry_operation_impl(callee, retry_settings, *args, **kwargs): if isinstance(next_opt, YdbRetryOperationSleepOpt): backoff_ms = int(next_opt.timeout * 1000) - await asyncio.sleep(next_opt.timeout) + if next_opt.timeout > 0: + await asyncio.sleep(next_opt.timeout) else: - with _start_try_span(backoff_ms) as try_span: + with _create_span(SpanName.TRY, _try_span_attrs(backoff_ms)) as try_span: try: return await next_opt.result except BaseException as e: # pylint: disable=W0703 From ac91ee6e77b7789fe0de00f00d019e8402af14cc Mon Sep 17 00:00:00 2001 From: Oleg Ovcharuk Date: Thu, 7 May 2026 16:05:34 +0300 Subject: [PATCH 35/36] Update test_tracing_async.py --- tests/tracing/test_tracing_async.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tracing/test_tracing_async.py b/tests/tracing/test_tracing_async.py index c4d98c06..6b4e96ad 100644 --- a/tests/tracing/test_tracing_async.py +++ b/tests/tracing/test_tracing_async.py @@ -4,7 +4,7 @@ """ from opentelemetry.trace import StatusCode, SpanKind -from ydb.opentelemetry.tracing import SpanName, create_ydb_span +from ydb.opentelemetry.tracing import SpanName from ydb.query.transaction import QueryTxStateEnum from .conftest import FakeDriverConfig from unittest.mock import AsyncMock, MagicMock, patch From febc9e323b942e84f30e8dda454ac3a6fc31920d Mon Sep 17 00:00:00 2001 From: Oleg Ovcharuk Date: Thu, 7 May 2026 17:26:19 +0300 Subject: [PATCH 36/36] review fixes --- ydb/opentelemetry/tracing.py | 6 ++++-- ydb/retries.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/ydb/opentelemetry/tracing.py b/ydb/opentelemetry/tracing.py index 9d7f47e0..1d0995df 100644 --- a/ydb/opentelemetry/tracing.py +++ b/ydb/opentelemetry/tracing.py @@ -101,7 +101,9 @@ def _split_endpoint(endpoint: Optional[str]) -> Tuple[str, int]: port_s = ep[close + 2 :] return host, int(port_s) if port_s.isdigit() else 0 - host, _, port_s = ep.rpartition(":") + host, sep, port_s = ep.rpartition(":") + if not sep: + return ep, 0 return host, int(port_s) if port_s.isdigit() else 0 @@ -122,7 +124,7 @@ def _build_ydb_attrs(driver_config, node_id=None, peer=None): if location: attrs["ydb.node.dc"] = location if node_id is not None: - attrs["ydb.node.id"] = node_id or 0 + attrs["ydb.node.id"] = node_id return attrs diff --git a/ydb/retries.py b/ydb/retries.py index 7f1de6fa..4b7c137f 100644 --- a/ydb/retries.py +++ b/ydb/retries.py @@ -174,7 +174,8 @@ def traced_callee(*a: Any, **kw: Any) -> Any: for next_opt in retry_operation_impl(traced_callee, retry_settings, *args, **kwargs): if isinstance(next_opt, YdbRetryOperationSleepOpt): backoff_ms = int(next_opt.timeout * 1000) - time.sleep(next_opt.timeout) + if next_opt.timeout > 0: + time.sleep(next_opt.timeout) else: return next_opt.result return None