CVE-2026-41486
Description
Ray is an AI compute engine. From version 2.54.0 to before version 2.55.0, Ray Data registers custom Arrow extension types (ray.data.arrow_tensor, ray.data.arrow_tensor_v2, ray.data.arrow_variable_shaped_tensor) globally in PyArrow. When PyArrow reads a Parquet file containing one of these extension types, it calls __arrow_ext_deserialize__ on the field's metadata bytes. Ray's implementation passes these bytes directly to cloudpickle.loads(), achieving arbitrary code execution during schema parsing, before any row data is read. This issue has been patched in version 2.55.0.
Affected packages
Versions sourced from the GitHub Security Advisory.
| Package | Affected versions | Patched versions |
|---|---|---|
rayPyPI | >= 2.49.0, < 2.55.0 | 2.55.0 |
Affected products
1- Range: >= 2.49.0, < 2.55.0
Patches
1c02bd31ae319[Data] Fix RCE in Arrow extension type deserialization from Parquet (#62056)
2 files changed · +29 −37
python/ray/data/_internal/tensor_extensions/arrow.py+28 −19 modified@@ -2,6 +2,7 @@ import functools import json import logging +import os import sys import threading import warnings @@ -56,16 +57,20 @@ class _SerializationFormat(Enum): - # JSON format is legacy and inefficient, only kept for backward compatibility JSON = 0 CLOUDPICKLE = 1 # Set the default serialization format for Arrow extension types. +# JSON is the default (safe). Cloudpickle is opt-in for backward compatibility. ARROW_EXTENSION_SERIALIZATION_FORMAT = _SerializationFormat( - _SerializationFormat.JSON # legacy - if env_integer("RAY_DATA_ARROW_EXTENSION_SERIALIZATION_LEGACY_JSON_FORMAT", 0) == 1 - else _SerializationFormat.CLOUDPICKLE # default + _SerializationFormat.CLOUDPICKLE + if env_integer("RAY_DATA_ARROW_EXTENSION_SERIALIZATION_CLOUDPICKLE", 0) == 1 + else _SerializationFormat.JSON +) + +_AUTOLOAD_CLOUDPICKLE_TENSOR_METADATA = ( + os.environ.get("RAY_DATA_AUTOLOAD_CLOUDPICKLE_TENSOR_METADATA", "0") == "1" ) # Conditional imports for PyArrow features that are only available in newer versions @@ -125,18 +130,23 @@ def _extension_array_concat_supported() -> bool: def _deserialize_with_fallback(serialized: bytes, field_name: str = "data"): - """Deserialize data with cloudpickle first, fallback to JSON.""" + """Deserialize extension type metadata from Parquet field metadata. + Uses JSON only by default. cloudpickle deserialization is available as an + opt-in for files written by Ray 2.49-2.54, but MUST NOT be used with + untrusted Parquet files. + """ try: - # Try cloudpickle first (new format) - return cloudpickle.loads(serialized) - except Exception: - # Fallback to JSON format (legacy) - try: - return json.loads(serialized) - except json.JSONDecodeError: - raise ValueError( - f"Unable to deserialize {field_name} from {type(serialized)}" - ) + return json.loads(serialized) + except (json.JSONDecodeError, UnicodeDecodeError, ValueError): + if _AUTOLOAD_CLOUDPICKLE_TENSOR_METADATA: + # Opt-in only: files written by Ray 2.49-2.54 used cloudpickle. + # WARNING: Do not enable this for files from untrusted sources. + return cloudpickle.loads(serialized) + raise ValueError( + f"Unable to deserialize {field_name}. If this file was written by " + f"Ray 2.49-2.54, set RAY_DATA_AUTOLOAD_CLOUDPICKLE_TENSOR_METADATA=1 " + f"(trusted sources only)." + ) @DeveloperAPI(stability="beta") @@ -418,7 +428,7 @@ def _coerce_np_datetime_to_pa_timestamp_precision( def _infer_pyarrow_type( - column_values: Union[List[Any], np.ndarray] + column_values: Union[List[Any], np.ndarray], ) -> Optional[pa.DataType]: """Infers target Pyarrow `DataType` based on the provided columnar values. @@ -500,7 +510,7 @@ def _len_gt_overflow_threshold(obj: Any) -> bool: def _try_infer_pa_timestamp_type( - column_values: Union[List[Any], np.ndarray] + column_values: Union[List[Any], np.ndarray], ) -> Optional[pa.DataType]: if isinstance(column_values, list) and len(column_values) > 0: # In case provided column values is a list of elements, this @@ -1321,8 +1331,7 @@ def from_numpy( dtype.byteorder == "=" and sys.byteorder == "big" ): raise ValueError( - "Only little-endian string tensors are supported, " - f"but got: {dtype}" + f"Only little-endian string tensors are supported, but got: {dtype}" ) pa_value_type = pa.binary(dtype.itemsize)
python/ray/data/tests/datasource/test_daft.py+1 −18 modified@@ -1,4 +1,3 @@ -import os from unittest.mock import patch import pyarrow as pa @@ -8,26 +7,10 @@ @pytest.fixture(scope="module") def ray_start(request): - """Initialize Ray with proper serialization format.""" - # TODO: Remove this once Daft issue is fixed to default to Cloudpickle - # serialization format. - # Force the serialization format to JSON for this test. - # Refer Daft issue https://github.com/Eventual-Inc/Daft/issues/4828 - # and Ray issue https://github.com/ray-project/ray/issues/54837 - # for more details. - - # Set environment variable before importing ray - os.environ["RAY_DATA_ARROW_EXTENSION_SERIALIZATION_LEGACY_JSON_FORMAT"] = "1" - + """Initialize Ray for Daft tests.""" import ray - import ray.data._internal.tensor_extensions.arrow as arrow_module - from ray.data._internal.tensor_extensions.arrow import _SerializationFormat - - # Force the serialization format to JSON after import - arrow_module.ARROW_EXTENSION_SERIALIZATION_FORMAT = _SerializationFormat.JSON try: - # Set environment variable for Ray workers yield ray.init( num_cpus=16, )
Vulnerability mechanics
AI mechanics synthesis has not run for this CVE yet.
References
7- github.com/advisories/GHSA-mw35-8rx3-xf9rghsaADVISORY
- nvd.nist.gov/vuln/detail/CVE-2026-41486ghsaADVISORY
- github.com/ray-project/ray/commit/c02bd31ae31996805868baa446a131a8d304525fnvdWEB
- github.com/ray-project/ray/pull/54831ghsaWEB
- github.com/ray-project/ray/pull/62056nvdWEB
- github.com/ray-project/ray/releases/tag/ray-2.55.0nvdWEB
- github.com/ray-project/ray/security/advisories/GHSA-mw35-8rx3-xf9rnvdWEB
News mentions
0No linked articles in our index yet.