VYPR
High severityGHSA Advisory· Published May 8, 2026· Updated May 13, 2026

CVE-2026-41486

CVE-2026-41486

Description

Ray is an AI compute engine. From version 2.54.0 to before version 2.55.0, Ray Data registers custom Arrow extension types (ray.data.arrow_tensor, ray.data.arrow_tensor_v2, ray.data.arrow_variable_shaped_tensor) globally in PyArrow. When PyArrow reads a Parquet file containing one of these extension types, it calls __arrow_ext_deserialize__ on the field's metadata bytes. Ray's implementation passes these bytes directly to cloudpickle.loads(), achieving arbitrary code execution during schema parsing, before any row data is read. This issue has been patched in version 2.55.0.

Affected packages

Versions sourced from the GitHub Security Advisory.

PackageAffected versionsPatched versions
rayPyPI
>= 2.49.0, < 2.55.02.55.0

Affected products

1

Patches

1
c02bd31ae319

[Data] Fix RCE in Arrow extension type deserialization from Parquet (#62056)

https://github.com/ray-project/rayGoutamMar 27, 2026via ghsa
2 files changed · +29 37
  • python/ray/data/_internal/tensor_extensions/arrow.py+28 19 modified
    @@ -2,6 +2,7 @@
     import functools
     import json
     import logging
    +import os
     import sys
     import threading
     import warnings
    @@ -56,16 +57,20 @@
     
     
     class _SerializationFormat(Enum):
    -    # JSON format is legacy and inefficient, only kept for backward compatibility
         JSON = 0
         CLOUDPICKLE = 1
     
     
     # Set the default serialization format for Arrow extension types.
    +# JSON is the default (safe). Cloudpickle is opt-in for backward compatibility.
     ARROW_EXTENSION_SERIALIZATION_FORMAT = _SerializationFormat(
    -    _SerializationFormat.JSON  # legacy
    -    if env_integer("RAY_DATA_ARROW_EXTENSION_SERIALIZATION_LEGACY_JSON_FORMAT", 0) == 1
    -    else _SerializationFormat.CLOUDPICKLE  # default
    +    _SerializationFormat.CLOUDPICKLE
    +    if env_integer("RAY_DATA_ARROW_EXTENSION_SERIALIZATION_CLOUDPICKLE", 0) == 1
    +    else _SerializationFormat.JSON
    +)
    +
    +_AUTOLOAD_CLOUDPICKLE_TENSOR_METADATA = (
    +    os.environ.get("RAY_DATA_AUTOLOAD_CLOUDPICKLE_TENSOR_METADATA", "0") == "1"
     )
     
     # Conditional imports for PyArrow features that are only available in newer versions
    @@ -125,18 +130,23 @@ def _extension_array_concat_supported() -> bool:
     
     
     def _deserialize_with_fallback(serialized: bytes, field_name: str = "data"):
    -    """Deserialize data with cloudpickle first, fallback to JSON."""
    +    """Deserialize extension type metadata from Parquet field metadata.
    +    Uses JSON only by default. cloudpickle deserialization is available as an
    +    opt-in for files written by Ray 2.49-2.54, but MUST NOT be used with
    +    untrusted Parquet files.
    +    """
         try:
    -        # Try cloudpickle first (new format)
    -        return cloudpickle.loads(serialized)
    -    except Exception:
    -        # Fallback to JSON format (legacy)
    -        try:
    -            return json.loads(serialized)
    -        except json.JSONDecodeError:
    -            raise ValueError(
    -                f"Unable to deserialize {field_name} from {type(serialized)}"
    -            )
    +        return json.loads(serialized)
    +    except (json.JSONDecodeError, UnicodeDecodeError, ValueError):
    +        if _AUTOLOAD_CLOUDPICKLE_TENSOR_METADATA:
    +            # Opt-in only: files written by Ray 2.49-2.54 used cloudpickle.
    +            # WARNING: Do not enable this for files from untrusted sources.
    +            return cloudpickle.loads(serialized)
    +        raise ValueError(
    +            f"Unable to deserialize {field_name}. If this file was written by "
    +            f"Ray 2.49-2.54, set RAY_DATA_AUTOLOAD_CLOUDPICKLE_TENSOR_METADATA=1 "
    +            f"(trusted sources only)."
    +        )
     
     
     @DeveloperAPI(stability="beta")
    @@ -418,7 +428,7 @@ def _coerce_np_datetime_to_pa_timestamp_precision(
     
     
     def _infer_pyarrow_type(
    -    column_values: Union[List[Any], np.ndarray]
    +    column_values: Union[List[Any], np.ndarray],
     ) -> Optional[pa.DataType]:
         """Infers target Pyarrow `DataType` based on the provided
         columnar values.
    @@ -500,7 +510,7 @@ def _len_gt_overflow_threshold(obj: Any) -> bool:
     
     
     def _try_infer_pa_timestamp_type(
    -    column_values: Union[List[Any], np.ndarray]
    +    column_values: Union[List[Any], np.ndarray],
     ) -> Optional[pa.DataType]:
         if isinstance(column_values, list) and len(column_values) > 0:
             # In case provided column values is a list of elements, this
    @@ -1321,8 +1331,7 @@ def from_numpy(
                     dtype.byteorder == "=" and sys.byteorder == "big"
                 ):
                     raise ValueError(
    -                    "Only little-endian string tensors are supported, "
    -                    f"but got: {dtype}"
    +                    f"Only little-endian string tensors are supported, but got: {dtype}"
                     )
                 pa_value_type = pa.binary(dtype.itemsize)
     
    
  • python/ray/data/tests/datasource/test_daft.py+1 18 modified
    @@ -1,4 +1,3 @@
    -import os
     from unittest.mock import patch
     
     import pyarrow as pa
    @@ -8,26 +7,10 @@
     
     @pytest.fixture(scope="module")
     def ray_start(request):
    -    """Initialize Ray with proper serialization format."""
    -    # TODO: Remove this once Daft issue is fixed to default to Cloudpickle
    -    # serialization format.
    -    # Force the serialization format to JSON for this test.
    -    # Refer Daft issue https://github.com/Eventual-Inc/Daft/issues/4828
    -    # and Ray issue https://github.com/ray-project/ray/issues/54837
    -    # for more details.
    -
    -    # Set environment variable before importing ray
    -    os.environ["RAY_DATA_ARROW_EXTENSION_SERIALIZATION_LEGACY_JSON_FORMAT"] = "1"
    -
    +    """Initialize Ray for Daft tests."""
         import ray
    -    import ray.data._internal.tensor_extensions.arrow as arrow_module
    -    from ray.data._internal.tensor_extensions.arrow import _SerializationFormat
    -
    -    # Force the serialization format to JSON after import
    -    arrow_module.ARROW_EXTENSION_SERIALIZATION_FORMAT = _SerializationFormat.JSON
     
         try:
    -        # Set environment variable for Ray workers
             yield ray.init(
                 num_cpus=16,
             )
    

Vulnerability mechanics

AI mechanics synthesis has not run for this CVE yet.

References

7

News mentions

0

No linked articles in our index yet.