Label Studio allows Server-Side Request Forgery in the S3 Storage Endpoint
Description
Label Studio is an open source data labeling tool. Prior to version 1.16.0, Label Studio's S3 storage integration feature contains a Server-Side Request Forgery (SSRF) vulnerability in its endpoint configuration. When creating an S3 storage connection, the application allows users to specify a custom S3 endpoint URL via the s3_endpoint parameter. This endpoint URL is passed directly to the boto3 AWS SDK without proper validation or restrictions on the protocol or destination. The vulnerability allows an attacker to make the application send HTTP requests to arbitrary internal services by specifying them as the S3 endpoint. When the storage sync operation is triggered, the application attempts to make S3 API calls to the specified endpoint, effectively making HTTP requests to the target service and returning the response in error messages. This SSRF vulnerability enables attackers to bypass network segmentation and access internal services that should not be accessible from the external network. The vulnerability is particularly severe because error messages from failed requests contain the full response body, allowing data exfiltration from internal services. Version 1.16.0 contains a patch for the issue.
Affected packages
Versions sourced from the GitHub Security Advisory.
| Package | Affected versions | Patched versions |
|---|---|---|
label-studioPyPI | < 1.16.0 | 1.16.0 |
Affected products
1- Range: < 1.16.0
Patches
106a2b29c1208fix: LEAP-1805: revisit S3 exceptions behavior (#7015)
6 files changed · +147 −6
label_studio/core/settings/base.py+18 −0 modified@@ -713,6 +713,24 @@ def collect_versions_dummy(**kwargs): if CSRF_TRUSTED_ORIGINS: CSRF_TRUSTED_ORIGINS = CSRF_TRUSTED_ORIGINS.split(',') +# Custom S3 endpoints on these domains will get detailed error reporting +S3_TRUSTED_STORAGE_DOMAINS = get_env_list( + 'S3_TRUSTED_STORAGE_DOMAINS', + [ + 'amazonaws.com', + 'scw.cloud', + 'yandexcloud.net', + 'digitaloceanspaces.com', + 'orange-business.com', + 'computecanada.ca', + 'cloudflarestorage.com', + 'wasabisys.com', + 'oracle.com', + 'amazon.com', + 'appdomain.cloud', + ], +) + REAL_HOSTNAME = os.getenv('HOSTNAME') # we have to use getenv, because we don't use LABEL_STUDIO_ prefix GCS_CLOUD_STORAGE_FORCE_DEFAULT_CREDENTIALS = get_bool_env('GCS_CLOUD_STORAGE_FORCE_DEFAULT_CREDENTIALS', False) PUBLIC_API_DOCS = get_bool_env('PUBLIC_API_DOCS', False)
label_studio/io_storages/s3/models.py+11 −1 modified@@ -20,7 +20,7 @@ ImportStorageLink, ProjectStorageMixin, ) -from io_storages.s3.utils import get_client_and_resource, resolve_s3_url +from io_storages.s3.utils import catch_and_reraise_from_none, get_client_and_resource, resolve_s3_url from io_storages.utils import storage_can_resolve_bucket_url from tasks.models import Annotation from tasks.validation import ValidationError as TaskValidationError @@ -54,6 +54,7 @@ class S3StorageMixin(models.Model): region_name = models.TextField(_('region_name'), null=True, blank=True, help_text='AWS Region') s3_endpoint = models.TextField(_('s3_endpoint'), null=True, blank=True, help_text='S3 Endpoint') + @catch_and_reraise_from_none def get_client_and_resource(self): # s3 client initialization ~ 100 ms, for 30 tasks it's a 3 seconds, so we need to cache it cache_key = f'{self.aws_access_key_id}:{self.aws_secret_access_key}:{self.aws_session_token}:{self.region_name}:{self.s3_endpoint}' @@ -80,6 +81,7 @@ def get_client_and_bucket(self, validate_connection=True): self.validate_connection(client) return client, s3.Bucket(self.bucket) + @catch_and_reraise_from_none def validate_connection(self, client=None): logger.debug('validate_connection') if client is None: @@ -126,6 +128,7 @@ class S3ImportStorageBase(S3StorageMixin, ImportStorage): _('recursive scan'), default=False, help_text=_('Perform recursive scan over the bucket content') ) + @catch_and_reraise_from_none def iterkeys(self): client, bucket = self.get_client_and_bucket() if self.prefix: @@ -146,6 +149,7 @@ def iterkeys(self): continue yield key + @catch_and_reraise_from_none def scan_and_create_links(self): return self._scan_and_create_links(S3ImportStorageLink) @@ -157,6 +161,7 @@ def _get_validated_task(self, parsed_data, key): ) return parsed_data + @catch_and_reraise_from_none def get_data(self, key): uri = f'{self.url_scheme}://{self.bucket}/{key}' if self.use_blob_urls: @@ -174,12 +179,15 @@ def get_data(self, key): value = self._get_validated_task(value, key) return value + @catch_and_reraise_from_none def generate_http_url(self, url): return resolve_s3_url(url, self.get_client(), self.presign, expires_in=self.presign_ttl * 60) + @catch_and_reraise_from_none def can_resolve_url(self, url: Union[str, None]) -> bool: return storage_can_resolve_bucket_url(self, url) + @catch_and_reraise_from_none def get_blob_metadata(self, key): return AWS.get_blob_metadata( key, @@ -201,6 +209,7 @@ class Meta: class S3ExportStorage(S3StorageMixin, ExportStorage): + @catch_and_reraise_from_none def save_annotation(self, annotation): client, s3 = self.get_client_and_resource() logger.debug(f'Creating new object on {self.__class__.__name__} Storage {self} for annotation {annotation}') @@ -228,6 +237,7 @@ def save_annotation(self, annotation): # create link if everything ok S3ExportStorageLink.create(annotation, self) + @catch_and_reraise_from_none def delete_annotation(self, annotation): client, s3 = self.get_client_and_resource() logger.debug(f'Deleting object on {self.__class__.__name__} Storage {self} for annotation {annotation}')
label_studio/io_storages/s3/utils.py+34 −0 modified@@ -10,6 +10,7 @@ from botocore.exceptions import ClientError from core.utils.params import get_env from django.conf import settings +from tldextract import TLDExtract logger = logging.getLogger(__name__) @@ -135,3 +136,36 @@ def validate_pattern(cls, storage, pattern, glob_pattern=True): logger.debug(key + ' matches file pattern') return '' return 'No objects found matching the provided glob pattern' + + +class S3StorageError(Exception): + pass + + +# see https://github.com/john-kurkowski/tldextract?tab=readme-ov-file#note-about-caching +# prevents network call on first use +extractor = TLDExtract(suffix_list_urls=()) + + +def catch_and_reraise_from_none(func): + """ + For S3 storages - if s3_endpoint is not on a known domain, catch exception and + raise a new one with the previous context suppressed. See also: https://peps.python.org/pep-0409/ + """ + + def wrapper(self, *args, **kwargs): + try: + return func(self, *args, **kwargs) + except Exception as e: + if self.s3_endpoint and ( + domain := extractor.extract_urllib(urlparse(self.s3_endpoint)).registered_domain.lower() + ) not in [trusted_domain.lower() for trusted_domain in settings.S3_TRUSTED_STORAGE_DOMAINS]: + logger.error(f'Exception from unrecognized S3 domain: {e}', exc_info=True) + raise S3StorageError( + f'Debugging info is not available for s3 endpoints on domain: {domain}. ' + 'Please contact your Label Studio devops team if you require detailed error reporting for this domain.' + ) from None + else: + raise e + + return wrapper
label_studio/tests/io_storages/s3/test_utils.py+41 −0 added@@ -0,0 +1,41 @@ +from unittest.mock import patch + +import pytest +from django.test import override_settings +from io_storages.s3.utils import S3StorageError, catch_and_reraise_from_none + + +@override_settings(S3_TRUSTED_STORAGE_DOMAINS=['trusted-domain.com']) +def test_catch_and_reraise_from_none_with_untrusted_domain(): + class TestClass: + s3_endpoint = 'http://untrusted-domain.com' + + instance = TestClass() + + @catch_and_reraise_from_none + def function_to_test(self): + raise Exception('Original Exception') + + with patch('io_storages.s3.utils.extractor.extract_urllib') as mock_extract: + mock_extract.return_value.registered_domain = 'untrusted-domain.com' + with pytest.raises(S3StorageError) as excinfo: + function_to_test(instance) + assert 'Debugging info is not available for s3 endpoints on domain: untrusted-domain.com' in str(excinfo.value) + + +@override_settings(S3_TRUSTED_STORAGE_DOMAINS=['trusted-domain.com']) +def test_catch_and_reraise_from_none_with_trusted_domain(): + class TestClass: + s3_endpoint = 'http://trusted-domain.com' + + instance = TestClass() + + @catch_and_reraise_from_none + def function_to_test(self): + raise Exception('Original Exception') + + with patch('io_storages.s3.utils.extractor.extract_urllib') as mock_extract: + mock_extract.return_value.registered_domain = 'trusted-domain.com' + with pytest.raises(Exception) as excinfo: + function_to_test(instance) + assert 'Original Exception' in str(excinfo.value)
poetry.lock+42 −5 modified@@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand. [[package]] name = "annotated-types" @@ -1212,7 +1212,7 @@ version = "3.13.1" description = "A platform independent file lock." optional = false python-versions = ">=3.8" -groups = ["test"] +groups = ["main", "test"] markers = "python_version >= \"3.12\" or python_version <= \"3.11\"" files = [ {file = "filelock-3.13.1-py3-none-any.whl", hash = "sha256:57dbda9b35157b05fb3e58ee91448612eb674172fab98ee235ccb0b5bee19a1c"}, @@ -3175,7 +3175,6 @@ files = [ {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"}, - {file = "psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"}, @@ -3792,7 +3791,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -3998,6 +3996,22 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "requests-file" +version = "2.1.0" +description = "File transport adapter for Requests" +optional = false +python-versions = "*" +groups = ["main"] +markers = "python_version >= \"3.12\" or python_version <= \"3.11\"" +files = [ + {file = "requests_file-2.1.0-py2.py3-none-any.whl", hash = "sha256:cf270de5a4c5874e84599fc5778303d496c10ae5e870bfa378818f35d21bda5c"}, + {file = "requests_file-2.1.0.tar.gz", hash = "sha256:0f549a3f3b0699415ac04d167e9cb39bccfb730cb832b4d20be3d9867356e658"}, +] + +[package.dependencies] +requests = ">=1.0.0" + [[package]] name = "requests-mock" version = "1.12.1" @@ -4597,6 +4611,29 @@ stevedore = ">=4,<5" [package.extras] dev = ["Faker", "allure-pytest", "bump2version", "colorlog", "coverage[toml]", "flask (>=2.2.3)", "flit (>=3.2,<4)", "fluent-logger", "itsdangerous", "mypy", "mypy-extensions", "pip-tools", "pre-commit", "py", "pygments", "pytest-cov", "pytest-xdist", "tox (>=4,<5)", "twine", "types-PyYAML", "types-requests", "types-setuptools", "wheel"] +[[package]] +name = "tldextract" +version = "5.1.3" +description = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well." +optional = false +python-versions = ">=3.9" +groups = ["main"] +markers = "python_version >= \"3.12\" or python_version <= \"3.11\"" +files = [ + {file = "tldextract-5.1.3-py3-none-any.whl", hash = "sha256:78de310cc2ca018692de5ddf320f9d6bd7c5cf857d0fd4f2175f0cdf4440ea75"}, + {file = "tldextract-5.1.3.tar.gz", hash = "sha256:d43c7284c23f5dc8a42fd0fee2abede2ff74cc622674e4cb07f514ab3330c338"}, +] + +[package.dependencies] +filelock = ">=3.0.8" +idna = "*" +requests = ">=2.1.0" +requests-file = ">=1.4" + +[package.extras] +release = ["build", "twine"] +testing = ["mypy", "pytest", "pytest-gitignore", "pytest-mock", "responses", "ruff", "syrupy", "tox", "tox-uv", "types-filelock", "types-requests"] + [[package]] name = "toml" version = "0.10.2" @@ -5034,4 +5071,4 @@ uwsgi = ["pyuwsgi", "uwsgitop"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4" -content-hash = "d2cdf4a39058b618cb91b25b0b58584521858cf240dad8a6b382f4c2750d0633" +content-hash = "c995db7696980df7683659e09ae771e9961854a0b345dbd40c9cd1eff632d7cb"
pyproject.toml+1 −0 modified@@ -206,6 +206,7 @@ django-csp = "3.7" openai = "^1.10.0" django-migration-linter = "^5.1.0" setuptools = ">=75.4.0" +tldextract = ">=5.1.3" # Humansignal repo dependencies label-studio-sdk = {url = "https://github.com/HumanSignal/label-studio-sdk/archive/09995cf0c72398322af949bb13de034a7bbb785f.zip"}
Vulnerability mechanics
Generated by null/stub on May 9, 2026. Inputs: CWE entries + fix-commit diffs from this CVE's patches. Citations validated against bundle.
References
4- github.com/advisories/GHSA-m238-fmcw-wh58ghsaADVISORY
- nvd.nist.gov/vuln/detail/CVE-2025-25297ghsaADVISORY
- github.com/HumanSignal/label-studio/commit/06a2b29c1208e1878ccae66e6b84c8b24598fa79ghsax_refsource_MISCWEB
- github.com/HumanSignal/label-studio/security/advisories/GHSA-m238-fmcw-wh58ghsax_refsource_CONFIRMWEB
News mentions
0No linked articles in our index yet.